+// HTML-GLOBAL-INDEX:

Global Namespace

+// HTML-GLOBAL-INDEX:

Namespaces

+// HTML-GLOBAL-INDEX:
  • @nonymous_namespace
  • +// HTML-GLOBAL-INDEX:
  • PrimaryNamespace
  • +// HTML-GLOBAL-INDEX:
  • AnotherNamespace
  • + +// MD-GLOBAL-INDEX: # Global Namespace +// MD-GLOBAL-INDEX: ## Namespaces +// MD-GLOBAL-INDEX: * [@nonymous_namespace](..{{[\/]}}@nonymous_namespace{{[\/]}}index.md) +// MD-GLOBAL-INDEX: * [PrimaryNamespace](..{{[\/]}}PrimaryNamespace{{[\/]}}index.md) +// MD-GLOBAL-INDEX: * [AnotherNamespace](..{{[\/]}}AnotherNamespace{{[\/]}}index.md) + +// MD-ALL-FILES: # All Files +// MD-ALL-FILES: ## [@nonymous_namespace](@nonymous_namespace{{[\/]}}index.md) +// MD-ALL-FILES: ## [AnotherNamespace](AnotherNamespace{{[\/]}}index.md) +// MD-ALL-FILES: ## [GlobalNamespace](GlobalNamespace{{[\/]}}index.md) +// MD-ALL-FILES: ## [PrimaryNamespace](PrimaryNamespace{{[\/]}}index.md) + +// MD-INDEX: # C/C++ Reference +// MD-INDEX: * Namespace: [@nonymous_namespace](@nonymous_namespace) +// MD-INDEX: * Namespace: [AnotherNamespace](AnotherNamespace) +// MD-INDEX: * Namespace: [PrimaryNamespace](PrimaryNamespace) \ No newline at end of file From eab37384c151c7eabbffb65e5a053b58f88c8b5d Mon Sep 17 00:00:00 2001 From: PeterChou1 Date: Tue, 16 Jul 2024 18:06:12 -0400 Subject: [PATCH 197/777] [clang-doc] add enum test (#97679) This patch adds a test which test the enum generation for clang-doc. --- clang-tools-extra/test/clang-doc/enum.cpp | 132 ++++++++++++++++++++++ 1 file changed, 132 insertions(+) create mode 100644 clang-tools-extra/test/clang-doc/enum.cpp diff --git a/clang-tools-extra/test/clang-doc/enum.cpp b/clang-tools-extra/test/clang-doc/enum.cpp new file mode 100644 index 0000000000000..e559940a31de6 --- /dev/null +++ b/clang-tools-extra/test/clang-doc/enum.cpp @@ -0,0 +1,132 @@ +// RUN: rm -rf %t && mkdir -p %t +// RUN: clang-doc --format=html --doxygen --output=%t --executor=standalone %s +// RUN: clang-doc --format=md --doxygen --output=%t --executor=standalone %s +// RUN: FileCheck %s < %t/GlobalNamespace/index.html --check-prefix=HTML-INDEX-LINE +// RUN: FileCheck %s < %t/GlobalNamespace/index.html --check-prefix=HTML-INDEX +// RUN: FileCheck %s < %t/GlobalNamespace/Animals.html --check-prefix=HTML-ANIMAL-LINE +// RUN: FileCheck %s < %t/GlobalNamespace/Animals.html --check-prefix=HTML-ANIMAL +// RUN: FileCheck %s < %t/Vehicles/index.html --check-prefix=HTML-VEHICLES-LINE +// RUN: FileCheck %s < %t/Vehicles/index.html --check-prefix=HTML-VEHICLES +// RUN: FileCheck %s < %t/GlobalNamespace/index.md --check-prefix=MD-INDEX-LINE +// RUN: FileCheck %s < %t/GlobalNamespace/index.md --check-prefix=MD-INDEX +// RUN: FileCheck %s < %t/GlobalNamespace/Animals.md --check-prefix=MD-ANIMAL-LINE +// RUN: FileCheck %s < %t/GlobalNamespace/Animals.md --check-prefix=MD-ANIMAL +// RUN: FileCheck %s < %t/Vehicles/index.md --check-prefix=MD-VEHICLES-LINE +// RUN: FileCheck %s < %t/Vehicles/index.md --check-prefix=MD-VEHICLES + + +/** + * @brief For specifying RGB colors + */ +enum Color { +// MD-INDEX-LINE: *Defined at {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}enum.cpp#[[@LINE-1]]* +// HTML-INDEX-LINE:

    Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}enum.cpp

    + Red, ///< Red + Green, ///< Green + Blue ///< Blue +}; + +// MD-INDEX: ## Enums +// MD-INDEX: | enum Color | +// MD-INDEX: -- +// MD-INDEX: | Red | +// MD-INDEX: | Green | +// MD-INDEX: | Blue | +// MD-INDEX: **brief** For specifying RGB colors + +// HTML-INDEX:

    Enums

    +// HTML-INDEX:

    enum Color

    +// HTML-INDEX:
  • Red
  • +// HTML-INDEX:
  • Green
  • +// HTML-INDEX:
  • Blue
  • + +/** + * @brief Shape Types + */ +enum class Shapes { +// MD-INDEX-LINE: *Defined at {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}enum.cpp#[[@LINE-1]]* +// HTML-INDEX-LINE:

    Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}enum.cpp

    + /// Circle + Circle, + /// Rectangle + Rectangle, + /// Triangle + Triangle +}; +// MD-INDEX: | enum class Shapes | +// MD-INDEX: -- +// MD-INDEX: | Circle | +// MD-INDEX: | Rectangle | +// MD-INDEX: | Triangle | +// MD-INDEX: **brief** Shape Types + +// HTML-INDEX:

    enum class Shapes

    +// HTML-INDEX:
  • Circle
  • +// HTML-INDEX:
  • Rectangle
  • +// HTML-INDEX:
  • Triangle
  • + + +class Animals { +// MD-ANIMAL-LINE: *Defined at {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}enum.cpp#[[@LINE-1]]* +// HTML-ANIMAL-LINE:

    Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}enum.cpp

    +public: + /** + * @brief specify what animal the class is + */ + enum AnimalType { +// MD-ANIMAL-LINE: *Defined at {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}enum.cpp#[[@LINE-1]]* +// HTML-ANIMAL-LINE:

    Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}enum.cpp

    + Dog, /// Man's best friend + Cat, /// Man's other best friend + Iguana /// A lizard + }; +}; + +// HTML-ANIMAL:

    class Animals

    +// HTML-ANIMAL:

    Enums

    +// HTML-ANIMAL:

    enum AnimalType

    +// HTML-ANIMAL:
  • Dog
  • +// HTML-ANIMAL:
  • Cat
  • +// HTML-ANIMAL:
  • Iguana
  • + +// MD-ANIMAL: # class Animals +// MD-ANIMAL: ## Enums +// MD-ANIMAL: | enum AnimalType | +// MD-ANIMAL: -- +// MD-ANIMAL: | Dog | +// MD-ANIMAL: | Cat | +// MD-ANIMAL: | Iguana | +// MD-ANIMAL: **brief** specify what animal the class is + + +namespace Vehicles { + /** + * @brief specify type of car + */ + enum Car { +// MD-VEHICLES-LINE: *Defined at {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}enum.cpp#[[@LINE-1]]* +// HTML-VEHICLES-LINE:

    Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}enum.cpp

    + Sedan, /// Sedan + SUV, /// SUV + Pickup, /// Pickup + Hatchback /// Hatchback + }; +} + +// MD-VEHICLES: # namespace Vehicles +// MD-VEHICLES: ## Enums +// MD-VEHICLES: | enum Car | +// MD-VEHICLES: -- +// MD-VEHICLES: | Sedan | +// MD-VEHICLES: | SUV | +// MD-VEHICLES: | Pickup | +// MD-VEHICLES: | Hatchback | +// MD-VEHICLES: **brief** specify type of car + +// HTML-VEHICLES:

    namespace Vehicles

    +// HTML-VEHICLES:

    Enums

    +// HTML-VEHICLES:

    enum Car

    +// HTML-VEHICLES:
  • Sedan
  • +// HTML-VEHICLES:
  • SUV
  • +// HTML-VEHICLES:
  • Pickup
  • +// HTML-VEHICLES:
  • Hatchback
  • \ No newline at end of file From e8ab4132598cd925263137adfad510b411459907 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 16 Jul 2024 15:33:27 -0700 Subject: [PATCH 198/777] [RISCV] Add capital letters to T-Head extension names in descriptions. (#99070) This matches T-Head documentation and the capitalization we use for the RISCVSubtarget methods. --- .../Driver/print-supported-extensions-riscv.c | 22 +++++----- llvm/lib/Target/RISCV/RISCVFeatures.td | 44 +++++++++---------- llvm/test/MC/RISCV/XTHeadVdot-valid.s | 28 ++++++------ 3 files changed, 47 insertions(+), 47 deletions(-) diff --git a/clang/test/Driver/print-supported-extensions-riscv.c b/clang/test/Driver/print-supported-extensions-riscv.c index 88cbcc1296244..1dc4580ec202e 100644 --- a/clang/test/Driver/print-supported-extensions-riscv.c +++ b/clang/test/Driver/print-supported-extensions-riscv.c @@ -154,17 +154,17 @@ // CHECK-NEXT: xsfvqmaccqoq 1.0 'XSfvqmaccqoq' (SiFive Int8 Matrix Multiplication Instructions (4-by-8 and 8-by-4)) // CHECK-NEXT: xsifivecdiscarddlone 1.0 'XSiFivecdiscarddlone' (SiFive sf.cdiscard.d.l1 Instruction) // CHECK-NEXT: xsifivecflushdlone 1.0 'XSiFivecflushdlone' (SiFive sf.cflush.d.l1 Instruction) -// CHECK-NEXT: xtheadba 1.0 'xtheadba' (T-Head address calculation instructions) -// CHECK-NEXT: xtheadbb 1.0 'xtheadbb' (T-Head basic bit-manipulation instructions) -// CHECK-NEXT: xtheadbs 1.0 'xtheadbs' (T-Head single-bit instructions) -// CHECK-NEXT: xtheadcmo 1.0 'xtheadcmo' (T-Head cache management instructions) -// CHECK-NEXT: xtheadcondmov 1.0 'xtheadcondmov' (T-Head conditional move instructions) -// CHECK-NEXT: xtheadfmemidx 1.0 'xtheadfmemidx' (T-Head FP Indexed Memory Operations) -// CHECK-NEXT: xtheadmac 1.0 'xtheadmac' (T-Head Multiply-Accumulate Instructions) -// CHECK-NEXT: xtheadmemidx 1.0 'xtheadmemidx' (T-Head Indexed Memory Operations) -// CHECK-NEXT: xtheadmempair 1.0 'xtheadmempair' (T-Head two-GPR Memory Operations) -// CHECK-NEXT: xtheadsync 1.0 'xtheadsync' (T-Head multicore synchronization instructions) -// CHECK-NEXT: xtheadvdot 1.0 'xtheadvdot' (T-Head Vector Extensions for Dot) +// CHECK-NEXT: xtheadba 1.0 'XTHeadBa' (T-Head address calculation instructions) +// CHECK-NEXT: xtheadbb 1.0 'XTHeadBb' (T-Head basic bit-manipulation instructions) +// CHECK-NEXT: xtheadbs 1.0 'XTHeadBs' (T-Head single-bit instructions) +// CHECK-NEXT: xtheadcmo 1.0 'XTHeadCmo' (T-Head cache management instructions) +// CHECK-NEXT: xtheadcondmov 1.0 'XTHeadCondMov' (T-Head conditional move instructions) +// CHECK-NEXT: xtheadfmemidx 1.0 'XTHeadFMemIdx' (T-Head FP Indexed Memory Operations) +// CHECK-NEXT: xtheadmac 1.0 'XTHeadMac' (T-Head Multiply-Accumulate Instructions) +// CHECK-NEXT: xtheadmemidx 1.0 'XTHeadMemIdx' (T-Head Indexed Memory Operations) +// CHECK-NEXT: xtheadmempair 1.0 'XTHeadMemPair' (T-Head two-GPR Memory Operations) +// CHECK-NEXT: xtheadsync 1.0 'XTHeadSync' (T-Head multicore synchronization instructions) +// CHECK-NEXT: xtheadvdot 1.0 'XTHeadVdot' (T-Head Vector Extensions for Dot) // CHECK-NEXT: xventanacondops 1.0 'XVentanaCondOps' (Ventana Conditional Ops) // CHECK-NEXT: xwchc 2.2 'Xwchc' (WCH/QingKe additional compressed opcodes) // CHECK-EMPTY: diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index 5a8605aa4a197..c9979b2b36fc3 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -1006,81 +1006,81 @@ def HasVendorXVentanaCondOps : Predicate<"Subtarget->hasVendorXVentanaCondOps()" def FeatureVendorXTHeadBa : RISCVExtension<"xtheadba", 1, 0, - "'xtheadba' (T-Head address calculation instructions)">; + "'XTHeadBa' (T-Head address calculation instructions)">; def HasVendorXTHeadBa : Predicate<"Subtarget->hasVendorXTHeadBa()">, AssemblerPredicate<(all_of FeatureVendorXTHeadBa), - "'xtheadba' (T-Head address calculation instructions)">; + "'XTHeadBa' (T-Head address calculation instructions)">; def FeatureVendorXTHeadBb : RISCVExtension<"xtheadbb", 1, 0, - "'xtheadbb' (T-Head basic bit-manipulation instructions)">; + "'XTHeadBb' (T-Head basic bit-manipulation instructions)">; def HasVendorXTHeadBb : Predicate<"Subtarget->hasVendorXTHeadBb()">, AssemblerPredicate<(all_of FeatureVendorXTHeadBb), - "'xtheadbb' (T-Head basic bit-manipulation instructions)">; + "'XTHeadBb' (T-Head basic bit-manipulation instructions)">; def FeatureVendorXTHeadBs : RISCVExtension<"xtheadbs", 1, 0, - "'xtheadbs' (T-Head single-bit instructions)">; + "'XTHeadBs' (T-Head single-bit instructions)">; def HasVendorXTHeadBs : Predicate<"Subtarget->hasVendorXTHeadBs()">, AssemblerPredicate<(all_of FeatureVendorXTHeadBs), - "'xtheadbs' (T-Head single-bit instructions)">; + "'XTHeadBs' (T-Head single-bit instructions)">; def FeatureVendorXTHeadCondMov : RISCVExtension<"xtheadcondmov", 1, 0, - "'xtheadcondmov' (T-Head conditional move instructions)">; + "'XTHeadCondMov' (T-Head conditional move instructions)">; def HasVendorXTHeadCondMov : Predicate<"Subtarget->hasVendorXTHeadCondMov()">, AssemblerPredicate<(all_of FeatureVendorXTHeadCondMov), - "'xtheadcondmov' (T-Head conditional move instructions)">; + "'XTHeadCondMov' (T-Head conditional move instructions)">; def FeatureVendorXTHeadCmo : RISCVExtension<"xtheadcmo", 1, 0, - "'xtheadcmo' (T-Head cache management instructions)">; + "'XTHeadCmo' (T-Head cache management instructions)">; def HasVendorXTHeadCmo : Predicate<"Subtarget->hasVendorXTHeadCmo()">, AssemblerPredicate<(all_of FeatureVendorXTHeadCmo), - "'xtheadcmo' (T-Head cache management instructions)">; + "'XTHeadCmo' (T-Head cache management instructions)">; def FeatureVendorXTHeadFMemIdx : RISCVExtension<"xtheadfmemidx", 1, 0, - "'xtheadfmemidx' (T-Head FP Indexed Memory Operations)">; + "'XTHeadFMemIdx' (T-Head FP Indexed Memory Operations)">; def HasVendorXTHeadFMemIdx : Predicate<"Subtarget->hasVendorXTHeadFMemIdx()">, AssemblerPredicate<(all_of FeatureVendorXTHeadFMemIdx), - "'xtheadfmemidx' (T-Head FP Indexed Memory Operations)">; + "'XTHeadFMemIdx' (T-Head FP Indexed Memory Operations)">; def FeatureVendorXTHeadMac : RISCVExtension<"xtheadmac", 1, 0, - "'xtheadmac' (T-Head Multiply-Accumulate Instructions)">; + "'XTHeadMac' (T-Head Multiply-Accumulate Instructions)">; def HasVendorXTHeadMac : Predicate<"Subtarget->hasVendorXTHeadMac()">, AssemblerPredicate<(all_of FeatureVendorXTHeadMac), - "'xtheadmac' (T-Head Multiply-Accumulate Instructions)">; + "'XTHeadMac' (T-Head Multiply-Accumulate Instructions)">; def FeatureVendorXTHeadMemIdx : RISCVExtension<"xtheadmemidx", 1, 0, - "'xtheadmemidx' (T-Head Indexed Memory Operations)">; + "'XTHeadMemIdx' (T-Head Indexed Memory Operations)">; def HasVendorXTHeadMemIdx : Predicate<"Subtarget->hasVendorXTHeadMemIdx()">, AssemblerPredicate<(all_of FeatureVendorXTHeadMemIdx), - "'xtheadmemidx' (T-Head Indexed Memory Operations)">; + "'XTHeadMemIdx' (T-Head Indexed Memory Operations)">; def FeatureVendorXTHeadMemPair : RISCVExtension<"xtheadmempair", 1, 0, - "'xtheadmempair' (T-Head two-GPR Memory Operations)">; + "'XTHeadMemPair' (T-Head two-GPR Memory Operations)">; def HasVendorXTHeadMemPair : Predicate<"Subtarget->hasVendorXTHeadMemPair()">, AssemblerPredicate<(all_of FeatureVendorXTHeadMemPair), - "'xtheadmempair' (T-Head two-GPR Memory Operations)">; + "'XTHeadMemPair' (T-Head two-GPR Memory Operations)">; def FeatureVendorXTHeadSync : RISCVExtension<"xtheadsync", 1, 0, - "'xtheadsync' (T-Head multicore synchronization instructions)">; + "'XTHeadSync' (T-Head multicore synchronization instructions)">; def HasVendorXTHeadSync : Predicate<"Subtarget->hasVendorXTHeadSync()">, AssemblerPredicate<(all_of FeatureVendorXTHeadSync), - "'xtheadsync' (T-Head multicore synchronization instructions)">; + "'XTHeadSync' (T-Head multicore synchronization instructions)">; def FeatureVendorXTHeadVdot : RISCVExtension<"xtheadvdot", 1, 0, - "'xtheadvdot' (T-Head Vector Extensions for Dot)", + "'XTHeadVdot' (T-Head Vector Extensions for Dot)", [FeatureStdExtV]>; def HasVendorXTHeadVdot : Predicate<"Subtarget->hasVendorXTHeadVdot()">, AssemblerPredicate<(all_of FeatureVendorXTHeadVdot), - "'xtheadvdot' (T-Head Vector Extensions for Dot)">; + "'XTHeadVdot' (T-Head Vector Extensions for Dot)">; // SiFive Extensions diff --git a/llvm/test/MC/RISCV/XTHeadVdot-valid.s b/llvm/test/MC/RISCV/XTHeadVdot-valid.s index ab411dfac7308..ecae431f68a90 100644 --- a/llvm/test/MC/RISCV/XTHeadVdot-valid.s +++ b/llvm/test/MC/RISCV/XTHeadVdot-valid.s @@ -11,83 +11,83 @@ th.vmaqau.vv v8, v20, v4, v0.t # CHECK-INST: th.vmaqau.vv v8, v20, v4, v0.t # CHECK-ENCODING: [0x0b,0x64,0x4a,0x88] -# CHECK-ERROR: instruction requires the following: 'xtheadvdot' (T-Head Vector Extensions for Dot){{$}} +# CHECK-ERROR: instruction requires the following: 'XTHeadVdot' (T-Head Vector Extensions for Dot){{$}} # CHECK-UNKNOWN: 884a640b th.vmaqau.vv v8, v20, v4 # CHECK-INST: th.vmaqau.vv v8, v20, v4 # CHECK-ENCODING: [0x0b,0x64,0x4a,0x8a] -# CHECK-ERROR: instruction requires the following: 'xtheadvdot' (T-Head Vector Extensions for Dot){{$}} +# CHECK-ERROR: instruction requires the following: 'XTHeadVdot' (T-Head Vector Extensions for Dot){{$}} # CHECK-UNKNOWN: 8a4a640b th.vmaqau.vx v8, a0, v4, v0.t # CHECK-INST: th.vmaqau.vx v8, a0, v4, v0.t # CHECK-ENCODING: [0x0b,0x64,0x45,0x8c] -# CHECK-ERROR: instruction requires the following: 'xtheadvdot' (T-Head Vector Extensions for Dot){{$}} +# CHECK-ERROR: instruction requires the following: 'XTHeadVdot' (T-Head Vector Extensions for Dot){{$}} # CHECK-UNKNOWN: 8c45640b th.vmaqau.vx v8, a0, v4 # CHECK-INST: th.vmaqau.vx v8, a0, v4 # CHECK-ENCODING: [0x0b,0x64,0x45,0x8e] -# CHECK-ERROR: instruction requires the following: 'xtheadvdot' (T-Head Vector Extensions for Dot){{$}} +# CHECK-ERROR: instruction requires the following: 'XTHeadVdot' (T-Head Vector Extensions for Dot){{$}} # CHECK-UNKNOWN: 8e45640b th.vmaqa.vv v8, v20, v4, v0.t # CHECK-INST: th.vmaqa.vv v8, v20, v4, v0.t # CHECK-ENCODING: [0x0b,0x64,0x4a,0x80] -# CHECK-ERROR: instruction requires the following: 'xtheadvdot' (T-Head Vector Extensions for Dot){{$}} +# CHECK-ERROR: instruction requires the following: 'XTHeadVdot' (T-Head Vector Extensions for Dot){{$}} # CHECK-UNKNOWN: 804a640b th.vmaqa.vv v8, v20, v4 # CHECK-INST: th.vmaqa.vv v8, v20, v4 # CHECK-ENCODING: [0x0b,0x64,0x4a,0x82] -# CHECK-ERROR: instruction requires the following: 'xtheadvdot' (T-Head Vector Extensions for Dot){{$}} +# CHECK-ERROR: instruction requires the following: 'XTHeadVdot' (T-Head Vector Extensions for Dot){{$}} # CHECK-UNKNOWN: 824a640b th.vmaqa.vx v8, a0, v4, v0.t # CHECK-INST: th.vmaqa.vx v8, a0, v4, v0.t # CHECK-ENCODING: [0x0b,0x64,0x45,0x84] -# CHECK-ERROR: instruction requires the following: 'xtheadvdot' (T-Head Vector Extensions for Dot){{$}} +# CHECK-ERROR: instruction requires the following: 'XTHeadVdot' (T-Head Vector Extensions for Dot){{$}} # CHECK-UNKNOWN: 8445640b th.vmaqa.vx v8, a0, v4 # CHECK-INST: th.vmaqa.vx v8, a0, v4 # CHECK-ENCODING: [0x0b,0x64,0x45,0x86] -# CHECK-ERROR: instruction requires the following: 'xtheadvdot' (T-Head Vector Extensions for Dot){{$}} +# CHECK-ERROR: instruction requires the following: 'XTHeadVdot' (T-Head Vector Extensions for Dot){{$}} # CHECK-UNKNOWN: 8645640b th.vmaqasu.vv v8, v20, v4, v0.t # CHECK-INST: th.vmaqasu.vv v8, v20, v4, v0.t # CHECK-ENCODING: [0x0b,0x64,0x4a,0x90] -# CHECK-ERROR: instruction requires the following: 'xtheadvdot' (T-Head Vector Extensions for Dot){{$}} +# CHECK-ERROR: instruction requires the following: 'XTHeadVdot' (T-Head Vector Extensions for Dot){{$}} # CHECK-UNKNOWN: 904a640b th.vmaqasu.vv v8, v20, v4 # CHECK-INST: th.vmaqasu.vv v8, v20, v4 # CHECK-ENCODING: [0x0b,0x64,0x4a,0x92] -# CHECK-ERROR: instruction requires the following: 'xtheadvdot' (T-Head Vector Extensions for Dot){{$}} +# CHECK-ERROR: instruction requires the following: 'XTHeadVdot' (T-Head Vector Extensions for Dot){{$}} # CHECK-UNKNOWN: 924a640b th.vmaqasu.vx v8, a0, v4, v0.t # CHECK-INST: th.vmaqasu.vx v8, a0, v4, v0.t # CHECK-ENCODING: [0x0b,0x64,0x45,0x94] -# CHECK-ERROR: instruction requires the following: 'xtheadvdot' (T-Head Vector Extensions for Dot){{$}} +# CHECK-ERROR: instruction requires the following: 'XTHeadVdot' (T-Head Vector Extensions for Dot){{$}} # CHECK-UNKNOWN: 9445640b th.vmaqasu.vx v8, a0, v4 # CHECK-INST: th.vmaqasu.vx v8, a0, v4 # CHECK-ENCODING: [0x0b,0x64,0x45,0x96] -# CHECK-ERROR: instruction requires the following: 'xtheadvdot' (T-Head Vector Extensions for Dot){{$}} +# CHECK-ERROR: instruction requires the following: 'XTHeadVdot' (T-Head Vector Extensions for Dot){{$}} # CHECK-UNKNOWN: 9645640b th.vmaqaus.vx v8, a0, v4, v0.t # CHECK-INST: th.vmaqaus.vx v8, a0, v4, v0.t # CHECK-ENCODING: [0x0b,0x64,0x45,0x9c] -# CHECK-ERROR: instruction requires the following: 'xtheadvdot' (T-Head Vector Extensions for Dot){{$}} +# CHECK-ERROR: instruction requires the following: 'XTHeadVdot' (T-Head Vector Extensions for Dot){{$}} # CHECK-UNKNOWN: 9c45640b th.vmaqaus.vx v8, a0, v4 # CHECK-INST: th.vmaqaus.vx v8, a0, v4 # CHECK-ENCODING: [0x0b,0x64,0x45,0x9e] -# CHECK-ERROR: instruction requires the following: 'xtheadvdot' (T-Head Vector Extensions for Dot){{$}} +# CHECK-ERROR: instruction requires the following: 'XTHeadVdot' (T-Head Vector Extensions for Dot){{$}} # CHECK-UNKNOWN: 9e45640b From dbc3df17180782006be0ad6e43d3da81d98c2d4d Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 16 Jul 2024 15:39:20 -0700 Subject: [PATCH 199/777] [RISCV] Remove unnecessary call to MachineFunction::getSubtarget. NFC RISCVInstrInfo already caches a reference to the subtarget object that owns it. We can use that. --- llvm/lib/Target/RISCV/RISCVInstrInfo.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index 5e1b5284751f4..ba3b4bd701d63 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -1474,10 +1474,8 @@ unsigned RISCVInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { if (!MI.memoperands_empty()) { MachineMemOperand *MMO = *(MI.memoperands_begin()); - const MachineFunction &MF = *MI.getParent()->getParent(); - const auto &ST = MF.getSubtarget(); - if (ST.hasStdExtZihintntl() && MMO->isNonTemporal()) { - if (ST.hasStdExtCOrZca() && ST.enableRVCHintInstrs()) { + if (STI.hasStdExtZihintntl() && MMO->isNonTemporal()) { + if (STI.hasStdExtCOrZca() && STI.enableRVCHintInstrs()) { if (isCompressibleInst(MI, STI)) return 4; // c.ntl.all + c.load/c.store return 6; // c.ntl.all + load/store From 6ad2987a72392e9885e1186a34834041445e0a1e Mon Sep 17 00:00:00 2001 From: Daniel Bertalan Date: Wed, 17 Jul 2024 00:41:36 +0200 Subject: [PATCH 200/777] [lld-macho] Omit `__llvm_addrsig` metadata from the output (#98913) This section contains metadata that's only relevant for Identical Code Folding at link time, we should not include it in the output. We still treat it like a regular section during input file parsing (e.g. create a `ConcatInputSection` for it), as we want its relocations to be parsed. But it should not be passed to `addInputSection`, as that's what assigns it to an `OutputSection` and adds it to the `inputSections` vector which specifies the inputs to dead-stripping and relocation scanning. This fixes a "__DATA,__llvm_addrsig, offset 0: fixups overlap" error when using `--icf=safe` alongside `-fixup_chains`. This occurs because all `__llvm_addrsig` sections are 8 bytes large, and the relocations which signify functions whose addresses are taken are all at offset 0. This makes the fix in 5fa24ac2 ("Category Merger: add support for addrsig references") obsolete, as we no longer try to resolve symbols referenced in `__llvm_addrsig` when writing the output file. When we do iterate its relocations in `markAddrSigSymbols`, we do not try to resolve their addresses. --- lld/MachO/Driver.cpp | 3 +++ lld/MachO/ObjC.cpp | 32 -------------------------------- lld/test/MachO/dead-strip.s | 24 ++++++++++++++++++++++++ lld/test/MachO/icf-safe.ll | 16 ++++++++++++---- 4 files changed, 39 insertions(+), 36 deletions(-) diff --git a/lld/MachO/Driver.cpp b/lld/MachO/Driver.cpp index 28c28f29defd1..a370d5734124a 100644 --- a/lld/MachO/Driver.cpp +++ b/lld/MachO/Driver.cpp @@ -1247,6 +1247,9 @@ static void gatherInputSections() { // contrast, EH frames are handled like regular ConcatInputSections.) if (section->name == section_names::compactUnwind) continue; + // Addrsig sections contain metadata only needed at link time. + if (section->name == section_names::addrSig) + continue; for (const Subsection &subsection : section->subsections) addInputSection(subsection.isec); } diff --git a/lld/MachO/ObjC.cpp b/lld/MachO/ObjC.cpp index 740ebaf7e0403..4a6f99654ba13 100644 --- a/lld/MachO/ObjC.cpp +++ b/lld/MachO/ObjC.cpp @@ -449,7 +449,6 @@ class ObjcCategoryMerger { mergeCategoriesIntoSingleCategory(std::vector &categories); void eraseISec(ConcatInputSection *isec); - void removeRefsToErasedIsecs(); void eraseMergedCategories(); void generateCatListForNonErasedCategories( @@ -519,8 +518,6 @@ class ObjcCategoryMerger { std::vector &allInputSections; // Map of base class Symbol to list of InfoInputCategory's for it MapVector> categoryMap; - // Set for tracking InputSection erased via eraseISec - DenseSet erasedIsecs; // Normally, the binary data comes from the input files, but since we're // generating binary data ourselves, we use the below array to store it in. @@ -1272,8 +1269,6 @@ void ObjcCategoryMerger::generateCatListForNonErasedCategories( } void ObjcCategoryMerger::eraseISec(ConcatInputSection *isec) { - erasedIsecs.insert(isec); - isec->live = false; for (auto &sym : isec->symbols) sym->used = false; @@ -1326,33 +1321,6 @@ void ObjcCategoryMerger::eraseMergedCategories() { catLayout.instancePropsOffset); } } - - removeRefsToErasedIsecs(); -} - -// The compiler may generate references to categories inside the addrsig -// section. This function will erase these references. -void ObjcCategoryMerger::removeRefsToErasedIsecs() { - for (InputSection *isec : inputSections) { - if (isec->getName() != section_names::addrSig) - continue; - - auto removeRelocs = [this](Reloc &r) { - auto *isec = dyn_cast_or_null( - r.referent.dyn_cast()); - if (!isec) { - Defined *sym = - dyn_cast_or_null(r.referent.dyn_cast()); - if (sym) - isec = dyn_cast(sym->isec()); - } - if (!isec) - return false; - return erasedIsecs.count(isec) > 0; - }; - - llvm::erase_if(isec->relocs, removeRelocs); - } } void ObjcCategoryMerger::doMerge() { diff --git a/lld/test/MachO/dead-strip.s b/lld/test/MachO/dead-strip.s index f593b69843ba6..d107dad53a3c5 100644 --- a/lld/test/MachO/dead-strip.s +++ b/lld/test/MachO/dead-strip.s @@ -329,6 +329,17 @@ # LIT-NEXT: Contents of (__TEXT,__literals) section # LIT-NEXT: ef be ad de {{$}} +## Ensure that addrsig metadata does not keep unreferenced functions alive. +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-macos \ +# RUN: %t/addrsig.s -o %t/addrsig.o +# RUN: %lld -lSystem -dead_strip --icf=safe %t/addrsig.o -o %t/addrsig +# RUN: llvm-objdump --syms %t/addrsig | \ +# RUN: FileCheck --check-prefix=ADDSIG --implicit-check-not _addrsig %s +# ADDSIG-LABEL: SYMBOL TABLE: +# ADDSIG-NEXT: g F __TEXT,__text _main +# ADDSIG-NEXT: g F __TEXT,__text __mh_execute_header +# ADDSIG-NEXT: *UND* dyld_stub_binder + ## Duplicate symbols that will be dead stripped later should not fail when using ## the --dead-stripped-duplicates flag # RUN: llvm-mc -filetype=obj -triple=x86_64-apple-macos \ @@ -988,3 +999,16 @@ _more_data: _main: callq _ref_undef_fun .subsections_via_symbols + +#--- addrsig.s +.globl _main, _addrsig +_main: + retq + +_addrsig: + retq + +.subsections_via_symbols + +.addrsig +.addrsig_sym _addrsig diff --git a/lld/test/MachO/icf-safe.ll b/lld/test/MachO/icf-safe.ll index 71c6f9f7ddac8..03830a30048a6 100644 --- a/lld/test/MachO/icf-safe.ll +++ b/lld/test/MachO/icf-safe.ll @@ -5,14 +5,19 @@ ; RUN: llc -filetype=obj %s -O3 -o %t/icf-obj.o -enable-machine-outliner=never -mtriple arm64-apple-macos -addrsig ; RUN: %lld -arch arm64 -lSystem --icf=safe -dylib -o %t/icf-safe.dylib %t/icf-obj.o ; RUN: %lld -arch arm64 -lSystem --icf=all -dylib -o %t/icf-all.dylib %t/icf-obj.o -; RUN: llvm-objdump %t/icf-safe.dylib -d --macho | FileCheck %s --check-prefix=ICFSAFE -; RUN: llvm-objdump %t/icf-all.dylib -d --macho | FileCheck %s --check-prefix=ICFALL +; RUN: llvm-objdump %t/icf-safe.dylib -d -h --macho | FileCheck %s --check-prefixes=ICFSAFE,CHECK +; RUN: llvm-objdump %t/icf-all.dylib -d -h --macho | FileCheck %s --check-prefixes=ICFALL,CHECK ; RUN: llvm-as %s -o %t/icf-bitcode.o ; RUN: %lld -arch arm64 -lSystem --icf=safe -dylib -o %t/icf-safe-bitcode.dylib %t/icf-bitcode.o ; RUN: %lld -arch arm64 -lSystem --icf=all -dylib -o %t/icf-all-bitcode.dylib %t/icf-bitcode.o -; RUN: llvm-objdump %t/icf-safe-bitcode.dylib -d --macho | FileCheck %s --check-prefix=ICFSAFE -; RUN: llvm-objdump %t/icf-all-bitcode.dylib -d --macho | FileCheck %s --check-prefix=ICFALL +; RUN: llvm-objdump %t/icf-safe-bitcode.dylib -d -h --macho | FileCheck %s --check-prefixes=ICFSAFE,CHECK +; RUN: llvm-objdump %t/icf-all-bitcode.dylib -d -h --macho | FileCheck %s --check-prefixes=ICFALL,CHECK + +;; Regression test: if we tried writing __llvm_addrsig to the output, -fixup_chains would fail with a "fixups overlap" +;; error, as the relocations (which reference the address-taken functions) are all at offset 0. +; RUN: %lld -arch arm64 -lSystem --icf=safe -fixup_chains -dylib -o %t/icf-safe-chained.dylib %t/icf-obj.o +; RUN: llvm-objdump %t/icf-safe-chained.dylib -d -h --macho | FileCheck %s --check-prefixes=ICFSAFE,CHECK ; ICFSAFE-LABEL: _callAllFunctions ; ICFSAFE: bl _func02 @@ -24,6 +29,9 @@ ; ICFALL-NEXT: bl _func03_takeaddr ; ICFALL-NEXT: bl _func03_takeaddr +; CHECK-LABEL: Sections: +; CHECK-NOT: __llvm_addrsig + target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" target triple = "arm64-apple-macos11.0" From ffbda47159361cd089b8046ec634ace071e5493c Mon Sep 17 00:00:00 2001 From: Jie Fu Date: Wed, 17 Jul 2024 07:13:16 +0800 Subject: [PATCH 201/777] [mlir] Fix build error (NFC) /llvm-project/mlir/include/mlir/CAPI/Rewrite.h:21:63: error: extra ';' outside of a function is incompatible with C++98 [-Werror,-Wc++98-compat-extra-semi] DEFINE_C_API_PTR_METHODS(MlirRewriterBase, mlir::RewriterBase); ^ 1 error generated. --- mlir/include/mlir/CAPI/Rewrite.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/include/mlir/CAPI/Rewrite.h b/mlir/include/mlir/CAPI/Rewrite.h index 0e6dcb2477626..f0bb9337e49ea 100644 --- a/mlir/include/mlir/CAPI/Rewrite.h +++ b/mlir/include/mlir/CAPI/Rewrite.h @@ -18,6 +18,6 @@ #include "mlir/CAPI/Wrap.h" #include "mlir/IR/PatternMatch.h" -DEFINE_C_API_PTR_METHODS(MlirRewriterBase, mlir::RewriterBase); +DEFINE_C_API_PTR_METHODS(MlirRewriterBase, mlir::RewriterBase) #endif // MLIR_CAPIREWRITER_H From bed625b0ad6722e1d29a3ea492dda173eee541dd Mon Sep 17 00:00:00 2001 From: Dmitriy Chestnykh Date: Wed, 17 Jul 2024 02:34:25 +0300 Subject: [PATCH 202/777] [MC,ELF] Emit warning if a string constant contains newline char (#98060) GAS emits warning about newline in the string constant so make the same behaviour. --- llvm/lib/MC/MCParser/AsmLexer.cpp | 1 - llvm/lib/MC/MCParser/AsmParser.cpp | 5 ++ .../MC/ELF/warn-newline-in-escaped-string.s | 55 +++++++++++++++++++ 3 files changed, 60 insertions(+), 1 deletion(-) create mode 100644 llvm/test/MC/ELF/warn-newline-in-escaped-string.s diff --git a/llvm/lib/MC/MCParser/AsmLexer.cpp b/llvm/lib/MC/MCParser/AsmLexer.cpp index e08404ae0ad92..778ca340e1248 100644 --- a/llvm/lib/MC/MCParser/AsmLexer.cpp +++ b/llvm/lib/MC/MCParser/AsmLexer.cpp @@ -646,7 +646,6 @@ AsmToken AsmLexer::LexQuote() { return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart)); } - // TODO: does gas allow multiline string constants? while (CurChar != '"') { if (CurChar == '\\') { // Allow \", etc. diff --git a/llvm/lib/MC/MCParser/AsmParser.cpp b/llvm/lib/MC/MCParser/AsmParser.cpp index ee5bebf324570..d05712bca73cd 100644 --- a/llvm/lib/MC/MCParser/AsmParser.cpp +++ b/llvm/lib/MC/MCParser/AsmParser.cpp @@ -3033,6 +3033,11 @@ bool AsmParser::parseEscapedString(std::string &Data) { StringRef Str = getTok().getStringContents(); for (unsigned i = 0, e = Str.size(); i != e; ++i) { if (Str[i] != '\\') { + if (Str[i] == '\n') { + SMLoc NewlineLoc = SMLoc::getFromPointer(Str.data() + i); + if (Warning(NewlineLoc, "unterminated string; newline inserted")) + return true; + } Data += Str[i]; continue; } diff --git a/llvm/test/MC/ELF/warn-newline-in-escaped-string.s b/llvm/test/MC/ELF/warn-newline-in-escaped-string.s new file mode 100644 index 0000000000000..64de13969ffb0 --- /dev/null +++ b/llvm/test/MC/ELF/warn-newline-in-escaped-string.s @@ -0,0 +1,55 @@ +// RUN: llvm-mc -filetype=obj -triple x86_64 %s 2>&1 -o /dev/null \ +// RUN: | FileCheck %s --implicit-check-not=warning: + +.string "abcd\xFFefg +12345678" + +// CHECK: [[#@LINE-3]]:21: warning: unterminated string; newline inserted +// CHECK-NEXT: .string "abcd\xFFefg + +.ascii "some test ascii + +sequence +with +newlines\x0A +" + +// CHECK: [[#@LINE-7]]:24: warning: unterminated string; newline inserted +// CHECK-NEXT: .ascii "some test ascii +// CHECK: [[#@LINE-8]]:1: warning: unterminated string; newline inserted +// CHECK: [[#@LINE-8]]:9: warning: unterminated string; newline inserted +// CHECK-NEXT: sequence +// CHECK: [[#@LINE-9]]:5: warning: unterminated string; newline inserted +// CHECK-NEXT: with +// CHECK: [[#@LINE-10]]:13: warning: unterminated string; newline inserted +// CHECK-NEXT: newlines\x0A + +.asciz "another test string + +with +newline characters + + +" + +// CHECK: [[#@LINE-8]]:28: warning: unterminated string; newline inserted +// CHECK-NEXT: .asciz "another test string +// CHECK: [[#@LINE-9]]:1: warning: unterminated string; newline inserted +// CHECK: [[#@LINE-9]]:5: warning: unterminated string; newline inserted +// CHECK-NEXT: with +// CHECK: [[#@LINE-10]]:19: warning: unterminated string; newline inserted +// CHECK-NEXT: newline characters +// CHECK: [[#@LINE-11]]:1: warning: unterminated string; newline inserted +// CHECK: [[#@LINE-11]]:1: warning: unterminated string; newline inserted + +.file "warn-newline +.s" +// CHECK: [[#@LINE-2]]:20: warning: unterminated string; newline inserted + +.cv_file 1 "some_an +other_file.s" +// CHECK: [[#@LINE-2]]:20: warning: unterminated string; newline inserted + +.ascii "test\nvalid1_string\xFF\n\n\xFF" +.asciz "\n\n\nvalid2_string\x0A" +.string "1234\nvalid3_string\xFF\n\xFF\n" From f56cdd4a45b7bbe84be5d4ba9442eb7071605efc Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Tue, 16 Jul 2024 19:46:46 -0400 Subject: [PATCH 203/777] [NFC] Use named variable for test case `select-from-load.ll` --- llvm/test/Transforms/InstCombine/AMDGPU/select-from-load.ll | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/select-from-load.ll b/llvm/test/Transforms/InstCombine/AMDGPU/select-from-load.ll index d9af665e663f3..e22fd040bdc2c 100644 --- a/llvm/test/Transforms/InstCombine/AMDGPU/select-from-load.ll +++ b/llvm/test/Transforms/InstCombine/AMDGPU/select-from-load.ll @@ -25,8 +25,8 @@ entry: br label %for.cond10 for.cond10: ; preds = %for.cond10, %entry - %3 = load i64, ptr %retval.0.i - store i64 %3, ptr addrspace(1) null + %load.0 = load i64, ptr %retval.0.i + store i64 %load.0, ptr addrspace(1) null br label %for.cond10 } From b6c4ad700b0f5851313f18df89b9da2c27ba3185 Mon Sep 17 00:00:00 2001 From: Yeting Kuo <46629943+yetingk@users.noreply.github.com> Date: Wed, 17 Jul 2024 08:37:55 +0800 Subject: [PATCH 204/777] [RISCV] Remove x7 from fastcc list. (#96729) Like #93321, this patch also tries to solve the conflict usage of x7 for fastcc and Zicfilp. But this patch removes x7 from fastcc directly. Its purpose is to reduce the code complexity of #93321, and we also found that it at most increase 0.02% instruction count for most benchmarks and it might be benefit for benchmarks. --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 9 +- llvm/test/CodeGen/RISCV/fastcc-int.ll | 34 +- .../CodeGen/RISCV/fastcc-without-f-reg.ll | 1196 +++++++++-------- .../CodeGen/RISCV/rvv/calling-conv-fastcc.ll | 68 +- .../rvv/fixed-vectors-calling-conv-fastcc.ll | 25 +- 5 files changed, 680 insertions(+), 652 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 8b5e56bff4097..1280201d7b814 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -18884,15 +18884,14 @@ ArrayRef RISCV::getArgGPRs(const RISCVABI::ABI ABI) { static ArrayRef getFastCCArgGPRs(const RISCVABI::ABI ABI) { // The GPRs used for passing arguments in the FastCC, X5 and X6 might be used // for save-restore libcall, so we don't use them. + // Don't use X7 for fastcc, since Zicfilp uses X7 as the label register. static const MCPhysReg FastCCIGPRs[] = { - RISCV::X10, RISCV::X11, RISCV::X12, RISCV::X13, RISCV::X14, - RISCV::X15, RISCV::X16, RISCV::X17, RISCV::X7, RISCV::X28, - RISCV::X29, RISCV::X30, RISCV::X31}; + RISCV::X10, RISCV::X11, RISCV::X12, RISCV::X13, RISCV::X14, RISCV::X15, + RISCV::X16, RISCV::X17, RISCV::X28, RISCV::X29, RISCV::X30, RISCV::X31}; // The GPRs used for passing arguments in the FastCC when using ILP32E/ILP64E. static const MCPhysReg FastCCEGPRs[] = {RISCV::X10, RISCV::X11, RISCV::X12, - RISCV::X13, RISCV::X14, RISCV::X15, - RISCV::X7}; + RISCV::X13, RISCV::X14, RISCV::X15}; if (ABI == RISCVABI::ABI_ILP32E || ABI == RISCVABI::ABI_LP64E) return ArrayRef(FastCCEGPRs); diff --git a/llvm/test/CodeGen/RISCV/fastcc-int.ll b/llvm/test/CodeGen/RISCV/fastcc-int.ll index e4c41a1aa890f..75046b701b235 100644 --- a/llvm/test/CodeGen/RISCV/fastcc-int.ll +++ b/llvm/test/CodeGen/RISCV/fastcc-int.ll @@ -32,16 +32,17 @@ define i32 @caller(<16 x i32> %A) nounwind { ; RV32-NEXT: lw a5, 20(a0) ; RV32-NEXT: lw a6, 24(a0) ; RV32-NEXT: lw a7, 28(a0) -; RV32-NEXT: lw t2, 32(a0) -; RV32-NEXT: lw t3, 36(a0) -; RV32-NEXT: lw t4, 40(a0) -; RV32-NEXT: lw t5, 44(a0) -; RV32-NEXT: lw t6, 48(a0) -; RV32-NEXT: lw t1, 52(a0) +; RV32-NEXT: lw t3, 32(a0) +; RV32-NEXT: lw t4, 36(a0) +; RV32-NEXT: lw t5, 40(a0) +; RV32-NEXT: lw t6, 44(a0) +; RV32-NEXT: lw t1, 48(a0) +; RV32-NEXT: lw t2, 52(a0) ; RV32-NEXT: lw s0, 56(a0) ; RV32-NEXT: lw a0, 60(a0) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: sw s0, 4(sp) +; RV32-NEXT: sw a0, 12(sp) +; RV32-NEXT: sw s0, 8(sp) +; RV32-NEXT: sw t2, 4(sp) ; RV32-NEXT: sw t1, 0(sp) ; RV32-NEXT: mv a0, t0 ; RV32-NEXT: call callee @@ -63,16 +64,17 @@ define i32 @caller(<16 x i32> %A) nounwind { ; RV64-NEXT: ld a5, 40(a0) ; RV64-NEXT: ld a6, 48(a0) ; RV64-NEXT: ld a7, 56(a0) -; RV64-NEXT: ld t2, 64(a0) -; RV64-NEXT: ld t3, 72(a0) -; RV64-NEXT: ld t4, 80(a0) -; RV64-NEXT: ld t5, 88(a0) -; RV64-NEXT: ld t6, 96(a0) -; RV64-NEXT: ld t1, 104(a0) +; RV64-NEXT: ld t3, 64(a0) +; RV64-NEXT: ld t4, 72(a0) +; RV64-NEXT: ld t5, 80(a0) +; RV64-NEXT: ld t6, 88(a0) +; RV64-NEXT: ld t1, 96(a0) +; RV64-NEXT: ld t2, 104(a0) ; RV64-NEXT: ld s0, 112(a0) ; RV64-NEXT: ld a0, 120(a0) -; RV64-NEXT: sd a0, 16(sp) -; RV64-NEXT: sd s0, 8(sp) +; RV64-NEXT: sd a0, 24(sp) +; RV64-NEXT: sd s0, 16(sp) +; RV64-NEXT: sd t2, 8(sp) ; RV64-NEXT: sd t1, 0(sp) ; RV64-NEXT: mv a0, t0 ; RV64-NEXT: call callee diff --git a/llvm/test/CodeGen/RISCV/fastcc-without-f-reg.ll b/llvm/test/CodeGen/RISCV/fastcc-without-f-reg.ll index a44d31dff09cc..1dbb060fc35fa 100644 --- a/llvm/test/CodeGen/RISCV/fastcc-without-f-reg.ll +++ b/llvm/test/CodeGen/RISCV/fastcc-without-f-reg.ll @@ -288,29 +288,30 @@ define half @caller_half_32(<32 x half> %A) nounwind { ; ZHINX32-NEXT: lh t2, 196(sp) ; ZHINX32-NEXT: lh t1, 200(sp) ; ZHINX32-NEXT: lh t0, 204(sp) -; ZHINX32-NEXT: sh t0, 36(sp) -; ZHINX32-NEXT: sh t1, 34(sp) -; ZHINX32-NEXT: sh t2, 32(sp) -; ZHINX32-NEXT: sh t3, 30(sp) -; ZHINX32-NEXT: sh ra, 28(sp) -; ZHINX32-NEXT: sh s11, 26(sp) -; ZHINX32-NEXT: sh s10, 24(sp) -; ZHINX32-NEXT: sh s9, 22(sp) -; ZHINX32-NEXT: sh s8, 20(sp) -; ZHINX32-NEXT: sh s7, 18(sp) -; ZHINX32-NEXT: sh s6, 16(sp) -; ZHINX32-NEXT: sh s5, 14(sp) -; ZHINX32-NEXT: sh s4, 12(sp) -; ZHINX32-NEXT: sh s3, 10(sp) -; ZHINX32-NEXT: sh s2, 8(sp) -; ZHINX32-NEXT: sh s1, 6(sp) -; ZHINX32-NEXT: sh s0, 4(sp) -; ZHINX32-NEXT: sh t4, 2(sp) -; ZHINX32-NEXT: sh t5, 0(sp) -; ZHINX32-NEXT: lw t2, 56(sp) # 4-byte Folded Reload -; ZHINX32-NEXT: lw t3, 52(sp) # 4-byte Folded Reload -; ZHINX32-NEXT: lw t4, 48(sp) # 4-byte Folded Reload -; ZHINX32-NEXT: lw t5, 44(sp) # 4-byte Folded Reload +; ZHINX32-NEXT: sh t0, 38(sp) +; ZHINX32-NEXT: sh t1, 36(sp) +; ZHINX32-NEXT: sh t2, 34(sp) +; ZHINX32-NEXT: sh t3, 32(sp) +; ZHINX32-NEXT: sh ra, 30(sp) +; ZHINX32-NEXT: sh s11, 28(sp) +; ZHINX32-NEXT: sh s10, 26(sp) +; ZHINX32-NEXT: sh s9, 24(sp) +; ZHINX32-NEXT: sh s8, 22(sp) +; ZHINX32-NEXT: sh s7, 20(sp) +; ZHINX32-NEXT: sh s6, 18(sp) +; ZHINX32-NEXT: sh s5, 16(sp) +; ZHINX32-NEXT: sh s4, 14(sp) +; ZHINX32-NEXT: sh s3, 12(sp) +; ZHINX32-NEXT: sh s2, 10(sp) +; ZHINX32-NEXT: sh s1, 8(sp) +; ZHINX32-NEXT: sh s0, 6(sp) +; ZHINX32-NEXT: sh t4, 4(sp) +; ZHINX32-NEXT: sh t5, 2(sp) +; ZHINX32-NEXT: sh t6, 0(sp) +; ZHINX32-NEXT: lw t3, 56(sp) # 4-byte Folded Reload +; ZHINX32-NEXT: lw t4, 52(sp) # 4-byte Folded Reload +; ZHINX32-NEXT: lw t5, 48(sp) # 4-byte Folded Reload +; ZHINX32-NEXT: lw t6, 44(sp) # 4-byte Folded Reload ; ZHINX32-NEXT: call callee_half_32 ; ZHINX32-NEXT: lw ra, 108(sp) # 4-byte Folded Reload ; ZHINX32-NEXT: lw s0, 104(sp) # 4-byte Folded Reload @@ -372,29 +373,30 @@ define half @caller_half_32(<32 x half> %A) nounwind { ; ZHINX64-NEXT: lh t2, 344(sp) ; ZHINX64-NEXT: lh t1, 352(sp) ; ZHINX64-NEXT: lh t0, 360(sp) -; ZHINX64-NEXT: sh t0, 36(sp) -; ZHINX64-NEXT: sh t1, 34(sp) -; ZHINX64-NEXT: sh t2, 32(sp) -; ZHINX64-NEXT: sh t3, 30(sp) -; ZHINX64-NEXT: sh ra, 28(sp) -; ZHINX64-NEXT: sh s11, 26(sp) -; ZHINX64-NEXT: sh s10, 24(sp) -; ZHINX64-NEXT: sh s9, 22(sp) -; ZHINX64-NEXT: sh s8, 20(sp) -; ZHINX64-NEXT: sh s7, 18(sp) -; ZHINX64-NEXT: sh s6, 16(sp) -; ZHINX64-NEXT: sh s5, 14(sp) -; ZHINX64-NEXT: sh s4, 12(sp) -; ZHINX64-NEXT: sh s3, 10(sp) -; ZHINX64-NEXT: sh s2, 8(sp) -; ZHINX64-NEXT: sh s1, 6(sp) -; ZHINX64-NEXT: sh s0, 4(sp) -; ZHINX64-NEXT: sh t4, 2(sp) -; ZHINX64-NEXT: sh t5, 0(sp) -; ZHINX64-NEXT: ld t2, 64(sp) # 8-byte Folded Reload -; ZHINX64-NEXT: ld t3, 56(sp) # 8-byte Folded Reload -; ZHINX64-NEXT: ld t4, 48(sp) # 8-byte Folded Reload -; ZHINX64-NEXT: ld t5, 40(sp) # 8-byte Folded Reload +; ZHINX64-NEXT: sh t0, 38(sp) +; ZHINX64-NEXT: sh t1, 36(sp) +; ZHINX64-NEXT: sh t2, 34(sp) +; ZHINX64-NEXT: sh t3, 32(sp) +; ZHINX64-NEXT: sh ra, 30(sp) +; ZHINX64-NEXT: sh s11, 28(sp) +; ZHINX64-NEXT: sh s10, 26(sp) +; ZHINX64-NEXT: sh s9, 24(sp) +; ZHINX64-NEXT: sh s8, 22(sp) +; ZHINX64-NEXT: sh s7, 20(sp) +; ZHINX64-NEXT: sh s6, 18(sp) +; ZHINX64-NEXT: sh s5, 16(sp) +; ZHINX64-NEXT: sh s4, 14(sp) +; ZHINX64-NEXT: sh s3, 12(sp) +; ZHINX64-NEXT: sh s2, 10(sp) +; ZHINX64-NEXT: sh s1, 8(sp) +; ZHINX64-NEXT: sh s0, 6(sp) +; ZHINX64-NEXT: sh t4, 4(sp) +; ZHINX64-NEXT: sh t5, 2(sp) +; ZHINX64-NEXT: sh t6, 0(sp) +; ZHINX64-NEXT: ld t3, 64(sp) # 8-byte Folded Reload +; ZHINX64-NEXT: ld t4, 56(sp) # 8-byte Folded Reload +; ZHINX64-NEXT: ld t5, 48(sp) # 8-byte Folded Reload +; ZHINX64-NEXT: ld t6, 40(sp) # 8-byte Folded Reload ; ZHINX64-NEXT: call callee_half_32 ; ZHINX64-NEXT: ld ra, 168(sp) # 8-byte Folded Reload ; ZHINX64-NEXT: ld s0, 160(sp) # 8-byte Folded Reload @@ -414,38 +416,38 @@ define half @caller_half_32(<32 x half> %A) nounwind { ; ; ZFINX32-LABEL: caller_half_32: ; ZFINX32: # %bb.0: -; ZFINX32-NEXT: addi sp, sp, -144 -; ZFINX32-NEXT: sw ra, 140(sp) # 4-byte Folded Spill -; ZFINX32-NEXT: sw s0, 136(sp) # 4-byte Folded Spill -; ZFINX32-NEXT: sw s1, 132(sp) # 4-byte Folded Spill -; ZFINX32-NEXT: sw s2, 128(sp) # 4-byte Folded Spill -; ZFINX32-NEXT: sw s3, 124(sp) # 4-byte Folded Spill -; ZFINX32-NEXT: sw s4, 120(sp) # 4-byte Folded Spill -; ZFINX32-NEXT: sw s5, 116(sp) # 4-byte Folded Spill -; ZFINX32-NEXT: sw s6, 112(sp) # 4-byte Folded Spill -; ZFINX32-NEXT: sw s7, 108(sp) # 4-byte Folded Spill -; ZFINX32-NEXT: sw s8, 104(sp) # 4-byte Folded Spill -; ZFINX32-NEXT: sw s9, 100(sp) # 4-byte Folded Spill -; ZFINX32-NEXT: sw s10, 96(sp) # 4-byte Folded Spill -; ZFINX32-NEXT: sw s11, 92(sp) # 4-byte Folded Spill +; ZFINX32-NEXT: addi sp, sp, -160 +; ZFINX32-NEXT: sw ra, 156(sp) # 4-byte Folded Spill +; ZFINX32-NEXT: sw s0, 152(sp) # 4-byte Folded Spill +; ZFINX32-NEXT: sw s1, 148(sp) # 4-byte Folded Spill +; ZFINX32-NEXT: sw s2, 144(sp) # 4-byte Folded Spill +; ZFINX32-NEXT: sw s3, 140(sp) # 4-byte Folded Spill +; ZFINX32-NEXT: sw s4, 136(sp) # 4-byte Folded Spill +; ZFINX32-NEXT: sw s5, 132(sp) # 4-byte Folded Spill +; ZFINX32-NEXT: sw s6, 128(sp) # 4-byte Folded Spill +; ZFINX32-NEXT: sw s7, 124(sp) # 4-byte Folded Spill +; ZFINX32-NEXT: sw s8, 120(sp) # 4-byte Folded Spill +; ZFINX32-NEXT: sw s9, 116(sp) # 4-byte Folded Spill +; ZFINX32-NEXT: sw s10, 112(sp) # 4-byte Folded Spill +; ZFINX32-NEXT: sw s11, 108(sp) # 4-byte Folded Spill ; ZFINX32-NEXT: lw t0, 0(a0) ; ZFINX32-NEXT: lw a1, 4(a0) -; ZFINX32-NEXT: sw a1, 88(sp) # 4-byte Folded Spill +; ZFINX32-NEXT: sw a1, 104(sp) # 4-byte Folded Spill ; ZFINX32-NEXT: lw a1, 8(a0) -; ZFINX32-NEXT: sw a1, 84(sp) # 4-byte Folded Spill +; ZFINX32-NEXT: sw a1, 100(sp) # 4-byte Folded Spill ; ZFINX32-NEXT: lw a1, 12(a0) -; ZFINX32-NEXT: sw a1, 80(sp) # 4-byte Folded Spill +; ZFINX32-NEXT: sw a1, 96(sp) # 4-byte Folded Spill ; ZFINX32-NEXT: lw a1, 16(a0) -; ZFINX32-NEXT: sw a1, 76(sp) # 4-byte Folded Spill +; ZFINX32-NEXT: sw a1, 92(sp) # 4-byte Folded Spill ; ZFINX32-NEXT: lw a5, 20(a0) ; ZFINX32-NEXT: lw a6, 24(a0) ; ZFINX32-NEXT: lw a7, 28(a0) -; ZFINX32-NEXT: lw t2, 32(a0) -; ZFINX32-NEXT: lw t3, 36(a0) -; ZFINX32-NEXT: lw t4, 40(a0) -; ZFINX32-NEXT: lw t5, 44(a0) -; ZFINX32-NEXT: lw t6, 48(a0) -; ZFINX32-NEXT: lw t1, 52(a0) +; ZFINX32-NEXT: lw t3, 32(a0) +; ZFINX32-NEXT: lw t4, 36(a0) +; ZFINX32-NEXT: lw t5, 40(a0) +; ZFINX32-NEXT: lw t6, 44(a0) +; ZFINX32-NEXT: lw t1, 48(a0) +; ZFINX32-NEXT: lw t2, 52(a0) ; ZFINX32-NEXT: lw s0, 56(a0) ; ZFINX32-NEXT: lw s1, 60(a0) ; ZFINX32-NEXT: lw s2, 64(a0) @@ -464,83 +466,84 @@ define half @caller_half_32(<32 x half> %A) nounwind { ; ZFINX32-NEXT: lw a2, 116(a0) ; ZFINX32-NEXT: lw a1, 120(a0) ; ZFINX32-NEXT: lw a0, 124(a0) -; ZFINX32-NEXT: sw a0, 72(sp) -; ZFINX32-NEXT: sw a1, 68(sp) -; ZFINX32-NEXT: sw a2, 64(sp) -; ZFINX32-NEXT: sw a3, 60(sp) -; ZFINX32-NEXT: sw a4, 56(sp) -; ZFINX32-NEXT: sw ra, 52(sp) -; ZFINX32-NEXT: sw s11, 48(sp) -; ZFINX32-NEXT: sw s10, 44(sp) -; ZFINX32-NEXT: sw s9, 40(sp) -; ZFINX32-NEXT: sw s8, 36(sp) -; ZFINX32-NEXT: sw s7, 32(sp) -; ZFINX32-NEXT: sw s6, 28(sp) -; ZFINX32-NEXT: sw s5, 24(sp) -; ZFINX32-NEXT: sw s4, 20(sp) -; ZFINX32-NEXT: sw s3, 16(sp) -; ZFINX32-NEXT: sw s2, 12(sp) -; ZFINX32-NEXT: sw s1, 8(sp) -; ZFINX32-NEXT: sw s0, 4(sp) +; ZFINX32-NEXT: sw a0, 76(sp) +; ZFINX32-NEXT: sw a1, 72(sp) +; ZFINX32-NEXT: sw a2, 68(sp) +; ZFINX32-NEXT: sw a3, 64(sp) +; ZFINX32-NEXT: sw a4, 60(sp) +; ZFINX32-NEXT: sw ra, 56(sp) +; ZFINX32-NEXT: sw s11, 52(sp) +; ZFINX32-NEXT: sw s10, 48(sp) +; ZFINX32-NEXT: sw s9, 44(sp) +; ZFINX32-NEXT: sw s8, 40(sp) +; ZFINX32-NEXT: sw s7, 36(sp) +; ZFINX32-NEXT: sw s6, 32(sp) +; ZFINX32-NEXT: sw s5, 28(sp) +; ZFINX32-NEXT: sw s4, 24(sp) +; ZFINX32-NEXT: sw s3, 20(sp) +; ZFINX32-NEXT: sw s2, 16(sp) +; ZFINX32-NEXT: sw s1, 12(sp) +; ZFINX32-NEXT: sw s0, 8(sp) +; ZFINX32-NEXT: sw t2, 4(sp) ; ZFINX32-NEXT: sw t1, 0(sp) ; ZFINX32-NEXT: mv a0, t0 -; ZFINX32-NEXT: lw a1, 88(sp) # 4-byte Folded Reload -; ZFINX32-NEXT: lw a2, 84(sp) # 4-byte Folded Reload -; ZFINX32-NEXT: lw a3, 80(sp) # 4-byte Folded Reload -; ZFINX32-NEXT: lw a4, 76(sp) # 4-byte Folded Reload +; ZFINX32-NEXT: lw a1, 104(sp) # 4-byte Folded Reload +; ZFINX32-NEXT: lw a2, 100(sp) # 4-byte Folded Reload +; ZFINX32-NEXT: lw a3, 96(sp) # 4-byte Folded Reload +; ZFINX32-NEXT: lw a4, 92(sp) # 4-byte Folded Reload ; ZFINX32-NEXT: call callee_half_32 ; ZFINX32-NEXT: lui a1, 1048560 ; ZFINX32-NEXT: or a0, a0, a1 -; ZFINX32-NEXT: lw ra, 140(sp) # 4-byte Folded Reload -; ZFINX32-NEXT: lw s0, 136(sp) # 4-byte Folded Reload -; ZFINX32-NEXT: lw s1, 132(sp) # 4-byte Folded Reload -; ZFINX32-NEXT: lw s2, 128(sp) # 4-byte Folded Reload -; ZFINX32-NEXT: lw s3, 124(sp) # 4-byte Folded Reload -; ZFINX32-NEXT: lw s4, 120(sp) # 4-byte Folded Reload -; ZFINX32-NEXT: lw s5, 116(sp) # 4-byte Folded Reload -; ZFINX32-NEXT: lw s6, 112(sp) # 4-byte Folded Reload -; ZFINX32-NEXT: lw s7, 108(sp) # 4-byte Folded Reload -; ZFINX32-NEXT: lw s8, 104(sp) # 4-byte Folded Reload -; ZFINX32-NEXT: lw s9, 100(sp) # 4-byte Folded Reload -; ZFINX32-NEXT: lw s10, 96(sp) # 4-byte Folded Reload -; ZFINX32-NEXT: lw s11, 92(sp) # 4-byte Folded Reload -; ZFINX32-NEXT: addi sp, sp, 144 +; ZFINX32-NEXT: lw ra, 156(sp) # 4-byte Folded Reload +; ZFINX32-NEXT: lw s0, 152(sp) # 4-byte Folded Reload +; ZFINX32-NEXT: lw s1, 148(sp) # 4-byte Folded Reload +; ZFINX32-NEXT: lw s2, 144(sp) # 4-byte Folded Reload +; ZFINX32-NEXT: lw s3, 140(sp) # 4-byte Folded Reload +; ZFINX32-NEXT: lw s4, 136(sp) # 4-byte Folded Reload +; ZFINX32-NEXT: lw s5, 132(sp) # 4-byte Folded Reload +; ZFINX32-NEXT: lw s6, 128(sp) # 4-byte Folded Reload +; ZFINX32-NEXT: lw s7, 124(sp) # 4-byte Folded Reload +; ZFINX32-NEXT: lw s8, 120(sp) # 4-byte Folded Reload +; ZFINX32-NEXT: lw s9, 116(sp) # 4-byte Folded Reload +; ZFINX32-NEXT: lw s10, 112(sp) # 4-byte Folded Reload +; ZFINX32-NEXT: lw s11, 108(sp) # 4-byte Folded Reload +; ZFINX32-NEXT: addi sp, sp, 160 ; ZFINX32-NEXT: ret ; ; ZFINX64-LABEL: caller_half_32: ; ZFINX64: # %bb.0: -; ZFINX64-NEXT: addi sp, sp, -288 -; ZFINX64-NEXT: sd ra, 280(sp) # 8-byte Folded Spill -; ZFINX64-NEXT: sd s0, 272(sp) # 8-byte Folded Spill -; ZFINX64-NEXT: sd s1, 264(sp) # 8-byte Folded Spill -; ZFINX64-NEXT: sd s2, 256(sp) # 8-byte Folded Spill -; ZFINX64-NEXT: sd s3, 248(sp) # 8-byte Folded Spill -; ZFINX64-NEXT: sd s4, 240(sp) # 8-byte Folded Spill -; ZFINX64-NEXT: sd s5, 232(sp) # 8-byte Folded Spill -; ZFINX64-NEXT: sd s6, 224(sp) # 8-byte Folded Spill -; ZFINX64-NEXT: sd s7, 216(sp) # 8-byte Folded Spill -; ZFINX64-NEXT: sd s8, 208(sp) # 8-byte Folded Spill -; ZFINX64-NEXT: sd s9, 200(sp) # 8-byte Folded Spill -; ZFINX64-NEXT: sd s10, 192(sp) # 8-byte Folded Spill -; ZFINX64-NEXT: sd s11, 184(sp) # 8-byte Folded Spill +; ZFINX64-NEXT: addi sp, sp, -304 +; ZFINX64-NEXT: sd ra, 296(sp) # 8-byte Folded Spill +; ZFINX64-NEXT: sd s0, 288(sp) # 8-byte Folded Spill +; ZFINX64-NEXT: sd s1, 280(sp) # 8-byte Folded Spill +; ZFINX64-NEXT: sd s2, 272(sp) # 8-byte Folded Spill +; ZFINX64-NEXT: sd s3, 264(sp) # 8-byte Folded Spill +; ZFINX64-NEXT: sd s4, 256(sp) # 8-byte Folded Spill +; ZFINX64-NEXT: sd s5, 248(sp) # 8-byte Folded Spill +; ZFINX64-NEXT: sd s6, 240(sp) # 8-byte Folded Spill +; ZFINX64-NEXT: sd s7, 232(sp) # 8-byte Folded Spill +; ZFINX64-NEXT: sd s8, 224(sp) # 8-byte Folded Spill +; ZFINX64-NEXT: sd s9, 216(sp) # 8-byte Folded Spill +; ZFINX64-NEXT: sd s10, 208(sp) # 8-byte Folded Spill +; ZFINX64-NEXT: sd s11, 200(sp) # 8-byte Folded Spill ; ZFINX64-NEXT: ld t0, 0(a0) ; ZFINX64-NEXT: ld a1, 8(a0) -; ZFINX64-NEXT: sd a1, 176(sp) # 8-byte Folded Spill +; ZFINX64-NEXT: sd a1, 192(sp) # 8-byte Folded Spill ; ZFINX64-NEXT: ld a1, 16(a0) -; ZFINX64-NEXT: sd a1, 168(sp) # 8-byte Folded Spill +; ZFINX64-NEXT: sd a1, 184(sp) # 8-byte Folded Spill ; ZFINX64-NEXT: ld a1, 24(a0) -; ZFINX64-NEXT: sd a1, 160(sp) # 8-byte Folded Spill +; ZFINX64-NEXT: sd a1, 176(sp) # 8-byte Folded Spill ; ZFINX64-NEXT: ld a1, 32(a0) -; ZFINX64-NEXT: sd a1, 152(sp) # 8-byte Folded Spill +; ZFINX64-NEXT: sd a1, 168(sp) # 8-byte Folded Spill ; ZFINX64-NEXT: ld a5, 40(a0) ; ZFINX64-NEXT: ld a6, 48(a0) ; ZFINX64-NEXT: ld a7, 56(a0) -; ZFINX64-NEXT: ld t2, 64(a0) -; ZFINX64-NEXT: ld t3, 72(a0) -; ZFINX64-NEXT: ld t4, 80(a0) -; ZFINX64-NEXT: ld t5, 88(a0) -; ZFINX64-NEXT: ld t6, 96(a0) -; ZFINX64-NEXT: ld t1, 104(a0) +; ZFINX64-NEXT: ld t3, 64(a0) +; ZFINX64-NEXT: ld t4, 72(a0) +; ZFINX64-NEXT: ld t5, 80(a0) +; ZFINX64-NEXT: ld t6, 88(a0) +; ZFINX64-NEXT: ld t1, 96(a0) +; ZFINX64-NEXT: ld t2, 104(a0) ; ZFINX64-NEXT: ld s0, 112(a0) ; ZFINX64-NEXT: ld s1, 120(a0) ; ZFINX64-NEXT: ld s2, 128(a0) @@ -559,83 +562,84 @@ define half @caller_half_32(<32 x half> %A) nounwind { ; ZFINX64-NEXT: ld a2, 232(a0) ; ZFINX64-NEXT: ld a1, 240(a0) ; ZFINX64-NEXT: ld a0, 248(a0) -; ZFINX64-NEXT: sd a0, 144(sp) -; ZFINX64-NEXT: sd a1, 136(sp) -; ZFINX64-NEXT: sd a2, 128(sp) -; ZFINX64-NEXT: sd a3, 120(sp) -; ZFINX64-NEXT: sd a4, 112(sp) -; ZFINX64-NEXT: sd ra, 104(sp) -; ZFINX64-NEXT: sd s11, 96(sp) -; ZFINX64-NEXT: sd s10, 88(sp) -; ZFINX64-NEXT: sd s9, 80(sp) -; ZFINX64-NEXT: sd s8, 72(sp) -; ZFINX64-NEXT: sd s7, 64(sp) -; ZFINX64-NEXT: sd s6, 56(sp) -; ZFINX64-NEXT: sd s5, 48(sp) -; ZFINX64-NEXT: sd s4, 40(sp) -; ZFINX64-NEXT: sd s3, 32(sp) -; ZFINX64-NEXT: sd s2, 24(sp) -; ZFINX64-NEXT: sd s1, 16(sp) -; ZFINX64-NEXT: sd s0, 8(sp) +; ZFINX64-NEXT: sd a0, 152(sp) +; ZFINX64-NEXT: sd a1, 144(sp) +; ZFINX64-NEXT: sd a2, 136(sp) +; ZFINX64-NEXT: sd a3, 128(sp) +; ZFINX64-NEXT: sd a4, 120(sp) +; ZFINX64-NEXT: sd ra, 112(sp) +; ZFINX64-NEXT: sd s11, 104(sp) +; ZFINX64-NEXT: sd s10, 96(sp) +; ZFINX64-NEXT: sd s9, 88(sp) +; ZFINX64-NEXT: sd s8, 80(sp) +; ZFINX64-NEXT: sd s7, 72(sp) +; ZFINX64-NEXT: sd s6, 64(sp) +; ZFINX64-NEXT: sd s5, 56(sp) +; ZFINX64-NEXT: sd s4, 48(sp) +; ZFINX64-NEXT: sd s3, 40(sp) +; ZFINX64-NEXT: sd s2, 32(sp) +; ZFINX64-NEXT: sd s1, 24(sp) +; ZFINX64-NEXT: sd s0, 16(sp) +; ZFINX64-NEXT: sd t2, 8(sp) ; ZFINX64-NEXT: sd t1, 0(sp) ; ZFINX64-NEXT: mv a0, t0 -; ZFINX64-NEXT: ld a1, 176(sp) # 8-byte Folded Reload -; ZFINX64-NEXT: ld a2, 168(sp) # 8-byte Folded Reload -; ZFINX64-NEXT: ld a3, 160(sp) # 8-byte Folded Reload -; ZFINX64-NEXT: ld a4, 152(sp) # 8-byte Folded Reload +; ZFINX64-NEXT: ld a1, 192(sp) # 8-byte Folded Reload +; ZFINX64-NEXT: ld a2, 184(sp) # 8-byte Folded Reload +; ZFINX64-NEXT: ld a3, 176(sp) # 8-byte Folded Reload +; ZFINX64-NEXT: ld a4, 168(sp) # 8-byte Folded Reload ; ZFINX64-NEXT: call callee_half_32 ; ZFINX64-NEXT: lui a1, 1048560 ; ZFINX64-NEXT: or a0, a0, a1 -; ZFINX64-NEXT: ld ra, 280(sp) # 8-byte Folded Reload -; ZFINX64-NEXT: ld s0, 272(sp) # 8-byte Folded Reload -; ZFINX64-NEXT: ld s1, 264(sp) # 8-byte Folded Reload -; ZFINX64-NEXT: ld s2, 256(sp) # 8-byte Folded Reload -; ZFINX64-NEXT: ld s3, 248(sp) # 8-byte Folded Reload -; ZFINX64-NEXT: ld s4, 240(sp) # 8-byte Folded Reload -; ZFINX64-NEXT: ld s5, 232(sp) # 8-byte Folded Reload -; ZFINX64-NEXT: ld s6, 224(sp) # 8-byte Folded Reload -; ZFINX64-NEXT: ld s7, 216(sp) # 8-byte Folded Reload -; ZFINX64-NEXT: ld s8, 208(sp) # 8-byte Folded Reload -; ZFINX64-NEXT: ld s9, 200(sp) # 8-byte Folded Reload -; ZFINX64-NEXT: ld s10, 192(sp) # 8-byte Folded Reload -; ZFINX64-NEXT: ld s11, 184(sp) # 8-byte Folded Reload -; ZFINX64-NEXT: addi sp, sp, 288 +; ZFINX64-NEXT: ld ra, 296(sp) # 8-byte Folded Reload +; ZFINX64-NEXT: ld s0, 288(sp) # 8-byte Folded Reload +; ZFINX64-NEXT: ld s1, 280(sp) # 8-byte Folded Reload +; ZFINX64-NEXT: ld s2, 272(sp) # 8-byte Folded Reload +; ZFINX64-NEXT: ld s3, 264(sp) # 8-byte Folded Reload +; ZFINX64-NEXT: ld s4, 256(sp) # 8-byte Folded Reload +; ZFINX64-NEXT: ld s5, 248(sp) # 8-byte Folded Reload +; ZFINX64-NEXT: ld s6, 240(sp) # 8-byte Folded Reload +; ZFINX64-NEXT: ld s7, 232(sp) # 8-byte Folded Reload +; ZFINX64-NEXT: ld s8, 224(sp) # 8-byte Folded Reload +; ZFINX64-NEXT: ld s9, 216(sp) # 8-byte Folded Reload +; ZFINX64-NEXT: ld s10, 208(sp) # 8-byte Folded Reload +; ZFINX64-NEXT: ld s11, 200(sp) # 8-byte Folded Reload +; ZFINX64-NEXT: addi sp, sp, 304 ; ZFINX64-NEXT: ret ; ; ZDINX32-LABEL: caller_half_32: ; ZDINX32: # %bb.0: -; ZDINX32-NEXT: addi sp, sp, -144 -; ZDINX32-NEXT: sw ra, 140(sp) # 4-byte Folded Spill -; ZDINX32-NEXT: sw s0, 136(sp) # 4-byte Folded Spill -; ZDINX32-NEXT: sw s1, 132(sp) # 4-byte Folded Spill -; ZDINX32-NEXT: sw s2, 128(sp) # 4-byte Folded Spill -; ZDINX32-NEXT: sw s3, 124(sp) # 4-byte Folded Spill -; ZDINX32-NEXT: sw s4, 120(sp) # 4-byte Folded Spill -; ZDINX32-NEXT: sw s5, 116(sp) # 4-byte Folded Spill -; ZDINX32-NEXT: sw s6, 112(sp) # 4-byte Folded Spill -; ZDINX32-NEXT: sw s7, 108(sp) # 4-byte Folded Spill -; ZDINX32-NEXT: sw s8, 104(sp) # 4-byte Folded Spill -; ZDINX32-NEXT: sw s9, 100(sp) # 4-byte Folded Spill -; ZDINX32-NEXT: sw s10, 96(sp) # 4-byte Folded Spill -; ZDINX32-NEXT: sw s11, 92(sp) # 4-byte Folded Spill +; ZDINX32-NEXT: addi sp, sp, -160 +; ZDINX32-NEXT: sw ra, 156(sp) # 4-byte Folded Spill +; ZDINX32-NEXT: sw s0, 152(sp) # 4-byte Folded Spill +; ZDINX32-NEXT: sw s1, 148(sp) # 4-byte Folded Spill +; ZDINX32-NEXT: sw s2, 144(sp) # 4-byte Folded Spill +; ZDINX32-NEXT: sw s3, 140(sp) # 4-byte Folded Spill +; ZDINX32-NEXT: sw s4, 136(sp) # 4-byte Folded Spill +; ZDINX32-NEXT: sw s5, 132(sp) # 4-byte Folded Spill +; ZDINX32-NEXT: sw s6, 128(sp) # 4-byte Folded Spill +; ZDINX32-NEXT: sw s7, 124(sp) # 4-byte Folded Spill +; ZDINX32-NEXT: sw s8, 120(sp) # 4-byte Folded Spill +; ZDINX32-NEXT: sw s9, 116(sp) # 4-byte Folded Spill +; ZDINX32-NEXT: sw s10, 112(sp) # 4-byte Folded Spill +; ZDINX32-NEXT: sw s11, 108(sp) # 4-byte Folded Spill ; ZDINX32-NEXT: lw t0, 0(a0) ; ZDINX32-NEXT: lw a1, 4(a0) -; ZDINX32-NEXT: sw a1, 88(sp) # 4-byte Folded Spill +; ZDINX32-NEXT: sw a1, 104(sp) # 4-byte Folded Spill ; ZDINX32-NEXT: lw a1, 8(a0) -; ZDINX32-NEXT: sw a1, 84(sp) # 4-byte Folded Spill +; ZDINX32-NEXT: sw a1, 100(sp) # 4-byte Folded Spill ; ZDINX32-NEXT: lw a1, 12(a0) -; ZDINX32-NEXT: sw a1, 80(sp) # 4-byte Folded Spill +; ZDINX32-NEXT: sw a1, 96(sp) # 4-byte Folded Spill ; ZDINX32-NEXT: lw a1, 16(a0) -; ZDINX32-NEXT: sw a1, 76(sp) # 4-byte Folded Spill +; ZDINX32-NEXT: sw a1, 92(sp) # 4-byte Folded Spill ; ZDINX32-NEXT: lw a5, 20(a0) ; ZDINX32-NEXT: lw a6, 24(a0) ; ZDINX32-NEXT: lw a7, 28(a0) -; ZDINX32-NEXT: lw t2, 32(a0) -; ZDINX32-NEXT: lw t3, 36(a0) -; ZDINX32-NEXT: lw t4, 40(a0) -; ZDINX32-NEXT: lw t5, 44(a0) -; ZDINX32-NEXT: lw t6, 48(a0) -; ZDINX32-NEXT: lw t1, 52(a0) +; ZDINX32-NEXT: lw t3, 32(a0) +; ZDINX32-NEXT: lw t4, 36(a0) +; ZDINX32-NEXT: lw t5, 40(a0) +; ZDINX32-NEXT: lw t6, 44(a0) +; ZDINX32-NEXT: lw t1, 48(a0) +; ZDINX32-NEXT: lw t2, 52(a0) ; ZDINX32-NEXT: lw s0, 56(a0) ; ZDINX32-NEXT: lw s1, 60(a0) ; ZDINX32-NEXT: lw s2, 64(a0) @@ -654,83 +658,84 @@ define half @caller_half_32(<32 x half> %A) nounwind { ; ZDINX32-NEXT: lw a2, 116(a0) ; ZDINX32-NEXT: lw a1, 120(a0) ; ZDINX32-NEXT: lw a0, 124(a0) -; ZDINX32-NEXT: sw a0, 72(sp) -; ZDINX32-NEXT: sw a1, 68(sp) -; ZDINX32-NEXT: sw a2, 64(sp) -; ZDINX32-NEXT: sw a3, 60(sp) -; ZDINX32-NEXT: sw a4, 56(sp) -; ZDINX32-NEXT: sw ra, 52(sp) -; ZDINX32-NEXT: sw s11, 48(sp) -; ZDINX32-NEXT: sw s10, 44(sp) -; ZDINX32-NEXT: sw s9, 40(sp) -; ZDINX32-NEXT: sw s8, 36(sp) -; ZDINX32-NEXT: sw s7, 32(sp) -; ZDINX32-NEXT: sw s6, 28(sp) -; ZDINX32-NEXT: sw s5, 24(sp) -; ZDINX32-NEXT: sw s4, 20(sp) -; ZDINX32-NEXT: sw s3, 16(sp) -; ZDINX32-NEXT: sw s2, 12(sp) -; ZDINX32-NEXT: sw s1, 8(sp) -; ZDINX32-NEXT: sw s0, 4(sp) +; ZDINX32-NEXT: sw a0, 76(sp) +; ZDINX32-NEXT: sw a1, 72(sp) +; ZDINX32-NEXT: sw a2, 68(sp) +; ZDINX32-NEXT: sw a3, 64(sp) +; ZDINX32-NEXT: sw a4, 60(sp) +; ZDINX32-NEXT: sw ra, 56(sp) +; ZDINX32-NEXT: sw s11, 52(sp) +; ZDINX32-NEXT: sw s10, 48(sp) +; ZDINX32-NEXT: sw s9, 44(sp) +; ZDINX32-NEXT: sw s8, 40(sp) +; ZDINX32-NEXT: sw s7, 36(sp) +; ZDINX32-NEXT: sw s6, 32(sp) +; ZDINX32-NEXT: sw s5, 28(sp) +; ZDINX32-NEXT: sw s4, 24(sp) +; ZDINX32-NEXT: sw s3, 20(sp) +; ZDINX32-NEXT: sw s2, 16(sp) +; ZDINX32-NEXT: sw s1, 12(sp) +; ZDINX32-NEXT: sw s0, 8(sp) +; ZDINX32-NEXT: sw t2, 4(sp) ; ZDINX32-NEXT: sw t1, 0(sp) ; ZDINX32-NEXT: mv a0, t0 -; ZDINX32-NEXT: lw a1, 88(sp) # 4-byte Folded Reload -; ZDINX32-NEXT: lw a2, 84(sp) # 4-byte Folded Reload -; ZDINX32-NEXT: lw a3, 80(sp) # 4-byte Folded Reload -; ZDINX32-NEXT: lw a4, 76(sp) # 4-byte Folded Reload +; ZDINX32-NEXT: lw a1, 104(sp) # 4-byte Folded Reload +; ZDINX32-NEXT: lw a2, 100(sp) # 4-byte Folded Reload +; ZDINX32-NEXT: lw a3, 96(sp) # 4-byte Folded Reload +; ZDINX32-NEXT: lw a4, 92(sp) # 4-byte Folded Reload ; ZDINX32-NEXT: call callee_half_32 ; ZDINX32-NEXT: lui a1, 1048560 ; ZDINX32-NEXT: or a0, a0, a1 -; ZDINX32-NEXT: lw ra, 140(sp) # 4-byte Folded Reload -; ZDINX32-NEXT: lw s0, 136(sp) # 4-byte Folded Reload -; ZDINX32-NEXT: lw s1, 132(sp) # 4-byte Folded Reload -; ZDINX32-NEXT: lw s2, 128(sp) # 4-byte Folded Reload -; ZDINX32-NEXT: lw s3, 124(sp) # 4-byte Folded Reload -; ZDINX32-NEXT: lw s4, 120(sp) # 4-byte Folded Reload -; ZDINX32-NEXT: lw s5, 116(sp) # 4-byte Folded Reload -; ZDINX32-NEXT: lw s6, 112(sp) # 4-byte Folded Reload -; ZDINX32-NEXT: lw s7, 108(sp) # 4-byte Folded Reload -; ZDINX32-NEXT: lw s8, 104(sp) # 4-byte Folded Reload -; ZDINX32-NEXT: lw s9, 100(sp) # 4-byte Folded Reload -; ZDINX32-NEXT: lw s10, 96(sp) # 4-byte Folded Reload -; ZDINX32-NEXT: lw s11, 92(sp) # 4-byte Folded Reload -; ZDINX32-NEXT: addi sp, sp, 144 +; ZDINX32-NEXT: lw ra, 156(sp) # 4-byte Folded Reload +; ZDINX32-NEXT: lw s0, 152(sp) # 4-byte Folded Reload +; ZDINX32-NEXT: lw s1, 148(sp) # 4-byte Folded Reload +; ZDINX32-NEXT: lw s2, 144(sp) # 4-byte Folded Reload +; ZDINX32-NEXT: lw s3, 140(sp) # 4-byte Folded Reload +; ZDINX32-NEXT: lw s4, 136(sp) # 4-byte Folded Reload +; ZDINX32-NEXT: lw s5, 132(sp) # 4-byte Folded Reload +; ZDINX32-NEXT: lw s6, 128(sp) # 4-byte Folded Reload +; ZDINX32-NEXT: lw s7, 124(sp) # 4-byte Folded Reload +; ZDINX32-NEXT: lw s8, 120(sp) # 4-byte Folded Reload +; ZDINX32-NEXT: lw s9, 116(sp) # 4-byte Folded Reload +; ZDINX32-NEXT: lw s10, 112(sp) # 4-byte Folded Reload +; ZDINX32-NEXT: lw s11, 108(sp) # 4-byte Folded Reload +; ZDINX32-NEXT: addi sp, sp, 160 ; ZDINX32-NEXT: ret ; ; ZDINX64-LABEL: caller_half_32: ; ZDINX64: # %bb.0: -; ZDINX64-NEXT: addi sp, sp, -288 -; ZDINX64-NEXT: sd ra, 280(sp) # 8-byte Folded Spill -; ZDINX64-NEXT: sd s0, 272(sp) # 8-byte Folded Spill -; ZDINX64-NEXT: sd s1, 264(sp) # 8-byte Folded Spill -; ZDINX64-NEXT: sd s2, 256(sp) # 8-byte Folded Spill -; ZDINX64-NEXT: sd s3, 248(sp) # 8-byte Folded Spill -; ZDINX64-NEXT: sd s4, 240(sp) # 8-byte Folded Spill -; ZDINX64-NEXT: sd s5, 232(sp) # 8-byte Folded Spill -; ZDINX64-NEXT: sd s6, 224(sp) # 8-byte Folded Spill -; ZDINX64-NEXT: sd s7, 216(sp) # 8-byte Folded Spill -; ZDINX64-NEXT: sd s8, 208(sp) # 8-byte Folded Spill -; ZDINX64-NEXT: sd s9, 200(sp) # 8-byte Folded Spill -; ZDINX64-NEXT: sd s10, 192(sp) # 8-byte Folded Spill -; ZDINX64-NEXT: sd s11, 184(sp) # 8-byte Folded Spill +; ZDINX64-NEXT: addi sp, sp, -304 +; ZDINX64-NEXT: sd ra, 296(sp) # 8-byte Folded Spill +; ZDINX64-NEXT: sd s0, 288(sp) # 8-byte Folded Spill +; ZDINX64-NEXT: sd s1, 280(sp) # 8-byte Folded Spill +; ZDINX64-NEXT: sd s2, 272(sp) # 8-byte Folded Spill +; ZDINX64-NEXT: sd s3, 264(sp) # 8-byte Folded Spill +; ZDINX64-NEXT: sd s4, 256(sp) # 8-byte Folded Spill +; ZDINX64-NEXT: sd s5, 248(sp) # 8-byte Folded Spill +; ZDINX64-NEXT: sd s6, 240(sp) # 8-byte Folded Spill +; ZDINX64-NEXT: sd s7, 232(sp) # 8-byte Folded Spill +; ZDINX64-NEXT: sd s8, 224(sp) # 8-byte Folded Spill +; ZDINX64-NEXT: sd s9, 216(sp) # 8-byte Folded Spill +; ZDINX64-NEXT: sd s10, 208(sp) # 8-byte Folded Spill +; ZDINX64-NEXT: sd s11, 200(sp) # 8-byte Folded Spill ; ZDINX64-NEXT: ld t0, 0(a0) ; ZDINX64-NEXT: ld a1, 8(a0) -; ZDINX64-NEXT: sd a1, 176(sp) # 8-byte Folded Spill +; ZDINX64-NEXT: sd a1, 192(sp) # 8-byte Folded Spill ; ZDINX64-NEXT: ld a1, 16(a0) -; ZDINX64-NEXT: sd a1, 168(sp) # 8-byte Folded Spill +; ZDINX64-NEXT: sd a1, 184(sp) # 8-byte Folded Spill ; ZDINX64-NEXT: ld a1, 24(a0) -; ZDINX64-NEXT: sd a1, 160(sp) # 8-byte Folded Spill +; ZDINX64-NEXT: sd a1, 176(sp) # 8-byte Folded Spill ; ZDINX64-NEXT: ld a1, 32(a0) -; ZDINX64-NEXT: sd a1, 152(sp) # 8-byte Folded Spill +; ZDINX64-NEXT: sd a1, 168(sp) # 8-byte Folded Spill ; ZDINX64-NEXT: ld a5, 40(a0) ; ZDINX64-NEXT: ld a6, 48(a0) ; ZDINX64-NEXT: ld a7, 56(a0) -; ZDINX64-NEXT: ld t2, 64(a0) -; ZDINX64-NEXT: ld t3, 72(a0) -; ZDINX64-NEXT: ld t4, 80(a0) -; ZDINX64-NEXT: ld t5, 88(a0) -; ZDINX64-NEXT: ld t6, 96(a0) -; ZDINX64-NEXT: ld t1, 104(a0) +; ZDINX64-NEXT: ld t3, 64(a0) +; ZDINX64-NEXT: ld t4, 72(a0) +; ZDINX64-NEXT: ld t5, 80(a0) +; ZDINX64-NEXT: ld t6, 88(a0) +; ZDINX64-NEXT: ld t1, 96(a0) +; ZDINX64-NEXT: ld t2, 104(a0) ; ZDINX64-NEXT: ld s0, 112(a0) ; ZDINX64-NEXT: ld s1, 120(a0) ; ZDINX64-NEXT: ld s2, 128(a0) @@ -749,47 +754,48 @@ define half @caller_half_32(<32 x half> %A) nounwind { ; ZDINX64-NEXT: ld a2, 232(a0) ; ZDINX64-NEXT: ld a1, 240(a0) ; ZDINX64-NEXT: ld a0, 248(a0) -; ZDINX64-NEXT: sd a0, 144(sp) -; ZDINX64-NEXT: sd a1, 136(sp) -; ZDINX64-NEXT: sd a2, 128(sp) -; ZDINX64-NEXT: sd a3, 120(sp) -; ZDINX64-NEXT: sd a4, 112(sp) -; ZDINX64-NEXT: sd ra, 104(sp) -; ZDINX64-NEXT: sd s11, 96(sp) -; ZDINX64-NEXT: sd s10, 88(sp) -; ZDINX64-NEXT: sd s9, 80(sp) -; ZDINX64-NEXT: sd s8, 72(sp) -; ZDINX64-NEXT: sd s7, 64(sp) -; ZDINX64-NEXT: sd s6, 56(sp) -; ZDINX64-NEXT: sd s5, 48(sp) -; ZDINX64-NEXT: sd s4, 40(sp) -; ZDINX64-NEXT: sd s3, 32(sp) -; ZDINX64-NEXT: sd s2, 24(sp) -; ZDINX64-NEXT: sd s1, 16(sp) -; ZDINX64-NEXT: sd s0, 8(sp) +; ZDINX64-NEXT: sd a0, 152(sp) +; ZDINX64-NEXT: sd a1, 144(sp) +; ZDINX64-NEXT: sd a2, 136(sp) +; ZDINX64-NEXT: sd a3, 128(sp) +; ZDINX64-NEXT: sd a4, 120(sp) +; ZDINX64-NEXT: sd ra, 112(sp) +; ZDINX64-NEXT: sd s11, 104(sp) +; ZDINX64-NEXT: sd s10, 96(sp) +; ZDINX64-NEXT: sd s9, 88(sp) +; ZDINX64-NEXT: sd s8, 80(sp) +; ZDINX64-NEXT: sd s7, 72(sp) +; ZDINX64-NEXT: sd s6, 64(sp) +; ZDINX64-NEXT: sd s5, 56(sp) +; ZDINX64-NEXT: sd s4, 48(sp) +; ZDINX64-NEXT: sd s3, 40(sp) +; ZDINX64-NEXT: sd s2, 32(sp) +; ZDINX64-NEXT: sd s1, 24(sp) +; ZDINX64-NEXT: sd s0, 16(sp) +; ZDINX64-NEXT: sd t2, 8(sp) ; ZDINX64-NEXT: sd t1, 0(sp) ; ZDINX64-NEXT: mv a0, t0 -; ZDINX64-NEXT: ld a1, 176(sp) # 8-byte Folded Reload -; ZDINX64-NEXT: ld a2, 168(sp) # 8-byte Folded Reload -; ZDINX64-NEXT: ld a3, 160(sp) # 8-byte Folded Reload -; ZDINX64-NEXT: ld a4, 152(sp) # 8-byte Folded Reload +; ZDINX64-NEXT: ld a1, 192(sp) # 8-byte Folded Reload +; ZDINX64-NEXT: ld a2, 184(sp) # 8-byte Folded Reload +; ZDINX64-NEXT: ld a3, 176(sp) # 8-byte Folded Reload +; ZDINX64-NEXT: ld a4, 168(sp) # 8-byte Folded Reload ; ZDINX64-NEXT: call callee_half_32 ; ZDINX64-NEXT: lui a1, 1048560 ; ZDINX64-NEXT: or a0, a0, a1 -; ZDINX64-NEXT: ld ra, 280(sp) # 8-byte Folded Reload -; ZDINX64-NEXT: ld s0, 272(sp) # 8-byte Folded Reload -; ZDINX64-NEXT: ld s1, 264(sp) # 8-byte Folded Reload -; ZDINX64-NEXT: ld s2, 256(sp) # 8-byte Folded Reload -; ZDINX64-NEXT: ld s3, 248(sp) # 8-byte Folded Reload -; ZDINX64-NEXT: ld s4, 240(sp) # 8-byte Folded Reload -; ZDINX64-NEXT: ld s5, 232(sp) # 8-byte Folded Reload -; ZDINX64-NEXT: ld s6, 224(sp) # 8-byte Folded Reload -; ZDINX64-NEXT: ld s7, 216(sp) # 8-byte Folded Reload -; ZDINX64-NEXT: ld s8, 208(sp) # 8-byte Folded Reload -; ZDINX64-NEXT: ld s9, 200(sp) # 8-byte Folded Reload -; ZDINX64-NEXT: ld s10, 192(sp) # 8-byte Folded Reload -; ZDINX64-NEXT: ld s11, 184(sp) # 8-byte Folded Reload -; ZDINX64-NEXT: addi sp, sp, 288 +; ZDINX64-NEXT: ld ra, 296(sp) # 8-byte Folded Reload +; ZDINX64-NEXT: ld s0, 288(sp) # 8-byte Folded Reload +; ZDINX64-NEXT: ld s1, 280(sp) # 8-byte Folded Reload +; ZDINX64-NEXT: ld s2, 272(sp) # 8-byte Folded Reload +; ZDINX64-NEXT: ld s3, 264(sp) # 8-byte Folded Reload +; ZDINX64-NEXT: ld s4, 256(sp) # 8-byte Folded Reload +; ZDINX64-NEXT: ld s5, 248(sp) # 8-byte Folded Reload +; ZDINX64-NEXT: ld s6, 240(sp) # 8-byte Folded Reload +; ZDINX64-NEXT: ld s7, 232(sp) # 8-byte Folded Reload +; ZDINX64-NEXT: ld s8, 224(sp) # 8-byte Folded Reload +; ZDINX64-NEXT: ld s9, 216(sp) # 8-byte Folded Reload +; ZDINX64-NEXT: ld s10, 208(sp) # 8-byte Folded Reload +; ZDINX64-NEXT: ld s11, 200(sp) # 8-byte Folded Reload +; ZDINX64-NEXT: addi sp, sp, 304 ; ZDINX64-NEXT: ret %C = call fastcc half @callee_half_32(<32 x half> %A) ret half %C @@ -826,86 +832,87 @@ define fastcc float @callee_float_32(<32 x float> %A) nounwind { define float @caller_float_32(<32 x float> %A) nounwind { ; ZHINX32-LABEL: caller_float_32: ; ZHINX32: # %bb.0: -; ZHINX32-NEXT: addi sp, sp, -144 -; ZHINX32-NEXT: sw ra, 140(sp) # 4-byte Folded Spill -; ZHINX32-NEXT: sw s0, 136(sp) # 4-byte Folded Spill -; ZHINX32-NEXT: sw s1, 132(sp) # 4-byte Folded Spill -; ZHINX32-NEXT: sw s2, 128(sp) # 4-byte Folded Spill -; ZHINX32-NEXT: sw s3, 124(sp) # 4-byte Folded Spill -; ZHINX32-NEXT: sw s4, 120(sp) # 4-byte Folded Spill -; ZHINX32-NEXT: sw s5, 116(sp) # 4-byte Folded Spill -; ZHINX32-NEXT: sw s6, 112(sp) # 4-byte Folded Spill -; ZHINX32-NEXT: sw s7, 108(sp) # 4-byte Folded Spill -; ZHINX32-NEXT: sw s8, 104(sp) # 4-byte Folded Spill -; ZHINX32-NEXT: sw s9, 100(sp) # 4-byte Folded Spill -; ZHINX32-NEXT: sw s10, 96(sp) # 4-byte Folded Spill -; ZHINX32-NEXT: sw s11, 92(sp) # 4-byte Folded Spill -; ZHINX32-NEXT: lw t0, 144(sp) -; ZHINX32-NEXT: sw t0, 88(sp) # 4-byte Folded Spill -; ZHINX32-NEXT: lw t0, 148(sp) -; ZHINX32-NEXT: sw t0, 84(sp) # 4-byte Folded Spill -; ZHINX32-NEXT: lw t0, 152(sp) -; ZHINX32-NEXT: sw t0, 80(sp) # 4-byte Folded Spill -; ZHINX32-NEXT: lw t0, 156(sp) -; ZHINX32-NEXT: sw t0, 76(sp) # 4-byte Folded Spill -; ZHINX32-NEXT: lw t6, 160(sp) -; ZHINX32-NEXT: lw t5, 164(sp) -; ZHINX32-NEXT: lw t4, 168(sp) -; ZHINX32-NEXT: lw s0, 172(sp) -; ZHINX32-NEXT: lw s1, 176(sp) -; ZHINX32-NEXT: lw s2, 180(sp) -; ZHINX32-NEXT: lw s3, 184(sp) -; ZHINX32-NEXT: lw s4, 188(sp) -; ZHINX32-NEXT: lw s5, 192(sp) -; ZHINX32-NEXT: lw s6, 196(sp) -; ZHINX32-NEXT: lw s7, 200(sp) -; ZHINX32-NEXT: lw s8, 204(sp) -; ZHINX32-NEXT: lw s9, 208(sp) -; ZHINX32-NEXT: lw s10, 212(sp) -; ZHINX32-NEXT: lw s11, 216(sp) -; ZHINX32-NEXT: lw ra, 220(sp) -; ZHINX32-NEXT: lw t3, 224(sp) -; ZHINX32-NEXT: lw t2, 228(sp) -; ZHINX32-NEXT: lw t1, 232(sp) -; ZHINX32-NEXT: lw t0, 236(sp) -; ZHINX32-NEXT: sw t0, 72(sp) -; ZHINX32-NEXT: sw t1, 68(sp) -; ZHINX32-NEXT: sw t2, 64(sp) -; ZHINX32-NEXT: sw t3, 60(sp) -; ZHINX32-NEXT: sw ra, 56(sp) -; ZHINX32-NEXT: sw s11, 52(sp) -; ZHINX32-NEXT: sw s10, 48(sp) -; ZHINX32-NEXT: sw s9, 44(sp) -; ZHINX32-NEXT: sw s8, 40(sp) -; ZHINX32-NEXT: sw s7, 36(sp) -; ZHINX32-NEXT: sw s6, 32(sp) -; ZHINX32-NEXT: sw s5, 28(sp) -; ZHINX32-NEXT: sw s4, 24(sp) -; ZHINX32-NEXT: sw s3, 20(sp) -; ZHINX32-NEXT: sw s2, 16(sp) -; ZHINX32-NEXT: sw s1, 12(sp) -; ZHINX32-NEXT: sw s0, 8(sp) -; ZHINX32-NEXT: sw t4, 4(sp) -; ZHINX32-NEXT: sw t5, 0(sp) -; ZHINX32-NEXT: lw t2, 88(sp) # 4-byte Folded Reload -; ZHINX32-NEXT: lw t3, 84(sp) # 4-byte Folded Reload -; ZHINX32-NEXT: lw t4, 80(sp) # 4-byte Folded Reload -; ZHINX32-NEXT: lw t5, 76(sp) # 4-byte Folded Reload +; ZHINX32-NEXT: addi sp, sp, -160 +; ZHINX32-NEXT: sw ra, 156(sp) # 4-byte Folded Spill +; ZHINX32-NEXT: sw s0, 152(sp) # 4-byte Folded Spill +; ZHINX32-NEXT: sw s1, 148(sp) # 4-byte Folded Spill +; ZHINX32-NEXT: sw s2, 144(sp) # 4-byte Folded Spill +; ZHINX32-NEXT: sw s3, 140(sp) # 4-byte Folded Spill +; ZHINX32-NEXT: sw s4, 136(sp) # 4-byte Folded Spill +; ZHINX32-NEXT: sw s5, 132(sp) # 4-byte Folded Spill +; ZHINX32-NEXT: sw s6, 128(sp) # 4-byte Folded Spill +; ZHINX32-NEXT: sw s7, 124(sp) # 4-byte Folded Spill +; ZHINX32-NEXT: sw s8, 120(sp) # 4-byte Folded Spill +; ZHINX32-NEXT: sw s9, 116(sp) # 4-byte Folded Spill +; ZHINX32-NEXT: sw s10, 112(sp) # 4-byte Folded Spill +; ZHINX32-NEXT: sw s11, 108(sp) # 4-byte Folded Spill +; ZHINX32-NEXT: lw t0, 160(sp) +; ZHINX32-NEXT: sw t0, 104(sp) # 4-byte Folded Spill +; ZHINX32-NEXT: lw t0, 164(sp) +; ZHINX32-NEXT: sw t0, 100(sp) # 4-byte Folded Spill +; ZHINX32-NEXT: lw t0, 168(sp) +; ZHINX32-NEXT: sw t0, 96(sp) # 4-byte Folded Spill +; ZHINX32-NEXT: lw t0, 172(sp) +; ZHINX32-NEXT: sw t0, 92(sp) # 4-byte Folded Spill +; ZHINX32-NEXT: lw t6, 176(sp) +; ZHINX32-NEXT: lw t5, 180(sp) +; ZHINX32-NEXT: lw t4, 184(sp) +; ZHINX32-NEXT: lw s0, 188(sp) +; ZHINX32-NEXT: lw s1, 192(sp) +; ZHINX32-NEXT: lw s2, 196(sp) +; ZHINX32-NEXT: lw s3, 200(sp) +; ZHINX32-NEXT: lw s4, 204(sp) +; ZHINX32-NEXT: lw s5, 208(sp) +; ZHINX32-NEXT: lw s6, 212(sp) +; ZHINX32-NEXT: lw s7, 216(sp) +; ZHINX32-NEXT: lw s8, 220(sp) +; ZHINX32-NEXT: lw s9, 224(sp) +; ZHINX32-NEXT: lw s10, 228(sp) +; ZHINX32-NEXT: lw s11, 232(sp) +; ZHINX32-NEXT: lw ra, 236(sp) +; ZHINX32-NEXT: lw t3, 240(sp) +; ZHINX32-NEXT: lw t2, 244(sp) +; ZHINX32-NEXT: lw t1, 248(sp) +; ZHINX32-NEXT: lw t0, 252(sp) +; ZHINX32-NEXT: sw t0, 76(sp) +; ZHINX32-NEXT: sw t1, 72(sp) +; ZHINX32-NEXT: sw t2, 68(sp) +; ZHINX32-NEXT: sw t3, 64(sp) +; ZHINX32-NEXT: sw ra, 60(sp) +; ZHINX32-NEXT: sw s11, 56(sp) +; ZHINX32-NEXT: sw s10, 52(sp) +; ZHINX32-NEXT: sw s9, 48(sp) +; ZHINX32-NEXT: sw s8, 44(sp) +; ZHINX32-NEXT: sw s7, 40(sp) +; ZHINX32-NEXT: sw s6, 36(sp) +; ZHINX32-NEXT: sw s5, 32(sp) +; ZHINX32-NEXT: sw s4, 28(sp) +; ZHINX32-NEXT: sw s3, 24(sp) +; ZHINX32-NEXT: sw s2, 20(sp) +; ZHINX32-NEXT: sw s1, 16(sp) +; ZHINX32-NEXT: sw s0, 12(sp) +; ZHINX32-NEXT: sw t4, 8(sp) +; ZHINX32-NEXT: sw t5, 4(sp) +; ZHINX32-NEXT: sw t6, 0(sp) +; ZHINX32-NEXT: lw t3, 104(sp) # 4-byte Folded Reload +; ZHINX32-NEXT: lw t4, 100(sp) # 4-byte Folded Reload +; ZHINX32-NEXT: lw t5, 96(sp) # 4-byte Folded Reload +; ZHINX32-NEXT: lw t6, 92(sp) # 4-byte Folded Reload ; ZHINX32-NEXT: call callee_float_32 -; ZHINX32-NEXT: lw ra, 140(sp) # 4-byte Folded Reload -; ZHINX32-NEXT: lw s0, 136(sp) # 4-byte Folded Reload -; ZHINX32-NEXT: lw s1, 132(sp) # 4-byte Folded Reload -; ZHINX32-NEXT: lw s2, 128(sp) # 4-byte Folded Reload -; ZHINX32-NEXT: lw s3, 124(sp) # 4-byte Folded Reload -; ZHINX32-NEXT: lw s4, 120(sp) # 4-byte Folded Reload -; ZHINX32-NEXT: lw s5, 116(sp) # 4-byte Folded Reload -; ZHINX32-NEXT: lw s6, 112(sp) # 4-byte Folded Reload -; ZHINX32-NEXT: lw s7, 108(sp) # 4-byte Folded Reload -; ZHINX32-NEXT: lw s8, 104(sp) # 4-byte Folded Reload -; ZHINX32-NEXT: lw s9, 100(sp) # 4-byte Folded Reload -; ZHINX32-NEXT: lw s10, 96(sp) # 4-byte Folded Reload -; ZHINX32-NEXT: lw s11, 92(sp) # 4-byte Folded Reload -; ZHINX32-NEXT: addi sp, sp, 144 +; ZHINX32-NEXT: lw ra, 156(sp) # 4-byte Folded Reload +; ZHINX32-NEXT: lw s0, 152(sp) # 4-byte Folded Reload +; ZHINX32-NEXT: lw s1, 148(sp) # 4-byte Folded Reload +; ZHINX32-NEXT: lw s2, 144(sp) # 4-byte Folded Reload +; ZHINX32-NEXT: lw s3, 140(sp) # 4-byte Folded Reload +; ZHINX32-NEXT: lw s4, 136(sp) # 4-byte Folded Reload +; ZHINX32-NEXT: lw s5, 132(sp) # 4-byte Folded Reload +; ZHINX32-NEXT: lw s6, 128(sp) # 4-byte Folded Reload +; ZHINX32-NEXT: lw s7, 124(sp) # 4-byte Folded Reload +; ZHINX32-NEXT: lw s8, 120(sp) # 4-byte Folded Reload +; ZHINX32-NEXT: lw s9, 116(sp) # 4-byte Folded Reload +; ZHINX32-NEXT: lw s10, 112(sp) # 4-byte Folded Reload +; ZHINX32-NEXT: lw s11, 108(sp) # 4-byte Folded Reload +; ZHINX32-NEXT: addi sp, sp, 160 ; ZHINX32-NEXT: ret ; ; ZHINX64-LABEL: caller_float_32: @@ -952,29 +959,30 @@ define float @caller_float_32(<32 x float> %A) nounwind { ; ZHINX64-NEXT: lw t2, 392(sp) ; ZHINX64-NEXT: lw t1, 400(sp) ; ZHINX64-NEXT: lw t0, 408(sp) -; ZHINX64-NEXT: sw t0, 72(sp) -; ZHINX64-NEXT: sw t1, 68(sp) -; ZHINX64-NEXT: sw t2, 64(sp) -; ZHINX64-NEXT: sw t3, 60(sp) -; ZHINX64-NEXT: sw ra, 56(sp) -; ZHINX64-NEXT: sw s11, 52(sp) -; ZHINX64-NEXT: sw s10, 48(sp) -; ZHINX64-NEXT: sw s9, 44(sp) -; ZHINX64-NEXT: sw s8, 40(sp) -; ZHINX64-NEXT: sw s7, 36(sp) -; ZHINX64-NEXT: sw s6, 32(sp) -; ZHINX64-NEXT: sw s5, 28(sp) -; ZHINX64-NEXT: sw s4, 24(sp) -; ZHINX64-NEXT: sw s3, 20(sp) -; ZHINX64-NEXT: sw s2, 16(sp) -; ZHINX64-NEXT: sw s1, 12(sp) -; ZHINX64-NEXT: sw s0, 8(sp) -; ZHINX64-NEXT: sw t4, 4(sp) -; ZHINX64-NEXT: sw t5, 0(sp) -; ZHINX64-NEXT: ld t2, 112(sp) # 8-byte Folded Reload -; ZHINX64-NEXT: ld t3, 104(sp) # 8-byte Folded Reload -; ZHINX64-NEXT: ld t4, 96(sp) # 8-byte Folded Reload -; ZHINX64-NEXT: ld t5, 88(sp) # 8-byte Folded Reload +; ZHINX64-NEXT: sw t0, 76(sp) +; ZHINX64-NEXT: sw t1, 72(sp) +; ZHINX64-NEXT: sw t2, 68(sp) +; ZHINX64-NEXT: sw t3, 64(sp) +; ZHINX64-NEXT: sw ra, 60(sp) +; ZHINX64-NEXT: sw s11, 56(sp) +; ZHINX64-NEXT: sw s10, 52(sp) +; ZHINX64-NEXT: sw s9, 48(sp) +; ZHINX64-NEXT: sw s8, 44(sp) +; ZHINX64-NEXT: sw s7, 40(sp) +; ZHINX64-NEXT: sw s6, 36(sp) +; ZHINX64-NEXT: sw s5, 32(sp) +; ZHINX64-NEXT: sw s4, 28(sp) +; ZHINX64-NEXT: sw s3, 24(sp) +; ZHINX64-NEXT: sw s2, 20(sp) +; ZHINX64-NEXT: sw s1, 16(sp) +; ZHINX64-NEXT: sw s0, 12(sp) +; ZHINX64-NEXT: sw t4, 8(sp) +; ZHINX64-NEXT: sw t5, 4(sp) +; ZHINX64-NEXT: sw t6, 0(sp) +; ZHINX64-NEXT: ld t3, 112(sp) # 8-byte Folded Reload +; ZHINX64-NEXT: ld t4, 104(sp) # 8-byte Folded Reload +; ZHINX64-NEXT: ld t5, 96(sp) # 8-byte Folded Reload +; ZHINX64-NEXT: ld t6, 88(sp) # 8-byte Folded Reload ; ZHINX64-NEXT: call callee_float_32 ; ZHINX64-NEXT: ld ra, 216(sp) # 8-byte Folded Reload ; ZHINX64-NEXT: ld s0, 208(sp) # 8-byte Folded Reload @@ -994,86 +1002,87 @@ define float @caller_float_32(<32 x float> %A) nounwind { ; ; ZFINX32-LABEL: caller_float_32: ; ZFINX32: # %bb.0: -; ZFINX32-NEXT: addi sp, sp, -144 -; ZFINX32-NEXT: sw ra, 140(sp) # 4-byte Folded Spill -; ZFINX32-NEXT: sw s0, 136(sp) # 4-byte Folded Spill -; ZFINX32-NEXT: sw s1, 132(sp) # 4-byte Folded Spill -; ZFINX32-NEXT: sw s2, 128(sp) # 4-byte Folded Spill -; ZFINX32-NEXT: sw s3, 124(sp) # 4-byte Folded Spill -; ZFINX32-NEXT: sw s4, 120(sp) # 4-byte Folded Spill -; ZFINX32-NEXT: sw s5, 116(sp) # 4-byte Folded Spill -; ZFINX32-NEXT: sw s6, 112(sp) # 4-byte Folded Spill -; ZFINX32-NEXT: sw s7, 108(sp) # 4-byte Folded Spill -; ZFINX32-NEXT: sw s8, 104(sp) # 4-byte Folded Spill -; ZFINX32-NEXT: sw s9, 100(sp) # 4-byte Folded Spill -; ZFINX32-NEXT: sw s10, 96(sp) # 4-byte Folded Spill -; ZFINX32-NEXT: sw s11, 92(sp) # 4-byte Folded Spill -; ZFINX32-NEXT: lw t0, 144(sp) -; ZFINX32-NEXT: sw t0, 88(sp) # 4-byte Folded Spill -; ZFINX32-NEXT: lw t0, 148(sp) -; ZFINX32-NEXT: sw t0, 84(sp) # 4-byte Folded Spill -; ZFINX32-NEXT: lw t0, 152(sp) -; ZFINX32-NEXT: sw t0, 80(sp) # 4-byte Folded Spill -; ZFINX32-NEXT: lw t0, 156(sp) -; ZFINX32-NEXT: sw t0, 76(sp) # 4-byte Folded Spill -; ZFINX32-NEXT: lw t6, 160(sp) -; ZFINX32-NEXT: lw t5, 164(sp) -; ZFINX32-NEXT: lw t4, 168(sp) -; ZFINX32-NEXT: lw s0, 172(sp) -; ZFINX32-NEXT: lw s1, 176(sp) -; ZFINX32-NEXT: lw s2, 180(sp) -; ZFINX32-NEXT: lw s3, 184(sp) -; ZFINX32-NEXT: lw s4, 188(sp) -; ZFINX32-NEXT: lw s5, 192(sp) -; ZFINX32-NEXT: lw s6, 196(sp) -; ZFINX32-NEXT: lw s7, 200(sp) -; ZFINX32-NEXT: lw s8, 204(sp) -; ZFINX32-NEXT: lw s9, 208(sp) -; ZFINX32-NEXT: lw s10, 212(sp) -; ZFINX32-NEXT: lw s11, 216(sp) -; ZFINX32-NEXT: lw ra, 220(sp) -; ZFINX32-NEXT: lw t3, 224(sp) -; ZFINX32-NEXT: lw t2, 228(sp) -; ZFINX32-NEXT: lw t1, 232(sp) -; ZFINX32-NEXT: lw t0, 236(sp) -; ZFINX32-NEXT: sw t0, 72(sp) -; ZFINX32-NEXT: sw t1, 68(sp) -; ZFINX32-NEXT: sw t2, 64(sp) -; ZFINX32-NEXT: sw t3, 60(sp) -; ZFINX32-NEXT: sw ra, 56(sp) -; ZFINX32-NEXT: sw s11, 52(sp) -; ZFINX32-NEXT: sw s10, 48(sp) -; ZFINX32-NEXT: sw s9, 44(sp) -; ZFINX32-NEXT: sw s8, 40(sp) -; ZFINX32-NEXT: sw s7, 36(sp) -; ZFINX32-NEXT: sw s6, 32(sp) -; ZFINX32-NEXT: sw s5, 28(sp) -; ZFINX32-NEXT: sw s4, 24(sp) -; ZFINX32-NEXT: sw s3, 20(sp) -; ZFINX32-NEXT: sw s2, 16(sp) -; ZFINX32-NEXT: sw s1, 12(sp) -; ZFINX32-NEXT: sw s0, 8(sp) -; ZFINX32-NEXT: sw t4, 4(sp) -; ZFINX32-NEXT: sw t5, 0(sp) -; ZFINX32-NEXT: lw t2, 88(sp) # 4-byte Folded Reload -; ZFINX32-NEXT: lw t3, 84(sp) # 4-byte Folded Reload -; ZFINX32-NEXT: lw t4, 80(sp) # 4-byte Folded Reload -; ZFINX32-NEXT: lw t5, 76(sp) # 4-byte Folded Reload +; ZFINX32-NEXT: addi sp, sp, -160 +; ZFINX32-NEXT: sw ra, 156(sp) # 4-byte Folded Spill +; ZFINX32-NEXT: sw s0, 152(sp) # 4-byte Folded Spill +; ZFINX32-NEXT: sw s1, 148(sp) # 4-byte Folded Spill +; ZFINX32-NEXT: sw s2, 144(sp) # 4-byte Folded Spill +; ZFINX32-NEXT: sw s3, 140(sp) # 4-byte Folded Spill +; ZFINX32-NEXT: sw s4, 136(sp) # 4-byte Folded Spill +; ZFINX32-NEXT: sw s5, 132(sp) # 4-byte Folded Spill +; ZFINX32-NEXT: sw s6, 128(sp) # 4-byte Folded Spill +; ZFINX32-NEXT: sw s7, 124(sp) # 4-byte Folded Spill +; ZFINX32-NEXT: sw s8, 120(sp) # 4-byte Folded Spill +; ZFINX32-NEXT: sw s9, 116(sp) # 4-byte Folded Spill +; ZFINX32-NEXT: sw s10, 112(sp) # 4-byte Folded Spill +; ZFINX32-NEXT: sw s11, 108(sp) # 4-byte Folded Spill +; ZFINX32-NEXT: lw t0, 160(sp) +; ZFINX32-NEXT: sw t0, 104(sp) # 4-byte Folded Spill +; ZFINX32-NEXT: lw t0, 164(sp) +; ZFINX32-NEXT: sw t0, 100(sp) # 4-byte Folded Spill +; ZFINX32-NEXT: lw t0, 168(sp) +; ZFINX32-NEXT: sw t0, 96(sp) # 4-byte Folded Spill +; ZFINX32-NEXT: lw t0, 172(sp) +; ZFINX32-NEXT: sw t0, 92(sp) # 4-byte Folded Spill +; ZFINX32-NEXT: lw t6, 176(sp) +; ZFINX32-NEXT: lw t5, 180(sp) +; ZFINX32-NEXT: lw t4, 184(sp) +; ZFINX32-NEXT: lw s0, 188(sp) +; ZFINX32-NEXT: lw s1, 192(sp) +; ZFINX32-NEXT: lw s2, 196(sp) +; ZFINX32-NEXT: lw s3, 200(sp) +; ZFINX32-NEXT: lw s4, 204(sp) +; ZFINX32-NEXT: lw s5, 208(sp) +; ZFINX32-NEXT: lw s6, 212(sp) +; ZFINX32-NEXT: lw s7, 216(sp) +; ZFINX32-NEXT: lw s8, 220(sp) +; ZFINX32-NEXT: lw s9, 224(sp) +; ZFINX32-NEXT: lw s10, 228(sp) +; ZFINX32-NEXT: lw s11, 232(sp) +; ZFINX32-NEXT: lw ra, 236(sp) +; ZFINX32-NEXT: lw t3, 240(sp) +; ZFINX32-NEXT: lw t2, 244(sp) +; ZFINX32-NEXT: lw t1, 248(sp) +; ZFINX32-NEXT: lw t0, 252(sp) +; ZFINX32-NEXT: sw t0, 76(sp) +; ZFINX32-NEXT: sw t1, 72(sp) +; ZFINX32-NEXT: sw t2, 68(sp) +; ZFINX32-NEXT: sw t3, 64(sp) +; ZFINX32-NEXT: sw ra, 60(sp) +; ZFINX32-NEXT: sw s11, 56(sp) +; ZFINX32-NEXT: sw s10, 52(sp) +; ZFINX32-NEXT: sw s9, 48(sp) +; ZFINX32-NEXT: sw s8, 44(sp) +; ZFINX32-NEXT: sw s7, 40(sp) +; ZFINX32-NEXT: sw s6, 36(sp) +; ZFINX32-NEXT: sw s5, 32(sp) +; ZFINX32-NEXT: sw s4, 28(sp) +; ZFINX32-NEXT: sw s3, 24(sp) +; ZFINX32-NEXT: sw s2, 20(sp) +; ZFINX32-NEXT: sw s1, 16(sp) +; ZFINX32-NEXT: sw s0, 12(sp) +; ZFINX32-NEXT: sw t4, 8(sp) +; ZFINX32-NEXT: sw t5, 4(sp) +; ZFINX32-NEXT: sw t6, 0(sp) +; ZFINX32-NEXT: lw t3, 104(sp) # 4-byte Folded Reload +; ZFINX32-NEXT: lw t4, 100(sp) # 4-byte Folded Reload +; ZFINX32-NEXT: lw t5, 96(sp) # 4-byte Folded Reload +; ZFINX32-NEXT: lw t6, 92(sp) # 4-byte Folded Reload ; ZFINX32-NEXT: call callee_float_32 -; ZFINX32-NEXT: lw ra, 140(sp) # 4-byte Folded Reload -; ZFINX32-NEXT: lw s0, 136(sp) # 4-byte Folded Reload -; ZFINX32-NEXT: lw s1, 132(sp) # 4-byte Folded Reload -; ZFINX32-NEXT: lw s2, 128(sp) # 4-byte Folded Reload -; ZFINX32-NEXT: lw s3, 124(sp) # 4-byte Folded Reload -; ZFINX32-NEXT: lw s4, 120(sp) # 4-byte Folded Reload -; ZFINX32-NEXT: lw s5, 116(sp) # 4-byte Folded Reload -; ZFINX32-NEXT: lw s6, 112(sp) # 4-byte Folded Reload -; ZFINX32-NEXT: lw s7, 108(sp) # 4-byte Folded Reload -; ZFINX32-NEXT: lw s8, 104(sp) # 4-byte Folded Reload -; ZFINX32-NEXT: lw s9, 100(sp) # 4-byte Folded Reload -; ZFINX32-NEXT: lw s10, 96(sp) # 4-byte Folded Reload -; ZFINX32-NEXT: lw s11, 92(sp) # 4-byte Folded Reload -; ZFINX32-NEXT: addi sp, sp, 144 +; ZFINX32-NEXT: lw ra, 156(sp) # 4-byte Folded Reload +; ZFINX32-NEXT: lw s0, 152(sp) # 4-byte Folded Reload +; ZFINX32-NEXT: lw s1, 148(sp) # 4-byte Folded Reload +; ZFINX32-NEXT: lw s2, 144(sp) # 4-byte Folded Reload +; ZFINX32-NEXT: lw s3, 140(sp) # 4-byte Folded Reload +; ZFINX32-NEXT: lw s4, 136(sp) # 4-byte Folded Reload +; ZFINX32-NEXT: lw s5, 132(sp) # 4-byte Folded Reload +; ZFINX32-NEXT: lw s6, 128(sp) # 4-byte Folded Reload +; ZFINX32-NEXT: lw s7, 124(sp) # 4-byte Folded Reload +; ZFINX32-NEXT: lw s8, 120(sp) # 4-byte Folded Reload +; ZFINX32-NEXT: lw s9, 116(sp) # 4-byte Folded Reload +; ZFINX32-NEXT: lw s10, 112(sp) # 4-byte Folded Reload +; ZFINX32-NEXT: lw s11, 108(sp) # 4-byte Folded Reload +; ZFINX32-NEXT: addi sp, sp, 160 ; ZFINX32-NEXT: ret ; ; ZFINX64-LABEL: caller_float_32: @@ -1120,29 +1129,30 @@ define float @caller_float_32(<32 x float> %A) nounwind { ; ZFINX64-NEXT: lw t2, 392(sp) ; ZFINX64-NEXT: lw t1, 400(sp) ; ZFINX64-NEXT: lw t0, 408(sp) -; ZFINX64-NEXT: sw t0, 72(sp) -; ZFINX64-NEXT: sw t1, 68(sp) -; ZFINX64-NEXT: sw t2, 64(sp) -; ZFINX64-NEXT: sw t3, 60(sp) -; ZFINX64-NEXT: sw ra, 56(sp) -; ZFINX64-NEXT: sw s11, 52(sp) -; ZFINX64-NEXT: sw s10, 48(sp) -; ZFINX64-NEXT: sw s9, 44(sp) -; ZFINX64-NEXT: sw s8, 40(sp) -; ZFINX64-NEXT: sw s7, 36(sp) -; ZFINX64-NEXT: sw s6, 32(sp) -; ZFINX64-NEXT: sw s5, 28(sp) -; ZFINX64-NEXT: sw s4, 24(sp) -; ZFINX64-NEXT: sw s3, 20(sp) -; ZFINX64-NEXT: sw s2, 16(sp) -; ZFINX64-NEXT: sw s1, 12(sp) -; ZFINX64-NEXT: sw s0, 8(sp) -; ZFINX64-NEXT: sw t4, 4(sp) -; ZFINX64-NEXT: sw t5, 0(sp) -; ZFINX64-NEXT: ld t2, 112(sp) # 8-byte Folded Reload -; ZFINX64-NEXT: ld t3, 104(sp) # 8-byte Folded Reload -; ZFINX64-NEXT: ld t4, 96(sp) # 8-byte Folded Reload -; ZFINX64-NEXT: ld t5, 88(sp) # 8-byte Folded Reload +; ZFINX64-NEXT: sw t0, 76(sp) +; ZFINX64-NEXT: sw t1, 72(sp) +; ZFINX64-NEXT: sw t2, 68(sp) +; ZFINX64-NEXT: sw t3, 64(sp) +; ZFINX64-NEXT: sw ra, 60(sp) +; ZFINX64-NEXT: sw s11, 56(sp) +; ZFINX64-NEXT: sw s10, 52(sp) +; ZFINX64-NEXT: sw s9, 48(sp) +; ZFINX64-NEXT: sw s8, 44(sp) +; ZFINX64-NEXT: sw s7, 40(sp) +; ZFINX64-NEXT: sw s6, 36(sp) +; ZFINX64-NEXT: sw s5, 32(sp) +; ZFINX64-NEXT: sw s4, 28(sp) +; ZFINX64-NEXT: sw s3, 24(sp) +; ZFINX64-NEXT: sw s2, 20(sp) +; ZFINX64-NEXT: sw s1, 16(sp) +; ZFINX64-NEXT: sw s0, 12(sp) +; ZFINX64-NEXT: sw t4, 8(sp) +; ZFINX64-NEXT: sw t5, 4(sp) +; ZFINX64-NEXT: sw t6, 0(sp) +; ZFINX64-NEXT: ld t3, 112(sp) # 8-byte Folded Reload +; ZFINX64-NEXT: ld t4, 104(sp) # 8-byte Folded Reload +; ZFINX64-NEXT: ld t5, 96(sp) # 8-byte Folded Reload +; ZFINX64-NEXT: ld t6, 88(sp) # 8-byte Folded Reload ; ZFINX64-NEXT: call callee_float_32 ; ZFINX64-NEXT: ld ra, 216(sp) # 8-byte Folded Reload ; ZFINX64-NEXT: ld s0, 208(sp) # 8-byte Folded Reload @@ -1162,86 +1172,87 @@ define float @caller_float_32(<32 x float> %A) nounwind { ; ; ZDINX32-LABEL: caller_float_32: ; ZDINX32: # %bb.0: -; ZDINX32-NEXT: addi sp, sp, -144 -; ZDINX32-NEXT: sw ra, 140(sp) # 4-byte Folded Spill -; ZDINX32-NEXT: sw s0, 136(sp) # 4-byte Folded Spill -; ZDINX32-NEXT: sw s1, 132(sp) # 4-byte Folded Spill -; ZDINX32-NEXT: sw s2, 128(sp) # 4-byte Folded Spill -; ZDINX32-NEXT: sw s3, 124(sp) # 4-byte Folded Spill -; ZDINX32-NEXT: sw s4, 120(sp) # 4-byte Folded Spill -; ZDINX32-NEXT: sw s5, 116(sp) # 4-byte Folded Spill -; ZDINX32-NEXT: sw s6, 112(sp) # 4-byte Folded Spill -; ZDINX32-NEXT: sw s7, 108(sp) # 4-byte Folded Spill -; ZDINX32-NEXT: sw s8, 104(sp) # 4-byte Folded Spill -; ZDINX32-NEXT: sw s9, 100(sp) # 4-byte Folded Spill -; ZDINX32-NEXT: sw s10, 96(sp) # 4-byte Folded Spill -; ZDINX32-NEXT: sw s11, 92(sp) # 4-byte Folded Spill -; ZDINX32-NEXT: lw t0, 144(sp) -; ZDINX32-NEXT: sw t0, 88(sp) # 4-byte Folded Spill -; ZDINX32-NEXT: lw t0, 148(sp) -; ZDINX32-NEXT: sw t0, 84(sp) # 4-byte Folded Spill -; ZDINX32-NEXT: lw t0, 152(sp) -; ZDINX32-NEXT: sw t0, 80(sp) # 4-byte Folded Spill -; ZDINX32-NEXT: lw t0, 156(sp) -; ZDINX32-NEXT: sw t0, 76(sp) # 4-byte Folded Spill -; ZDINX32-NEXT: lw t6, 160(sp) -; ZDINX32-NEXT: lw t5, 164(sp) -; ZDINX32-NEXT: lw t4, 168(sp) -; ZDINX32-NEXT: lw s0, 172(sp) -; ZDINX32-NEXT: lw s1, 176(sp) -; ZDINX32-NEXT: lw s2, 180(sp) -; ZDINX32-NEXT: lw s3, 184(sp) -; ZDINX32-NEXT: lw s4, 188(sp) -; ZDINX32-NEXT: lw s5, 192(sp) -; ZDINX32-NEXT: lw s6, 196(sp) -; ZDINX32-NEXT: lw s7, 200(sp) -; ZDINX32-NEXT: lw s8, 204(sp) -; ZDINX32-NEXT: lw s9, 208(sp) -; ZDINX32-NEXT: lw s10, 212(sp) -; ZDINX32-NEXT: lw s11, 216(sp) -; ZDINX32-NEXT: lw ra, 220(sp) -; ZDINX32-NEXT: lw t3, 224(sp) -; ZDINX32-NEXT: lw t2, 228(sp) -; ZDINX32-NEXT: lw t1, 232(sp) -; ZDINX32-NEXT: lw t0, 236(sp) -; ZDINX32-NEXT: sw t0, 72(sp) -; ZDINX32-NEXT: sw t1, 68(sp) -; ZDINX32-NEXT: sw t2, 64(sp) -; ZDINX32-NEXT: sw t3, 60(sp) -; ZDINX32-NEXT: sw ra, 56(sp) -; ZDINX32-NEXT: sw s11, 52(sp) -; ZDINX32-NEXT: sw s10, 48(sp) -; ZDINX32-NEXT: sw s9, 44(sp) -; ZDINX32-NEXT: sw s8, 40(sp) -; ZDINX32-NEXT: sw s7, 36(sp) -; ZDINX32-NEXT: sw s6, 32(sp) -; ZDINX32-NEXT: sw s5, 28(sp) -; ZDINX32-NEXT: sw s4, 24(sp) -; ZDINX32-NEXT: sw s3, 20(sp) -; ZDINX32-NEXT: sw s2, 16(sp) -; ZDINX32-NEXT: sw s1, 12(sp) -; ZDINX32-NEXT: sw s0, 8(sp) -; ZDINX32-NEXT: sw t4, 4(sp) -; ZDINX32-NEXT: sw t5, 0(sp) -; ZDINX32-NEXT: lw t2, 88(sp) # 4-byte Folded Reload -; ZDINX32-NEXT: lw t3, 84(sp) # 4-byte Folded Reload -; ZDINX32-NEXT: lw t4, 80(sp) # 4-byte Folded Reload -; ZDINX32-NEXT: lw t5, 76(sp) # 4-byte Folded Reload +; ZDINX32-NEXT: addi sp, sp, -160 +; ZDINX32-NEXT: sw ra, 156(sp) # 4-byte Folded Spill +; ZDINX32-NEXT: sw s0, 152(sp) # 4-byte Folded Spill +; ZDINX32-NEXT: sw s1, 148(sp) # 4-byte Folded Spill +; ZDINX32-NEXT: sw s2, 144(sp) # 4-byte Folded Spill +; ZDINX32-NEXT: sw s3, 140(sp) # 4-byte Folded Spill +; ZDINX32-NEXT: sw s4, 136(sp) # 4-byte Folded Spill +; ZDINX32-NEXT: sw s5, 132(sp) # 4-byte Folded Spill +; ZDINX32-NEXT: sw s6, 128(sp) # 4-byte Folded Spill +; ZDINX32-NEXT: sw s7, 124(sp) # 4-byte Folded Spill +; ZDINX32-NEXT: sw s8, 120(sp) # 4-byte Folded Spill +; ZDINX32-NEXT: sw s9, 116(sp) # 4-byte Folded Spill +; ZDINX32-NEXT: sw s10, 112(sp) # 4-byte Folded Spill +; ZDINX32-NEXT: sw s11, 108(sp) # 4-byte Folded Spill +; ZDINX32-NEXT: lw t0, 160(sp) +; ZDINX32-NEXT: sw t0, 104(sp) # 4-byte Folded Spill +; ZDINX32-NEXT: lw t0, 164(sp) +; ZDINX32-NEXT: sw t0, 100(sp) # 4-byte Folded Spill +; ZDINX32-NEXT: lw t0, 168(sp) +; ZDINX32-NEXT: sw t0, 96(sp) # 4-byte Folded Spill +; ZDINX32-NEXT: lw t0, 172(sp) +; ZDINX32-NEXT: sw t0, 92(sp) # 4-byte Folded Spill +; ZDINX32-NEXT: lw t6, 176(sp) +; ZDINX32-NEXT: lw t5, 180(sp) +; ZDINX32-NEXT: lw t4, 184(sp) +; ZDINX32-NEXT: lw s0, 188(sp) +; ZDINX32-NEXT: lw s1, 192(sp) +; ZDINX32-NEXT: lw s2, 196(sp) +; ZDINX32-NEXT: lw s3, 200(sp) +; ZDINX32-NEXT: lw s4, 204(sp) +; ZDINX32-NEXT: lw s5, 208(sp) +; ZDINX32-NEXT: lw s6, 212(sp) +; ZDINX32-NEXT: lw s7, 216(sp) +; ZDINX32-NEXT: lw s8, 220(sp) +; ZDINX32-NEXT: lw s9, 224(sp) +; ZDINX32-NEXT: lw s10, 228(sp) +; ZDINX32-NEXT: lw s11, 232(sp) +; ZDINX32-NEXT: lw ra, 236(sp) +; ZDINX32-NEXT: lw t3, 240(sp) +; ZDINX32-NEXT: lw t2, 244(sp) +; ZDINX32-NEXT: lw t1, 248(sp) +; ZDINX32-NEXT: lw t0, 252(sp) +; ZDINX32-NEXT: sw t0, 76(sp) +; ZDINX32-NEXT: sw t1, 72(sp) +; ZDINX32-NEXT: sw t2, 68(sp) +; ZDINX32-NEXT: sw t3, 64(sp) +; ZDINX32-NEXT: sw ra, 60(sp) +; ZDINX32-NEXT: sw s11, 56(sp) +; ZDINX32-NEXT: sw s10, 52(sp) +; ZDINX32-NEXT: sw s9, 48(sp) +; ZDINX32-NEXT: sw s8, 44(sp) +; ZDINX32-NEXT: sw s7, 40(sp) +; ZDINX32-NEXT: sw s6, 36(sp) +; ZDINX32-NEXT: sw s5, 32(sp) +; ZDINX32-NEXT: sw s4, 28(sp) +; ZDINX32-NEXT: sw s3, 24(sp) +; ZDINX32-NEXT: sw s2, 20(sp) +; ZDINX32-NEXT: sw s1, 16(sp) +; ZDINX32-NEXT: sw s0, 12(sp) +; ZDINX32-NEXT: sw t4, 8(sp) +; ZDINX32-NEXT: sw t5, 4(sp) +; ZDINX32-NEXT: sw t6, 0(sp) +; ZDINX32-NEXT: lw t3, 104(sp) # 4-byte Folded Reload +; ZDINX32-NEXT: lw t4, 100(sp) # 4-byte Folded Reload +; ZDINX32-NEXT: lw t5, 96(sp) # 4-byte Folded Reload +; ZDINX32-NEXT: lw t6, 92(sp) # 4-byte Folded Reload ; ZDINX32-NEXT: call callee_float_32 -; ZDINX32-NEXT: lw ra, 140(sp) # 4-byte Folded Reload -; ZDINX32-NEXT: lw s0, 136(sp) # 4-byte Folded Reload -; ZDINX32-NEXT: lw s1, 132(sp) # 4-byte Folded Reload -; ZDINX32-NEXT: lw s2, 128(sp) # 4-byte Folded Reload -; ZDINX32-NEXT: lw s3, 124(sp) # 4-byte Folded Reload -; ZDINX32-NEXT: lw s4, 120(sp) # 4-byte Folded Reload -; ZDINX32-NEXT: lw s5, 116(sp) # 4-byte Folded Reload -; ZDINX32-NEXT: lw s6, 112(sp) # 4-byte Folded Reload -; ZDINX32-NEXT: lw s7, 108(sp) # 4-byte Folded Reload -; ZDINX32-NEXT: lw s8, 104(sp) # 4-byte Folded Reload -; ZDINX32-NEXT: lw s9, 100(sp) # 4-byte Folded Reload -; ZDINX32-NEXT: lw s10, 96(sp) # 4-byte Folded Reload -; ZDINX32-NEXT: lw s11, 92(sp) # 4-byte Folded Reload -; ZDINX32-NEXT: addi sp, sp, 144 +; ZDINX32-NEXT: lw ra, 156(sp) # 4-byte Folded Reload +; ZDINX32-NEXT: lw s0, 152(sp) # 4-byte Folded Reload +; ZDINX32-NEXT: lw s1, 148(sp) # 4-byte Folded Reload +; ZDINX32-NEXT: lw s2, 144(sp) # 4-byte Folded Reload +; ZDINX32-NEXT: lw s3, 140(sp) # 4-byte Folded Reload +; ZDINX32-NEXT: lw s4, 136(sp) # 4-byte Folded Reload +; ZDINX32-NEXT: lw s5, 132(sp) # 4-byte Folded Reload +; ZDINX32-NEXT: lw s6, 128(sp) # 4-byte Folded Reload +; ZDINX32-NEXT: lw s7, 124(sp) # 4-byte Folded Reload +; ZDINX32-NEXT: lw s8, 120(sp) # 4-byte Folded Reload +; ZDINX32-NEXT: lw s9, 116(sp) # 4-byte Folded Reload +; ZDINX32-NEXT: lw s10, 112(sp) # 4-byte Folded Reload +; ZDINX32-NEXT: lw s11, 108(sp) # 4-byte Folded Reload +; ZDINX32-NEXT: addi sp, sp, 160 ; ZDINX32-NEXT: ret ; ; ZDINX64-LABEL: caller_float_32: @@ -1288,29 +1299,30 @@ define float @caller_float_32(<32 x float> %A) nounwind { ; ZDINX64-NEXT: lw t2, 392(sp) ; ZDINX64-NEXT: lw t1, 400(sp) ; ZDINX64-NEXT: lw t0, 408(sp) -; ZDINX64-NEXT: sw t0, 72(sp) -; ZDINX64-NEXT: sw t1, 68(sp) -; ZDINX64-NEXT: sw t2, 64(sp) -; ZDINX64-NEXT: sw t3, 60(sp) -; ZDINX64-NEXT: sw ra, 56(sp) -; ZDINX64-NEXT: sw s11, 52(sp) -; ZDINX64-NEXT: sw s10, 48(sp) -; ZDINX64-NEXT: sw s9, 44(sp) -; ZDINX64-NEXT: sw s8, 40(sp) -; ZDINX64-NEXT: sw s7, 36(sp) -; ZDINX64-NEXT: sw s6, 32(sp) -; ZDINX64-NEXT: sw s5, 28(sp) -; ZDINX64-NEXT: sw s4, 24(sp) -; ZDINX64-NEXT: sw s3, 20(sp) -; ZDINX64-NEXT: sw s2, 16(sp) -; ZDINX64-NEXT: sw s1, 12(sp) -; ZDINX64-NEXT: sw s0, 8(sp) -; ZDINX64-NEXT: sw t4, 4(sp) -; ZDINX64-NEXT: sw t5, 0(sp) -; ZDINX64-NEXT: ld t2, 112(sp) # 8-byte Folded Reload -; ZDINX64-NEXT: ld t3, 104(sp) # 8-byte Folded Reload -; ZDINX64-NEXT: ld t4, 96(sp) # 8-byte Folded Reload -; ZDINX64-NEXT: ld t5, 88(sp) # 8-byte Folded Reload +; ZDINX64-NEXT: sw t0, 76(sp) +; ZDINX64-NEXT: sw t1, 72(sp) +; ZDINX64-NEXT: sw t2, 68(sp) +; ZDINX64-NEXT: sw t3, 64(sp) +; ZDINX64-NEXT: sw ra, 60(sp) +; ZDINX64-NEXT: sw s11, 56(sp) +; ZDINX64-NEXT: sw s10, 52(sp) +; ZDINX64-NEXT: sw s9, 48(sp) +; ZDINX64-NEXT: sw s8, 44(sp) +; ZDINX64-NEXT: sw s7, 40(sp) +; ZDINX64-NEXT: sw s6, 36(sp) +; ZDINX64-NEXT: sw s5, 32(sp) +; ZDINX64-NEXT: sw s4, 28(sp) +; ZDINX64-NEXT: sw s3, 24(sp) +; ZDINX64-NEXT: sw s2, 20(sp) +; ZDINX64-NEXT: sw s1, 16(sp) +; ZDINX64-NEXT: sw s0, 12(sp) +; ZDINX64-NEXT: sw t4, 8(sp) +; ZDINX64-NEXT: sw t5, 4(sp) +; ZDINX64-NEXT: sw t6, 0(sp) +; ZDINX64-NEXT: ld t3, 112(sp) # 8-byte Folded Reload +; ZDINX64-NEXT: ld t4, 104(sp) # 8-byte Folded Reload +; ZDINX64-NEXT: ld t5, 96(sp) # 8-byte Folded Reload +; ZDINX64-NEXT: ld t6, 88(sp) # 8-byte Folded Reload ; ZDINX64-NEXT: call callee_float_32 ; ZDINX64-NEXT: ld ra, 216(sp) # 8-byte Folded Reload ; ZDINX64-NEXT: ld s0, 208(sp) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/rvv/calling-conv-fastcc.ll b/llvm/test/CodeGen/RISCV/rvv/calling-conv-fastcc.ll index ee9f96a45d23e..fb84a2528778a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/calling-conv-fastcc.ll +++ b/llvm/test/CodeGen/RISCV/rvv/calling-conv-fastcc.ll @@ -502,8 +502,8 @@ define fastcc @vector_arg_indirect_stack(i32 %0, i32 %1, i32 ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, t4, a0 -; CHECK-NEXT: vl8re32.v v24, (t4) +; CHECK-NEXT: add a0, t5, a0 +; CHECK-NEXT: vl8re32.v v24, (t5) ; CHECK-NEXT: vl8re32.v v0, (a0) ; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, ma ; CHECK-NEXT: vadd.vv v8, v8, v24 @@ -521,25 +521,31 @@ define fastcc @pass_vector_arg_indirect_stack( @pass_vector_arg_indirect_stack( @pass_vector_arg_indirect_stack( @vector_arg_indirect_stack(i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, zeroinitializer, zeroinitializer, zeroinitializer, i32 8) ret %s diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv-fastcc.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv-fastcc.ll index 63cd42e97ef6f..9f48fdb3608a0 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv-fastcc.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv-fastcc.ll @@ -230,7 +230,7 @@ define fastcc <32 x i32> @vector_arg_indirect_stack(i32 %0, i32 %1, i32 %2, i32 ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; CHECK-NEXT: vle32.v v16, (t2) +; CHECK-NEXT: vle32.v v16, (t3) ; CHECK-NEXT: vadd.vv v8, v8, v16 ; CHECK-NEXT: ret %s = add <32 x i32> %x, %z @@ -261,8 +261,8 @@ define fastcc <32 x i32> @pass_vector_arg_indirect_stack(<32 x i32> %x, <32 x i3 ; CHECK-NEXT: li a5, 5 ; CHECK-NEXT: li a6, 6 ; CHECK-NEXT: li a7, 7 -; CHECK-NEXT: mv t2, sp -; CHECK-NEXT: li t3, 8 +; CHECK-NEXT: mv t3, sp +; CHECK-NEXT: li t4, 8 ; CHECK-NEXT: vse32.v v8, (a0) ; CHECK-NEXT: li a0, 0 ; CHECK-NEXT: vmv.v.i v16, 0 @@ -281,7 +281,7 @@ define fastcc <32 x i32> @vector_arg_direct_stack(i32 %0, i32 %1, i32 %2, i32 %3 ; CHECK-LABEL: vector_arg_direct_stack: ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 32 -; CHECK-NEXT: addi a1, sp, 8 +; CHECK-NEXT: addi a1, sp, 16 ; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; CHECK-NEXT: vle32.v v24, (a1) ; CHECK-NEXT: vadd.vv v8, v8, v16 @@ -303,11 +303,13 @@ define fastcc <32 x i32> @pass_vector_arg_direct_stack(<32 x i32> %x, <32 x i32> ; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; CHECK-NEXT: vmv.v.i v8, 0 -; CHECK-NEXT: addi a0, sp, 8 +; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vse32.v v8, (a0) ; CHECK-NEXT: li a0, 1 -; CHECK-NEXT: sd a0, 136(sp) +; CHECK-NEXT: sd a0, 144(sp) ; CHECK-NEXT: li a0, 13 +; CHECK-NEXT: sd a0, 8(sp) +; CHECK-NEXT: li a0, 12 ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: li a2, 2 ; CHECK-NEXT: li a3, 3 @@ -315,11 +317,10 @@ define fastcc <32 x i32> @pass_vector_arg_direct_stack(<32 x i32> %x, <32 x i32> ; CHECK-NEXT: li a5, 5 ; CHECK-NEXT: li a6, 6 ; CHECK-NEXT: li a7, 7 -; CHECK-NEXT: li t2, 8 -; CHECK-NEXT: li t3, 9 -; CHECK-NEXT: li t4, 10 -; CHECK-NEXT: li t5, 11 -; CHECK-NEXT: li t6, 12 +; CHECK-NEXT: li t3, 8 +; CHECK-NEXT: li t4, 9 +; CHECK-NEXT: li t5, 10 +; CHECK-NEXT: li t6, 11 ; CHECK-NEXT: sd a0, 0(sp) ; CHECK-NEXT: li a0, 0 ; CHECK-NEXT: vmv.v.i v16, 0 @@ -336,7 +337,7 @@ define fastcc <32 x i32> @pass_vector_arg_direct_stack(<32 x i32> %x, <32 x i32> define fastcc <4 x i1> @vector_mask_arg_direct_stack(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12, i32 %13, <32 x i32> %x, <32 x i32> %y, <32 x i32> %z, <4 x i1> %m1, <4 x i1> %m2, i32 %last) { ; CHECK-LABEL: vector_mask_arg_direct_stack: ; CHECK: # %bb.0: -; CHECK-NEXT: addi a0, sp, 136 +; CHECK-NEXT: addi a0, sp, 144 ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma ; CHECK-NEXT: vlm.v v8, (a0) ; CHECK-NEXT: vmxor.mm v0, v0, v8 From 746cea3eb741cae0cb90542d1580b7bacbf2a615 Mon Sep 17 00:00:00 2001 From: Yeting Kuo <46629943+yetingk@users.noreply.github.com> Date: Wed, 17 Jul 2024 08:40:42 +0800 Subject: [PATCH 205/777] [VP][RISCV] Introduce vp.splat and RISC-V. (#98731) This patch introduces a vp intrinsic for splat. It's helpful for IR-level passes to create a splat with specific vector length. --- llvm/docs/LangRef.rst | 47 ++ llvm/include/llvm/IR/Intrinsics.td | 7 + llvm/include/llvm/IR/VPIntrinsics.def | 7 + .../SelectionDAG/LegalizeIntegerTypes.cpp | 14 +- llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h | 2 + .../SelectionDAG/LegalizeVectorTypes.cpp | 26 + llvm/lib/IR/IntrinsicInst.cpp | 3 + llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 30 +- llvm/lib/Target/RISCV/RISCVISelLowering.h | 1 + .../RISCV/rvv/fixed-vectors-vp-splat.ll | 452 +++++++++++++++++ llvm/test/CodeGen/RISCV/rvv/vp-splat.ll | 464 ++++++++++++++++++ llvm/unittests/IR/VPIntrinsicTest.cpp | 2 + 12 files changed, 1051 insertions(+), 4 deletions(-) create mode 100644 llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vp-splat.ll create mode 100644 llvm/test/CodeGen/RISCV/rvv/vp-splat.ll diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index 40c8b7f769596..f2ff1f0f5852c 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -22841,6 +22841,53 @@ Examples: llvm.experimental.vp.splice(, , -2, 3, 2); ==> trailing elements +.. _int_experimental_vp_splat: + + +'``llvm.experimental.vp.splat``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare <2 x double> @llvm.experimental.vp.splat.v2f64(double %scalar, <2 x i1> %mask, i32 %evl) + declare @llvm.experimental.vp.splat.nxv4i32(i32 %scalar, %mask, i32 %evl) + +Overview: +""""""""" + +The '``llvm.experimental.vp.splat.*``' intrinsic is to create a predicated splat +with specific effective vector length. + +Arguments: +"""""""""" + +The result is a vector and it is a splat of the first scalar argument. The +second argument ``mask`` is a vector mask and has the same number of elements as +the result. The third argument is the explicit vector length of the operation. + +Semantics: +"""""""""" + +This intrinsic splats a vector with ``evl`` elements of a scalar argument. +The lanes in the result vector disabled by ``mask`` are ``poison``. The +elements past ``evl`` are poison. + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call <4 x float> @llvm.vp.splat.v4f32(float %a, <4 x i1> %mask, i32 %evl) + ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r + %e = insertelement <4 x float> poison, float %a, i32 0 + %s = shufflevector <4 x float> %e, <4 x float> poison, <4 x i32> zeroinitializer + %also.r = select <4 x i1> %mask, <4 x float> %s, <4 x float> poison + + .. _int_experimental_vp_reverse: diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td index 9d04256d59317..fc39122aa1be0 100644 --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -2342,6 +2342,13 @@ def int_experimental_vp_reverse: llvm_i32_ty], [IntrNoMem]>; +def int_experimental_vp_splat: + DefaultAttrsIntrinsic<[llvm_anyvector_ty], + [LLVMVectorElementType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty], + [IntrNoMem]>; + def int_vp_is_fpclass: DefaultAttrsIntrinsic<[ LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], [ llvm_anyvector_ty, diff --git a/llvm/include/llvm/IR/VPIntrinsics.def b/llvm/include/llvm/IR/VPIntrinsics.def index 8eced073501e8..a4a1000d37259 100644 --- a/llvm/include/llvm/IR/VPIntrinsics.def +++ b/llvm/include/llvm/IR/VPIntrinsics.def @@ -777,6 +777,13 @@ END_REGISTER_VP(experimental_vp_reverse, EXPERIMENTAL_VP_REVERSE) ///// } Shuffles +// llvm.vp.splat(val,mask,vlen) +BEGIN_REGISTER_VP_INTRINSIC(experimental_vp_splat, 1, 2) +BEGIN_REGISTER_VP_SDNODE(EXPERIMENTAL_VP_SPLAT, -1, experimental_vp_splat, 1, 2) +VP_PROPERTY_NO_FUNCTIONAL +HELPER_MAP_VPID_TO_VPSD(experimental_vp_splat, EXPERIMENTAL_VP_SPLAT) +END_REGISTER_VP(experimental_vp_splat, EXPERIMENTAL_VP_SPLAT) + #undef BEGIN_REGISTER_VP #undef BEGIN_REGISTER_VP_INTRINSIC #undef BEGIN_REGISTER_VP_SDNODE diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index 8641247cc2236..08321c3842450 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -137,6 +137,7 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) { break; case ISD::SPLAT_VECTOR: case ISD::SCALAR_TO_VECTOR: + case ISD::EXPERIMENTAL_VP_SPLAT: Res = PromoteIntRes_ScalarOp(N); break; case ISD::STEP_VECTOR: Res = PromoteIntRes_STEP_VECTOR(N); break; @@ -1920,6 +1921,7 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) { break; case ISD::SPLAT_VECTOR: case ISD::SCALAR_TO_VECTOR: + case ISD::EXPERIMENTAL_VP_SPLAT: Res = PromoteIntOp_ScalarOp(N); break; case ISD::VSELECT: @@ -2215,10 +2217,14 @@ SDValue DAGTypeLegalizer::PromoteIntOp_INSERT_VECTOR_ELT(SDNode *N, } SDValue DAGTypeLegalizer::PromoteIntOp_ScalarOp(SDNode *N) { + SDValue Op = GetPromotedInteger(N->getOperand(0)); + if (N->getOpcode() == ISD::EXPERIMENTAL_VP_SPLAT) + return SDValue( + DAG.UpdateNodeOperands(N, Op, N->getOperand(1), N->getOperand(2)), 0); + // Integer SPLAT_VECTOR/SCALAR_TO_VECTOR operands are implicitly truncated, // so just promote the operand in place. - return SDValue(DAG.UpdateNodeOperands(N, - GetPromotedInteger(N->getOperand(0))), 0); + return SDValue(DAG.UpdateNodeOperands(N, Op), 0); } SDValue DAGTypeLegalizer::PromoteIntOp_SELECT(SDNode *N, unsigned OpNo) { @@ -5235,6 +5241,7 @@ bool DAGTypeLegalizer::ExpandIntegerOperand(SDNode *N, unsigned OpNo) { case ISD::EXTRACT_ELEMENT: Res = ExpandOp_EXTRACT_ELEMENT(N); break; case ISD::INSERT_VECTOR_ELT: Res = ExpandOp_INSERT_VECTOR_ELT(N); break; case ISD::SCALAR_TO_VECTOR: Res = ExpandOp_SCALAR_TO_VECTOR(N); break; + case ISD::EXPERIMENTAL_VP_SPLAT: case ISD::SPLAT_VECTOR: Res = ExpandIntOp_SPLAT_VECTOR(N); break; case ISD::SELECT_CC: Res = ExpandIntOp_SELECT_CC(N); break; case ISD::SETCC: Res = ExpandIntOp_SETCC(N); break; @@ -5863,6 +5870,9 @@ SDValue DAGTypeLegalizer::PromoteIntRes_ScalarOp(SDNode *N) { EVT NOutElemVT = NOutVT.getVectorElementType(); SDValue Op = DAG.getNode(ISD::ANY_EXTEND, dl, NOutElemVT, N->getOperand(0)); + if (N->isVPOpcode()) + return DAG.getNode(N->getOpcode(), dl, NOutVT, Op, N->getOperand(1), + N->getOperand(2)); return DAG.getNode(N->getOpcode(), dl, NOutVT, Op); } diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index 7af47ed250d91..a5c92ee463690 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -928,6 +928,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { void SplitVecRes_Gather(MemSDNode *VPGT, SDValue &Lo, SDValue &Hi, bool SplitSETCC = false); void SplitVecRes_ScalarOp(SDNode *N, SDValue &Lo, SDValue &Hi); + void SplitVecRes_VP_SPLAT(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_STEP_VECTOR(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_SETCC(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_VECTOR_REVERSE(SDNode *N, SDValue &Lo, SDValue &Hi); @@ -1065,6 +1066,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue WidenVecOp_MGATHER(SDNode* N, unsigned OpNo); SDValue WidenVecOp_MSCATTER(SDNode* N, unsigned OpNo); SDValue WidenVecOp_VP_SCATTER(SDNode* N, unsigned OpNo); + SDValue WidenVecOp_VP_SPLAT(SDNode *N, unsigned OpNo); SDValue WidenVecOp_SETCC(SDNode* N); SDValue WidenVecOp_STRICT_FSETCC(SDNode* N); SDValue WidenVecOp_VSELECT(SDNode *N); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 1a575abbc16f4..ed629485c0c2b 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -1085,6 +1085,7 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) { case ISD::FCOPYSIGN: SplitVecRes_FPOp_MultiType(N, Lo, Hi); break; case ISD::IS_FPCLASS: SplitVecRes_IS_FPCLASS(N, Lo, Hi); break; case ISD::INSERT_VECTOR_ELT: SplitVecRes_INSERT_VECTOR_ELT(N, Lo, Hi); break; + case ISD::EXPERIMENTAL_VP_SPLAT: SplitVecRes_VP_SPLAT(N, Lo, Hi); break; case ISD::SPLAT_VECTOR: case ISD::SCALAR_TO_VECTOR: SplitVecRes_ScalarOp(N, Lo, Hi); @@ -2007,6 +2008,16 @@ void DAGTypeLegalizer::SplitVecRes_ScalarOp(SDNode *N, SDValue &Lo, } } +void DAGTypeLegalizer::SplitVecRes_VP_SPLAT(SDNode *N, SDValue &Lo, + SDValue &Hi) { + SDLoc dl(N); + auto [LoVT, HiVT] = DAG.GetSplitDestVTs(N->getValueType(0)); + auto [MaskLo, MaskHi] = SplitMask(N->getOperand(1)); + auto [EVLLo, EVLHi] = DAG.SplitEVL(N->getOperand(2), N->getValueType(0), dl); + Lo = DAG.getNode(N->getOpcode(), dl, LoVT, N->getOperand(0), MaskLo, EVLLo); + Hi = DAG.getNode(N->getOpcode(), dl, HiVT, N->getOperand(0), MaskHi, EVLHi); +} + void DAGTypeLegalizer::SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo, SDValue &Hi) { assert(ISD::isUNINDEXEDLoad(LD) && "Indexed load during type legalization!"); @@ -4299,6 +4310,7 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) { case ISD::STEP_VECTOR: case ISD::SPLAT_VECTOR: case ISD::SCALAR_TO_VECTOR: + case ISD::EXPERIMENTAL_VP_SPLAT: Res = WidenVecRes_ScalarOp(N); break; case ISD::SIGN_EXTEND_INREG: Res = WidenVecRes_InregOp(N); break; @@ -5835,6 +5847,9 @@ SDValue DAGTypeLegalizer::WidenVecRes_VP_GATHER(VPGatherSDNode *N) { SDValue DAGTypeLegalizer::WidenVecRes_ScalarOp(SDNode *N) { EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); + if (N->isVPOpcode()) + return DAG.getNode(N->getOpcode(), SDLoc(N), WidenVT, N->getOperand(0), + N->getOperand(1), N->getOperand(2)); return DAG.getNode(N->getOpcode(), SDLoc(N), WidenVT, N->getOperand(0)); } @@ -6374,6 +6389,10 @@ bool DAGTypeLegalizer::WidenVectorOperand(SDNode *N, unsigned OpNo) { Res = WidenVecOp_FP_TO_XINT_SAT(N); break; + case ISD::EXPERIMENTAL_VP_SPLAT: + Res = WidenVecOp_VP_SPLAT(N, OpNo); + break; + case ISD::VECREDUCE_FADD: case ISD::VECREDUCE_FMUL: case ISD::VECREDUCE_ADD: @@ -6834,6 +6853,13 @@ SDValue DAGTypeLegalizer::WidenVecOp_STORE(SDNode *N) { report_fatal_error("Unable to widen vector store"); } +SDValue DAGTypeLegalizer::WidenVecOp_VP_SPLAT(SDNode *N, unsigned OpNo) { + assert(OpNo == 1 && "Can widen only mask operand of vp_splat"); + return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), + N->getOperand(0), GetWidenedVector(N->getOperand(1)), + N->getOperand(2)); +} + SDValue DAGTypeLegalizer::WidenVecOp_VP_STORE(SDNode *N, unsigned OpNo) { assert((OpNo == 1 || OpNo == 3) && "Can widen only data or mask operand of vp_store"); diff --git a/llvm/lib/IR/IntrinsicInst.cpp b/llvm/lib/IR/IntrinsicInst.cpp index e17755c8ad57b..64a14da55b15e 100644 --- a/llvm/lib/IR/IntrinsicInst.cpp +++ b/llvm/lib/IR/IntrinsicInst.cpp @@ -699,6 +699,9 @@ Function *VPIntrinsic::getDeclarationForParams(Module *M, Intrinsic::ID VPID, VPFunc = Intrinsic::getDeclaration( M, VPID, {Params[0]->getType(), Params[1]->getType()}); break; + case Intrinsic::experimental_vp_splat: + VPFunc = Intrinsic::getDeclaration(M, VPID, ReturnType); + break; } assert(VPFunc && "Could not declare VP intrinsic"); return VPFunc; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 1280201d7b814..953196a586b6e 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -699,7 +699,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, ISD::VP_SMAX, ISD::VP_UMIN, ISD::VP_UMAX, ISD::VP_ABS, ISD::EXPERIMENTAL_VP_REVERSE, ISD::EXPERIMENTAL_VP_SPLICE, ISD::VP_SADDSAT, ISD::VP_UADDSAT, ISD::VP_SSUBSAT, - ISD::VP_USUBSAT, ISD::VP_CTTZ_ELTS, ISD::VP_CTTZ_ELTS_ZERO_UNDEF}; + ISD::VP_USUBSAT, ISD::VP_CTTZ_ELTS, ISD::VP_CTTZ_ELTS_ZERO_UNDEF, + ISD::EXPERIMENTAL_VP_SPLAT}; static const unsigned FloatingPointVPOps[] = { ISD::VP_FADD, ISD::VP_FSUB, ISD::VP_FMUL, @@ -715,7 +716,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, ISD::VP_FMINIMUM, ISD::VP_FMAXIMUM, ISD::VP_LRINT, ISD::VP_LLRINT, ISD::EXPERIMENTAL_VP_REVERSE, ISD::EXPERIMENTAL_VP_SPLICE, ISD::VP_REDUCE_FMINIMUM, - ISD::VP_REDUCE_FMAXIMUM}; + ISD::VP_REDUCE_FMAXIMUM, ISD::EXPERIMENTAL_VP_SPLAT}; static const unsigned IntegerVecReduceOps[] = { ISD::VECREDUCE_ADD, ISD::VECREDUCE_AND, ISD::VECREDUCE_OR, @@ -7252,6 +7253,8 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, return lowerVPSpliceExperimental(Op, DAG); case ISD::EXPERIMENTAL_VP_REVERSE: return lowerVPReverseExperimental(Op, DAG); + case ISD::EXPERIMENTAL_VP_SPLAT: + return lowerVPSplatExperimental(Op, DAG); case ISD::CLEAR_CACHE: { assert(getTargetMachine().getTargetTriple().isOSLinux() && "llvm.clear_cache only needs custom lower on Linux targets"); @@ -11614,6 +11617,29 @@ RISCVTargetLowering::lowerVPSpliceExperimental(SDValue Op, return convertFromScalableVector(VT, Result, DAG, Subtarget); } +SDValue RISCVTargetLowering::lowerVPSplatExperimental(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + SDValue Val = Op.getOperand(0); + SDValue Mask = Op.getOperand(1); + SDValue VL = Op.getOperand(2); + MVT VT = Op.getSimpleValueType(); + + MVT ContainerVT = VT; + if (VT.isFixedLengthVector()) { + ContainerVT = getContainerForFixedLengthVector(VT); + MVT MaskVT = getMaskTypeFor(ContainerVT); + Mask = convertToScalableVector(MaskVT, Mask, DAG, Subtarget); + } + + SDValue Result = + lowerScalarSplat(SDValue(), Val, VL, ContainerVT, DL, DAG, Subtarget); + + if (!VT.isFixedLengthVector()) + return Result; + return convertFromScalableVector(VT, Result, DAG, Subtarget); +} + SDValue RISCVTargetLowering::lowerVPReverseExperimental(SDValue Op, SelectionDAG &DAG) const { diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index 2642a188820e1..0b0ad9229f0b3 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -973,6 +973,7 @@ class RISCVTargetLowering : public TargetLowering { SDValue lowerLogicVPOp(SDValue Op, SelectionDAG &DAG) const; SDValue lowerVPExtMaskOp(SDValue Op, SelectionDAG &DAG) const; SDValue lowerVPSetCCMaskOp(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerVPSplatExperimental(SDValue Op, SelectionDAG &DAG) const; SDValue lowerVPSpliceExperimental(SDValue Op, SelectionDAG &DAG) const; SDValue lowerVPReverseExperimental(SDValue Op, SelectionDAG &DAG) const; SDValue lowerVPFPIntConvOp(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vp-splat.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vp-splat.ll new file mode 100644 index 0000000000000..2913cbdf0fffd --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vp-splat.ll @@ -0,0 +1,452 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=riscv32 -mattr=+v,+d,+zfh,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: llc -mtriple=riscv64 -mattr=+v,+d,+zfh,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64 + +define <1 x i8> @vp_splat_v1i8(i8 %val, <1 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_v1i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: ret + %splat = call <1 x i8> @llvm.experimental.vp.splat.v1i8(i8 %val, <1 x i1> %m, i32 %evl) + ret <1 x i8> %splat +} + +define <2 x i8> @vp_splat_v2i8(i8 %val, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_v2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: ret + %splat = call <2 x i8> @llvm.experimental.vp.splat.v2i8(i8 %val, <2 x i1> %m, i32 %evl) + ret <2 x i8> %splat +} + +define <4 x i8> @vp_splat_v4i8(i8 %val, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_v4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: ret + %splat = call <4 x i8> @llvm.experimental.vp.splat.v4i8(i8 %val, <4 x i1> %m, i32 %evl) + ret <4 x i8> %splat +} + +define <8 x i8> @vp_splat_v8i8(i8 %val, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_v8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: ret + %splat = call <8 x i8> @llvm.experimental.vp.splat.v8i8(i8 %val, <8 x i1> %m, i32 %evl) + ret <8 x i8> %splat +} + +define <16 x i8> @vp_splat_v16i8(i8 %val, <16 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_v16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: ret + %splat = call <16 x i8> @llvm.experimental.vp.splat.v16i8(i8 %val, <16 x i1> %m, i32 %evl) + ret <16 x i8> %splat +} + +define <32 x i8> @vp_splat_v32i8(i8 %val, <32 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_v32i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: ret + %splat = call <32 x i8> @llvm.experimental.vp.splat.v32i8(i8 %val, <32 x i1> %m, i32 %evl) + ret <32 x i8> %splat +} + +define <64 x i8> @vp_splat_v64i8(i8 %val, <64 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_v64i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: ret + %splat = call <64 x i8> @llvm.experimental.vp.splat.v64i8(i8 %val, <64 x i1> %m, i32 %evl) + ret <64 x i8> %splat +} + +define <1 x i16> @vp_splat_v1i16(i16 %val, <1 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_v1i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: ret + %splat = call <1 x i16> @llvm.experimental.vp.splat.v1i16(i16 %val, <1 x i1> %m, i32 %evl) + ret <1 x i16> %splat +} + +define <2 x i16> @vp_splat_v2i16(i16 %val, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_v2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: ret + %splat = call <2 x i16> @llvm.experimental.vp.splat.v2i16(i16 %val, <2 x i1> %m, i32 %evl) + ret <2 x i16> %splat +} + +define <4 x i16> @vp_splat_v4i16(i16 %val, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_v4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: ret + %splat = call <4 x i16> @llvm.experimental.vp.splat.v4i16(i16 %val, <4 x i1> %m, i32 %evl) + ret <4 x i16> %splat +} + +define <8 x i16> @vp_splat_v8i16(i16 %val, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: ret + %splat = call <8 x i16> @llvm.experimental.vp.splat.v8i16(i16 %val, <8 x i1> %m, i32 %evl) + ret <8 x i16> %splat +} + +define <16 x i16> @vp_splat_v16i16(i16 %val, <16 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_v16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: ret + %splat = call <16 x i16> @llvm.experimental.vp.splat.v16i16(i16 %val, <16 x i1> %m, i32 %evl) + ret <16 x i16> %splat +} + +define <32 x i16> @vp_splat_v32i16(i16 %val, <32 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_v32i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: ret + %splat = call <32 x i16> @llvm.experimental.vp.splat.v32i16(i16 %val, <32 x i1> %m, i32 %evl) + ret <32 x i16> %splat +} + +define <1 x i32> @vp_splat_v1i32(i32 %val, <1 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_v1i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: ret + %splat = call <1 x i32> @llvm.experimental.vp.splat.v1i32(i32 %val, <1 x i1> %m, i32 %evl) + ret <1 x i32> %splat +} + +define <2 x i32> @vp_splat_v2i32(i32 %val, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_v2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: ret + %splat = call <2 x i32> @llvm.experimental.vp.splat.v2i32(i32 %val, <2 x i1> %m, i32 %evl) + ret <2 x i32> %splat +} + +define <4 x i32> @vp_splat_v4i32(i32 %val, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: ret + %splat = call <4 x i32> @llvm.experimental.vp.splat.v4i32(i32 %val, <4 x i1> %m, i32 %evl) + ret <4 x i32> %splat +} + +define <8 x i32> @vp_splat_v8i32(i32 %val, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_v8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: ret + %splat = call <8 x i32> @llvm.experimental.vp.splat.v8i32(i32 %val, <8 x i1> %m, i32 %evl) + ret <8 x i32> %splat +} + +define <16 x i32> @vp_splat_v16i32(i32 %val, <16 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_v16i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: ret + %splat = call <16 x i32> @llvm.experimental.vp.splat.v16i32(i32 %val, <16 x i1> %m, i32 %evl) + ret <16 x i32> %splat +} + +define <1 x i64> @vp_splat_v1i64(i64 %val, <1 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vp_splat_v1i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v8, (a0), zero +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vp_splat_v1i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; RV64-NEXT: vmv.v.x v8, a0 +; RV64-NEXT: ret + %splat = call <1 x i64> @llvm.experimental.vp.splat.v1i64(i64 %val, <1 x i1> %m, i32 %evl) + ret <1 x i64> %splat +} + +define <2 x i64> @vp_splat_v2i64(i64 %val, <2 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vp_splat_v2i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v8, (a0), zero +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vp_splat_v2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; RV64-NEXT: vmv.v.x v8, a0 +; RV64-NEXT: ret + %splat = call <2 x i64> @llvm.experimental.vp.splat.v2i64(i64 %val, <2 x i1> %m, i32 %evl) + ret <2 x i64> %splat +} + +define <4 x i64> @vp_splat_v4i64(i64 %val, <4 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vp_splat_v4i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v8, (a0), zero +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vp_splat_v4i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; RV64-NEXT: vmv.v.x v8, a0 +; RV64-NEXT: ret + %splat = call <4 x i64> @llvm.experimental.vp.splat.v4i64(i64 %val, <4 x i1> %m, i32 %evl) + ret <4 x i64> %splat +} + +define <8 x i64> @vp_splat_v8i64(i64 %val, <8 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vp_splat_v8i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v8, (a0), zero +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vp_splat_v8i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; RV64-NEXT: vmv.v.x v8, a0 +; RV64-NEXT: ret + %splat = call <8 x i64> @llvm.experimental.vp.splat.v8i64(i64 %val, <8 x i1> %m, i32 %evl) + ret <8 x i64> %splat +} + +define <1 x half> @vp_splat_v1f16(half %val, <1 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_v1f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vfmv.v.f v8, fa0 +; CHECK-NEXT: ret + %splat = call <1 x half> @llvm.experimental.vp.splat.v1f16(half %val, <1 x i1> %m, i32 %evl) + ret <1 x half> %splat +} + +define <2 x half> @vp_splat_v2f16(half %val, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_v2f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vfmv.v.f v8, fa0 +; CHECK-NEXT: ret + %splat = call <2 x half> @llvm.experimental.vp.splat.v2f16(half %val, <2 x i1> %m, i32 %evl) + ret <2 x half> %splat +} + +define <4 x half> @vp_splat_v4f16(half %val, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_v4f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vfmv.v.f v8, fa0 +; CHECK-NEXT: ret + %splat = call <4 x half> @llvm.experimental.vp.splat.v4f16(half %val, <4 x i1> %m, i32 %evl) + ret <4 x half> %splat +} + +define <8 x half> @vp_splat_v8f16(half %val, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vfmv.v.f v8, fa0 +; CHECK-NEXT: ret + %splat = call <8 x half> @llvm.experimental.vp.splat.v8f16(half %val, <8 x i1> %m, i32 %evl) + ret <8 x half> %splat +} + +define <16 x half> @vp_splat_v16f16(half %val, <16 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vfmv.v.f v8, fa0 +; CHECK-NEXT: ret + %splat = call <16 x half> @llvm.experimental.vp.splat.v16f16(half %val, <16 x i1> %m, i32 %evl) + ret <16 x half> %splat +} + +define <32 x half> @vp_splat_v32f16(half %val, <32 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_v32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vfmv.v.f v8, fa0 +; CHECK-NEXT: ret + %splat = call <32 x half> @llvm.experimental.vp.splat.v32f16(half %val, <32 x i1> %m, i32 %evl) + ret <32 x half> %splat +} + +define <1 x float> @vp_splat_v1f32(float %val, <1 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_v1f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfmv.v.f v8, fa0 +; CHECK-NEXT: ret + %splat = call <1 x float> @llvm.experimental.vp.splat.v1f32(float %val, <1 x i1> %m, i32 %evl) + ret <1 x float> %splat +} + +define <2 x float> @vp_splat_v2f32(float %val, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_v2f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfmv.v.f v8, fa0 +; CHECK-NEXT: ret + %splat = call <2 x float> @llvm.experimental.vp.splat.v2f32(float %val, <2 x i1> %m, i32 %evl) + ret <2 x float> %splat +} + +define <4 x float> @vp_splat_v4f32(float %val, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_v4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfmv.v.f v8, fa0 +; CHECK-NEXT: ret + %splat = call <4 x float> @llvm.experimental.vp.splat.v4f32(float %val, <4 x i1> %m, i32 %evl) + ret <4 x float> %splat +} + +define <8 x float> @vp_splat_v8f32(float %val, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_v8f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfmv.v.f v8, fa0 +; CHECK-NEXT: ret + %splat = call <8 x float> @llvm.experimental.vp.splat.v8f32(float %val, <8 x i1> %m, i32 %evl) + ret <8 x float> %splat +} + +define <16 x float> @vp_splat_v16f32(float %val, <16 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_v16f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfmv.v.f v8, fa0 +; CHECK-NEXT: ret + %splat = call <16 x float> @llvm.experimental.vp.splat.v16f32(float %val, <16 x i1> %m, i32 %evl) + ret <16 x float> %splat +} + +define <1 x double> @vp_splat_v1f64(double %val, <1 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_v1f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vfmv.v.f v8, fa0 +; CHECK-NEXT: ret + %splat = call <1 x double> @llvm.experimental.vp.splat.v1f64(double %val, <1 x i1> %m, i32 %evl) + ret <1 x double> %splat +} + +define <2 x double> @vp_splat_v2f64(double %val, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_v2f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vfmv.v.f v8, fa0 +; CHECK-NEXT: ret + %splat = call <2 x double> @llvm.experimental.vp.splat.v2f64(double %val, <2 x i1> %m, i32 %evl) + ret <2 x double> %splat +} + +define <4 x double> @vp_splat_v4f64(double %val, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_v4f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vfmv.v.f v8, fa0 +; CHECK-NEXT: ret + %splat = call <4 x double> @llvm.experimental.vp.splat.v4f64(double %val, <4 x i1> %m, i32 %evl) + ret <4 x double> %splat +} + +define <8 x double> @vp_splat_v8f64(double %val, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_v8f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vfmv.v.f v8, fa0 +; CHECK-NEXT: ret + %splat = call <8 x double> @llvm.experimental.vp.splat.v8f64(double %val, <8 x i1> %m, i32 %evl) + ret <8 x double> %splat +} + +define <16 x i31> @vp_splat_v16i31(i31 %val, <16 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_v16i31: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: ret + %splat = call <16 x i31> @llvm.experimental.vp.splat.v16i31(i31 %val, <16 x i1> %m, i32 %evl) + ret <16 x i31> %splat +} + +define <15 x i32> @vp_splat_v15i32(i32 %val, <15 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_v15i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: ret + %splat = call <15 x i32> @llvm.experimental.vp.splat.v15i32(i32 %val, <15 x i1> %m, i32 %evl) + ret <15 x i32> %splat +} + +; Split case. +define <32 x i32> @vp_splat_v32i32(i32 %val, <32 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_v32i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: ret + %splat = call <32 x i32> @llvm.experimental.vp.splat.v32i32(i32 %val, <32 x i1> %m, i32 %evl) + ret <32 x i32> %splat +} diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-splat.ll b/llvm/test/CodeGen/RISCV/rvv/vp-splat.ll new file mode 100644 index 0000000000000..5fbdefda9f402 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/vp-splat.ll @@ -0,0 +1,464 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=riscv32 -mattr=+v,+d,+zfh,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: llc -mtriple=riscv64 -mattr=+v,+d,+zfh,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64 + +define @vp_splat_nxv1i8(i8 %val, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_nxv1i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: ret + %splat = call @llvm.experimental.vp.splat.nxv1i8(i8 %val, %m, i32 %evl) + ret %splat +} + +define @vp_splat_nxv2i8(i8 %val, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_nxv2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: ret + %splat = call @llvm.experimental.vp.splat.nxv2i8(i8 %val, %m, i32 %evl) + ret %splat +} + +define @vp_splat_nxv4i8(i8 %val, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_nxv4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: ret + %splat = call @llvm.experimental.vp.splat.nxv4i8(i8 %val, %m, i32 %evl) + ret %splat +} + +define @vp_splat_nxv8i8(i8 %val, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_nxv8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: ret + %splat = call @llvm.experimental.vp.splat.nxv8i8(i8 %val, %m, i32 %evl) + ret %splat +} + +define @vp_splat_nxv16i8(i8 %val, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_nxv16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: ret + %splat = call @llvm.experimental.vp.splat.nxv16i8(i8 %val, %m, i32 %evl) + ret %splat +} + +define @vp_splat_nxv32i8(i8 %val, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_nxv32i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: ret + %splat = call @llvm.experimental.vp.splat.nxv32i8(i8 %val, %m, i32 %evl) + ret %splat +} + +define @vp_splat_nxv64i8(i8 %val, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_nxv64i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: ret + %splat = call @llvm.experimental.vp.splat.nxv64i8(i8 %val, %m, i32 %evl) + ret %splat +} + +define @vp_splat_nxv1i16(i16 %val, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_nxv1i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: ret + %splat = call @llvm.experimental.vp.splat.nxv1i16(i16 %val, %m, i32 %evl) + ret %splat +} + +define @vp_splat_nxv2i16(i16 %val, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_nxv2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: ret + %splat = call @llvm.experimental.vp.splat.nxv2i16(i16 %val, %m, i32 %evl) + ret %splat +} + +define @vp_splat_nxv4i16(i16 %val, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_nxv4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: ret + %splat = call @llvm.experimental.vp.splat.nxv4i16(i16 %val, %m, i32 %evl) + ret %splat +} + +define @vp_splat_nxv8i16(i16 %val, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_nxv8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: ret + %splat = call @llvm.experimental.vp.splat.nxv8i16(i16 %val, %m, i32 %evl) + ret %splat +} + +define @vp_splat_nxv16i16(i16 %val, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_nxv16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: ret + %splat = call @llvm.experimental.vp.splat.nxv16i16(i16 %val, %m, i32 %evl) + ret %splat +} + +define @vp_splat_nxv32i16(i16 %val, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_nxv32i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: ret + %splat = call @llvm.experimental.vp.splat.nxv32i16(i16 %val, %m, i32 %evl) + ret %splat +} + +define @vp_splat_nxv1i32(i32 %val, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_nxv1i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: ret + %splat = call @llvm.experimental.vp.splat.nxv1i32(i32 %val, %m, i32 %evl) + ret %splat +} + +define @vp_splat_nxv2i32(i32 %val, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_nxv2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: ret + %splat = call @llvm.experimental.vp.splat.nxv2i32(i32 %val, %m, i32 %evl) + ret %splat +} + +define @vp_splat_nxv4i32(i32 %val, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_nxv4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: ret + %splat = call @llvm.experimental.vp.splat.nxv4i32(i32 %val, %m, i32 %evl) + ret %splat +} + +define @vp_splat_nxv8i32(i32 %val, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_nxv8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: ret + %splat = call @llvm.experimental.vp.splat.nxv8i32(i32 %val, %m, i32 %evl) + ret %splat +} + +define @vp_splat_nxv16i32(i32 %val, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_nxv16i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: ret + %splat = call @llvm.experimental.vp.splat.nxv16i32(i32 %val, %m, i32 %evl) + ret %splat +} + +define @vp_splat_nxv1i64(i64 %val, %m, i32 zeroext %evl) { +; RV32-LABEL: vp_splat_nxv1i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetvli a1, zero, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v8, (a0), zero +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vp_splat_nxv1i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; RV64-NEXT: vmv.v.x v8, a0 +; RV64-NEXT: ret + %splat = call @llvm.experimental.vp.splat.nxv1i64(i64 %val, %m, i32 %evl) + ret %splat +} + +define @vp_splat_nxv2i64(i64 %val, %m, i32 zeroext %evl) { +; RV32-LABEL: vp_splat_nxv2i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetvli a1, zero, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v8, (a0), zero +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vp_splat_nxv2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; RV64-NEXT: vmv.v.x v8, a0 +; RV64-NEXT: ret + %splat = call @llvm.experimental.vp.splat.nxv2i64(i64 %val, %m, i32 %evl) + ret %splat +} + +define @vp_splat_nxv4i64(i64 %val, %m, i32 zeroext %evl) { +; RV32-LABEL: vp_splat_nxv4i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetvli a1, zero, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v8, (a0), zero +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vp_splat_nxv4i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; RV64-NEXT: vmv.v.x v8, a0 +; RV64-NEXT: ret + %splat = call @llvm.experimental.vp.splat.nxv4i64(i64 %val, %m, i32 %evl) + ret %splat +} + +define @vp_splat_nxv8i64(i64 %val, %m, i32 zeroext %evl) { +; RV32-LABEL: vp_splat_nxv8i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v8, (a0), zero +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vp_splat_nxv8i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vmv.v.x v8, a0 +; RV64-NEXT: ret + %splat = call @llvm.experimental.vp.splat.nxv8i64(i64 %val, %m, i32 %evl) + ret %splat +} + +define @vp_splat_nxv1f16(half %val, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_nxv1f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vfmv.v.f v8, fa0 +; CHECK-NEXT: ret + %splat = call @llvm.experimental.vp.splat.nxv1f16(half %val, %m, i32 %evl) + ret %splat +} + +define @vp_splat_nxv2f16(half %val, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_nxv2f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vfmv.v.f v8, fa0 +; CHECK-NEXT: ret + %splat = call @llvm.experimental.vp.splat.nxv2f16(half %val, %m, i32 %evl) + ret %splat +} + +define @vp_splat_nxv4f16(half %val, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_nxv4f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vfmv.v.f v8, fa0 +; CHECK-NEXT: ret + %splat = call @llvm.experimental.vp.splat.nxv4f16(half %val, %m, i32 %evl) + ret %splat +} + +define @vp_splat_nxv8f16(half %val, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_nxv8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vfmv.v.f v8, fa0 +; CHECK-NEXT: ret + %splat = call @llvm.experimental.vp.splat.nxv8f16(half %val, %m, i32 %evl) + ret %splat +} + +define @vp_splat_nxv16f16(half %val, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_nxv16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vfmv.v.f v8, fa0 +; CHECK-NEXT: ret + %splat = call @llvm.experimental.vp.splat.nxv16f16(half %val, %m, i32 %evl) + ret %splat +} + +define @vp_splat_nxv32f16(half %val, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_nxv32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; CHECK-NEXT: vfmv.v.f v8, fa0 +; CHECK-NEXT: ret + %splat = call @llvm.experimental.vp.splat.nxv32f16(half %val, %m, i32 %evl) + ret %splat +} + +define @vp_splat_nxv1f32(float %val, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_nxv1f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfmv.v.f v8, fa0 +; CHECK-NEXT: ret + %splat = call @llvm.experimental.vp.splat.nxv1f32(float %val, %m, i32 %evl) + ret %splat +} + +define @vp_splat_nxv2f32(float %val, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_nxv2f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfmv.v.f v8, fa0 +; CHECK-NEXT: ret + %splat = call @llvm.experimental.vp.splat.nxv2f32(float %val, %m, i32 %evl) + ret %splat +} + +define @vp_splat_nxv4f32(float %val, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_nxv4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfmv.v.f v8, fa0 +; CHECK-NEXT: ret + %splat = call @llvm.experimental.vp.splat.nxv4f32(float %val, %m, i32 %evl) + ret %splat +} + +define @vp_splat_nxv8f32(float %val, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_nxv8f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfmv.v.f v8, fa0 +; CHECK-NEXT: ret + %splat = call @llvm.experimental.vp.splat.nxv8f32(float %val, %m, i32 %evl) + ret %splat +} + +define @vp_splat_nxv16f32(float %val, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_nxv16f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfmv.v.f v8, fa0 +; CHECK-NEXT: ret + %splat = call @llvm.experimental.vp.splat.nxv16f32(float %val, %m, i32 %evl) + ret %splat +} + +define @vp_splat_nxv1f64(double %val, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_nxv1f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vfmv.v.f v8, fa0 +; CHECK-NEXT: ret + %splat = call @llvm.experimental.vp.splat.nxv1f64(double %val, %m, i32 %evl) + ret %splat +} + +define @vp_splat_nxv2f64(double %val, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_nxv2f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vfmv.v.f v8, fa0 +; CHECK-NEXT: ret + %splat = call @llvm.experimental.vp.splat.nxv2f64(double %val, %m, i32 %evl) + ret %splat +} + +define @vp_splat_nxv4f64(double %val, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_nxv4f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vfmv.v.f v8, fa0 +; CHECK-NEXT: ret + %splat = call @llvm.experimental.vp.splat.nxv4f64(double %val, %m, i32 %evl) + ret %splat +} + +define @vp_splat_nxv8f64(double %val, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_nxv8f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vfmv.v.f v8, fa0 +; CHECK-NEXT: ret + %splat = call @llvm.experimental.vp.splat.nxv8f64(double %val, %m, i32 %evl) + ret %splat +} + +define @vp_splat_nxv16i31(i31 %val, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_nxv16i31: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: ret + %splat = call @llvm.experimental.vp.splat.nxv16i31(i31 %val, %m, i32 %evl) + ret %splat +} + +define @vp_splat_nxv15i32(i32 %val, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_nxv15i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: ret + %splat = call @llvm.experimental.vp.splat.nxv15i32(i32 %val, %m, i32 %evl) + ret %splat +} + +; Split case. +define @vp_splat_nxv32i32(i32 %val, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_splat_nxv32i32: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 1 +; CHECK-NEXT: sub a3, a1, a2 +; CHECK-NEXT: sltu a4, a1, a3 +; CHECK-NEXT: addi a4, a4, -1 +; CHECK-NEXT: and a3, a4, a3 +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: vmv.v.x v16, a0 +; CHECK-NEXT: bltu a1, a2, .LBB39_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a1, a2 +; CHECK-NEXT: .LBB39_2: +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: ret + %splat = call @llvm.experimental.vp.splat.nxv32i32(i32 %val, %m, i32 %evl) + ret %splat +} diff --git a/llvm/unittests/IR/VPIntrinsicTest.cpp b/llvm/unittests/IR/VPIntrinsicTest.cpp index d6508abd5197e..eab2850ca4e1e 100644 --- a/llvm/unittests/IR/VPIntrinsicTest.cpp +++ b/llvm/unittests/IR/VPIntrinsicTest.cpp @@ -108,6 +108,8 @@ class VPIntrinsicTest : public testing::Test { "addrspace(1)*, i32, <8 x i1>, i32) "; Str << " declare <8 x i32> @llvm.vp.gather.v8i32.v8p0i32(<8 x i32*>, <8 x " "i1>, i32) "; + Str << " declare <8 x i32> @llvm.experimental.vp.splat.v8i32(i32, <8 x " + "i1>, i32) "; for (const char *ReductionOpcode : ReductionIntOpcodes) Str << " declare i32 @llvm.vp.reduce." << ReductionOpcode From cf6233f408029835488e1b4f2aad8cfa69e57c22 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Tue, 16 Jul 2024 17:56:08 -0700 Subject: [PATCH 206/777] [lld-link] Change /lldemit:llvm to use the pre-codegen module This matches ELF (#97480). clang cc1 -emit-llvm and -emit-llvm-bc for ThinLTO backend compilation also uses `PreCodeGenModuleHook`. While here, replace deprecated %T with %t. Pull Request: https://github.com/llvm/llvm-project/pull/98589 --- lld/COFF/LTO.cpp | 2 +- lld/test/COFF/lto-emit-llvm.ll | 9 +++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/lld/COFF/LTO.cpp b/lld/COFF/LTO.cpp index be49aa6e8bb3d..5c881bc01c663 100644 --- a/lld/COFF/LTO.cpp +++ b/lld/COFF/LTO.cpp @@ -91,7 +91,7 @@ lto::Config BitcodeCompiler::createConfig() { c.TimeTraceGranularity = ctx.config.timeTraceGranularity; if (ctx.config.emit == EmitKind::LLVM) { - c.PostInternalizeModuleHook = [this](size_t task, const Module &m) { + c.PreCodeGenModuleHook = [this](size_t task, const Module &m) { if (std::unique_ptr os = openLTOOutputFile(ctx.config.outputFile)) WriteBitcodeToFile(m, *os, false); diff --git a/lld/test/COFF/lto-emit-llvm.ll b/lld/test/COFF/lto-emit-llvm.ll index 985058de10a48..3ba6cf722e97f 100644 --- a/lld/test/COFF/lto-emit-llvm.ll +++ b/lld/test/COFF/lto-emit-llvm.ll @@ -1,10 +1,11 @@ ; REQUIRES: x86 -; RUN: llvm-as -o %T/lto.obj %s +; RUN: rm -rf %t && mkdir %t +; RUN: llvm-as -o %t/lto.obj %s -; RUN: lld-link /lldemit:llvm /out:%T/lto.bc /entry:main /subsystem:console %T/lto.obj -; RUN: llvm-dis %T/lto.bc -o - | FileCheck %s +; RUN: lld-link /lldemit:llvm /out:%t/lto.bc /entry:main /subsystem:console %t/lto.obj +; RUN: llvm-dis %t/lto.bc -o - | FileCheck %s -; CHECK: define void @main() +; CHECK: define void @main() local_unnamed_addr target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-pc-windows-msvc" From b042af363bcdfa5e7e3d7dd424a561a041ac8f02 Mon Sep 17 00:00:00 2001 From: Allen Date: Wed, 17 Jul 2024 09:01:42 +0800 Subject: [PATCH 207/777] [clang codegen] Precommit tests for PR96025, NFC (#98704) Add extern "C" for the function because there is difference function naming rules between Linux and Windows --- clang/test/CodeGen/math-libcalls-tbaa.cpp | 38 +++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 clang/test/CodeGen/math-libcalls-tbaa.cpp diff --git a/clang/test/CodeGen/math-libcalls-tbaa.cpp b/clang/test/CodeGen/math-libcalls-tbaa.cpp new file mode 100644 index 0000000000000..5b93079492bc5 --- /dev/null +++ b/clang/test/CodeGen/math-libcalls-tbaa.cpp @@ -0,0 +1,38 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 + +// RUN: %clang_cc1 -fmath-errno -O3 -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,NoNewStructPathTBAA +// RUN: %clang_cc1 -fmath-errno -O3 -new-struct-path-tbaa -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,NewStructPathTBAA + +extern "C" float expf(float); + +// Emit int TBAA metadata on FP math libcalls, which is useful for alias analysis + +// CHECK-LABEL: define dso_local float @foo( +// CHECK-SAME: ptr nocapture noundef readonly [[NUM:%.*]], float noundef [[R2INV:%.*]], i32 noundef [[N:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[NUM]], i64 40 +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2:![0-9]+]] +// CHECK-NEXT: [[CALL:%.*]] = tail call float @expf(float noundef [[TMP0]]) #[[ATTR2:[0-9]+]] +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]] +// CHECK-NEXT: [[MUL:%.*]] = fmul float [[CALL]], [[TMP1]] +// CHECK-NEXT: ret float [[MUL]] +// +extern "C" float foo (float num[], float r2inv, int n) { + const float expm2 = expf(num[10]); // Emit TBAA metadata on @expf + float tmp = expm2 * num[10]; + return tmp; +} +//. +// NoNewStructPathTBAA: [[TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0} +// NoNewStructPathTBAA: [[META3]] = !{!"float", [[META4:![0-9]+]], i64 0} +// NoNewStructPathTBAA: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0} +// NoNewStructPathTBAA: [[META5]] = !{!"Simple C++ TBAA"} +//. +// NewStructPathTBAA: [[TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0, i64 4} +// NewStructPathTBAA: [[META3]] = !{[[META4:![0-9]+]], i64 4, !"float"} +// NewStructPathTBAA: [[META4]] = !{[[META5:![0-9]+]], i64 1, !"omnipotent char"} +// NewStructPathTBAA: [[META5]] = !{!"Simple C++ TBAA"} +//. +//// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +// NewStructPathTBAA: {{.*}} +// NoNewStructPathTBAA: {{.*}} From befd44bcdc6e9d2f4099bf344826b2cd0fd8cbdc Mon Sep 17 00:00:00 2001 From: Carl Ritson Date: Wed, 17 Jul 2024 11:30:44 +0900 Subject: [PATCH 208/777] [AMDGPU] Update hasUnwantedEffectsWhenEXECEmpty (#97982) Add barriers and s_wait_event to hasUnwantedEffectsWhenEXECEmpty. Add a comment documenting the current expected use of the function. --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 9 +- llvm/lib/Target/AMDGPU/SIInstrInfo.h | 18 +- .../CodeGen/AMDGPU/insert-skips-gfx10.mir | 30 ++ .../CodeGen/AMDGPU/insert-skips-gfx12.mir | 308 ++++++++++++++++++ 4 files changed, 361 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index ba72152f5668e..e6e74d619003d 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -4131,14 +4131,17 @@ bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const // EXEC = 0, but checking for that case here seems not worth it // given the typical code patterns. if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT || - isEXP(Opcode) || - Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::S_TRAP || - Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_BARRIER) + isEXP(Opcode) || Opcode == AMDGPU::DS_ORDERED_COUNT || + Opcode == AMDGPU::S_TRAP || Opcode == AMDGPU::S_WAIT_EVENT) return true; if (MI.isCall() || MI.isInlineAsm()) return true; // conservative assumption + // Assume that barrier interactions are only intended with active lanes. + if (isBarrier(Opcode)) + return true; + // A mode change is a scalar operation that influences vector instructions. if (modifiesModeRegister(MI)) return true; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index b723deb9543cd..1712dfe8d406c 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -936,6 +936,16 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM; } + bool isBarrier(unsigned Opcode) const { + return isBarrierStart(Opcode) || Opcode == AMDGPU::S_BARRIER_WAIT || + Opcode == AMDGPU::S_BARRIER_INIT_M0 || + Opcode == AMDGPU::S_BARRIER_INIT_IMM || + Opcode == AMDGPU::S_BARRIER_JOIN_IMM || + Opcode == AMDGPU::S_BARRIER_LEAVE || + Opcode == AMDGPU::DS_GWS_INIT || + Opcode == AMDGPU::DS_GWS_BARRIER; + } + static bool doesNotReadTiedSource(const MachineInstr &MI) { return MI.getDesc().TSFlags & SIInstrFlags::TiedSourceNotRead; } @@ -1009,7 +1019,13 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { /// Return true if the instruction modifies the mode register.q static bool modifiesModeRegister(const MachineInstr &MI); - /// Whether we must prevent this instruction from executing with EXEC = 0. + /// This function is used to determine if an instruction can be safely + /// executed under EXEC = 0 without hardware error, indeterminate results, + /// and/or visible effects on future vector execution or outside the shader. + /// Note: as of 2024 the only use of this is SIPreEmitPeephole where it is + /// used in removing branches over short EXEC = 0 sequences. + /// As such it embeds certain assumptions which may not apply to every case + /// of EXEC = 0 execution. bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const; /// Returns true if the instruction could potentially depend on the value of diff --git a/llvm/test/CodeGen/AMDGPU/insert-skips-gfx10.mir b/llvm/test/CodeGen/AMDGPU/insert-skips-gfx10.mir index 1d3132dbe2af2..b4ed3cafbacb5 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-skips-gfx10.mir +++ b/llvm/test/CodeGen/AMDGPU/insert-skips-gfx10.mir @@ -184,3 +184,33 @@ body: | bb.2: S_ENDPGM 0 ... + +--- +name: skip_barrier +body: | + ; CHECK-LABEL: name: skip_barrier + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: V_NOP_e32 implicit $exec + ; CHECK-NEXT: S_BARRIER + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: + successors: %bb.1, %bb.2 + S_CBRANCH_EXECZ %bb.2, implicit $exec + + bb.1: + successors: %bb.2 + V_NOP_e32 implicit $exec + S_BARRIER + + bb.2: + S_ENDPGM 0 +... diff --git a/llvm/test/CodeGen/AMDGPU/insert-skips-gfx12.mir b/llvm/test/CodeGen/AMDGPU/insert-skips-gfx12.mir index c0b839d218a95..2d092974ac566 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-skips-gfx12.mir +++ b/llvm/test/CodeGen/AMDGPU/insert-skips-gfx12.mir @@ -300,3 +300,311 @@ body: | bb.2: S_ENDPGM 0 ... + +--- +name: skip_wait_event +body: | + ; CHECK-LABEL: name: skip_wait_event + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: V_NOP_e32 implicit $exec + ; CHECK-NEXT: S_WAIT_EVENT 0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: + successors: %bb.1, %bb.2 + S_CBRANCH_EXECZ %bb.2, implicit $exec + + bb.1: + successors: %bb.2 + V_NOP_e32 implicit $exec + S_WAIT_EVENT 0 + + bb.2: + S_ENDPGM 0 +... + +--- +name: skip_barrier_signal_imm +body: | + ; CHECK-LABEL: name: skip_barrier_signal_imm + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: V_NOP_e32 implicit $exec + ; CHECK-NEXT: S_BARRIER_SIGNAL_IMM -1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: + successors: %bb.1, %bb.2 + S_CBRANCH_EXECZ %bb.2, implicit $exec + + bb.1: + successors: %bb.2 + V_NOP_e32 implicit $exec + S_BARRIER_SIGNAL_IMM -1 + + bb.2: + S_ENDPGM 0 +... + +--- +name: skip_barrier_signal_isfirst_imm +body: | + ; CHECK-LABEL: name: skip_barrier_signal_isfirst_imm + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: V_NOP_e32 implicit $exec + ; CHECK-NEXT: S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: + successors: %bb.1, %bb.2 + S_CBRANCH_EXECZ %bb.2, implicit $exec + + bb.1: + successors: %bb.2 + V_NOP_e32 implicit $exec + S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc + + bb.2: + S_ENDPGM 0 +... + +--- +name: skip_barrier_signal_m0 +body: | + ; CHECK-LABEL: name: skip_barrier_signal_m0 + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: V_NOP_e32 implicit $exec + ; CHECK-NEXT: $m0 = S_MOV_B32 -1 + ; CHECK-NEXT: S_BARRIER_SIGNAL_M0 implicit $m0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: + successors: %bb.1, %bb.2 + S_CBRANCH_EXECZ %bb.2, implicit $exec + + bb.1: + successors: %bb.2 + V_NOP_e32 implicit $exec + $m0 = S_MOV_B32 -1 + S_BARRIER_SIGNAL_M0 implicit $m0 + + bb.2: + S_ENDPGM 0 +... + +--- +name: skip_barrier_signal_isfirst_m0 +body: | + ; CHECK-LABEL: name: skip_barrier_signal_isfirst_m0 + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: V_NOP_e32 implicit $exec + ; CHECK-NEXT: $m0 = S_MOV_B32 -1 + ; CHECK-NEXT: S_BARRIER_SIGNAL_ISFIRST_M0 implicit $m0, implicit-def $scc + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: + successors: %bb.1, %bb.2 + S_CBRANCH_EXECZ %bb.2, implicit $exec + + bb.1: + successors: %bb.2 + V_NOP_e32 implicit $exec + $m0 = S_MOV_B32 -1 + S_BARRIER_SIGNAL_ISFIRST_M0 implicit $m0, implicit-def $scc + + bb.2: + S_ENDPGM 0 +... + +--- +name: skip_barrier_wait +body: | + ; CHECK-LABEL: name: skip_barrier_wait + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: V_NOP_e32 implicit $exec + ; CHECK-NEXT: S_BARRIER_WAIT -1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: + successors: %bb.1, %bb.2 + S_CBRANCH_EXECZ %bb.2, implicit $exec + + bb.1: + successors: %bb.2 + V_NOP_e32 implicit $exec + S_BARRIER_WAIT -1 + + bb.2: + S_ENDPGM 0 +... + +--- +name: skip_barrier_init_imm +body: | + ; CHECK-LABEL: name: skip_barrier_init_imm + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: V_NOP_e32 implicit $exec + ; CHECK-NEXT: $m0 = S_MOV_B32 -1 + ; CHECK-NEXT: S_BARRIER_INIT_IMM -1, implicit $m0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: + successors: %bb.1, %bb.2 + S_CBRANCH_EXECZ %bb.2, implicit $exec + + bb.1: + successors: %bb.2 + V_NOP_e32 implicit $exec + $m0 = S_MOV_B32 -1 + S_BARRIER_INIT_IMM -1, implicit $m0 + + bb.2: + S_ENDPGM 0 +... + +--- +name: skip_barrier_init_m0 +body: | + ; CHECK-LABEL: name: skip_barrier_init_m0 + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: V_NOP_e32 implicit $exec + ; CHECK-NEXT: $m0 = S_MOV_B32 -1 + ; CHECK-NEXT: S_BARRIER_INIT_M0 implicit $m0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: + successors: %bb.1, %bb.2 + S_CBRANCH_EXECZ %bb.2, implicit $exec + + bb.1: + successors: %bb.2 + V_NOP_e32 implicit $exec + $m0 = S_MOV_B32 -1 + S_BARRIER_INIT_M0 implicit $m0 + + bb.2: + S_ENDPGM 0 +... + +--- +name: skip_barrier_join_imm +body: | + ; CHECK-LABEL: name: skip_barrier_join_imm + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: V_NOP_e32 implicit $exec + ; CHECK-NEXT: S_BARRIER_JOIN_IMM -1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: + successors: %bb.1, %bb.2 + S_CBRANCH_EXECZ %bb.2, implicit $exec + + bb.1: + successors: %bb.2 + V_NOP_e32 implicit $exec + S_BARRIER_JOIN_IMM -1 + + bb.2: + S_ENDPGM 0 +... + +--- +name: skip_barrier_leave +body: | + ; CHECK-LABEL: name: skip_barrier_leave + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: V_NOP_e32 implicit $exec + ; CHECK-NEXT: S_BARRIER_LEAVE implicit-def $scc + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: + successors: %bb.1, %bb.2 + S_CBRANCH_EXECZ %bb.2, implicit $exec + + bb.1: + successors: %bb.2 + V_NOP_e32 implicit $exec + S_BARRIER_LEAVE implicit-def $scc + + bb.2: + S_ENDPGM 0 +... From 1b873e565eea97d02cdb2375c50ceea89a818e5b Mon Sep 17 00:00:00 2001 From: paperchalice Date: Wed, 17 Jul 2024 11:26:56 +0800 Subject: [PATCH 209/777] [CodeGen][NewPM] Port `phi-node-elimination` to new pass manager (#98867) - Add `PHIEliminationPass `. - Support new pass manager in `MachineBasicBlock:: SplitCriticalEdge ` --- llvm/include/llvm/CodeGen/MachineBasicBlock.h | 19 ++- llvm/include/llvm/CodeGen/PHIElimination.h | 24 ++++ llvm/include/llvm/Passes/CodeGenPassBuilder.h | 1 + .../llvm/Passes/MachinePassRegistry.def | 2 +- llvm/lib/CodeGen/MachineBasicBlock.cpp | 29 +++-- llvm/lib/CodeGen/PHIElimination.cpp | 118 ++++++++++++------ llvm/lib/Passes/PassBuilder.cpp | 1 + .../CodeGen/AArch64/PHIElimination-crash.mir | 3 + .../AArch64/PHIElimination-debugloc.mir | 4 + .../AMDGPU/phi-elimination-assertion.mir | 1 + .../CodeGen/AMDGPU/phi-elimination-end-cf.mir | 1 + .../CodeGen/AMDGPU/split-mbb-lis-subrange.mir | 1 + .../AMDGPU/stale-livevar-in-twoaddr-pass.mir | 1 + .../CodeGen/PowerPC/2013-07-01-PHIElimBug.mir | 1 + llvm/test/CodeGen/PowerPC/livevars-crash1.mir | 3 + llvm/test/CodeGen/PowerPC/livevars-crash2.mir | 3 + llvm/test/CodeGen/PowerPC/phi-eliminate.mir | 2 + .../CodeGen/PowerPC/two-address-crash.mir | 1 + llvm/test/CodeGen/Thumb2/phi_prevent_copy.mir | 1 + llvm/test/CodeGen/X86/callbr-asm-kill.mir | 1 + llvm/test/CodeGen/X86/phielim-undef.mir | 1 + 21 files changed, 168 insertions(+), 50 deletions(-) create mode 100644 llvm/include/llvm/CodeGen/PHIElimination.h diff --git a/llvm/include/llvm/CodeGen/MachineBasicBlock.h b/llvm/include/llvm/CodeGen/MachineBasicBlock.h index 562d37ef32f54..b8153fd5d3fb7 100644 --- a/llvm/include/llvm/CodeGen/MachineBasicBlock.h +++ b/llvm/include/llvm/CodeGen/MachineBasicBlock.h @@ -43,6 +43,8 @@ class raw_ostream; class LiveIntervals; class TargetRegisterClass; class TargetRegisterInfo; +template class AnalysisManager; +using MachineFunctionAnalysisManager = AnalysisManager; // This structure uniquely identifies a basic block section. // Possible values are @@ -968,7 +970,16 @@ class MachineBasicBlock /// MachineLoopInfo, as applicable. MachineBasicBlock * SplitCriticalEdge(MachineBasicBlock *Succ, Pass &P, - std::vector> *LiveInSets = nullptr); + std::vector> *LiveInSets = nullptr) { + return SplitCriticalEdge(Succ, &P, nullptr, LiveInSets); + } + + MachineBasicBlock * + SplitCriticalEdge(MachineBasicBlock *Succ, + MachineFunctionAnalysisManager &MFAM, + std::vector> *LiveInSets = nullptr) { + return SplitCriticalEdge(Succ, nullptr, &MFAM, LiveInSets); + } /// Check if the edge between this block and the given successor \p /// Succ, can be split. If this returns true a subsequent call to @@ -1243,6 +1254,12 @@ class MachineBasicBlock /// unless you know what you're doing, because it doesn't update Pred's /// successors list. Use Pred->removeSuccessor instead. void removePredecessor(MachineBasicBlock *Pred); + + // Helper method for new pass manager migration. + MachineBasicBlock * + SplitCriticalEdge(MachineBasicBlock *Succ, Pass *P, + MachineFunctionAnalysisManager *MFAM, + std::vector> *LiveInSets); }; raw_ostream& operator<<(raw_ostream &OS, const MachineBasicBlock &MBB); diff --git a/llvm/include/llvm/CodeGen/PHIElimination.h b/llvm/include/llvm/CodeGen/PHIElimination.h new file mode 100644 index 0000000000000..3a1a4c5c6133f --- /dev/null +++ b/llvm/include/llvm/CodeGen/PHIElimination.h @@ -0,0 +1,24 @@ +//===- llvm/CodeGen/PHIElimination.h ----------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CODEGEN_PHIELIMINATION_H +#define LLVM_CODEGEN_PHIELIMINATION_H + +#include "llvm/CodeGen/MachinePassManager.h" + +namespace llvm { + +class PHIEliminationPass : public PassInfoMixin { +public: + PreservedAnalyses run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM); +}; + +} // namespace llvm + +#endif // LLVM_CODEGEN_PHIELIMINATION_H diff --git a/llvm/include/llvm/Passes/CodeGenPassBuilder.h b/llvm/include/llvm/Passes/CodeGenPassBuilder.h index 5b8e69b602e2b..fb7a3c107d88a 100644 --- a/llvm/include/llvm/Passes/CodeGenPassBuilder.h +++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h @@ -43,6 +43,7 @@ #include "llvm/CodeGen/MachineFunctionAnalysis.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachinePassManager.h" +#include "llvm/CodeGen/PHIElimination.h" #include "llvm/CodeGen/PreISelIntrinsicLowering.h" #include "llvm/CodeGen/RegAllocFast.h" #include "llvm/CodeGen/ReplaceWithVeclib.h" diff --git a/llvm/include/llvm/Passes/MachinePassRegistry.def b/llvm/include/llvm/Passes/MachinePassRegistry.def index a47d7494f2eef..03f0782de6fed 100644 --- a/llvm/include/llvm/Passes/MachinePassRegistry.def +++ b/llvm/include/llvm/Passes/MachinePassRegistry.def @@ -132,6 +132,7 @@ MACHINE_FUNCTION_PASS("dead-mi-elimination", DeadMachineInstructionElimPass()) MACHINE_FUNCTION_PASS("finalize-isel", FinalizeISelPass()) MACHINE_FUNCTION_PASS("localstackalloc", LocalStackSlotAllocationPass()) MACHINE_FUNCTION_PASS("no-op-machine-function", NoOpMachineFunctionPass()) +MACHINE_FUNCTION_PASS("phi-node-elimination", PHIEliminationPass()) MACHINE_FUNCTION_PASS("print", PrintMIRPass()) MACHINE_FUNCTION_PASS("print", LiveIntervalsPrinterPass(dbgs())) MACHINE_FUNCTION_PASS("print", LiveVariablesPrinterPass(dbgs())) @@ -231,7 +232,6 @@ DUMMY_MACHINE_FUNCTION_PASS("mirfs-discriminators", MIRAddFSDiscriminatorsPass) DUMMY_MACHINE_FUNCTION_PASS("opt-phis", OptimizePHIsPass) DUMMY_MACHINE_FUNCTION_PASS("patchable-function", PatchableFunctionPass) DUMMY_MACHINE_FUNCTION_PASS("peephole-opt", PeepholeOptimizerPass) -DUMMY_MACHINE_FUNCTION_PASS("phi-node-elimination", PHIEliminationPass) DUMMY_MACHINE_FUNCTION_PASS("post-RA-sched", PostRASchedulerPass) DUMMY_MACHINE_FUNCTION_PASS("postmisched", PostMachineSchedulerPass) DUMMY_MACHINE_FUNCTION_PASS("postra-machine-sink", PostRAMachineSinkingPass) diff --git a/llvm/lib/CodeGen/MachineBasicBlock.cpp b/llvm/lib/CodeGen/MachineBasicBlock.cpp index 90d2edebedd72..d681d00b5d8c4 100644 --- a/llvm/lib/CodeGen/MachineBasicBlock.cpp +++ b/llvm/lib/CodeGen/MachineBasicBlock.cpp @@ -1135,9 +1135,19 @@ class SlotIndexUpdateDelegate : public MachineFunction::Delegate { } }; +#define GET_RESULT(RESULT, GETTER, INFIX) \ + [MF, P, MFAM]() { \ + if (P) { \ + auto *Wrapper = P->getAnalysisIfAvailable(); \ + return Wrapper ? &Wrapper->GETTER() : nullptr; \ + } \ + return MFAM->getCachedResult(*MF); \ + }() + MachineBasicBlock *MachineBasicBlock::SplitCriticalEdge( - MachineBasicBlock *Succ, Pass &P, + MachineBasicBlock *Succ, Pass *P, MachineFunctionAnalysisManager *MFAM, std::vector> *LiveInSets) { + assert((P || MFAM) && "Need a way to get analysis results!"); if (!canSplitCriticalEdge(Succ)) return nullptr; @@ -1161,10 +1171,8 @@ MachineBasicBlock *MachineBasicBlock::SplitCriticalEdge( << " -- " << printMBBReference(*NMBB) << " -- " << printMBBReference(*Succ) << '\n'); - auto *LISWrapper = P.getAnalysisIfAvailable(); - LiveIntervals *LIS = LISWrapper ? &LISWrapper->getLIS() : nullptr; - auto *SIWrapper = P.getAnalysisIfAvailable(); - SlotIndexes *Indexes = SIWrapper ? &SIWrapper->getSI() : nullptr; + LiveIntervals *LIS = GET_RESULT(LiveIntervals, getLIS, ); + SlotIndexes *Indexes = GET_RESULT(SlotIndexes, getSI, ); if (LIS) LIS->insertMBBInMaps(NMBB); else if (Indexes) @@ -1173,8 +1181,7 @@ MachineBasicBlock *MachineBasicBlock::SplitCriticalEdge( // On some targets like Mips, branches may kill virtual registers. Make sure // that LiveVariables is properly updated after updateTerminator replaces the // terminators. - auto *LVWrapper = P.getAnalysisIfAvailable(); - LiveVariables *LV = LVWrapper ? &LVWrapper->getLV() : nullptr; + LiveVariables *LV = GET_RESULT(LiveVariables, getLV, ); // Collect a list of virtual registers killed by the terminators. SmallVector KilledRegs; @@ -1339,12 +1346,10 @@ MachineBasicBlock *MachineBasicBlock::SplitCriticalEdge( LIS->repairIntervalsInRange(this, getFirstTerminator(), end(), UsedRegs); } - if (auto *MDTWrapper = - P.getAnalysisIfAvailable()) - MDTWrapper->getDomTree().recordSplitCriticalEdge(this, Succ, NMBB); + if (auto *MDT = GET_RESULT(MachineDominatorTree, getDomTree, )) + MDT->recordSplitCriticalEdge(this, Succ, NMBB); - auto *MLIWrapper = P.getAnalysisIfAvailable(); - if (MachineLoopInfo *MLI = MLIWrapper ? &MLIWrapper->getLI() : nullptr) + if (MachineLoopInfo *MLI = GET_RESULT(MachineLoop, getLI, Info)) if (MachineLoop *TIL = MLI->getLoopFor(this)) { // If one or the other blocks were not in a loop, the new block is not // either, and thus LI doesn't need to be updated. diff --git a/llvm/lib/CodeGen/PHIElimination.cpp b/llvm/lib/CodeGen/PHIElimination.cpp index e392bb8087327..e5f40771eda86 100644 --- a/llvm/lib/CodeGen/PHIElimination.cpp +++ b/llvm/lib/CodeGen/PHIElimination.cpp @@ -12,6 +12,7 @@ // //===----------------------------------------------------------------------===// +#include "llvm/CodeGen/PHIElimination.h" #include "PHIEliminationUtils.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallPtrSet.h" @@ -64,22 +65,13 @@ static cl::opt NoPhiElimLiveOutEarlyExit( namespace { -class PHIElimination : public MachineFunctionPass { +class PHIEliminationImpl { MachineRegisterInfo *MRI = nullptr; // Machine register information LiveVariables *LV = nullptr; LiveIntervals *LIS = nullptr; + MachineLoopInfo *MLI = nullptr; + MachineDominatorTree *MDT = nullptr; -public: - static char ID; // Pass identification, replacement for typeid - - PHIElimination() : MachineFunctionPass(ID) { - initializePHIEliminationPass(*PassRegistry::getPassRegistry()); - } - - bool runOnMachineFunction(MachineFunction &MF) override; - void getAnalysisUsage(AnalysisUsage &AU) const override; - -private: /// EliminatePHINodes - Eliminate phi nodes by inserting copy instructions /// in predecessor basic blocks. bool EliminatePHINodes(MachineFunction &MF, MachineBasicBlock &MBB); @@ -118,10 +110,71 @@ class PHIElimination : public MachineFunctionPass { using LoweredPHIMap = DenseMap; LoweredPHIMap LoweredPHIs; + + MachineFunctionPass *P = nullptr; + MachineFunctionAnalysisManager *MFAM = nullptr; + +public: + PHIEliminationImpl(MachineFunctionPass *P) : P(P) { + auto *LVWrapper = P->getAnalysisIfAvailable(); + auto *LISWrapper = P->getAnalysisIfAvailable(); + auto *MLIWrapper = P->getAnalysisIfAvailable(); + auto *MDTWrapper = + P->getAnalysisIfAvailable(); + LV = LVWrapper ? &LVWrapper->getLV() : nullptr; + LIS = LISWrapper ? &LISWrapper->getLIS() : nullptr; + MLI = MLIWrapper ? &MLIWrapper->getLI() : nullptr; + MDT = MDTWrapper ? &MDTWrapper->getDomTree() : nullptr; + } + + PHIEliminationImpl(MachineFunction &MF, MachineFunctionAnalysisManager &AM) + : LV(AM.getCachedResult(MF)), + LIS(AM.getCachedResult(MF)), + MLI(AM.getCachedResult(MF)), + MDT(AM.getCachedResult(MF)), MFAM(&AM) {} + + bool run(MachineFunction &MF); +}; + +class PHIElimination : public MachineFunctionPass { +public: + static char ID; // Pass identification, replacement for typeid + + PHIElimination() : MachineFunctionPass(ID) { + initializePHIEliminationPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override { + PHIEliminationImpl Impl(this); + return Impl.run(MF); + } + + MachineFunctionProperties getSetProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::NoPHIs); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override; }; } // end anonymous namespace +PreservedAnalyses +PHIEliminationPass::run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM) { + PHIEliminationImpl Impl(MF, MFAM); + bool Changed = Impl.run(MF); + if (!Changed) + return PreservedAnalyses::all(); + auto PA = getMachineFunctionPassPreservedAnalyses(); + PA.preserve(); + PA.preserve(); + PA.preserve(); + PA.preserve(); + PA.preserve(); + return PA; +} + STATISTIC(NumLowered, "Number of phis lowered"); STATISTIC(NumCriticalEdgesSplit, "Number of critical edges split"); STATISTIC(NumReused, "Number of reused lowered phis"); @@ -147,12 +200,8 @@ void PHIElimination::getAnalysisUsage(AnalysisUsage &AU) const { MachineFunctionPass::getAnalysisUsage(AU); } -bool PHIElimination::runOnMachineFunction(MachineFunction &MF) { +bool PHIEliminationImpl::run(MachineFunction &MF) { MRI = &MF.getRegInfo(); - auto *LVWrapper = getAnalysisIfAvailable(); - LV = LVWrapper ? &LVWrapper->getLV() : nullptr; - auto *LISWrapper = getAnalysisIfAvailable(); - LIS = LISWrapper ? &LISWrapper->getLIS() : nullptr; bool Changed = false; @@ -187,9 +236,6 @@ bool PHIElimination::runOnMachineFunction(MachineFunction &MF) { } } - MachineLoopInfoWrapperPass *MLIWrapper = - getAnalysisIfAvailable(); - MachineLoopInfo *MLI = MLIWrapper ? &MLIWrapper->getLI() : nullptr; for (auto &MBB : MF) Changed |= SplitPHIEdges(MF, MBB, MLI, (LV ? &LiveInSets : nullptr)); } @@ -223,9 +269,8 @@ bool PHIElimination::runOnMachineFunction(MachineFunction &MF) { } // TODO: we should use the incremental DomTree updater here. - if (Changed) - if (auto *MDT = getAnalysisIfAvailable()) - MDT->getDomTree().getBase().recalculate(MF); + if (Changed && MDT) + MDT->getBase().recalculate(MF); LoweredPHIs.clear(); ImpDefs.clear(); @@ -238,8 +283,8 @@ bool PHIElimination::runOnMachineFunction(MachineFunction &MF) { /// EliminatePHINodes - Eliminate phi nodes by inserting copy instructions in /// predecessor basic blocks. -bool PHIElimination::EliminatePHINodes(MachineFunction &MF, - MachineBasicBlock &MBB) { +bool PHIEliminationImpl::EliminatePHINodes(MachineFunction &MF, + MachineBasicBlock &MBB) { if (MBB.empty() || !MBB.front().isPHI()) return false; // Quick exit for basic blocks without PHIs. @@ -286,9 +331,9 @@ static bool allPhiOperandsUndefined(const MachineInstr &MPhi, return true; } /// LowerPHINode - Lower the PHI node at the top of the specified block. -void PHIElimination::LowerPHINode(MachineBasicBlock &MBB, - MachineBasicBlock::iterator LastPHIIt, - bool AllEdgesCritical) { +void PHIEliminationImpl::LowerPHINode(MachineBasicBlock &MBB, + MachineBasicBlock::iterator LastPHIIt, + bool AllEdgesCritical) { ++NumLowered; MachineBasicBlock::iterator AfterPHIsIt = std::next(LastPHIIt); @@ -689,7 +734,7 @@ void PHIElimination::LowerPHINode(MachineBasicBlock &MBB, /// particular, we want to map the number of uses of a virtual register which is /// used in a PHI node. We map that to the BB the vreg is coming from. This is /// used later to determine when the vreg is killed in the BB. -void PHIElimination::analyzePHINodes(const MachineFunction &MF) { +void PHIEliminationImpl::analyzePHINodes(const MachineFunction &MF) { for (const auto &MBB : MF) { for (const auto &BBI : MBB) { if (!BBI.isPHI()) @@ -705,9 +750,9 @@ void PHIElimination::analyzePHINodes(const MachineFunction &MF) { } } -bool PHIElimination::SplitPHIEdges(MachineFunction &MF, MachineBasicBlock &MBB, - MachineLoopInfo *MLI, - std::vector> *LiveInSets) { +bool PHIEliminationImpl::SplitPHIEdges( + MachineFunction &MF, MachineBasicBlock &MBB, MachineLoopInfo *MLI, + std::vector> *LiveInSets) { if (MBB.empty() || !MBB.front().isPHI() || MBB.isEHPad()) return false; // Quick exit for basic blocks without PHIs. @@ -774,7 +819,8 @@ bool PHIElimination::SplitPHIEdges(MachineFunction &MF, MachineBasicBlock &MBB, } if (!ShouldSplit && !SplitAllCriticalEdges) continue; - if (!PreMBB->SplitCriticalEdge(&MBB, *this, LiveInSets)) { + if (!(P ? PreMBB->SplitCriticalEdge(&MBB, *P, LiveInSets) + : PreMBB->SplitCriticalEdge(&MBB, *MFAM, LiveInSets))) { LLVM_DEBUG(dbgs() << "Failed to split critical edge.\n"); continue; } @@ -785,7 +831,7 @@ bool PHIElimination::SplitPHIEdges(MachineFunction &MF, MachineBasicBlock &MBB, return Changed; } -bool PHIElimination::isLiveIn(Register Reg, const MachineBasicBlock *MBB) { +bool PHIEliminationImpl::isLiveIn(Register Reg, const MachineBasicBlock *MBB) { assert((LV || LIS) && "isLiveIn() requires either LiveVariables or LiveIntervals"); if (LIS) @@ -794,8 +840,8 @@ bool PHIElimination::isLiveIn(Register Reg, const MachineBasicBlock *MBB) { return LV->isLiveIn(Reg, *MBB); } -bool PHIElimination::isLiveOutPastPHIs(Register Reg, - const MachineBasicBlock *MBB) { +bool PHIEliminationImpl::isLiveOutPastPHIs(Register Reg, + const MachineBasicBlock *MBB) { assert((LV || LIS) && "isLiveOutPastPHIs() requires either LiveVariables or LiveIntervals"); // LiveVariables considers uses in PHIs to be in the predecessor basic block, diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index 929690c2c74d6..a9d3f8ec3a4ec 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -105,6 +105,7 @@ #include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/MachineVerifier.h" +#include "llvm/CodeGen/PHIElimination.h" #include "llvm/CodeGen/PreISelIntrinsicLowering.h" #include "llvm/CodeGen/RegAllocFast.h" #include "llvm/CodeGen/SafeStack.h" diff --git a/llvm/test/CodeGen/AArch64/PHIElimination-crash.mir b/llvm/test/CodeGen/AArch64/PHIElimination-crash.mir index 1a1ba154062b7..8f43686429268 100644 --- a/llvm/test/CodeGen/AArch64/PHIElimination-crash.mir +++ b/llvm/test/CodeGen/AArch64/PHIElimination-crash.mir @@ -1,6 +1,9 @@ # RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -o /dev/null %s \ # RUN: -run-pass=livevars,phi-node-elimination,twoaddressinstruction \ # RUN: -no-phi-elim-live-out-early-exit=1 -phi-elim-split-all-critical-edges=1 +# RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -o /dev/null %s \ +# RUN: --passes='require,phi-node-elimination,two-address-instruction' \ +# RUN: -no-phi-elim-live-out-early-exit=1 -phi-elim-split-all-critical-edges=1 # Used to result in # diff --git a/llvm/test/CodeGen/AArch64/PHIElimination-debugloc.mir b/llvm/test/CodeGen/AArch64/PHIElimination-debugloc.mir index 61101491e9d9f..9b8283352161a 100644 --- a/llvm/test/CodeGen/AArch64/PHIElimination-debugloc.mir +++ b/llvm/test/CodeGen/AArch64/PHIElimination-debugloc.mir @@ -2,6 +2,10 @@ # RUN: -run-pass=livevars,phi-node-elimination,twoaddressinstruction \ # RUN: -no-phi-elim-live-out-early-exit=1 -phi-elim-split-all-critical-edges=1 \ # RUN: | FileCheck %s +# RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -o - %s \ +# RUN: --passes='require,phi-node-elimination,two-address-instruction' \ +# RUN: -no-phi-elim-live-out-early-exit=1 -phi-elim-split-all-critical-edges=1 \ +# RUN: | FileCheck %s --- | define void @test() !dbg !7 { diff --git a/llvm/test/CodeGen/AMDGPU/phi-elimination-assertion.mir b/llvm/test/CodeGen/AMDGPU/phi-elimination-assertion.mir index e2e6ea76103c7..00828820f8ed7 100644 --- a/llvm/test/CodeGen/AMDGPU/phi-elimination-assertion.mir +++ b/llvm/test/CodeGen/AMDGPU/phi-elimination-assertion.mir @@ -1,4 +1,5 @@ # RUN: llc -mtriple amdgcn -run-pass livevars -run-pass phi-node-elimination -o - %s | FileCheck %s +# RUN: llc -mtriple amdgcn --passes='require,phi-node-elimination' -o - %s | FileCheck %s ################################################################################ # This test used to hit an assert in PHIElimination: diff --git a/llvm/test/CodeGen/AMDGPU/phi-elimination-end-cf.mir b/llvm/test/CodeGen/AMDGPU/phi-elimination-end-cf.mir index 83c30507ce3ce..8b009978055ac 100644 --- a/llvm/test/CodeGen/AMDGPU/phi-elimination-end-cf.mir +++ b/llvm/test/CodeGen/AMDGPU/phi-elimination-end-cf.mir @@ -1,4 +1,5 @@ # RUN: llc -mtriple amdgcn -run-pass livevars -run-pass phi-node-elimination -verify-machineinstrs -o - %s | FileCheck %s +# RUN: llc -mtriple amdgcn --passes='require,phi-node-elimination' -o - %s | FileCheck %s # CHECK-LABEL: phi-cf-test # CHECK: bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/split-mbb-lis-subrange.mir b/llvm/test/CodeGen/AMDGPU/split-mbb-lis-subrange.mir index 896986ff9b02b..dfeca8db0b464 100644 --- a/llvm/test/CodeGen/AMDGPU/split-mbb-lis-subrange.mir +++ b/llvm/test/CodeGen/AMDGPU/split-mbb-lis-subrange.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2 # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass liveintervals,phi-node-elimination -o - %s | FileCheck -check-prefixes=GCN %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 --passes='require,phi-node-elimination' -o - %s | FileCheck -check-prefixes=GCN %s # This checks liveintervals pass verification and phi-node-elimination correctly preserves them. diff --git a/llvm/test/CodeGen/AMDGPU/stale-livevar-in-twoaddr-pass.mir b/llvm/test/CodeGen/AMDGPU/stale-livevar-in-twoaddr-pass.mir index 08bdec8871e17..4bb0046c0ee01 100644 --- a/llvm/test/CodeGen/AMDGPU/stale-livevar-in-twoaddr-pass.mir +++ b/llvm/test/CodeGen/AMDGPU/stale-livevar-in-twoaddr-pass.mir @@ -1,4 +1,5 @@ # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=livevars,phi-node-elimination,twoaddressinstruction -verify-machineinstrs -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx900 --passes='require,phi-node-elimination,two-address-instruction' -verify-machineinstrs -o - %s | FileCheck %s # This used to fail under ASAN enabled build because we didn't update LiveVariables in SIInstrInfo::convertToThreeAddress() # CHECK: _amdgpu_ps_main diff --git a/llvm/test/CodeGen/PowerPC/2013-07-01-PHIElimBug.mir b/llvm/test/CodeGen/PowerPC/2013-07-01-PHIElimBug.mir index 9e7c63a76ceda..2a669ed6b03a1 100644 --- a/llvm/test/CodeGen/PowerPC/2013-07-01-PHIElimBug.mir +++ b/llvm/test/CodeGen/PowerPC/2013-07-01-PHIElimBug.mir @@ -1,4 +1,5 @@ # RUN: llc -mtriple powerpc64-unknown-linux-gnu -run-pass livevars -run-pass phi-node-elimination -verify-machineinstrs -o - %s | FileCheck %s +# RUN: llc -mtriple powerpc64-unknown-linux-gnu --passes='require,phi-node-elimination' -o - %s | FileCheck %s # This test case was originally known as # test/CodeGen/PowerPC/2013-07-01-PHIElimBug.ll diff --git a/llvm/test/CodeGen/PowerPC/livevars-crash1.mir b/llvm/test/CodeGen/PowerPC/livevars-crash1.mir index 6ddc2b022e9b5..68d2e5a627e9d 100644 --- a/llvm/test/CodeGen/PowerPC/livevars-crash1.mir +++ b/llvm/test/CodeGen/PowerPC/livevars-crash1.mir @@ -1,6 +1,9 @@ # RUN: llc -mtriple powerpc64le-unknown-linux-gnu %s -o - 2>&1 \ # RUN: -run-pass=livevars,phi-node-elimination -verify-machineinstrs | \ # RUN: FileCheck %s +# RUN: llc -mtriple powerpc64le-unknown-linux-gnu %s -o - 2>&1 \ +# RUN: --passes='require,phi-node-elimination' | \ +# RUN: FileCheck %s --- | ; Function Attrs: noreturn nounwind diff --git a/llvm/test/CodeGen/PowerPC/livevars-crash2.mir b/llvm/test/CodeGen/PowerPC/livevars-crash2.mir index 1ae24fd0b7015..e165c85d5b72a 100644 --- a/llvm/test/CodeGen/PowerPC/livevars-crash2.mir +++ b/llvm/test/CodeGen/PowerPC/livevars-crash2.mir @@ -1,6 +1,9 @@ # RUN: llc -mtriple powerpc64le-unknown-linux-gnu %s -o - 2>&1 \ # RUN: -run-pass=livevars,phi-node-elimination -verify-machineinstrs | \ # RUN: FileCheck %s +# RUN: llc -mtriple powerpc64le-unknown-linux-gnu %s -o - 2>&1 \ +# RUN: --passes='require,phi-node-elimination' | \ +# RUN: FileCheck %s --- | define float @testfloatslt(float %c1, float %c2, float %c3, float %c4, float %a1, float %a2) { diff --git a/llvm/test/CodeGen/PowerPC/phi-eliminate.mir b/llvm/test/CodeGen/PowerPC/phi-eliminate.mir index f50d92772e345..72f778286abe4 100644 --- a/llvm/test/CodeGen/PowerPC/phi-eliminate.mir +++ b/llvm/test/CodeGen/PowerPC/phi-eliminate.mir @@ -1,5 +1,7 @@ # RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 %s -o - \ # RUN: -run-pass=livevars,phi-node-elimination | FileCheck %s +# RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 %s -o - \ +# RUN: --passes='require,phi-node-elimination' | FileCheck %s --- | define void @phi_eliminate(i32 %0, i32 %1, ptr %2) { diff --git a/llvm/test/CodeGen/PowerPC/two-address-crash.mir b/llvm/test/CodeGen/PowerPC/two-address-crash.mir index eda0a93e37f9d..cd2e69d8612b9 100644 --- a/llvm/test/CodeGen/PowerPC/two-address-crash.mir +++ b/llvm/test/CodeGen/PowerPC/two-address-crash.mir @@ -1,5 +1,6 @@ # RUN: llc -mtriple=ppc32-- %s -run-pass=phi-node-elimination \ # RUN: -verify-machineinstrs -o /dev/null 2>&1 +# RUN: llc -mtriple=ppc32-- %s --passes=phi-node-elimination -o /dev/null 2>&1 # RUN: llc -mtriple=ppc32-- %s -start-before=phi-node-elimination \ # RUN: -verify-machineinstrs -o /dev/null 2>&1 diff --git a/llvm/test/CodeGen/Thumb2/phi_prevent_copy.mir b/llvm/test/CodeGen/Thumb2/phi_prevent_copy.mir index 201972fae8cb0..0bd9f1c766e42 100644 --- a/llvm/test/CodeGen/Thumb2/phi_prevent_copy.mir +++ b/llvm/test/CodeGen/Thumb2/phi_prevent_copy.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve -simplify-mir -run-pass=phi-node-elimination %s -o - | FileCheck %s +# RUN: llc -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve -simplify-mir --passes=phi-node-elimination %s -o - | FileCheck %s --- | ; ModuleID = '' target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" diff --git a/llvm/test/CodeGen/X86/callbr-asm-kill.mir b/llvm/test/CodeGen/X86/callbr-asm-kill.mir index 0dded37c97afa..5aabeade52da1 100644 --- a/llvm/test/CodeGen/X86/callbr-asm-kill.mir +++ b/llvm/test/CodeGen/X86/callbr-asm-kill.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=x86_64-unknown-linux-gnu -verify-machineinstrs -O2 -run-pass=livevars,phi-node-elimination -o - %s | FileCheck %s +# RUN: llc -mtriple=x86_64-unknown-linux-gnu -O2 --passes='require,phi-node-elimination' -o - %s | FileCheck %s # Check that the COPY from [[MOV64rm]] is not killed, because there is a # subsequent use of [[MOV64rm]] in the INLINEASM_BR instruction which should be diff --git a/llvm/test/CodeGen/X86/phielim-undef.mir b/llvm/test/CodeGen/X86/phielim-undef.mir index 005ee37398157..cebc725537d0e 100644 --- a/llvm/test/CodeGen/X86/phielim-undef.mir +++ b/llvm/test/CodeGen/X86/phielim-undef.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=x86_64-- -verify-machineinstrs -o - %s -run-pass=livevars,phi-node-elimination,twoaddressinstruction | FileCheck %s +# RUN: llc -mtriple=x86_64-- -verify-machineinstrs -o - %s --passes='require,phi-node-elimination,two-address-instruction' | FileCheck %s --- | @b114 = external global i16, align 1 From 484fdb901f2cc39b122489508009947910001213 Mon Sep 17 00:00:00 2001 From: Allen Date: Wed, 17 Jul 2024 11:44:06 +0800 Subject: [PATCH 210/777] [clang codegen] Fix the ci fail for PR98704 (#99267) Different targets may have different flag on arguments, so restrict the triple to avoid ci fail. --- clang/test/CodeGen/math-libcalls-tbaa.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/test/CodeGen/math-libcalls-tbaa.cpp b/clang/test/CodeGen/math-libcalls-tbaa.cpp index 5b93079492bc5..0b231d474df77 100644 --- a/clang/test/CodeGen/math-libcalls-tbaa.cpp +++ b/clang/test/CodeGen/math-libcalls-tbaa.cpp @@ -1,7 +1,7 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 -// RUN: %clang_cc1 -fmath-errno -O3 -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,NoNewStructPathTBAA -// RUN: %clang_cc1 -fmath-errno -O3 -new-struct-path-tbaa -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,NewStructPathTBAA +// RUN: %clang_cc1 -triple=aarch64-unknown-linux-gnu -fmath-errno -O3 -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,NoNewStructPathTBAA +// RUN: %clang_cc1 -triple=aarch64-unknown-linux-gnu -fmath-errno -O3 -new-struct-path-tbaa -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,NewStructPathTBAA extern "C" float expf(float); From 493d504b35b9f655177b81ef3848e4a08a17831a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thorsten=20Sch=C3=BCtt?= Date: Wed, 17 Jul 2024 06:04:46 +0200 Subject: [PATCH 211/777] [GlobalIsel] Fix Machine Verifier errors (#99018) temporary solution. For discussion see https://github.com/llvm/llvm-project/pull/98894 Permanent solution could be: REQUIRES: default_triple --- llvm/test/MachineVerifier/test_g_extract_subvector.mir | 3 ++- llvm/test/MachineVerifier/test_g_insert_subvector.mir | 3 ++- llvm/test/MachineVerifier/test_g_vscale.mir | 3 ++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/llvm/test/MachineVerifier/test_g_extract_subvector.mir b/llvm/test/MachineVerifier/test_g_extract_subvector.mir index bc167d2eb7bcd..5a441ff29c172 100644 --- a/llvm/test/MachineVerifier/test_g_extract_subvector.mir +++ b/llvm/test/MachineVerifier/test_g_extract_subvector.mir @@ -1,4 +1,5 @@ -# RUN: not --crash llc -o - -run-pass=none -verify-machineinstrs %s 2>&1 | FileCheck %s +# RUN: not --crash llc -o - -run-pass=none -verify-machineinstrs -mtriple=arm64 %s 2>&1 | FileCheck %s +# REQUIRES: aarch64-registered-target --- name: g_extract_subvector tracksRegLiveness: true diff --git a/llvm/test/MachineVerifier/test_g_insert_subvector.mir b/llvm/test/MachineVerifier/test_g_insert_subvector.mir index dce30cdb6b1e5..9fce3c3e842d4 100644 --- a/llvm/test/MachineVerifier/test_g_insert_subvector.mir +++ b/llvm/test/MachineVerifier/test_g_insert_subvector.mir @@ -1,4 +1,5 @@ -# RUN: not --crash llc -o - -run-pass=none -verify-machineinstrs %s 2>&1 | FileCheck %s +# RUN: not --crash llc -o - -run-pass=none -verify-machineinstrs -mtriple=arm64 %s 2>&1 | FileCheck %s +# REQUIRES: aarch64-registered-target --- name: g_splat_vector diff --git a/llvm/test/MachineVerifier/test_g_vscale.mir b/llvm/test/MachineVerifier/test_g_vscale.mir index 78854620913a1..f4ff76766a84e 100644 --- a/llvm/test/MachineVerifier/test_g_vscale.mir +++ b/llvm/test/MachineVerifier/test_g_vscale.mir @@ -1,4 +1,5 @@ -# RUN: not --crash llc -verify-machineinstrs -run-pass none -o /dev/null %s 2>&1 | FileCheck %s +# RUN: not --crash llc -verify-machineinstrs -mtriple=arm64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s +# REQUIRES: aarch64-registered-target --- name: g_vscale From 3850912fee9a14990bc3d72dc2654b03f9e2ab87 Mon Sep 17 00:00:00 2001 From: hev Date: Wed, 17 Jul 2024 12:21:44 +0800 Subject: [PATCH 212/777] [LoongArch] Enable the TypePromotion pass from AArch64 (#98868) --- .../LoongArch/LoongArchTargetMachine.cpp | 7 +++ llvm/test/CodeGen/LoongArch/andn-icmp.ll | 8 +++ llvm/test/CodeGen/LoongArch/opt-pipeline.ll | 1 + .../LoongArch/typepromotion-overflow.ll | 52 +++++++------------ 4 files changed, 36 insertions(+), 32 deletions(-) diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp index 137fe1d04f45b..e83fc08696aea 100644 --- a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp @@ -151,6 +151,7 @@ class LoongArchPassConfig : public TargetPassConfig { } void addIRPasses() override; + void addCodeGenPrepare() override; bool addInstSelector() override; void addPreEmitPass() override; void addPreEmitPass2() override; @@ -178,6 +179,12 @@ void LoongArchPassConfig::addIRPasses() { TargetPassConfig::addIRPasses(); } +void LoongArchPassConfig::addCodeGenPrepare() { + if (getOptLevel() != CodeGenOptLevel::None) + addPass(createTypePromotionLegacyPass()); + TargetPassConfig::addCodeGenPrepare(); +} + bool LoongArchPassConfig::addInstSelector() { addPass(createLoongArchISelDag(getLoongArchTargetMachine())); diff --git a/llvm/test/CodeGen/LoongArch/andn-icmp.ll b/llvm/test/CodeGen/LoongArch/andn-icmp.ll index 46bae6a9b70c8..c529c2e281214 100644 --- a/llvm/test/CodeGen/LoongArch/andn-icmp.ll +++ b/llvm/test/CodeGen/LoongArch/andn-icmp.ll @@ -6,12 +6,14 @@ define i1 @andn_icmp_eq_i8(i8 signext %a, i8 signext %b) nounwind { ; LA32-LABEL: andn_icmp_eq_i8: ; LA32: # %bb.0: ; LA32-NEXT: andn $a0, $a1, $a0 +; LA32-NEXT: andi $a0, $a0, 255 ; LA32-NEXT: sltui $a0, $a0, 1 ; LA32-NEXT: ret ; ; LA64-LABEL: andn_icmp_eq_i8: ; LA64: # %bb.0: ; LA64-NEXT: andn $a0, $a1, $a0 +; LA64-NEXT: andi $a0, $a0, 255 ; LA64-NEXT: sltui $a0, $a0, 1 ; LA64-NEXT: ret %and = and i8 %a, %b @@ -23,12 +25,14 @@ define i1 @andn_icmp_eq_i16(i16 signext %a, i16 signext %b) nounwind { ; LA32-LABEL: andn_icmp_eq_i16: ; LA32: # %bb.0: ; LA32-NEXT: andn $a0, $a1, $a0 +; LA32-NEXT: bstrpick.w $a0, $a0, 15, 0 ; LA32-NEXT: sltui $a0, $a0, 1 ; LA32-NEXT: ret ; ; LA64-LABEL: andn_icmp_eq_i16: ; LA64: # %bb.0: ; LA64-NEXT: andn $a0, $a1, $a0 +; LA64-NEXT: bstrpick.d $a0, $a0, 15, 0 ; LA64-NEXT: sltui $a0, $a0, 1 ; LA64-NEXT: ret %and = and i16 %a, %b @@ -76,12 +80,14 @@ define i1 @andn_icmp_ne_i8(i8 signext %a, i8 signext %b) nounwind { ; LA32-LABEL: andn_icmp_ne_i8: ; LA32: # %bb.0: ; LA32-NEXT: andn $a0, $a1, $a0 +; LA32-NEXT: andi $a0, $a0, 255 ; LA32-NEXT: sltu $a0, $zero, $a0 ; LA32-NEXT: ret ; ; LA64-LABEL: andn_icmp_ne_i8: ; LA64: # %bb.0: ; LA64-NEXT: andn $a0, $a1, $a0 +; LA64-NEXT: andi $a0, $a0, 255 ; LA64-NEXT: sltu $a0, $zero, $a0 ; LA64-NEXT: ret %and = and i8 %a, %b @@ -93,12 +99,14 @@ define i1 @andn_icmp_ne_i16(i16 signext %a, i16 signext %b) nounwind { ; LA32-LABEL: andn_icmp_ne_i16: ; LA32: # %bb.0: ; LA32-NEXT: andn $a0, $a1, $a0 +; LA32-NEXT: bstrpick.w $a0, $a0, 15, 0 ; LA32-NEXT: sltu $a0, $zero, $a0 ; LA32-NEXT: ret ; ; LA64-LABEL: andn_icmp_ne_i16: ; LA64: # %bb.0: ; LA64-NEXT: andn $a0, $a1, $a0 +; LA64-NEXT: bstrpick.d $a0, $a0, 15, 0 ; LA64-NEXT: sltu $a0, $zero, $a0 ; LA64-NEXT: ret %and = and i16 %a, %b diff --git a/llvm/test/CodeGen/LoongArch/opt-pipeline.ll b/llvm/test/CodeGen/LoongArch/opt-pipeline.ll index b0c77155c095b..4e5a5433596db 100644 --- a/llvm/test/CodeGen/LoongArch/opt-pipeline.ll +++ b/llvm/test/CodeGen/LoongArch/opt-pipeline.ll @@ -68,6 +68,7 @@ ; LAXX-NEXT: Expand reduction intrinsics ; LAXX-NEXT: Natural Loop Information ; LAXX-NEXT: TLS Variable Hoist +; LAXX-NEXT: Type Promotion ; LAXX-NEXT: CodeGen Prepare ; LAXX-NEXT: Dominator Tree Construction ; LAXX-NEXT: Exception handling preparation diff --git a/llvm/test/CodeGen/LoongArch/typepromotion-overflow.ll b/llvm/test/CodeGen/LoongArch/typepromotion-overflow.ll index 68ad655130f5a..3f51e3b840097 100644 --- a/llvm/test/CodeGen/LoongArch/typepromotion-overflow.ll +++ b/llvm/test/CodeGen/LoongArch/typepromotion-overflow.ll @@ -287,7 +287,6 @@ define i32 @safe_add_underflow_neg(i8 zeroext %a) { ; LA32-LABEL: safe_add_underflow_neg: ; LA32: # %bb.0: ; LA32-NEXT: addi.w $a0, $a0, -2 -; LA32-NEXT: andi $a0, $a0, 255 ; LA32-NEXT: sltui $a0, $a0, 251 ; LA32-NEXT: ori $a1, $zero, 16 ; LA32-NEXT: masknez $a1, $a1, $a0 @@ -299,7 +298,6 @@ define i32 @safe_add_underflow_neg(i8 zeroext %a) { ; LA64-LABEL: safe_add_underflow_neg: ; LA64: # %bb.0: ; LA64-NEXT: addi.d $a0, $a0, -2 -; LA64-NEXT: andi $a0, $a0, 255 ; LA64-NEXT: sltui $a0, $a0, 251 ; LA64-NEXT: ori $a1, $zero, 16 ; LA64-NEXT: masknez $a1, $a1, $a0 @@ -344,8 +342,7 @@ define i32 @sext_sub_underflow(i8 zeroext %a) { ; LA32-LABEL: sext_sub_underflow: ; LA32: # %bb.0: ; LA32-NEXT: addi.w $a0, $a0, -6 -; LA32-NEXT: andi $a0, $a0, 255 -; LA32-NEXT: ori $a1, $zero, 250 +; LA32-NEXT: addi.w $a1, $zero, -6 ; LA32-NEXT: sltu $a0, $a1, $a0 ; LA32-NEXT: ori $a1, $zero, 16 ; LA32-NEXT: masknez $a1, $a1, $a0 @@ -357,8 +354,7 @@ define i32 @sext_sub_underflow(i8 zeroext %a) { ; LA64-LABEL: sext_sub_underflow: ; LA64: # %bb.0: ; LA64-NEXT: addi.d $a0, $a0, -6 -; LA64-NEXT: andi $a0, $a0, 255 -; LA64-NEXT: ori $a1, $zero, 250 +; LA64-NEXT: addi.w $a1, $zero, -6 ; LA64-NEXT: sltu $a0, $a1, $a0 ; LA64-NEXT: ori $a1, $zero, 16 ; LA64-NEXT: masknez $a1, $a1, $a0 @@ -401,7 +397,6 @@ define i32 @safe_sub_underflow_neg(i8 zeroext %a) { ; LA32-LABEL: safe_sub_underflow_neg: ; LA32: # %bb.0: ; LA32-NEXT: addi.w $a0, $a0, -4 -; LA32-NEXT: andi $a0, $a0, 255 ; LA32-NEXT: ori $a1, $zero, 250 ; LA32-NEXT: sltu $a0, $a1, $a0 ; LA32-NEXT: ori $a1, $zero, 16 @@ -414,7 +409,6 @@ define i32 @safe_sub_underflow_neg(i8 zeroext %a) { ; LA64-LABEL: safe_sub_underflow_neg: ; LA64: # %bb.0: ; LA64-NEXT: addi.d $a0, $a0, -4 -; LA64-NEXT: andi $a0, $a0, 255 ; LA64-NEXT: ori $a1, $zero, 250 ; LA64-NEXT: sltu $a0, $a1, $a0 ; LA64-NEXT: ori $a1, $zero, 16 @@ -433,8 +427,7 @@ define i32 @sext_sub_underflow_neg(i8 zeroext %a) { ; LA32-LABEL: sext_sub_underflow_neg: ; LA32: # %bb.0: ; LA32-NEXT: addi.w $a0, $a0, -4 -; LA32-NEXT: andi $a0, $a0, 255 -; LA32-NEXT: sltui $a0, $a0, 253 +; LA32-NEXT: sltui $a0, $a0, -3 ; LA32-NEXT: ori $a1, $zero, 16 ; LA32-NEXT: masknez $a1, $a1, $a0 ; LA32-NEXT: ori $a2, $zero, 8 @@ -445,8 +438,7 @@ define i32 @sext_sub_underflow_neg(i8 zeroext %a) { ; LA64-LABEL: sext_sub_underflow_neg: ; LA64: # %bb.0: ; LA64-NEXT: addi.d $a0, $a0, -4 -; LA64-NEXT: andi $a0, $a0, 255 -; LA64-NEXT: sltui $a0, $a0, 253 +; LA64-NEXT: sltui $a0, $a0, -3 ; LA64-NEXT: ori $a1, $zero, 16 ; LA64-NEXT: masknez $a1, $a1, $a0 ; LA64-NEXT: ori $a2, $zero, 8 @@ -476,19 +468,17 @@ entry: define i32 @safe_sub_var_imm(ptr nocapture readonly %b) local_unnamed_addr #1 { ; LA32-LABEL: safe_sub_var_imm: ; LA32: # %bb.0: # %entry -; LA32-NEXT: ld.b $a0, $a0, 0 -; LA32-NEXT: addi.w $a0, $a0, 8 -; LA32-NEXT: andi $a0, $a0, 255 -; LA32-NEXT: ori $a1, $zero, 252 +; LA32-NEXT: ld.bu $a0, $a0, 0 +; LA32-NEXT: addi.w $a0, $a0, -248 +; LA32-NEXT: addi.w $a1, $zero, -4 ; LA32-NEXT: sltu $a0, $a1, $a0 ; LA32-NEXT: ret ; ; LA64-LABEL: safe_sub_var_imm: ; LA64: # %bb.0: # %entry -; LA64-NEXT: ld.b $a0, $a0, 0 -; LA64-NEXT: addi.d $a0, $a0, 8 -; LA64-NEXT: andi $a0, $a0, 255 -; LA64-NEXT: ori $a1, $zero, 252 +; LA64-NEXT: ld.bu $a0, $a0, 0 +; LA64-NEXT: addi.d $a0, $a0, -248 +; LA64-NEXT: addi.w $a1, $zero, -4 ; LA64-NEXT: sltu $a0, $a1, $a0 ; LA64-NEXT: ret entry: @@ -533,11 +523,10 @@ define i8 @convert_add_order(i8 zeroext %arg) { ; LA32-NEXT: ori $a1, $a0, 1 ; LA32-NEXT: sltui $a2, $a1, 50 ; LA32-NEXT: addi.w $a1, $a1, -40 -; LA32-NEXT: andi $a1, $a1, 255 ; LA32-NEXT: sltui $a1, $a1, 20 ; LA32-NEXT: ori $a3, $zero, 2 ; LA32-NEXT: sub.w $a1, $a3, $a1 -; LA32-NEXT: addi.w $a3, $zero, -1 +; LA32-NEXT: ori $a3, $zero, 255 ; LA32-NEXT: masknez $a3, $a3, $a2 ; LA32-NEXT: maskeqz $a1, $a1, $a2 ; LA32-NEXT: or $a1, $a1, $a3 @@ -549,11 +538,10 @@ define i8 @convert_add_order(i8 zeroext %arg) { ; LA64-NEXT: ori $a1, $a0, 1 ; LA64-NEXT: sltui $a2, $a1, 50 ; LA64-NEXT: addi.d $a1, $a1, -40 -; LA64-NEXT: andi $a1, $a1, 255 ; LA64-NEXT: sltui $a1, $a1, 20 ; LA64-NEXT: ori $a3, $zero, 2 ; LA64-NEXT: sub.d $a1, $a3, $a1 -; LA64-NEXT: addi.w $a3, $zero, -1 +; LA64-NEXT: ori $a3, $zero, 255 ; LA64-NEXT: masknez $a3, $a3, $a2 ; LA64-NEXT: maskeqz $a1, $a1, $a2 ; LA64-NEXT: or $a1, $a1, $a3 @@ -574,9 +562,8 @@ define i8 @underflow_if_sub(i32 %arg, i8 zeroext %arg1) { ; LA32: # %bb.0: ; LA32-NEXT: slt $a2, $zero, $a0 ; LA32-NEXT: and $a0, $a2, $a0 -; LA32-NEXT: addi.w $a0, $a0, -11 -; LA32-NEXT: andi $a2, $a0, 247 -; LA32-NEXT: sltu $a1, $a2, $a1 +; LA32-NEXT: addi.w $a0, $a0, 245 +; LA32-NEXT: sltu $a1, $a0, $a1 ; LA32-NEXT: maskeqz $a0, $a0, $a1 ; LA32-NEXT: ori $a2, $zero, 100 ; LA32-NEXT: masknez $a1, $a2, $a1 @@ -588,9 +575,8 @@ define i8 @underflow_if_sub(i32 %arg, i8 zeroext %arg1) { ; LA64-NEXT: addi.w $a2, $a0, 0 ; LA64-NEXT: slt $a2, $zero, $a2 ; LA64-NEXT: and $a0, $a2, $a0 -; LA64-NEXT: addi.d $a0, $a0, -11 -; LA64-NEXT: andi $a2, $a0, 247 -; LA64-NEXT: sltu $a1, $a2, $a1 +; LA64-NEXT: addi.d $a0, $a0, 245 +; LA64-NEXT: sltu $a1, $a0, $a1 ; LA64-NEXT: maskeqz $a0, $a0, $a1 ; LA64-NEXT: ori $a2, $zero, 100 ; LA64-NEXT: masknez $a1, $a2, $a1 @@ -609,9 +595,10 @@ define i8 @underflow_if_sub(i32 %arg, i8 zeroext %arg1) { define i8 @underflow_if_sub_signext(i32 %arg, i8 signext %arg1) { ; LA32-LABEL: underflow_if_sub_signext: ; LA32: # %bb.0: +; LA32-NEXT: andi $a1, $a1, 255 ; LA32-NEXT: slt $a2, $zero, $a0 ; LA32-NEXT: and $a0, $a2, $a0 -; LA32-NEXT: addi.w $a0, $a0, -11 +; LA32-NEXT: addi.w $a0, $a0, 245 ; LA32-NEXT: sltu $a1, $a0, $a1 ; LA32-NEXT: maskeqz $a0, $a0, $a1 ; LA32-NEXT: ori $a2, $zero, 100 @@ -622,9 +609,10 @@ define i8 @underflow_if_sub_signext(i32 %arg, i8 signext %arg1) { ; LA64-LABEL: underflow_if_sub_signext: ; LA64: # %bb.0: ; LA64-NEXT: addi.w $a2, $a0, 0 +; LA64-NEXT: andi $a1, $a1, 255 ; LA64-NEXT: slt $a2, $zero, $a2 ; LA64-NEXT: and $a0, $a2, $a0 -; LA64-NEXT: addi.d $a0, $a0, -11 +; LA64-NEXT: addi.d $a0, $a0, 245 ; LA64-NEXT: sltu $a1, $a0, $a1 ; LA64-NEXT: maskeqz $a0, $a0, $a1 ; LA64-NEXT: ori $a2, $zero, 100 From 9e9924cc2e0d0f03def09e02e7344d10066eb1b6 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Wed, 17 Jul 2024 12:50:17 +0800 Subject: [PATCH 213/777] [RISCV] Don't fold vmerge.vvm or vmv.v.v into vredsum.vs if AVL changed (#99006) When folding, we currently check if the pseudo's result is not lanewise (e.g. vredsum.vs or viota.m) and bail if we're changing the mask. However we also need to check for the AVL too. This patch bails if the AVL changed for these pseudos, and also renames the pseudo table property to be more explicit. --- llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp | 15 ++++-- llvm/lib/Target/RISCV/RISCVInstrInfo.h | 2 +- .../Target/RISCV/RISCVInstrInfoVPseudos.td | 12 ++--- llvm/test/CodeGen/RISCV/rvv/combine-vmv.ll | 54 ++++++++++++++++++- .../RISCV/rvv/rvv-peephole-vmerge-vops.ll | 18 +++++++ 5 files changed, 87 insertions(+), 14 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index adde745a5a91b..eef6ae677ac85 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -3753,11 +3753,6 @@ bool RISCVDAGToDAGISel::performCombineVMergeAndVOps(SDNode *N) { if (!Info) return false; - // When Mask is not a true mask, this transformation is illegal for some - // operations whose results are affected by mask, like viota.m. - if (Info->MaskAffectsResult && Mask && !usesAllOnesMask(Mask, Glue)) - return false; - // If True has a merge operand then it needs to be the same as vmerge's False, // since False will be used for the result's merge operand. if (HasTiedDest && !isImplicitDef(True->getOperand(0))) { @@ -3835,6 +3830,16 @@ bool RISCVDAGToDAGISel::performCombineVMergeAndVOps(SDNode *N) { if (!VL) return false; + // Some operations produce different elementwise results depending on the + // active elements, like viota.m or vredsum. This transformation is illegal + // for these if we change the active elements (i.e. mask or VL). + if (Info->ActiveElementsAffectResult) { + if (Mask && !usesAllOnesMask(Mask, Glue)) + return false; + if (TrueVL != VL) + return false; + } + // If we end up changing the VL or mask of True, then we need to make sure it // doesn't raise any observable fp exceptions, since changing the active // elements will affect how fflags is set. diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h index f0c0953a3e56a..025cc36d19eb7 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h @@ -381,7 +381,7 @@ struct RISCVMaskedPseudoInfo { uint16_t MaskedPseudo; uint16_t UnmaskedPseudo; uint8_t MaskOpIdx; - uint8_t MaskAffectsResult : 1; + uint8_t ActiveElementsAffectResult : 1; }; #define GET_RISCVMaskedPseudosTable_DECL #include "RISCVGenSearchableTables.inc" diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td index d72390b7c14b5..b860273d639ee 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td @@ -561,17 +561,17 @@ def RISCVVIntrinsicsTable : GenericTable { // unmasked variant. For all but compares, both the masked and // unmasked variant have a passthru and policy operand. For compares, // neither has a policy op, and only the masked version has a passthru. -class RISCVMaskedPseudo MaskIdx, bit MaskAffectsRes=false> { +class RISCVMaskedPseudo MaskIdx, bit ActiveAffectsRes=false> { Pseudo MaskedPseudo = !cast(NAME); Pseudo UnmaskedPseudo = !cast(!subst("_MASK", "", NAME)); bits<4> MaskOpIdx = MaskIdx; - bit MaskAffectsResult = MaskAffectsRes; + bit ActiveElementsAffectResult = ActiveAffectsRes; } def RISCVMaskedPseudosTable : GenericTable { let FilterClass = "RISCVMaskedPseudo"; let CppTypeName = "RISCVMaskedPseudoInfo"; - let Fields = ["MaskedPseudo", "UnmaskedPseudo", "MaskOpIdx", "MaskAffectsResult"]; + let Fields = ["MaskedPseudo", "UnmaskedPseudo", "MaskOpIdx", "ActiveElementsAffectResult"]; let PrimaryKey = ["MaskedPseudo"]; let PrimaryKeyName = "getMaskedPseudoInfo"; } @@ -2065,7 +2065,7 @@ multiclass VPseudoVIOTA_M { SchedUnary<"WriteVIotaV", "ReadVIotaV", mx, forceMergeOpRead=true>; def "_" # mx # "_MASK" : VPseudoUnaryMask, - RISCVMaskedPseudo, + RISCVMaskedPseudo, SchedUnary<"WriteVIotaV", "ReadVIotaV", mx, forceMergeOpRead=true>; } @@ -3162,7 +3162,7 @@ multiclass VPseudoTernaryWithTailPolicy; def "_" # mx # "_E" # sew # "_MASK" : VPseudoTernaryMaskPolicy, - RISCVMaskedPseudo; + RISCVMaskedPseudo; } } @@ -3179,7 +3179,7 @@ multiclass VPseudoTernaryWithTailPolicyRoundingMode, - RISCVMaskedPseudo; + RISCVMaskedPseudo; } } diff --git a/llvm/test/CodeGen/RISCV/rvv/combine-vmv.ll b/llvm/test/CodeGen/RISCV/rvv/combine-vmv.ll index 61acf1afa94de..ec03f773c7108 100644 --- a/llvm/test/CodeGen/RISCV/rvv/combine-vmv.ll +++ b/llvm/test/CodeGen/RISCV/rvv/combine-vmv.ll @@ -19,6 +19,17 @@ define @vadd( %passthru, ret %w } +define @vadd_mask( %passthru, %a, %b, %m, iXLen %vl) { +; CHECK-LABEL: vadd_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m2, tu, mu +; CHECK-NEXT: vadd.vv v8, v10, v12, v0.t +; CHECK-NEXT: ret + %v = call @llvm.riscv.vadd.mask.nxv4i32.nxv4i32( poison, %a, %b, %m, iXLen %vl, iXLen 3) + %w = call @llvm.riscv.vmv.v.v.nxv4i32( %passthru, %v, iXLen %vl) + ret %w +} + define @vadd_undef( %a, %b, iXLen %vl1, iXLen %vl2) { ; CHECK-LABEL: vadd_undef: ; CHECK: # %bb.0: @@ -106,8 +117,8 @@ declare @llvm.riscv.vmv.v.v.nxv4f32(, < declare @llvm.riscv.vfadd.nxv4f32.nxv4f32(, , , iXLen, iXLen) -define @vfadd( %passthru, %a, %b, iXLen %vl1, iXLen %vl2) { -; CHECK-LABEL: vfadd: +define @unfoldable_vfadd( %passthru, %a, %b, iXLen %vl1, iXLen %vl2) { +; CHECK-LABEL: unfoldable_vfadd: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma ; CHECK-NEXT: vfadd.vv v10, v10, v12 @@ -118,3 +129,42 @@ define @vfadd( %passthru, @llvm.riscv.vmv.v.v.nxv4f32( %passthru, %v, iXLen %vl2) ret %w } + +define @foldable_vfadd( %passthru, %a, %b, iXLen %vl) { +; CHECK-LABEL: foldable_vfadd: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m2, tu, ma +; CHECK-NEXT: vfadd.vv v8, v10, v12 +; CHECK-NEXT: ret + %v = call @llvm.riscv.vfadd.nxv4f32.nxv4f32( poison, %a, %b, iXLen 7, iXLen %vl) + %w = call @llvm.riscv.vmv.v.v.nxv4f32( %passthru, %v, iXLen %vl) + ret %w +} + +; This shouldn't be folded because we need to preserve exceptions with +; "fpexcept.strict" exception behaviour, and changing the VL may hide them. +define @unfoldable_constrained_fadd( %passthru, %x, %y, iXLen %vl) strictfp { +; CHECK-LABEL: unfoldable_constrained_fadd: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; CHECK-NEXT: vfadd.vv v10, v10, v12 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, tu, ma +; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: ret + %a = call @llvm.experimental.constrained.fadd( %x, %y, metadata !"round.dynamic", metadata !"fpexcept.strict") strictfp + %b = call @llvm.riscv.vmv.v.v.nxv4f32( %passthru, %a, iXLen %vl) strictfp + ret %b +} + +define @unfoldable_vredsum( %passthru, %x, %y) { +; CHECK-LABEL: unfoldable_vredsum: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma +; CHECK-NEXT: vredsum.vs v9, v10, v9 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, ma +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret + %a = call @llvm.riscv.vredsum.nxv2i32.nxv4i32( poison, %x, %y, iXLen -1) + %b = call @llvm.riscv.vmv.v.v.nxv2i32( %passthru, %a, iXLen 1) + ret %b +} diff --git a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll index b6921abf8fdf4..a08bcae074b9b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll +++ b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll @@ -1014,6 +1014,24 @@ define @vfredusum_allones_mask( %passth ret %b } +define @unfoldable_vredsum_allones_mask_diff_vl( %passthru, %x, %y) { +; CHECK-LABEL: unfoldable_vredsum_allones_mask_diff_vl: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vsetvli a0, zero, e32, m1, tu, ma +; CHECK-NEXT: vredsum.vs v11, v9, v10 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, ma +; CHECK-NEXT: vmv.v.v v8, v11 +; CHECK-NEXT: ret + %a = call @llvm.riscv.vredsum.nxv2i32.nxv2i32( + %passthru, + %x, + %y, + i64 -1) + %b = call @llvm.riscv.vmerge.nxv2i32.nxv2i32( %passthru, %passthru, %a, splat (i1 -1), i64 1) + ret %b +} + declare @llvm.riscv.vle.nxv32i16.i64(, ptr nocapture, i64) declare @llvm.riscv.vssubu.mask.nxv32i8.i8.i64(, , i8, , i64, i64 immarg) declare @llvm.riscv.vmseq.nxv32i8.nxv32i8.i64(, , i64) From 6192f458f4fd2ca4e6f01515547034f89325e73c Mon Sep 17 00:00:00 2001 From: Petr Hosek Date: Tue, 16 Jul 2024 21:58:13 -0700 Subject: [PATCH 214/777] [libc++] Make `std::lock_guard` available with `_LIBCPP_HAS_NO_THREADS` (#98717) This change makes `std::lock_guard` available when `_LIBCPP_HAS_NO_THREADS` is set. This class is generic and doesn't require threading support, and is regularly used even in environments where threading isn't available like embedded. fixes #89891 --------- Co-authored-by: Louis Dionne --- libcxx/include/__mutex/lock_guard.h | 4 -- libcxx/include/__mutex/tag_types.h | 10 ++-- ...mpile.fail.cpp => assign.compile.pass.cpp} | 11 +---- ...compile.fail.cpp => copy.compile.pass.cpp} | 9 +--- ...lock.pass.cpp => ctor.adopt_lock.pass.cpp} | 23 +++------ .../{mutex.verify.cpp => ctor.mutex.pass.cpp} | 17 ++++--- .../thread.lock.guard/implicit_ctad.pass.cpp | 10 ++-- .../thread.lock.guard/mutex.pass.cpp | 49 ------------------- .../thread.lock.guard/std.mutex.pass.cpp | 49 +++++++++++++++++++ ...{types.pass.cpp => types.compile.pass.cpp} | 12 +---- .../thread.lock/thread.lock.guard/types.h | 33 +++++++++++++ 11 files changed, 113 insertions(+), 114 deletions(-) rename libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/{assign.compile.fail.cpp => assign.compile.pass.cpp} (74%) rename libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/{copy.compile.fail.cpp => copy.compile.pass.cpp} (78%) rename libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/{adopt_lock.pass.cpp => ctor.adopt_lock.pass.cpp} (64%) rename libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/{mutex.verify.cpp => ctor.mutex.pass.cpp} (63%) delete mode 100644 libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/mutex.pass.cpp create mode 100644 libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/std.mutex.pass.cpp rename libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/{types.pass.cpp => types.compile.pass.cpp} (71%) create mode 100644 libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/types.h diff --git a/libcxx/include/__mutex/lock_guard.h b/libcxx/include/__mutex/lock_guard.h index 8340b9bbd4453..ef56896be9f68 100644 --- a/libcxx/include/__mutex/lock_guard.h +++ b/libcxx/include/__mutex/lock_guard.h @@ -16,8 +16,6 @@ # pragma GCC system_header #endif -#ifndef _LIBCPP_HAS_NO_THREADS - _LIBCPP_BEGIN_NAMESPACE_STD template @@ -47,6 +45,4 @@ _LIBCPP_CTAD_SUPPORTED_FOR_TYPE(lock_guard); _LIBCPP_END_NAMESPACE_STD -#endif // _LIBCPP_HAS_NO_THREADS - #endif // _LIBCPP___MUTEX_LOCK_GUARD_H diff --git a/libcxx/include/__mutex/tag_types.h b/libcxx/include/__mutex/tag_types.h index 05ccb8b23a6f4..2b2dd58ee4e80 100644 --- a/libcxx/include/__mutex/tag_types.h +++ b/libcxx/include/__mutex/tag_types.h @@ -15,8 +15,6 @@ # pragma GCC system_header #endif -#ifndef _LIBCPP_HAS_NO_THREADS - _LIBCPP_BEGIN_NAMESPACE_STD struct _LIBCPP_EXPORTED_FROM_ABI defer_lock_t { @@ -31,18 +29,16 @@ struct _LIBCPP_EXPORTED_FROM_ABI adopt_lock_t { explicit adopt_lock_t() = default; }; -# if _LIBCPP_STD_VER >= 17 +#if _LIBCPP_STD_VER >= 17 inline constexpr defer_lock_t defer_lock = defer_lock_t(); inline constexpr try_to_lock_t try_to_lock = try_to_lock_t(); inline constexpr adopt_lock_t adopt_lock = adopt_lock_t(); -# elif !defined(_LIBCPP_CXX03_LANG) +#elif !defined(_LIBCPP_CXX03_LANG) constexpr defer_lock_t defer_lock = defer_lock_t(); constexpr try_to_lock_t try_to_lock = try_to_lock_t(); constexpr adopt_lock_t adopt_lock = adopt_lock_t(); -# endif +#endif _LIBCPP_END_NAMESPACE_STD -#endif // _LIBCPP_HAS_NO_THREADS - #endif // _LIBCPP___MUTEX_TAG_TYPES_H diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/assign.compile.fail.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/assign.compile.pass.cpp similarity index 74% rename from libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/assign.compile.fail.cpp rename to libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/assign.compile.pass.cpp index 2d0f438ed0391..abd37ea0d55dd 100644 --- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/assign.compile.fail.cpp +++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/assign.compile.pass.cpp @@ -14,13 +14,6 @@ #include -int main(int, char**) -{ - std::mutex m0; - std::mutex m1; - std::lock_guard lg0(m0); - std::lock_guard lg(m1); - lg = lg0; +#include "types.h" - return 0; -} +static_assert(!std::is_copy_assignable >::value, ""); diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/copy.compile.fail.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/copy.compile.pass.cpp similarity index 78% rename from libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/copy.compile.fail.cpp rename to libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/copy.compile.pass.cpp index e99517e47e8c6..2a5973726ff1d 100644 --- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/copy.compile.fail.cpp +++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/copy.compile.pass.cpp @@ -14,11 +14,6 @@ #include -int main(int, char**) -{ - std::mutex m; - std::lock_guard lg0(m); - std::lock_guard lg(lg0); +#include "types.h" - return 0; -} +static_assert(!std::is_copy_constructible >::value, ""); diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/adopt_lock.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/ctor.adopt_lock.pass.cpp similarity index 64% rename from libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/adopt_lock.pass.cpp rename to libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/ctor.adopt_lock.pass.cpp index 4d11674f1e83c..d674b9a93fc64 100644 --- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/adopt_lock.pass.cpp +++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/ctor.adopt_lock.pass.cpp @@ -5,8 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// -// UNSUPPORTED: no-threads + // UNSUPPORTED: c++03 // @@ -16,28 +15,18 @@ // lock_guard(mutex_type& m, adopt_lock_t); #include -#include #include -#include "make_test_thread.h" -#include "test_macros.h" - -std::mutex m; - -void do_try_lock() { - assert(m.try_lock() == false); -} +#include "types.h" int main(int, char**) { + MyMutex m; { m.lock(); - std::lock_guard lg(m, std::adopt_lock); - std::thread t = support::make_test_thread(do_try_lock); - t.join(); + std::lock_guard lg(m, std::adopt_lock); + assert(m.locked); } - - m.lock(); - m.unlock(); + assert(!m.locked); return 0; } diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/mutex.verify.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/ctor.mutex.pass.cpp similarity index 63% rename from libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/mutex.verify.cpp rename to libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/ctor.mutex.pass.cpp index 82f672891c452..9fcffd2f9957d 100644 --- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/mutex.verify.cpp +++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/ctor.mutex.pass.cpp @@ -6,20 +6,25 @@ // //===----------------------------------------------------------------------===// -// UNSUPPORTED: no-threads - // // template class lock_guard; // explicit lock_guard(mutex_type& m); +#include #include +#include + +#include "types.h" + +int main(int, char**) { + MyMutex m; + assert(!m.locked); + std::lock_guard lg(m); + assert(m.locked); -int main(int, char**) -{ - std::mutex m; - std::lock_guard lg = m; // expected-error{{no viable conversion}} + static_assert(!std::is_convertible >::value, "constructor must be explicit"); return 0; } diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/implicit_ctad.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/implicit_ctad.pass.cpp index 9319ec0dba04e..cd5e6692731fe 100644 --- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/implicit_ctad.pass.cpp +++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/implicit_ctad.pass.cpp @@ -6,24 +6,24 @@ // //===----------------------------------------------------------------------===// -// UNSUPPORTED: no-threads // UNSUPPORTED: c++03, c++11, c++14 // -// lock_guard +// template class lock_guard; // Make sure that the implicitly-generated CTAD works. #include #include "test_macros.h" +#include "types.h" int main(int, char**) { - std::mutex mutex; + MyMutex m; { - std::lock_guard lock(mutex); - ASSERT_SAME_TYPE(decltype(lock), std::lock_guard); + std::lock_guard lg(m); + ASSERT_SAME_TYPE(decltype(lg), std::lock_guard); } return 0; diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/mutex.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/mutex.pass.cpp deleted file mode 100644 index 6025b0c3b465b..0000000000000 --- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/mutex.pass.cpp +++ /dev/null @@ -1,49 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// UNSUPPORTED: no-threads - -// - -// template class lock_guard; - -// explicit lock_guard(mutex_type& m); - -// template lock_guard(lock_guard<_Mutex>) -// -> lock_guard<_Mutex>; // C++17 - -#include -#include -#include - -#include "make_test_thread.h" -#include "test_macros.h" - -std::mutex m; - -void do_try_lock() { - assert(m.try_lock() == false); -} - -int main(int, char**) { - { - std::lock_guard lg(m); - std::thread t = support::make_test_thread(do_try_lock); - t.join(); - } - - m.lock(); - m.unlock(); - -#if TEST_STD_VER >= 17 - std::lock_guard lg(m); - static_assert((std::is_same>::value), "" ); -#endif - - return 0; -} diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/std.mutex.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/std.mutex.pass.cpp new file mode 100644 index 0000000000000..5453db49d4e34 --- /dev/null +++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/std.mutex.pass.cpp @@ -0,0 +1,49 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: no-threads +// UNSUPPORTED: c++03 + +// Test the interoperation of std::lock_guard with std::mutex, since that is such +// a common use case. + +#include +#include +#include +#include + +#include "make_test_thread.h" +#include "test_macros.h" + +void do_try_lock(std::mutex& m) { assert(m.try_lock() == false); } + +int main(int, char**) { + { + std::mutex m; + { + std::lock_guard lg(m); + std::thread t = support::make_test_thread(do_try_lock, std::ref(m)); + t.join(); + } + + // This should work because the lock_guard unlocked the mutex when it was destroyed above. + m.lock(); + m.unlock(); + } + + // Test CTAD +#if TEST_STD_VER >= 17 + { + std::mutex m; + std::lock_guard lg(m); + static_assert(std::is_same>::value, ""); + } +#endif + + return 0; +} diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/types.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/types.compile.pass.cpp similarity index 71% rename from libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/types.pass.cpp rename to libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/types.compile.pass.cpp index 8b10d9dab8f2a..015dbfe3c46ae 100644 --- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/types.pass.cpp +++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/types.compile.pass.cpp @@ -5,8 +5,6 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// -// UNSUPPORTED: no-threads // @@ -21,12 +19,6 @@ #include #include -#include "test_macros.h" - -int main(int, char**) -{ - static_assert((std::is_same::mutex_type, - std::mutex>::value), ""); +#include "types.h" - return 0; -} +static_assert(std::is_same::mutex_type, MyMutex>::value, ""); diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/types.h b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/types.h new file mode 100644 index 0000000000000..5aeed21547880 --- /dev/null +++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/types.h @@ -0,0 +1,33 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef TEST_STD_THREAD_THREAD_MUTEX_THREAD_LOCK_THREAD_LOCK_GUARD_TYPES_H +#define TEST_STD_THREAD_THREAD_MUTEX_THREAD_LOCK_THREAD_LOCK_GUARD_TYPES_H + +#include + +struct MyMutex { + bool locked = false; + + MyMutex() = default; + ~MyMutex() { assert(!locked); } + + void lock() { + assert(!locked); + locked = true; + } + void unlock() { + assert(locked); + locked = false; + } + + MyMutex(MyMutex const&) = delete; + MyMutex& operator=(MyMutex const&) = delete; +}; + +#endif // TEST_STD_THREAD_THREAD_MUTEX_THREAD_LOCK_THREAD_LOCK_GUARD_TYPES_H From 6d26e574241e04264c10e15781c0788363f3e015 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 16 Jul 2024 21:21:40 -0700 Subject: [PATCH 215/777] [RISCV] Remove accidentally duplicated isel patterns. NFC VPatIntegerSetCCSDNode_XI_Swappable inherited from VPatIntegerSetCCSDNode_XI and contained the same patterns. --- llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td index e82625f085bec..a0f37ea4c6fea 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td @@ -325,9 +325,7 @@ multiclass VPatIntegerSetCCSDNode_XI_Swappable - : VPatIntegerSetCCSDNode_XI { + DAGOperand xop_kind> { foreach vti = AllIntegerVectors in { defvar instruction = !cast(instruction_name#_#kind#_#vti.LMul.MX); let Predicates = GetVTypePredicates.Predicates in { From 8ca7e24b359ab2b6d2868f9252c7cd11eb48c787 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 16 Jul 2024 21:26:08 -0700 Subject: [PATCH 216/777] [RISCV] Add more vector setcc VI isel patterns. Add more patterns isel patterns for vmseq.vi and friends with the constant splat on the left hand side. We can't trust the canonicalization in SimplifySetCC to keep constants to the RHS when the splat is VMV_V_X_VL for i64 on RV32. --- .../Target/RISCV/RISCVInstrInfoVSDPatterns.td | 74 +++-- llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll | 254 +++++++----------- 2 files changed, 128 insertions(+), 200 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td index a0f37ea4c6fea..7afd6def4e4d2 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td @@ -306,21 +306,6 @@ multiclass VPatIntegerSetCCSDNode_VV_Swappable { - foreach vti = AllIntegerVectors in { - defvar instruction = !cast(instruction_name#_#kind#_#vti.LMul.MX); - let Predicates = GetVTypePredicates.Predicates in - def : Pat<(vti.Mask (setcc (vti.Vector vti.RegClass:$rs1), - (vti.Vector (SplatPatKind (XLenVT xop_kind:$rs2))), cc)), - (instruction vti.RegClass:$rs1, xop_kind:$rs2, vti.AVL, vti.Log2SEW)>; - } -} - multiclass VPatIntegerSetCCSDNode_XI_Swappable; -multiclass VPatIntegerSetCCSDNode_VI - : VPatIntegerSetCCSDNode_XI; +multiclass VPatIntegerSetCCSDNode_VI_Swappable + : VPatIntegerSetCCSDNode_XI_Swappable; -multiclass VPatIntegerSetCCSDNode_VIPlus1 { +multiclass VPatIntegerSetCCSDNode_VIPlus1_Swappable { foreach vti = AllIntegerVectors in { defvar instruction = !cast(instruction_name#"_VI_"#vti.LMul.MX); - let Predicates = GetVTypePredicates.Predicates in - def : Pat<(vti.Mask (setcc (vti.Vector vti.RegClass:$rs1), - (vti.Vector (splatpat_kind simm5:$rs2)), - cc)), - (instruction vti.RegClass:$rs1, (DecImm simm5:$rs2), - vti.AVL, vti.Log2SEW)>; + let Predicates = GetVTypePredicates.Predicates in { + def : Pat<(vti.Mask (setcc (vti.Vector vti.RegClass:$rs1), + (vti.Vector (splatpat_kind simm5:$rs2)), + cc)), + (instruction vti.RegClass:$rs1, (DecImm simm5:$rs2), + vti.AVL, vti.Log2SEW)>; + def : Pat<(vti.Mask (setcc (vti.Vector (splatpat_kind simm5:$rs2)), + (vti.Vector vti.RegClass:$rs1), + invcc)), + (instruction vti.RegClass:$rs1, (DecImm simm5:$rs2), + vti.AVL, vti.Log2SEW)>; + } } } @@ -1045,21 +1039,21 @@ defm : VPatIntegerSetCCSDNode_VX_Swappable<"PseudoVMSGT", SETGT, SETLT>; defm : VPatIntegerSetCCSDNode_VX_Swappable<"PseudoVMSGTU", SETUGT, SETULT>; // There is no VMSGE(U)_VX instruction -defm : VPatIntegerSetCCSDNode_VI<"PseudoVMSEQ", SETEQ>; -defm : VPatIntegerSetCCSDNode_VI<"PseudoVMSNE", SETNE>; -defm : VPatIntegerSetCCSDNode_VI<"PseudoVMSLE", SETLE>; -defm : VPatIntegerSetCCSDNode_VI<"PseudoVMSLEU", SETULE>; -defm : VPatIntegerSetCCSDNode_VI<"PseudoVMSGT", SETGT>; -defm : VPatIntegerSetCCSDNode_VI<"PseudoVMSGTU", SETUGT>; - -defm : VPatIntegerSetCCSDNode_VIPlus1<"PseudoVMSLE", SETLT, - SplatPat_simm5_plus1>; -defm : VPatIntegerSetCCSDNode_VIPlus1<"PseudoVMSLEU", SETULT, - SplatPat_simm5_plus1_nonzero>; -defm : VPatIntegerSetCCSDNode_VIPlus1<"PseudoVMSGT", SETGE, - SplatPat_simm5_plus1>; -defm : VPatIntegerSetCCSDNode_VIPlus1<"PseudoVMSGTU", SETUGE, - SplatPat_simm5_plus1_nonzero>; +defm : VPatIntegerSetCCSDNode_VI_Swappable<"PseudoVMSEQ", SETEQ, SETEQ>; +defm : VPatIntegerSetCCSDNode_VI_Swappable<"PseudoVMSNE", SETNE, SETNE>; +defm : VPatIntegerSetCCSDNode_VI_Swappable<"PseudoVMSLE", SETLE, SETGE>; +defm : VPatIntegerSetCCSDNode_VI_Swappable<"PseudoVMSLEU", SETULE, SETUGE>; +defm : VPatIntegerSetCCSDNode_VI_Swappable<"PseudoVMSGT", SETGT, SETLT>; +defm : VPatIntegerSetCCSDNode_VI_Swappable<"PseudoVMSGTU", SETUGT, SETULT>; + +defm : VPatIntegerSetCCSDNode_VIPlus1_Swappable<"PseudoVMSLE", SETLT, SETGT, + SplatPat_simm5_plus1>; +defm : VPatIntegerSetCCSDNode_VIPlus1_Swappable<"PseudoVMSLEU", SETULT, SETUGT, + SplatPat_simm5_plus1_nonzero>; +defm : VPatIntegerSetCCSDNode_VIPlus1_Swappable<"PseudoVMSGT", SETGE, SETLE, + SplatPat_simm5_plus1>; +defm : VPatIntegerSetCCSDNode_VIPlus1_Swappable<"PseudoVMSGTU", SETUGE, SETULE, + SplatPat_simm5_plus1_nonzero>; // 11.9. Vector Integer Min/Max Instructions defm : VPatBinarySDNode_VV_VX; diff --git a/llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll index 479664c6f5f62..50bbe4f7b4c2d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll @@ -1216,19 +1216,19 @@ define @cttz_nxv1i64( %va) { ; RV32F-LABEL: cttz_nxv1i64: ; RV32F: # %bb.0: ; RV32F-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; RV32F-NEXT: vmseq.vx v0, v8, zero ; RV32F-NEXT: vrsub.vi v9, v8, 0 -; RV32F-NEXT: vand.vv v8, v8, v9 +; RV32F-NEXT: vand.vv v9, v8, v9 ; RV32F-NEXT: fsrmi a0, 1 ; RV32F-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; RV32F-NEXT: vfncvt.f.xu.w v9, v8 -; RV32F-NEXT: vsrl.vi v8, v9, 23 +; RV32F-NEXT: vfncvt.f.xu.w v10, v9 +; RV32F-NEXT: vsrl.vi v9, v10, 23 ; RV32F-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; RV32F-NEXT: vzext.vf2 v9, v8 +; RV32F-NEXT: vzext.vf2 v10, v9 ; RV32F-NEXT: li a1, 127 -; RV32F-NEXT: vsub.vx v8, v9, a1 +; RV32F-NEXT: vsub.vx v9, v10, a1 +; RV32F-NEXT: vmseq.vi v0, v8, 0 ; RV32F-NEXT: li a1, 64 -; RV32F-NEXT: vmerge.vxm v8, v8, a1, v0 +; RV32F-NEXT: vmerge.vxm v8, v9, a1, v0 ; RV32F-NEXT: fsrm a0 ; RV32F-NEXT: ret ; @@ -1250,39 +1250,22 @@ define @cttz_nxv1i64( %va) { ; RV64F-NEXT: fsrm a0 ; RV64F-NEXT: ret ; -; RV32D-LABEL: cttz_nxv1i64: -; RV32D: # %bb.0: -; RV32D-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; RV32D-NEXT: vmseq.vx v0, v8, zero -; RV32D-NEXT: vrsub.vi v9, v8, 0 -; RV32D-NEXT: vand.vv v8, v8, v9 -; RV32D-NEXT: fsrmi a0, 1 -; RV32D-NEXT: vfcvt.f.xu.v v8, v8 -; RV32D-NEXT: li a1, 52 -; RV32D-NEXT: vsrl.vx v8, v8, a1 -; RV32D-NEXT: li a1, 1023 -; RV32D-NEXT: vsub.vx v8, v8, a1 -; RV32D-NEXT: li a1, 64 -; RV32D-NEXT: vmerge.vxm v8, v8, a1, v0 -; RV32D-NEXT: fsrm a0 -; RV32D-NEXT: ret -; -; RV64D-LABEL: cttz_nxv1i64: -; RV64D: # %bb.0: -; RV64D-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; RV64D-NEXT: vrsub.vi v9, v8, 0 -; RV64D-NEXT: vand.vv v9, v8, v9 -; RV64D-NEXT: fsrmi a0, 1 -; RV64D-NEXT: vfcvt.f.xu.v v9, v9 -; RV64D-NEXT: li a1, 52 -; RV64D-NEXT: vsrl.vx v9, v9, a1 -; RV64D-NEXT: li a1, 1023 -; RV64D-NEXT: vsub.vx v9, v9, a1 -; RV64D-NEXT: vmseq.vi v0, v8, 0 -; RV64D-NEXT: li a1, 64 -; RV64D-NEXT: vmerge.vxm v8, v9, a1, v0 -; RV64D-NEXT: fsrm a0 -; RV64D-NEXT: ret +; CHECK-D-LABEL: cttz_nxv1i64: +; CHECK-D: # %bb.0: +; CHECK-D-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; CHECK-D-NEXT: vrsub.vi v9, v8, 0 +; CHECK-D-NEXT: vand.vv v9, v8, v9 +; CHECK-D-NEXT: fsrmi a0, 1 +; CHECK-D-NEXT: vfcvt.f.xu.v v9, v9 +; CHECK-D-NEXT: li a1, 52 +; CHECK-D-NEXT: vsrl.vx v9, v9, a1 +; CHECK-D-NEXT: li a1, 1023 +; CHECK-D-NEXT: vsub.vx v9, v9, a1 +; CHECK-D-NEXT: vmseq.vi v0, v8, 0 +; CHECK-D-NEXT: li a1, 64 +; CHECK-D-NEXT: vmerge.vxm v8, v9, a1, v0 +; CHECK-D-NEXT: fsrm a0 +; CHECK-D-NEXT: ret ; ; CHECK-ZVBB-LABEL: cttz_nxv1i64: ; CHECK-ZVBB: # %bb.0: @@ -1378,19 +1361,19 @@ define @cttz_nxv2i64( %va) { ; RV32F-LABEL: cttz_nxv2i64: ; RV32F: # %bb.0: ; RV32F-NEXT: vsetvli a0, zero, e64, m2, ta, ma -; RV32F-NEXT: vmseq.vx v0, v8, zero ; RV32F-NEXT: vrsub.vi v10, v8, 0 -; RV32F-NEXT: vand.vv v8, v8, v10 +; RV32F-NEXT: vand.vv v10, v8, v10 ; RV32F-NEXT: fsrmi a0, 1 ; RV32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV32F-NEXT: vfncvt.f.xu.w v10, v8 -; RV32F-NEXT: vsrl.vi v8, v10, 23 +; RV32F-NEXT: vfncvt.f.xu.w v12, v10 +; RV32F-NEXT: vsrl.vi v10, v12, 23 ; RV32F-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; RV32F-NEXT: vzext.vf2 v10, v8 +; RV32F-NEXT: vzext.vf2 v12, v10 ; RV32F-NEXT: li a1, 127 -; RV32F-NEXT: vsub.vx v8, v10, a1 +; RV32F-NEXT: vsub.vx v10, v12, a1 +; RV32F-NEXT: vmseq.vi v0, v8, 0 ; RV32F-NEXT: li a1, 64 -; RV32F-NEXT: vmerge.vxm v8, v8, a1, v0 +; RV32F-NEXT: vmerge.vxm v8, v10, a1, v0 ; RV32F-NEXT: fsrm a0 ; RV32F-NEXT: ret ; @@ -1412,39 +1395,22 @@ define @cttz_nxv2i64( %va) { ; RV64F-NEXT: fsrm a0 ; RV64F-NEXT: ret ; -; RV32D-LABEL: cttz_nxv2i64: -; RV32D: # %bb.0: -; RV32D-NEXT: vsetvli a0, zero, e64, m2, ta, ma -; RV32D-NEXT: vmseq.vx v0, v8, zero -; RV32D-NEXT: vrsub.vi v10, v8, 0 -; RV32D-NEXT: vand.vv v8, v8, v10 -; RV32D-NEXT: fsrmi a0, 1 -; RV32D-NEXT: vfcvt.f.xu.v v8, v8 -; RV32D-NEXT: li a1, 52 -; RV32D-NEXT: vsrl.vx v8, v8, a1 -; RV32D-NEXT: li a1, 1023 -; RV32D-NEXT: vsub.vx v8, v8, a1 -; RV32D-NEXT: li a1, 64 -; RV32D-NEXT: vmerge.vxm v8, v8, a1, v0 -; RV32D-NEXT: fsrm a0 -; RV32D-NEXT: ret -; -; RV64D-LABEL: cttz_nxv2i64: -; RV64D: # %bb.0: -; RV64D-NEXT: vsetvli a0, zero, e64, m2, ta, ma -; RV64D-NEXT: vrsub.vi v10, v8, 0 -; RV64D-NEXT: vand.vv v10, v8, v10 -; RV64D-NEXT: fsrmi a0, 1 -; RV64D-NEXT: vfcvt.f.xu.v v10, v10 -; RV64D-NEXT: li a1, 52 -; RV64D-NEXT: vsrl.vx v10, v10, a1 -; RV64D-NEXT: li a1, 1023 -; RV64D-NEXT: vsub.vx v10, v10, a1 -; RV64D-NEXT: vmseq.vi v0, v8, 0 -; RV64D-NEXT: li a1, 64 -; RV64D-NEXT: vmerge.vxm v8, v10, a1, v0 -; RV64D-NEXT: fsrm a0 -; RV64D-NEXT: ret +; CHECK-D-LABEL: cttz_nxv2i64: +; CHECK-D: # %bb.0: +; CHECK-D-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; CHECK-D-NEXT: vrsub.vi v10, v8, 0 +; CHECK-D-NEXT: vand.vv v10, v8, v10 +; CHECK-D-NEXT: fsrmi a0, 1 +; CHECK-D-NEXT: vfcvt.f.xu.v v10, v10 +; CHECK-D-NEXT: li a1, 52 +; CHECK-D-NEXT: vsrl.vx v10, v10, a1 +; CHECK-D-NEXT: li a1, 1023 +; CHECK-D-NEXT: vsub.vx v10, v10, a1 +; CHECK-D-NEXT: vmseq.vi v0, v8, 0 +; CHECK-D-NEXT: li a1, 64 +; CHECK-D-NEXT: vmerge.vxm v8, v10, a1, v0 +; CHECK-D-NEXT: fsrm a0 +; CHECK-D-NEXT: ret ; ; CHECK-ZVBB-LABEL: cttz_nxv2i64: ; CHECK-ZVBB: # %bb.0: @@ -1540,19 +1506,19 @@ define @cttz_nxv4i64( %va) { ; RV32F-LABEL: cttz_nxv4i64: ; RV32F: # %bb.0: ; RV32F-NEXT: vsetvli a0, zero, e64, m4, ta, ma -; RV32F-NEXT: vmseq.vx v0, v8, zero ; RV32F-NEXT: vrsub.vi v12, v8, 0 -; RV32F-NEXT: vand.vv v8, v8, v12 +; RV32F-NEXT: vand.vv v12, v8, v12 ; RV32F-NEXT: fsrmi a0, 1 ; RV32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; RV32F-NEXT: vfncvt.f.xu.w v12, v8 -; RV32F-NEXT: vsrl.vi v8, v12, 23 +; RV32F-NEXT: vfncvt.f.xu.w v16, v12 +; RV32F-NEXT: vsrl.vi v12, v16, 23 ; RV32F-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; RV32F-NEXT: vzext.vf2 v12, v8 +; RV32F-NEXT: vzext.vf2 v16, v12 ; RV32F-NEXT: li a1, 127 -; RV32F-NEXT: vsub.vx v8, v12, a1 +; RV32F-NEXT: vsub.vx v12, v16, a1 +; RV32F-NEXT: vmseq.vi v0, v8, 0 ; RV32F-NEXT: li a1, 64 -; RV32F-NEXT: vmerge.vxm v8, v8, a1, v0 +; RV32F-NEXT: vmerge.vxm v8, v12, a1, v0 ; RV32F-NEXT: fsrm a0 ; RV32F-NEXT: ret ; @@ -1574,39 +1540,22 @@ define @cttz_nxv4i64( %va) { ; RV64F-NEXT: fsrm a0 ; RV64F-NEXT: ret ; -; RV32D-LABEL: cttz_nxv4i64: -; RV32D: # %bb.0: -; RV32D-NEXT: vsetvli a0, zero, e64, m4, ta, ma -; RV32D-NEXT: vmseq.vx v0, v8, zero -; RV32D-NEXT: vrsub.vi v12, v8, 0 -; RV32D-NEXT: vand.vv v8, v8, v12 -; RV32D-NEXT: fsrmi a0, 1 -; RV32D-NEXT: vfcvt.f.xu.v v8, v8 -; RV32D-NEXT: li a1, 52 -; RV32D-NEXT: vsrl.vx v8, v8, a1 -; RV32D-NEXT: li a1, 1023 -; RV32D-NEXT: vsub.vx v8, v8, a1 -; RV32D-NEXT: li a1, 64 -; RV32D-NEXT: vmerge.vxm v8, v8, a1, v0 -; RV32D-NEXT: fsrm a0 -; RV32D-NEXT: ret -; -; RV64D-LABEL: cttz_nxv4i64: -; RV64D: # %bb.0: -; RV64D-NEXT: vsetvli a0, zero, e64, m4, ta, ma -; RV64D-NEXT: vrsub.vi v12, v8, 0 -; RV64D-NEXT: vand.vv v12, v8, v12 -; RV64D-NEXT: fsrmi a0, 1 -; RV64D-NEXT: vfcvt.f.xu.v v12, v12 -; RV64D-NEXT: li a1, 52 -; RV64D-NEXT: vsrl.vx v12, v12, a1 -; RV64D-NEXT: li a1, 1023 -; RV64D-NEXT: vsub.vx v12, v12, a1 -; RV64D-NEXT: vmseq.vi v0, v8, 0 -; RV64D-NEXT: li a1, 64 -; RV64D-NEXT: vmerge.vxm v8, v12, a1, v0 -; RV64D-NEXT: fsrm a0 -; RV64D-NEXT: ret +; CHECK-D-LABEL: cttz_nxv4i64: +; CHECK-D: # %bb.0: +; CHECK-D-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; CHECK-D-NEXT: vrsub.vi v12, v8, 0 +; CHECK-D-NEXT: vand.vv v12, v8, v12 +; CHECK-D-NEXT: fsrmi a0, 1 +; CHECK-D-NEXT: vfcvt.f.xu.v v12, v12 +; CHECK-D-NEXT: li a1, 52 +; CHECK-D-NEXT: vsrl.vx v12, v12, a1 +; CHECK-D-NEXT: li a1, 1023 +; CHECK-D-NEXT: vsub.vx v12, v12, a1 +; CHECK-D-NEXT: vmseq.vi v0, v8, 0 +; CHECK-D-NEXT: li a1, 64 +; CHECK-D-NEXT: vmerge.vxm v8, v12, a1, v0 +; CHECK-D-NEXT: fsrm a0 +; CHECK-D-NEXT: ret ; ; CHECK-ZVBB-LABEL: cttz_nxv4i64: ; CHECK-ZVBB: # %bb.0: @@ -1702,19 +1651,19 @@ define @cttz_nxv8i64( %va) { ; RV32F-LABEL: cttz_nxv8i64: ; RV32F: # %bb.0: ; RV32F-NEXT: vsetvli a0, zero, e64, m8, ta, ma -; RV32F-NEXT: vmseq.vx v0, v8, zero ; RV32F-NEXT: vrsub.vi v16, v8, 0 -; RV32F-NEXT: vand.vv v8, v8, v16 +; RV32F-NEXT: vand.vv v16, v8, v16 ; RV32F-NEXT: fsrmi a0, 1 ; RV32F-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; RV32F-NEXT: vfncvt.f.xu.w v16, v8 -; RV32F-NEXT: vsrl.vi v8, v16, 23 +; RV32F-NEXT: vfncvt.f.xu.w v24, v16 +; RV32F-NEXT: vsrl.vi v16, v24, 23 ; RV32F-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; RV32F-NEXT: vzext.vf2 v16, v8 +; RV32F-NEXT: vzext.vf2 v24, v16 ; RV32F-NEXT: li a1, 127 -; RV32F-NEXT: vsub.vx v8, v16, a1 +; RV32F-NEXT: vsub.vx v16, v24, a1 +; RV32F-NEXT: vmseq.vi v0, v8, 0 ; RV32F-NEXT: li a1, 64 -; RV32F-NEXT: vmerge.vxm v8, v8, a1, v0 +; RV32F-NEXT: vmerge.vxm v8, v16, a1, v0 ; RV32F-NEXT: fsrm a0 ; RV32F-NEXT: ret ; @@ -1736,39 +1685,22 @@ define @cttz_nxv8i64( %va) { ; RV64F-NEXT: fsrm a0 ; RV64F-NEXT: ret ; -; RV32D-LABEL: cttz_nxv8i64: -; RV32D: # %bb.0: -; RV32D-NEXT: vsetvli a0, zero, e64, m8, ta, ma -; RV32D-NEXT: vmseq.vx v0, v8, zero -; RV32D-NEXT: vrsub.vi v16, v8, 0 -; RV32D-NEXT: vand.vv v8, v8, v16 -; RV32D-NEXT: fsrmi a0, 1 -; RV32D-NEXT: vfcvt.f.xu.v v8, v8 -; RV32D-NEXT: li a1, 52 -; RV32D-NEXT: vsrl.vx v8, v8, a1 -; RV32D-NEXT: li a1, 1023 -; RV32D-NEXT: vsub.vx v8, v8, a1 -; RV32D-NEXT: li a1, 64 -; RV32D-NEXT: vmerge.vxm v8, v8, a1, v0 -; RV32D-NEXT: fsrm a0 -; RV32D-NEXT: ret -; -; RV64D-LABEL: cttz_nxv8i64: -; RV64D: # %bb.0: -; RV64D-NEXT: vsetvli a0, zero, e64, m8, ta, ma -; RV64D-NEXT: vrsub.vi v16, v8, 0 -; RV64D-NEXT: vand.vv v16, v8, v16 -; RV64D-NEXT: fsrmi a0, 1 -; RV64D-NEXT: vfcvt.f.xu.v v16, v16 -; RV64D-NEXT: li a1, 52 -; RV64D-NEXT: vsrl.vx v16, v16, a1 -; RV64D-NEXT: li a1, 1023 -; RV64D-NEXT: vsub.vx v16, v16, a1 -; RV64D-NEXT: vmseq.vi v0, v8, 0 -; RV64D-NEXT: li a1, 64 -; RV64D-NEXT: vmerge.vxm v8, v16, a1, v0 -; RV64D-NEXT: fsrm a0 -; RV64D-NEXT: ret +; CHECK-D-LABEL: cttz_nxv8i64: +; CHECK-D: # %bb.0: +; CHECK-D-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; CHECK-D-NEXT: vrsub.vi v16, v8, 0 +; CHECK-D-NEXT: vand.vv v16, v8, v16 +; CHECK-D-NEXT: fsrmi a0, 1 +; CHECK-D-NEXT: vfcvt.f.xu.v v16, v16 +; CHECK-D-NEXT: li a1, 52 +; CHECK-D-NEXT: vsrl.vx v16, v16, a1 +; CHECK-D-NEXT: li a1, 1023 +; CHECK-D-NEXT: vsub.vx v16, v16, a1 +; CHECK-D-NEXT: vmseq.vi v0, v8, 0 +; CHECK-D-NEXT: li a1, 64 +; CHECK-D-NEXT: vmerge.vxm v8, v16, a1, v0 +; CHECK-D-NEXT: fsrm a0 +; CHECK-D-NEXT: ret ; ; CHECK-ZVBB-LABEL: cttz_nxv8i64: ; CHECK-ZVBB: # %bb.0: @@ -3343,4 +3275,6 @@ define @cttz_zero_undef_nxv8i64( %va) { } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; RV32: {{.*}} +; RV32D: {{.*}} ; RV64: {{.*}} +; RV64D: {{.*}} From f0ac8903ea1c1d664d1fae16ed00e096c713aaee Mon Sep 17 00:00:00 2001 From: Piotr Fusik Date: Wed, 17 Jul 2024 07:07:33 +0200 Subject: [PATCH 217/777] [RISCV][NFC] Fix intrinsic misspelled in a comment (#98998) --- llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp b/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp index 0e84eda0c9d07..0a66a38f6d5ab 100644 --- a/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp +++ b/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp @@ -120,14 +120,16 @@ bool RISCVCodeGenPrepare::visitAnd(BinaryOperator &BO) { // // loop: // %phi = phi [ ..., %entry ], [ %acc, %loop ] -// %acc = call float @llvm.vector.reduce.fadd.nxv4f32(float %phi, %vec) +// %acc = call float @llvm.vector.reduce.fadd.nxv2f32(float %phi, +// %vec) // // -> // // loop: // %phi = phi [ ..., %entry ], [ %acc.vec, %loop ] // %phi.scalar = extractelement %phi, i64 0 -// %acc = call float @llvm.vector.reduce.fadd.nxv4f32(float %x, %vec) +// %acc = call float @llvm.vector.reduce.fadd.nxv2f32(float %x, +// %vec) // %acc.vec = insertelement poison, float %acc.next, i64 0 // // Which eliminates the scalar -> vector -> scalar crossing during instruction From 3fe50b6dde174c76b3380927d7dd43ac19527d64 Mon Sep 17 00:00:00 2001 From: Amir Ayupov Date: Tue, 16 Jul 2024 22:14:43 -0700 Subject: [PATCH 218/777] [BOLT] Store FileSymRefs in a multimap With aggressive ICF, it's possible to have different local symbols (under different FILE symbols) to be mapped to the same address. FileSymRefs only keeps a single SymbolRef per address, which prevents fragment matching from finding the correct symbol to perform parent function lookup. Work around this issue by switching FileSymRefs to a multimap. In future, uses of FileSymRefs can be replaced with SortedSymbols which keeps essentially the same information. Test Plan: added ambiguous_fragment.test Reviewers: dcci, ayermolo, maksfb, rafaelauler Reviewed By: rafaelauler Pull Request: https://github.com/llvm/llvm-project/pull/98992 --- bolt/include/bolt/Rewrite/RewriteInstance.h | 2 +- bolt/lib/Rewrite/RewriteInstance.cpp | 13 +++-- bolt/test/X86/Inputs/ambiguous_fragment.s | 54 +++++++++++++++++++ .../test/X86/Inputs/ambiguous_fragment.script | 6 +++ bolt/test/X86/ambiguous_fragment.test | 33 ++++++++++++ 5 files changed, 104 insertions(+), 4 deletions(-) create mode 100644 bolt/test/X86/Inputs/ambiguous_fragment.s create mode 100644 bolt/test/X86/Inputs/ambiguous_fragment.script create mode 100644 bolt/test/X86/ambiguous_fragment.test diff --git a/bolt/include/bolt/Rewrite/RewriteInstance.h b/bolt/include/bolt/Rewrite/RewriteInstance.h index af1d9b4b70a3d..16a82d5687de9 100644 --- a/bolt/include/bolt/Rewrite/RewriteInstance.h +++ b/bolt/include/bolt/Rewrite/RewriteInstance.h @@ -490,7 +490,7 @@ class RewriteInstance { std::unordered_map SymbolIndex; /// Store all non-zero symbols in this map for a quick address lookup. - std::map FileSymRefs; + std::multimap FileSymRefs; /// FILE symbols used for disambiguating split function parents. std::vector FileSymbols; diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp index ded2f577237fe..32562ccb6b345 100644 --- a/bolt/lib/Rewrite/RewriteInstance.cpp +++ b/bolt/lib/Rewrite/RewriteInstance.cpp @@ -886,7 +886,7 @@ void RewriteInstance::discoverFileObjects() { if (SymName == "__hot_start" || SymName == "__hot_end") continue; - FileSymRefs[SymbolAddress] = Symbol; + FileSymRefs.emplace(SymbolAddress, Symbol); // Skip section symbols that will be registered by disassemblePLT(). if (SymbolType == SymbolRef::ST_Debug) { @@ -1052,7 +1052,9 @@ void RewriteInstance::discoverFileObjects() { // Remove the symbol from FileSymRefs so that we can skip it from // in the future. - auto SI = FileSymRefs.find(SymbolAddress); + auto SI = llvm::find_if( + llvm::make_range(FileSymRefs.equal_range(SymbolAddress)), + [&](auto SymIt) { return SymIt.second == Symbol; }); assert(SI != FileSymRefs.end() && "symbol expected to be present"); assert(SI->second == Symbol && "wrong symbol found"); FileSymRefs.erase(SI); @@ -1260,6 +1262,7 @@ void RewriteInstance::discoverFileObjects() { registerFragments(); FileSymbols.clear(); + FileSymRefs.clear(); discoverBOLTReserved(); } @@ -1433,7 +1436,11 @@ void RewriteInstance::registerFragments() { const uint64_t Address = BF->getAddress(); // Get fragment's own symbol - const auto SymIt = FileSymRefs.find(Address); + const auto SymIt = llvm::find_if( + llvm::make_range(FileSymRefs.equal_range(Address)), [&](auto SI) { + StringRef Name = cantFail(SI.second.getName()); + return Name.contains(ParentName); + }); if (SymIt == FileSymRefs.end()) { BC->errs() << "BOLT-ERROR: symbol lookup failed for function at address 0x" diff --git a/bolt/test/X86/Inputs/ambiguous_fragment.s b/bolt/test/X86/Inputs/ambiguous_fragment.s new file mode 100644 index 0000000000000..05346ffd7c344 --- /dev/null +++ b/bolt/test/X86/Inputs/ambiguous_fragment.s @@ -0,0 +1,54 @@ +#--- file1 +.file "file1.cpp" +.section .text.cold +.type __func.cold.0, @function +__func.cold.0: + ud2 + .size __func.cold.0, .-__func.cold.0 +.section .text +.type __func, @function +__func: + ud2 + .size __func, .-__func + +#--- file2 +.file "file2.cpp" +.section .text.cold +.type __func.cold.0, @function +__func.cold.0: + ud2 + .size __func.cold.0, .-__func.cold.0 +.section .text +.type __func, @function +__func: + ud2 + .size __func, .-__func + +#--- file3 +.file "file3.cpp" +.section .text.cold +.type __func.cold.0, @function +__func.cold.0: + ud2 + .size __func.cold.0, .-__func.cold.0 +.section .text +.type __func, @function +__func: + ud2 + .size __func, .-__func + +#--- file4 +.file "file4.cpp" +.section .text.cold +.type __func.cold.0, @function +__func.cold.0: + ud2 + .size __func.cold.0, .-__func.cold.0 +.section .text +.type __func, @function +__func: + ud2 + .size __func, .-__func + +#--- file5 +.file "bolt-pseudo.o" diff --git a/bolt/test/X86/Inputs/ambiguous_fragment.script b/bolt/test/X86/Inputs/ambiguous_fragment.script new file mode 100644 index 0000000000000..00129b8887641 --- /dev/null +++ b/bolt/test/X86/Inputs/ambiguous_fragment.script @@ -0,0 +1,6 @@ +SECTIONS { + . = 0x10000; + .text : { *(.text) } + . = 0x20000; + .text.cold : { *(.text.cold) } +} diff --git a/bolt/test/X86/ambiguous_fragment.test b/bolt/test/X86/ambiguous_fragment.test new file mode 100644 index 0000000000000..e7d32c0a680a3 --- /dev/null +++ b/bolt/test/X86/ambiguous_fragment.test @@ -0,0 +1,33 @@ +## This reproduces a bug with misidentification of a parent fragment. + +RUN: split-file %p/Inputs/ambiguous_fragment.s %t + +RUN: llvm-mc --filetype=obj --triple x86_64-unknown-unknown %t/file1 -o %t1.o +RUN: llvm-mc --filetype=obj --triple x86_64-unknown-unknown %t/file2 -o %t2.o +RUN: llvm-mc --filetype=obj --triple x86_64-unknown-unknown %t/file3 -o %t3.o +RUN: llvm-mc --filetype=obj --triple x86_64-unknown-unknown %t/file4 -o %t4.o +RUN: llvm-mc --filetype=obj --triple x86_64-unknown-unknown %t/file5 -o %t5.o + +RUN: ld.lld %t1.o %t2.o %t3.o %t4.o %t5.o -o %t.exe \ +RUN: --script %p/Inputs/ambiguous_fragment.script + +RUN: llvm-objcopy %t.exe %t.exe2 \ +RUN: --add-symbol=_Zfunc.cold.0=.text.cold:0x4,local,function \ +RUN: --add-symbol=_Zfunc=.text:0xc,function + +RUN: llvm-objdump --syms %t.exe2 | FileCheck %s --check-prefix=CHECK-SYMS + +RUN: link_fdata %s %t.exe2 %t.preagg PREAGG +RUN: perf2bolt -v=1 %t.exe2 -p %t.preagg --pa -o %t.fdata -w %t.yaml | FileCheck %s + +# PREAGG: B X:0 #__func# 1 0 + +CHECK-SYMS: 0000000000020004 {{.*}} __func.cold.0 +CHECK-SYMS: 0000000000020004 {{.*}} _Zfunc.cold.0 + +CHECK-NOT: BOLT-ERROR: parent function not found for __func.cold.0 +CHECK: BOLT-INFO: marking __func.cold.0/3(*4) as a fragment of __func/4(*3) +CHECK-NEXT: BOLT-INFO: marking __func.cold.0/1(*2) as a fragment of __func/1(*2) +CHECK-NEXT: BOLT-INFO: marking __func.cold.0/2(*2) as a fragment of __func/2(*2) +CHECK-NEXT: BOLT-INFO: marking __func.cold.0/3(*4) as a fragment of __func/3(*2) +CHECK-NEXT: BOLT-INFO: marking __func.cold.0/4(*2) as a fragment of __func/4(*3) From 20861f1f2fdc5d9aaa76140ade96f7edfdefb0e1 Mon Sep 17 00:00:00 2001 From: Guray Ozen Date: Wed, 17 Jul 2024 07:25:11 +0200 Subject: [PATCH 219/777] [mlir][gpu] Use alloc OP's `host_shared` in cuda runtime (#99035) --- .../ExecutionEngine/CudaRuntimeWrappers.cpp | 13 ++++++--- .../GPU/CUDA/alloc-host-shared.mlir | 27 +++++++++++++++++++ 2 files changed, 37 insertions(+), 3 deletions(-) create mode 100644 mlir/test/Integration/GPU/CUDA/alloc-host-shared.mlir diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp index 09dc30365e37c..6a32309aa9e05 100644 --- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp +++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp @@ -237,11 +237,18 @@ extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuEventRecord(CUevent event, } extern "C" MLIR_CUDA_WRAPPERS_EXPORT void * -mgpuMemAlloc(uint64_t sizeBytes, CUstream /*stream*/, bool /*isHostShared*/) { +mgpuMemAlloc(uint64_t sizeBytes, CUstream stream, bool isHostShared) { ScopedContext scopedContext; CUdeviceptr ptr = 0; - if (sizeBytes != 0) - CUDA_REPORT_IF_ERROR(cuMemAlloc(&ptr, sizeBytes)); + if (sizeBytes == 0) + return reinterpret_cast(ptr); + + if (isHostShared) { + CUDA_REPORT_IF_ERROR( + cuMemAllocManaged(&ptr, sizeBytes, CU_MEM_ATTACH_GLOBAL)); + return reinterpret_cast(ptr); + } + CUDA_REPORT_IF_ERROR(cuMemAlloc(&ptr, sizeBytes)); return reinterpret_cast(ptr); } diff --git a/mlir/test/Integration/GPU/CUDA/alloc-host-shared.mlir b/mlir/test/Integration/GPU/CUDA/alloc-host-shared.mlir new file mode 100644 index 0000000000000..77fa0deffdd69 --- /dev/null +++ b/mlir/test/Integration/GPU/CUDA/alloc-host-shared.mlir @@ -0,0 +1,27 @@ +// RUN: mlir-opt %s \ +// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \ +// RUN: | mlir-cpu-runner \ +// RUN: --shared-libs=%mlir_cuda_runtime \ +// RUN: --shared-libs=%mlir_runner_utils \ +// RUN: --entry-point-result=void \ +// RUN: | FileCheck %s + +// CHECK: 2000 +module attributes {gpu.container_module} { + func.func @main() { + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %c1000_i32 = arith.constant 1000 : i32 + %memref = gpu.alloc host_shared () : memref<1xi32> + memref.store %c1000_i32, %memref[%c1] : memref<1xi32> + gpu.launch blocks(%arg0, %arg1, %arg2) in (%arg6 = %c1, %arg7 = %c1, %arg8 = %c1) threads(%arg3, %arg4, %arg5) in (%arg9 = %c1, %arg10 = %c1, %arg11 = %c1) { + %1 = memref.load %memref[%c1] : memref<1xi32> + %2 = arith.addi %1, %1 : i32 + memref.store %2, %memref[%c1] : memref<1xi32> + gpu.terminator + } + %0 = memref.load %memref[%c1] : memref<1xi32> + vector.print %0 : i32 + return + } +} From e316f1956992730fa601849799ccb12d17f507d7 Mon Sep 17 00:00:00 2001 From: WANG Rui Date: Tue, 16 Jul 2024 16:56:12 +0800 Subject: [PATCH 220/777] [LoongArch] Pre-commit tests for spurious mask removal. NFC --- llvm/test/CodeGen/LoongArch/andn-icmp.ll | 452 +++++++++++++++++++++++ 1 file changed, 452 insertions(+) diff --git a/llvm/test/CodeGen/LoongArch/andn-icmp.ll b/llvm/test/CodeGen/LoongArch/andn-icmp.ll index c529c2e281214..4fc3c8df4664c 100644 --- a/llvm/test/CodeGen/LoongArch/andn-icmp.ll +++ b/llvm/test/CodeGen/LoongArch/andn-icmp.ll @@ -149,3 +149,455 @@ define i1 @andn_icmp_ne_i64(i64 %a, i64 %b) nounwind { %cmpne = icmp ne i64 %and, %b ret i1 %cmpne } + +define i1 @andn_icmp_ult_i8(i8 signext %a, i8 signext %b) nounwind { +; LA32-LABEL: andn_icmp_ult_i8: +; LA32: # %bb.0: +; LA32-NEXT: andi $a1, $a1, 255 +; LA32-NEXT: and $a0, $a1, $a0 +; LA32-NEXT: sltu $a0, $a0, $a1 +; LA32-NEXT: ret +; +; LA64-LABEL: andn_icmp_ult_i8: +; LA64: # %bb.0: +; LA64-NEXT: andi $a1, $a1, 255 +; LA64-NEXT: and $a0, $a1, $a0 +; LA64-NEXT: sltu $a0, $a0, $a1 +; LA64-NEXT: ret + %and = and i8 %a, %b + %cmp = icmp ult i8 %and, %b + ret i1 %cmp +} + +define i1 @andn_icmp_ult_i16(i16 signext %a, i16 signext %b) nounwind { +; LA32-LABEL: andn_icmp_ult_i16: +; LA32: # %bb.0: +; LA32-NEXT: bstrpick.w $a1, $a1, 15, 0 +; LA32-NEXT: and $a0, $a1, $a0 +; LA32-NEXT: sltu $a0, $a0, $a1 +; LA32-NEXT: ret +; +; LA64-LABEL: andn_icmp_ult_i16: +; LA64: # %bb.0: +; LA64-NEXT: bstrpick.d $a1, $a1, 15, 0 +; LA64-NEXT: and $a0, $a1, $a0 +; LA64-NEXT: sltu $a0, $a0, $a1 +; LA64-NEXT: ret + %and = and i16 %a, %b + %cmp = icmp ult i16 %and, %b + ret i1 %cmp +} + +define i1 @andn_icmp_uge_i8(i8 signext %a, i8 signext %b) nounwind { +; LA32-LABEL: andn_icmp_uge_i8: +; LA32: # %bb.0: +; LA32-NEXT: andi $a1, $a1, 255 +; LA32-NEXT: and $a0, $a1, $a0 +; LA32-NEXT: sltu $a0, $a0, $a1 +; LA32-NEXT: xori $a0, $a0, 1 +; LA32-NEXT: ret +; +; LA64-LABEL: andn_icmp_uge_i8: +; LA64: # %bb.0: +; LA64-NEXT: andi $a1, $a1, 255 +; LA64-NEXT: and $a0, $a1, $a0 +; LA64-NEXT: sltu $a0, $a0, $a1 +; LA64-NEXT: xori $a0, $a0, 1 +; LA64-NEXT: ret + %and = and i8 %a, %b + %cmp = icmp uge i8 %and, %b + ret i1 %cmp +} + +define i1 @andn_icmp_uge_i16(i16 signext %a, i16 signext %b) nounwind { +; LA32-LABEL: andn_icmp_uge_i16: +; LA32: # %bb.0: +; LA32-NEXT: bstrpick.w $a1, $a1, 15, 0 +; LA32-NEXT: and $a0, $a1, $a0 +; LA32-NEXT: sltu $a0, $a0, $a1 +; LA32-NEXT: xori $a0, $a0, 1 +; LA32-NEXT: ret +; +; LA64-LABEL: andn_icmp_uge_i16: +; LA64: # %bb.0: +; LA64-NEXT: bstrpick.d $a1, $a1, 15, 0 +; LA64-NEXT: and $a0, $a1, $a0 +; LA64-NEXT: sltu $a0, $a0, $a1 +; LA64-NEXT: xori $a0, $a0, 1 +; LA64-NEXT: ret + %and = and i16 %a, %b + %cmp = icmp uge i16 %and, %b + ret i1 %cmp +} + +define i1 @andn_icmp_ugt_i8(i8 signext %a, i8 signext %b) nounwind { +; LA32-LABEL: andn_icmp_ugt_i8: +; LA32: # %bb.0: +; LA32-NEXT: andi $a1, $a1, 255 +; LA32-NEXT: and $a0, $a1, $a0 +; LA32-NEXT: sltu $a0, $a1, $a0 +; LA32-NEXT: ret +; +; LA64-LABEL: andn_icmp_ugt_i8: +; LA64: # %bb.0: +; LA64-NEXT: andi $a1, $a1, 255 +; LA64-NEXT: and $a0, $a1, $a0 +; LA64-NEXT: sltu $a0, $a1, $a0 +; LA64-NEXT: ret + %and = and i8 %a, %b + %cmp = icmp ugt i8 %and, %b + ret i1 %cmp +} + +define i1 @andn_icmp_ugt_i16(i16 signext %a, i16 signext %b) nounwind { +; LA32-LABEL: andn_icmp_ugt_i16: +; LA32: # %bb.0: +; LA32-NEXT: bstrpick.w $a1, $a1, 15, 0 +; LA32-NEXT: and $a0, $a1, $a0 +; LA32-NEXT: sltu $a0, $a1, $a0 +; LA32-NEXT: ret +; +; LA64-LABEL: andn_icmp_ugt_i16: +; LA64: # %bb.0: +; LA64-NEXT: bstrpick.d $a1, $a1, 15, 0 +; LA64-NEXT: and $a0, $a1, $a0 +; LA64-NEXT: sltu $a0, $a1, $a0 +; LA64-NEXT: ret + %and = and i16 %a, %b + %cmp = icmp ugt i16 %and, %b + ret i1 %cmp +} + +define i1 @andn_icmp_ule_i8(i8 signext %a, i8 signext %b) nounwind { +; LA32-LABEL: andn_icmp_ule_i8: +; LA32: # %bb.0: +; LA32-NEXT: andi $a1, $a1, 255 +; LA32-NEXT: and $a0, $a1, $a0 +; LA32-NEXT: sltu $a0, $a1, $a0 +; LA32-NEXT: xori $a0, $a0, 1 +; LA32-NEXT: ret +; +; LA64-LABEL: andn_icmp_ule_i8: +; LA64: # %bb.0: +; LA64-NEXT: andi $a1, $a1, 255 +; LA64-NEXT: and $a0, $a1, $a0 +; LA64-NEXT: sltu $a0, $a1, $a0 +; LA64-NEXT: xori $a0, $a0, 1 +; LA64-NEXT: ret + %and = and i8 %a, %b + %cmp = icmp ule i8 %and, %b + ret i1 %cmp +} + +define i1 @andn_icmp_ule_i16(i16 signext %a, i16 signext %b) nounwind { +; LA32-LABEL: andn_icmp_ule_i16: +; LA32: # %bb.0: +; LA32-NEXT: bstrpick.w $a1, $a1, 15, 0 +; LA32-NEXT: and $a0, $a1, $a0 +; LA32-NEXT: sltu $a0, $a1, $a0 +; LA32-NEXT: xori $a0, $a0, 1 +; LA32-NEXT: ret +; +; LA64-LABEL: andn_icmp_ule_i16: +; LA64: # %bb.0: +; LA64-NEXT: bstrpick.d $a1, $a1, 15, 0 +; LA64-NEXT: and $a0, $a1, $a0 +; LA64-NEXT: sltu $a0, $a1, $a0 +; LA64-NEXT: xori $a0, $a0, 1 +; LA64-NEXT: ret + %and = and i16 %a, %b + %cmp = icmp ule i16 %and, %b + ret i1 %cmp +} + +define i1 @andn_icmp_eq_i8_sz(i8 signext %a, i8 zeroext %b) nounwind { +; LA32-LABEL: andn_icmp_eq_i8_sz: +; LA32: # %bb.0: +; LA32-NEXT: andn $a0, $a1, $a0 +; LA32-NEXT: sltui $a0, $a0, 1 +; LA32-NEXT: ret +; +; LA64-LABEL: andn_icmp_eq_i8_sz: +; LA64: # %bb.0: +; LA64-NEXT: andn $a0, $a1, $a0 +; LA64-NEXT: sltui $a0, $a0, 1 +; LA64-NEXT: ret + %and = and i8 %a, %b + %cmp = icmp eq i8 %and, %b + ret i1 %cmp +} + +define i1 @andn_icmp_eq_i8_zs(i8 zeroext %a, i8 signext %b) nounwind { +; LA32-LABEL: andn_icmp_eq_i8_zs: +; LA32: # %bb.0: +; LA32-NEXT: andn $a0, $a1, $a0 +; LA32-NEXT: andi $a0, $a0, 255 +; LA32-NEXT: sltui $a0, $a0, 1 +; LA32-NEXT: ret +; +; LA64-LABEL: andn_icmp_eq_i8_zs: +; LA64: # %bb.0: +; LA64-NEXT: andn $a0, $a1, $a0 +; LA64-NEXT: andi $a0, $a0, 255 +; LA64-NEXT: sltui $a0, $a0, 1 +; LA64-NEXT: ret + %and = and i8 %a, %b + %cmp = icmp eq i8 %and, %b + ret i1 %cmp +} + +define i1 @andn_icmp_eq_i8_zz(i8 zeroext %a, i8 zeroext %b) nounwind { +; LA32-LABEL: andn_icmp_eq_i8_zz: +; LA32: # %bb.0: +; LA32-NEXT: andn $a0, $a1, $a0 +; LA32-NEXT: sltui $a0, $a0, 1 +; LA32-NEXT: ret +; +; LA64-LABEL: andn_icmp_eq_i8_zz: +; LA64: # %bb.0: +; LA64-NEXT: andn $a0, $a1, $a0 +; LA64-NEXT: sltui $a0, $a0, 1 +; LA64-NEXT: ret + %and = and i8 %a, %b + %cmp = icmp eq i8 %and, %b + ret i1 %cmp +} + +define i1 @andn_icmp_eq_i8_sn(i8 signext %a, i8 %b) nounwind { +; LA32-LABEL: andn_icmp_eq_i8_sn: +; LA32: # %bb.0: +; LA32-NEXT: andn $a0, $a1, $a0 +; LA32-NEXT: andi $a0, $a0, 255 +; LA32-NEXT: sltui $a0, $a0, 1 +; LA32-NEXT: ret +; +; LA64-LABEL: andn_icmp_eq_i8_sn: +; LA64: # %bb.0: +; LA64-NEXT: andn $a0, $a1, $a0 +; LA64-NEXT: andi $a0, $a0, 255 +; LA64-NEXT: sltui $a0, $a0, 1 +; LA64-NEXT: ret + %and = and i8 %a, %b + %cmp = icmp eq i8 %and, %b + ret i1 %cmp +} + +define i1 @andn_icmp_eq_i8_zn(i8 zeroext %a, i8 %b) nounwind { +; LA32-LABEL: andn_icmp_eq_i8_zn: +; LA32: # %bb.0: +; LA32-NEXT: andn $a0, $a1, $a0 +; LA32-NEXT: andi $a0, $a0, 255 +; LA32-NEXT: sltui $a0, $a0, 1 +; LA32-NEXT: ret +; +; LA64-LABEL: andn_icmp_eq_i8_zn: +; LA64: # %bb.0: +; LA64-NEXT: andn $a0, $a1, $a0 +; LA64-NEXT: andi $a0, $a0, 255 +; LA64-NEXT: sltui $a0, $a0, 1 +; LA64-NEXT: ret + %and = and i8 %a, %b + %cmp = icmp eq i8 %and, %b + ret i1 %cmp +} + +define i1 @andn_icmp_eq_i8_ns(i8 %a, i8 signext %b) nounwind { +; LA32-LABEL: andn_icmp_eq_i8_ns: +; LA32: # %bb.0: +; LA32-NEXT: andn $a0, $a1, $a0 +; LA32-NEXT: andi $a0, $a0, 255 +; LA32-NEXT: sltui $a0, $a0, 1 +; LA32-NEXT: ret +; +; LA64-LABEL: andn_icmp_eq_i8_ns: +; LA64: # %bb.0: +; LA64-NEXT: andn $a0, $a1, $a0 +; LA64-NEXT: andi $a0, $a0, 255 +; LA64-NEXT: sltui $a0, $a0, 1 +; LA64-NEXT: ret + %and = and i8 %a, %b + %cmp = icmp eq i8 %and, %b + ret i1 %cmp +} + +define i1 @andn_icmp_eq_i8_nz(i8 %a, i8 zeroext %b) nounwind { +; LA32-LABEL: andn_icmp_eq_i8_nz: +; LA32: # %bb.0: +; LA32-NEXT: andn $a0, $a1, $a0 +; LA32-NEXT: sltui $a0, $a0, 1 +; LA32-NEXT: ret +; +; LA64-LABEL: andn_icmp_eq_i8_nz: +; LA64: # %bb.0: +; LA64-NEXT: andn $a0, $a1, $a0 +; LA64-NEXT: sltui $a0, $a0, 1 +; LA64-NEXT: ret + %and = and i8 %a, %b + %cmp = icmp eq i8 %and, %b + ret i1 %cmp +} + +define i1 @andn_icmp_eq_i8_nn(i8 %a, i8 %b) nounwind { +; LA32-LABEL: andn_icmp_eq_i8_nn: +; LA32: # %bb.0: +; LA32-NEXT: andn $a0, $a1, $a0 +; LA32-NEXT: andi $a0, $a0, 255 +; LA32-NEXT: sltui $a0, $a0, 1 +; LA32-NEXT: ret +; +; LA64-LABEL: andn_icmp_eq_i8_nn: +; LA64: # %bb.0: +; LA64-NEXT: andn $a0, $a1, $a0 +; LA64-NEXT: andi $a0, $a0, 255 +; LA64-NEXT: sltui $a0, $a0, 1 +; LA64-NEXT: ret + %and = and i8 %a, %b + %cmp = icmp eq i8 %and, %b + ret i1 %cmp +} + +define i1 @andn_icmp_ult_i8_sz(i8 signext %a, i8 zeroext %b) nounwind { +; LA32-LABEL: andn_icmp_ult_i8_sz: +; LA32: # %bb.0: +; LA32-NEXT: and $a0, $a0, $a1 +; LA32-NEXT: sltu $a0, $a0, $a1 +; LA32-NEXT: ret +; +; LA64-LABEL: andn_icmp_ult_i8_sz: +; LA64: # %bb.0: +; LA64-NEXT: and $a0, $a0, $a1 +; LA64-NEXT: sltu $a0, $a0, $a1 +; LA64-NEXT: ret + %and = and i8 %a, %b + %cmp = icmp ult i8 %and, %b + ret i1 %cmp +} + +define i1 @andn_icmp_ult_i8_zs(i8 zeroext %a, i8 signext %b) nounwind { +; LA32-LABEL: andn_icmp_ult_i8_zs: +; LA32: # %bb.0: +; LA32-NEXT: andi $a1, $a1, 255 +; LA32-NEXT: and $a0, $a0, $a1 +; LA32-NEXT: sltu $a0, $a0, $a1 +; LA32-NEXT: ret +; +; LA64-LABEL: andn_icmp_ult_i8_zs: +; LA64: # %bb.0: +; LA64-NEXT: andi $a1, $a1, 255 +; LA64-NEXT: and $a0, $a0, $a1 +; LA64-NEXT: sltu $a0, $a0, $a1 +; LA64-NEXT: ret + %and = and i8 %a, %b + %cmp = icmp ult i8 %and, %b + ret i1 %cmp +} + +define i1 @andn_icmp_ult_i8_zz(i8 zeroext %a, i8 zeroext %b) nounwind { +; LA32-LABEL: andn_icmp_ult_i8_zz: +; LA32: # %bb.0: +; LA32-NEXT: and $a0, $a0, $a1 +; LA32-NEXT: sltu $a0, $a0, $a1 +; LA32-NEXT: ret +; +; LA64-LABEL: andn_icmp_ult_i8_zz: +; LA64: # %bb.0: +; LA64-NEXT: and $a0, $a0, $a1 +; LA64-NEXT: sltu $a0, $a0, $a1 +; LA64-NEXT: ret + %and = and i8 %a, %b + %cmp = icmp ult i8 %and, %b + ret i1 %cmp +} + +define i1 @andn_icmp_ult_i8_sn(i8 signext %a, i8 %b) nounwind { +; LA32-LABEL: andn_icmp_ult_i8_sn: +; LA32: # %bb.0: +; LA32-NEXT: andi $a1, $a1, 255 +; LA32-NEXT: and $a0, $a1, $a0 +; LA32-NEXT: sltu $a0, $a0, $a1 +; LA32-NEXT: ret +; +; LA64-LABEL: andn_icmp_ult_i8_sn: +; LA64: # %bb.0: +; LA64-NEXT: andi $a1, $a1, 255 +; LA64-NEXT: and $a0, $a1, $a0 +; LA64-NEXT: sltu $a0, $a0, $a1 +; LA64-NEXT: ret + %and = and i8 %a, %b + %cmp = icmp ult i8 %and, %b + ret i1 %cmp +} + +define i1 @andn_icmp_ult_i8_zn(i8 zeroext %a, i8 %b) nounwind { +; LA32-LABEL: andn_icmp_ult_i8_zn: +; LA32: # %bb.0: +; LA32-NEXT: andi $a1, $a1, 255 +; LA32-NEXT: and $a0, $a1, $a0 +; LA32-NEXT: sltu $a0, $a0, $a1 +; LA32-NEXT: ret +; +; LA64-LABEL: andn_icmp_ult_i8_zn: +; LA64: # %bb.0: +; LA64-NEXT: andi $a1, $a1, 255 +; LA64-NEXT: and $a0, $a1, $a0 +; LA64-NEXT: sltu $a0, $a0, $a1 +; LA64-NEXT: ret + %and = and i8 %a, %b + %cmp = icmp ult i8 %and, %b + ret i1 %cmp +} + +define i1 @andn_icmp_ult_i8_ns(i8 %a, i8 signext %b) nounwind { +; LA32-LABEL: andn_icmp_ult_i8_ns: +; LA32: # %bb.0: +; LA32-NEXT: andi $a1, $a1, 255 +; LA32-NEXT: and $a0, $a1, $a0 +; LA32-NEXT: sltu $a0, $a0, $a1 +; LA32-NEXT: ret +; +; LA64-LABEL: andn_icmp_ult_i8_ns: +; LA64: # %bb.0: +; LA64-NEXT: andi $a1, $a1, 255 +; LA64-NEXT: and $a0, $a1, $a0 +; LA64-NEXT: sltu $a0, $a0, $a1 +; LA64-NEXT: ret + %and = and i8 %a, %b + %cmp = icmp ult i8 %and, %b + ret i1 %cmp +} + +define i1 @andn_icmp_ult_i8_nz(i8 %a, i8 zeroext %b) nounwind { +; LA32-LABEL: andn_icmp_ult_i8_nz: +; LA32: # %bb.0: +; LA32-NEXT: and $a0, $a0, $a1 +; LA32-NEXT: sltu $a0, $a0, $a1 +; LA32-NEXT: ret +; +; LA64-LABEL: andn_icmp_ult_i8_nz: +; LA64: # %bb.0: +; LA64-NEXT: and $a0, $a0, $a1 +; LA64-NEXT: sltu $a0, $a0, $a1 +; LA64-NEXT: ret + %and = and i8 %a, %b + %cmp = icmp ult i8 %and, %b + ret i1 %cmp +} + +define i1 @andn_icmp_ult_i8_nn(i8 %a, i8 %b) nounwind { +; LA32-LABEL: andn_icmp_ult_i8_nn: +; LA32: # %bb.0: +; LA32-NEXT: andi $a1, $a1, 255 +; LA32-NEXT: and $a0, $a1, $a0 +; LA32-NEXT: sltu $a0, $a0, $a1 +; LA32-NEXT: ret +; +; LA64-LABEL: andn_icmp_ult_i8_nn: +; LA64: # %bb.0: +; LA64-NEXT: andi $a1, $a1, 255 +; LA64-NEXT: and $a0, $a1, $a0 +; LA64-NEXT: sltu $a0, $a0, $a1 +; LA64-NEXT: ret + %and = and i8 %a, %b + %cmp = icmp ult i8 %and, %b + ret i1 %cmp +} From b330d800cb7917e537b05a23febfe188401c5628 Mon Sep 17 00:00:00 2001 From: cor3ntin Date: Wed, 17 Jul 2024 07:52:40 +0200 Subject: [PATCH 221/777] Reapply [Clang][C++26] Implement "Ordering of constraints involving fold expressions (#99022) Implement https://isocpp.org/files/papers/P2963R3.pdf --- clang/docs/ReleaseNotes.rst | 3 + clang/include/clang/Sema/Sema.h | 5 + clang/include/clang/Sema/SemaConcept.h | 193 ++++++-- clang/lib/Sema/SemaConcept.cpp | 612 ++++++++++++++++-------- clang/lib/Sema/SemaTemplateVariadic.cpp | 4 + clang/test/SemaCXX/cxx2c-fold-exprs.cpp | 277 +++++++++++ clang/www/cxx_status.html | 2 +- 7 files changed, 862 insertions(+), 234 deletions(-) create mode 100644 clang/test/SemaCXX/cxx2c-fold-exprs.cpp diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index d0138d6b00017..8c0d1635d2756 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -276,6 +276,9 @@ C++2c Feature Support - Implemented `P3144R2 Deleting a Pointer to an Incomplete Type Should be Ill-formed `_. +- Implemented `P2963R3 Ordering of constraints involving fold expressions `_. + + Resolutions to C++ Defect Reports ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - Substitute template parameter pack, when it is not explicitly specified diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index 48dff1b76cc57..3cb1aa935fe46 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -14078,6 +14078,11 @@ class Sema final : public SemaBase { const DeclarationNameInfo &NameInfo, SmallVectorImpl &Unexpanded); + /// Collect the set of unexpanded parameter packs within the given + /// expression. + static void collectUnexpandedParameterPacks( + Expr *E, SmallVectorImpl &Unexpanded); + /// Invoked when parsing a template argument followed by an /// ellipsis, which creates a pack expansion. /// diff --git a/clang/include/clang/Sema/SemaConcept.h b/clang/include/clang/Sema/SemaConcept.h index 711443505174f..03791962b2dc0 100644 --- a/clang/include/clang/Sema/SemaConcept.h +++ b/clang/include/clang/Sema/SemaConcept.h @@ -26,7 +26,9 @@ namespace clang { class Sema; -struct AtomicConstraint { +enum { ConstraintAlignment = 8 }; + +struct alignas(ConstraintAlignment) AtomicConstraint { const Expr *ConstraintExpr; std::optional> ParameterMapping; @@ -75,6 +77,28 @@ struct AtomicConstraint { } }; +struct alignas(ConstraintAlignment) FoldExpandedConstraint; + +using NormalFormConstraint = + llvm::PointerUnion; +struct NormalizedConstraint; +using NormalForm = + llvm::SmallVector, 4>; + +// A constraint is in conjunctive normal form when it is a conjunction of +// clauses where each clause is a disjunction of atomic constraints. For atomic +// constraints A, B, and C, the constraint A  ∧ (B  ∨ C) is in conjunctive +// normal form. +NormalForm makeCNF(const NormalizedConstraint &Normalized); + +// A constraint is in disjunctive normal form when it is a disjunction of +// clauses where each clause is a conjunction of atomic constraints. For atomic +// constraints A, B, and C, the disjunctive normal form of the constraint A +//  ∧ (B  ∨ C) is (A  ∧ B)  ∨ (A  ∧ C). +NormalForm makeDNF(const NormalizedConstraint &Normalized); + +struct alignas(ConstraintAlignment) NormalizedConstraintPair; + /// \brief A normalized constraint, as defined in C++ [temp.constr.normal], is /// either an atomic constraint, a conjunction of normalized constraints or a /// disjunction of normalized constraints. @@ -83,30 +107,20 @@ struct NormalizedConstraint { enum CompoundConstraintKind { CCK_Conjunction, CCK_Disjunction }; - using CompoundConstraint = llvm::PointerIntPair< - std::pair *, 1, - CompoundConstraintKind>; + using CompoundConstraint = llvm::PointerIntPair; - llvm::PointerUnion Constraint; + llvm::PointerUnion + Constraint; NormalizedConstraint(AtomicConstraint *C): Constraint{C} { }; + NormalizedConstraint(FoldExpandedConstraint *C) : Constraint{C} {}; + NormalizedConstraint(ASTContext &C, NormalizedConstraint LHS, - NormalizedConstraint RHS, CompoundConstraintKind Kind) - : Constraint{CompoundConstraint{ - new (C) std::pair{ - std::move(LHS), std::move(RHS)}, Kind}} { }; - - NormalizedConstraint(ASTContext &C, const NormalizedConstraint &Other) { - if (Other.isAtomic()) { - Constraint = new (C) AtomicConstraint(*Other.getAtomicConstraint()); - } else { - Constraint = CompoundConstraint( - new (C) std::pair{ - NormalizedConstraint(C, Other.getLHS()), - NormalizedConstraint(C, Other.getRHS())}, - Other.getCompoundKind()); - } - } + NormalizedConstraint RHS, CompoundConstraintKind Kind); + + NormalizedConstraint(ASTContext &C, const NormalizedConstraint &Other); NormalizedConstraint(NormalizedConstraint &&Other): Constraint(Other.Constraint) { Other.Constraint = nullptr; @@ -120,29 +134,32 @@ struct NormalizedConstraint { return *this; } - CompoundConstraintKind getCompoundKind() const { - assert(!isAtomic() && "getCompoundKind called on atomic constraint."); - return Constraint.get().getInt(); - } - bool isAtomic() const { return Constraint.is(); } - - NormalizedConstraint &getLHS() const { - assert(!isAtomic() && "getLHS called on atomic constraint."); - return Constraint.get().getPointer()->first; + bool isFoldExpanded() const { + return Constraint.is(); } + bool isCompound() const { return Constraint.is(); } - NormalizedConstraint &getRHS() const { - assert(!isAtomic() && "getRHS called on atomic constraint."); - return Constraint.get().getPointer()->second; + CompoundConstraintKind getCompoundKind() const { + assert(isCompound() && "getCompoundKind on a non-compound constraint.."); + return Constraint.get().getInt(); } + NormalizedConstraint &getLHS() const; + NormalizedConstraint &getRHS() const; + AtomicConstraint *getAtomicConstraint() const { assert(isAtomic() && "getAtomicConstraint called on non-atomic constraint."); return Constraint.get(); } + FoldExpandedConstraint *getFoldExpandedConstraint() const { + assert(isFoldExpanded() && + "getFoldExpandedConstraint called on non-fold-expanded constraint."); + return Constraint.get(); + } + private: static std::optional fromConstraintExprs(Sema &S, NamedDecl *D, ArrayRef E); @@ -150,6 +167,116 @@ struct NormalizedConstraint { fromConstraintExpr(Sema &S, NamedDecl *D, const Expr *E); }; +struct alignas(ConstraintAlignment) NormalizedConstraintPair { + NormalizedConstraint LHS, RHS; +}; + +struct alignas(ConstraintAlignment) FoldExpandedConstraint { + enum class FoldOperatorKind { And, Or } Kind; + NormalizedConstraint Constraint; + const Expr *Pattern; + + FoldExpandedConstraint(FoldOperatorKind K, NormalizedConstraint C, + const Expr *Pattern) + : Kind(K), Constraint(std::move(C)), Pattern(Pattern) {}; + + template + bool subsumes(const FoldExpandedConstraint &Other, + const AtomicSubsumptionEvaluator &E) const; + + static bool AreCompatibleForSubsumption(const FoldExpandedConstraint &A, + const FoldExpandedConstraint &B); +}; + +const NormalizedConstraint *getNormalizedAssociatedConstraints( + Sema &S, NamedDecl *ConstrainedDecl, + ArrayRef AssociatedConstraints); + +template +bool subsumes(const NormalForm &PDNF, const NormalForm &QCNF, + const AtomicSubsumptionEvaluator &E) { + // C++ [temp.constr.order] p2 + // Then, P subsumes Q if and only if, for every disjunctive clause Pi in the + // disjunctive normal form of P, Pi subsumes every conjunctive clause Qj in + // the conjuctive normal form of Q, where [...] + for (const auto &Pi : PDNF) { + for (const auto &Qj : QCNF) { + // C++ [temp.constr.order] p2 + // - [...] a disjunctive clause Pi subsumes a conjunctive clause Qj if + // and only if there exists an atomic constraint Pia in Pi for which + // there exists an atomic constraint, Qjb, in Qj such that Pia + // subsumes Qjb. + bool Found = false; + for (NormalFormConstraint Pia : Pi) { + for (NormalFormConstraint Qjb : Qj) { + if (Pia.is() && + Qjb.is()) { + if (Pia.get()->subsumes( + *Qjb.get(), E)) { + Found = true; + break; + } + } else if (Pia.is() && + Qjb.is()) { + if (E(*Pia.get(), + *Qjb.get())) { + Found = true; + break; + } + } + } + if (Found) + break; + } + if (!Found) + return false; + } + } + return true; +} + +template +bool subsumes(Sema &S, NamedDecl *DP, ArrayRef P, NamedDecl *DQ, + ArrayRef Q, bool &Subsumes, + const AtomicSubsumptionEvaluator &E) { + // C++ [temp.constr.order] p2 + // In order to determine if a constraint P subsumes a constraint Q, P is + // transformed into disjunctive normal form, and Q is transformed into + // conjunctive normal form. [...] + const NormalizedConstraint *PNormalized = + getNormalizedAssociatedConstraints(S, DP, P); + if (!PNormalized) + return true; + NormalForm PDNF = makeDNF(*PNormalized); + + const NormalizedConstraint *QNormalized = + getNormalizedAssociatedConstraints(S, DQ, Q); + if (!QNormalized) + return true; + NormalForm QCNF = makeCNF(*QNormalized); + + Subsumes = subsumes(PDNF, QCNF, E); + return false; +} + +template +bool FoldExpandedConstraint::subsumes( + const FoldExpandedConstraint &Other, + const AtomicSubsumptionEvaluator &E) const { + + // [C++26] [temp.constr.order] + // a fold expanded constraint A subsumes another fold expanded constraint B if + // they are compatible for subsumption, have the same fold-operator, and the + // constraint of A subsumes that of B + + if (Kind != Other.Kind || !AreCompatibleForSubsumption(*this, Other)) + return false; + + NormalForm PDNF = makeDNF(this->Constraint); + NormalForm QCNF = makeCNF(Other.Constraint); + return clang::subsumes(PDNF, QCNF, E); +} + } // clang #endif // LLVM_CLANG_SEMA_SEMACONCEPT_H diff --git a/clang/lib/Sema/SemaConcept.cpp b/clang/lib/Sema/SemaConcept.cpp index 54891150da20f..84c5753a46ac3 100644 --- a/clang/lib/Sema/SemaConcept.cpp +++ b/clang/lib/Sema/SemaConcept.cpp @@ -65,6 +65,7 @@ class LogicalBinOp { const Expr *getLHS() const { return LHS; } const Expr *getRHS() const { return RHS; } + OverloadedOperatorKind getOp() const { return Op; } ExprResult recreateBinOp(Sema &SemaRef, ExprResult LHS) const { return recreateBinOp(SemaRef, LHS, const_cast(getRHS())); @@ -177,77 +178,177 @@ struct SatisfactionStackRAII { }; } // namespace -template +template static ExprResult calculateConstraintSatisfaction(Sema &S, const Expr *ConstraintExpr, ConstraintSatisfaction &Satisfaction, - AtomicEvaluator &&Evaluator) { - ConstraintExpr = ConstraintExpr->IgnoreParenImpCasts(); + const ConstraintEvaluator &Evaluator); - if (LogicalBinOp BO = ConstraintExpr) { - size_t EffectiveDetailEndIndex = Satisfaction.Details.size(); - ExprResult LHSRes = calculateConstraintSatisfaction( - S, BO.getLHS(), Satisfaction, Evaluator); +template +static ExprResult +calculateConstraintSatisfaction(Sema &S, const Expr *LHS, + OverloadedOperatorKind Op, const Expr *RHS, + ConstraintSatisfaction &Satisfaction, + const ConstraintEvaluator &Evaluator) { + size_t EffectiveDetailEndIndex = Satisfaction.Details.size(); - if (LHSRes.isInvalid()) - return ExprError(); + ExprResult LHSRes = + calculateConstraintSatisfaction(S, LHS, Satisfaction, Evaluator); - bool IsLHSSatisfied = Satisfaction.IsSatisfied; + if (LHSRes.isInvalid()) + return ExprError(); - if (BO.isOr() && IsLHSSatisfied) - // [temp.constr.op] p3 - // A disjunction is a constraint taking two operands. To determine if - // a disjunction is satisfied, the satisfaction of the first operand - // is checked. If that is satisfied, the disjunction is satisfied. - // Otherwise, the disjunction is satisfied if and only if the second - // operand is satisfied. - // LHS is instantiated while RHS is not. Skip creating invalid BinaryOp. - return LHSRes; + bool IsLHSSatisfied = Satisfaction.IsSatisfied; + + if (Op == clang::OO_PipePipe && IsLHSSatisfied) + // [temp.constr.op] p3 + // A disjunction is a constraint taking two operands. To determine if + // a disjunction is satisfied, the satisfaction of the first operand + // is checked. If that is satisfied, the disjunction is satisfied. + // Otherwise, the disjunction is satisfied if and only if the second + // operand is satisfied. + // LHS is instantiated while RHS is not. Skip creating invalid BinaryOp. + return LHSRes; + + if (Op == clang::OO_AmpAmp && !IsLHSSatisfied) + // [temp.constr.op] p2 + // A conjunction is a constraint taking two operands. To determine if + // a conjunction is satisfied, the satisfaction of the first operand + // is checked. If that is not satisfied, the conjunction is not + // satisfied. Otherwise, the conjunction is satisfied if and only if + // the second operand is satisfied. + // LHS is instantiated while RHS is not. Skip creating invalid BinaryOp. + return LHSRes; + + ExprResult RHSRes = + calculateConstraintSatisfaction(S, RHS, Satisfaction, Evaluator); + if (RHSRes.isInvalid()) + return ExprError(); - if (BO.isAnd() && !IsLHSSatisfied) - // [temp.constr.op] p2 - // A conjunction is a constraint taking two operands. To determine if - // a conjunction is satisfied, the satisfaction of the first operand - // is checked. If that is not satisfied, the conjunction is not - // satisfied. Otherwise, the conjunction is satisfied if and only if - // the second operand is satisfied. - // LHS is instantiated while RHS is not. Skip creating invalid BinaryOp. - return LHSRes; - - ExprResult RHSRes = calculateConstraintSatisfaction( - S, BO.getRHS(), Satisfaction, std::forward(Evaluator)); - if (RHSRes.isInvalid()) + bool IsRHSSatisfied = Satisfaction.IsSatisfied; + // Current implementation adds diagnostic information about the falsity + // of each false atomic constraint expression when it evaluates them. + // When the evaluation results to `false || true`, the information + // generated during the evaluation of left-hand side is meaningless + // because the whole expression evaluates to true. + // The following code removes the irrelevant diagnostic information. + // FIXME: We should probably delay the addition of diagnostic information + // until we know the entire expression is false. + if (Op == clang::OO_PipePipe && IsRHSSatisfied) { + auto EffectiveDetailEnd = Satisfaction.Details.begin(); + std::advance(EffectiveDetailEnd, EffectiveDetailEndIndex); + Satisfaction.Details.erase(EffectiveDetailEnd, Satisfaction.Details.end()); + } + + if (!LHSRes.isUsable() || !RHSRes.isUsable()) + return ExprEmpty(); + + return BinaryOperator::Create(S.Context, LHSRes.get(), RHSRes.get(), + BinaryOperator::getOverloadedOpcode(Op), + S.Context.BoolTy, VK_PRValue, OK_Ordinary, + LHS->getBeginLoc(), FPOptionsOverride{}); +} + +template +static ExprResult +calculateConstraintSatisfaction(Sema &S, const CXXFoldExpr *FE, + ConstraintSatisfaction &Satisfaction, + const ConstraintEvaluator &Evaluator) { + bool Conjunction = FE->getOperator() == BinaryOperatorKind::BO_LAnd; + size_t EffectiveDetailEndIndex = Satisfaction.Details.size(); + + ExprResult Out; + if (FE->isLeftFold() && FE->getInit()) { + Out = calculateConstraintSatisfaction(S, FE->getInit(), Satisfaction, + Evaluator); + if (Out.isInvalid()) return ExprError(); + // If the first clause of a conjunction is not satisfied, + // or if the first clause of a disjection is satisfied, + // we have established satisfaction of the whole constraint + // and we should not continue further. + if (Conjunction != Satisfaction.IsSatisfied) + return Out; + } + std::optional NumExpansions = + Evaluator.EvaluateFoldExpandedConstraintSize(FE); + if (!NumExpansions) + return ExprError(); + for (unsigned I = 0; I < *NumExpansions; I++) { + Sema::ArgumentPackSubstitutionIndexRAII SubstIndex(S, I); + ExprResult Res = calculateConstraintSatisfaction(S, FE->getPattern(), + Satisfaction, Evaluator); + if (Res.isInvalid()) + return ExprError(); bool IsRHSSatisfied = Satisfaction.IsSatisfied; - // Current implementation adds diagnostic information about the falsity - // of each false atomic constraint expression when it evaluates them. - // When the evaluation results to `false || true`, the information - // generated during the evaluation of left-hand side is meaningless - // because the whole expression evaluates to true. - // The following code removes the irrelevant diagnostic information. - // FIXME: We should probably delay the addition of diagnostic information - // until we know the entire expression is false. - if (BO.isOr() && IsRHSSatisfied) { + if (!Conjunction && IsRHSSatisfied) { auto EffectiveDetailEnd = Satisfaction.Details.begin(); std::advance(EffectiveDetailEnd, EffectiveDetailEndIndex); Satisfaction.Details.erase(EffectiveDetailEnd, Satisfaction.Details.end()); } + if (Out.isUnset()) + Out = Res; + else if (!Res.isUnset()) { + Out = BinaryOperator::Create( + S.Context, Out.get(), Res.get(), FE->getOperator(), S.Context.BoolTy, + VK_PRValue, OK_Ordinary, FE->getBeginLoc(), FPOptionsOverride{}); + } + if (Conjunction != IsRHSSatisfied) + return Out; + } - return BO.recreateBinOp(S, LHSRes, RHSRes); + if (FE->isRightFold() && FE->getInit()) { + ExprResult Res = calculateConstraintSatisfaction(S, FE->getInit(), + Satisfaction, Evaluator); + if (Out.isInvalid()) + return ExprError(); + + if (Out.isUnset()) + Out = Res; + else if (!Res.isUnset()) { + Out = BinaryOperator::Create( + S.Context, Out.get(), Res.get(), FE->getOperator(), S.Context.BoolTy, + VK_PRValue, OK_Ordinary, FE->getBeginLoc(), FPOptionsOverride{}); + } } + if (Out.isUnset()) { + Satisfaction.IsSatisfied = Conjunction; + Out = S.BuildEmptyCXXFoldExpr(FE->getBeginLoc(), FE->getOperator()); + } + return Out; +} + +template +static ExprResult +calculateConstraintSatisfaction(Sema &S, const Expr *ConstraintExpr, + ConstraintSatisfaction &Satisfaction, + const ConstraintEvaluator &Evaluator) { + ConstraintExpr = ConstraintExpr->IgnoreParenImpCasts(); + + if (LogicalBinOp BO = ConstraintExpr) + return calculateConstraintSatisfaction( + S, BO.getLHS(), BO.getOp(), BO.getRHS(), Satisfaction, Evaluator); + if (auto *C = dyn_cast(ConstraintExpr)) { // These aren't evaluated, so we don't care about cleanups, so we can just // evaluate these as if the cleanups didn't exist. - return calculateConstraintSatisfaction( - S, C->getSubExpr(), Satisfaction, - std::forward(Evaluator)); + return calculateConstraintSatisfaction(S, C->getSubExpr(), Satisfaction, + Evaluator); + } + + if (auto *FE = dyn_cast(ConstraintExpr); + FE && S.getLangOpts().CPlusPlus26 && + (FE->getOperator() == BinaryOperatorKind::BO_LAnd || + FE->getOperator() == BinaryOperatorKind::BO_LOr)) { + return calculateConstraintSatisfaction(S, FE, Satisfaction, Evaluator); } // An atomic constraint expression - ExprResult SubstitutedAtomicExpr = Evaluator(ConstraintExpr); + ExprResult SubstitutedAtomicExpr = + Evaluator.EvaluateAtomicConstraint(ConstraintExpr); if (SubstitutedAtomicExpr.isInvalid()) return ExprError(); @@ -334,91 +435,132 @@ static ExprResult calculateConstraintSatisfaction( Sema &S, const NamedDecl *Template, SourceLocation TemplateNameLoc, const MultiLevelTemplateArgumentList &MLTAL, const Expr *ConstraintExpr, ConstraintSatisfaction &Satisfaction) { - return calculateConstraintSatisfaction( - S, ConstraintExpr, Satisfaction, [&](const Expr *AtomicExpr) { - EnterExpressionEvaluationContext ConstantEvaluated( - S, Sema::ExpressionEvaluationContext::ConstantEvaluated, - Sema::ReuseLambdaContextDecl); - - // Atomic constraint - substitute arguments and check satisfaction. - ExprResult SubstitutedExpression; - { - TemplateDeductionInfo Info(TemplateNameLoc); - Sema::InstantiatingTemplate Inst(S, AtomicExpr->getBeginLoc(), - Sema::InstantiatingTemplate::ConstraintSubstitution{}, - const_cast(Template), Info, - AtomicExpr->getSourceRange()); - if (Inst.isInvalid()) + + struct ConstraintEvaluator { + Sema &S; + const NamedDecl *Template; + SourceLocation TemplateNameLoc; + const MultiLevelTemplateArgumentList &MLTAL; + ConstraintSatisfaction &Satisfaction; + + ExprResult EvaluateAtomicConstraint(const Expr *AtomicExpr) const { + EnterExpressionEvaluationContext ConstantEvaluated( + S, Sema::ExpressionEvaluationContext::ConstantEvaluated, + Sema::ReuseLambdaContextDecl); + + // Atomic constraint - substitute arguments and check satisfaction. + ExprResult SubstitutedExpression; + { + TemplateDeductionInfo Info(TemplateNameLoc); + Sema::InstantiatingTemplate Inst( + S, AtomicExpr->getBeginLoc(), + Sema::InstantiatingTemplate::ConstraintSubstitution{}, + const_cast(Template), Info, + AtomicExpr->getSourceRange()); + if (Inst.isInvalid()) + return ExprError(); + + llvm::FoldingSetNodeID ID; + if (Template && + DiagRecursiveConstraintEval(S, ID, Template, AtomicExpr, MLTAL)) { + Satisfaction.IsSatisfied = false; + Satisfaction.ContainsErrors = true; + return ExprEmpty(); + } + + SatisfactionStackRAII StackRAII(S, Template, ID); + + // We do not want error diagnostics escaping here. + Sema::SFINAETrap Trap(S); + SubstitutedExpression = + S.SubstConstraintExpr(const_cast(AtomicExpr), MLTAL); + + if (SubstitutedExpression.isInvalid() || Trap.hasErrorOccurred()) { + // C++2a [temp.constr.atomic]p1 + // ...If substitution results in an invalid type or expression, the + // constraint is not satisfied. + if (!Trap.hasErrorOccurred()) + // A non-SFINAE error has occurred as a result of this + // substitution. return ExprError(); - llvm::FoldingSetNodeID ID; - if (Template && - DiagRecursiveConstraintEval(S, ID, Template, AtomicExpr, MLTAL)) { - Satisfaction.IsSatisfied = false; - Satisfaction.ContainsErrors = true; - return ExprEmpty(); - } - - SatisfactionStackRAII StackRAII(S, Template, ID); - - // We do not want error diagnostics escaping here. - Sema::SFINAETrap Trap(S); - SubstitutedExpression = - S.SubstConstraintExpr(const_cast(AtomicExpr), MLTAL); - - if (SubstitutedExpression.isInvalid() || Trap.hasErrorOccurred()) { - // C++2a [temp.constr.atomic]p1 - // ...If substitution results in an invalid type or expression, the - // constraint is not satisfied. - if (!Trap.hasErrorOccurred()) - // A non-SFINAE error has occurred as a result of this - // substitution. - return ExprError(); - - PartialDiagnosticAt SubstDiag{SourceLocation(), - PartialDiagnostic::NullDiagnostic()}; - Info.takeSFINAEDiagnostic(SubstDiag); - // FIXME: Concepts: This is an unfortunate consequence of there - // being no serialization code for PartialDiagnostics and the fact - // that serializing them would likely take a lot more storage than - // just storing them as strings. We would still like, in the - // future, to serialize the proper PartialDiagnostic as serializing - // it as a string defeats the purpose of the diagnostic mechanism. - SmallString<128> DiagString; - DiagString = ": "; - SubstDiag.second.EmitToString(S.getDiagnostics(), DiagString); - unsigned MessageSize = DiagString.size(); - char *Mem = new (S.Context) char[MessageSize]; - memcpy(Mem, DiagString.c_str(), MessageSize); - Satisfaction.Details.emplace_back( - new (S.Context) ConstraintSatisfaction::SubstitutionDiagnostic{ - SubstDiag.first, StringRef(Mem, MessageSize)}); - Satisfaction.IsSatisfied = false; - return ExprEmpty(); - } + PartialDiagnosticAt SubstDiag{SourceLocation(), + PartialDiagnostic::NullDiagnostic()}; + Info.takeSFINAEDiagnostic(SubstDiag); + // FIXME: Concepts: This is an unfortunate consequence of there + // being no serialization code for PartialDiagnostics and the fact + // that serializing them would likely take a lot more storage than + // just storing them as strings. We would still like, in the + // future, to serialize the proper PartialDiagnostic as serializing + // it as a string defeats the purpose of the diagnostic mechanism. + SmallString<128> DiagString; + DiagString = ": "; + SubstDiag.second.EmitToString(S.getDiagnostics(), DiagString); + unsigned MessageSize = DiagString.size(); + char *Mem = new (S.Context) char[MessageSize]; + memcpy(Mem, DiagString.c_str(), MessageSize); + Satisfaction.Details.emplace_back( + new (S.Context) ConstraintSatisfaction::SubstitutionDiagnostic{ + SubstDiag.first, StringRef(Mem, MessageSize)}); + Satisfaction.IsSatisfied = false; + return ExprEmpty(); } + } - if (!S.CheckConstraintExpression(SubstitutedExpression.get())) - return ExprError(); + if (!S.CheckConstraintExpression(SubstitutedExpression.get())) + return ExprError(); + + // [temp.constr.atomic]p3: To determine if an atomic constraint is + // satisfied, the parameter mapping and template arguments are first + // substituted into its expression. If substitution results in an + // invalid type or expression, the constraint is not satisfied. + // Otherwise, the lvalue-to-rvalue conversion is performed if necessary, + // and E shall be a constant expression of type bool. + // + // Perform the L to R Value conversion if necessary. We do so for all + // non-PRValue categories, else we fail to extend the lifetime of + // temporaries, and that fails the constant expression check. + if (!SubstitutedExpression.get()->isPRValue()) + SubstitutedExpression = ImplicitCastExpr::Create( + S.Context, SubstitutedExpression.get()->getType(), + CK_LValueToRValue, SubstitutedExpression.get(), + /*BasePath=*/nullptr, VK_PRValue, FPOptionsOverride()); + + return SubstitutedExpression; + } - // [temp.constr.atomic]p3: To determine if an atomic constraint is - // satisfied, the parameter mapping and template arguments are first - // substituted into its expression. If substitution results in an - // invalid type or expression, the constraint is not satisfied. - // Otherwise, the lvalue-to-rvalue conversion is performed if necessary, - // and E shall be a constant expression of type bool. - // - // Perform the L to R Value conversion if necessary. We do so for all - // non-PRValue categories, else we fail to extend the lifetime of - // temporaries, and that fails the constant expression check. - if (!SubstitutedExpression.get()->isPRValue()) - SubstitutedExpression = ImplicitCastExpr::Create( - S.Context, SubstitutedExpression.get()->getType(), - CK_LValueToRValue, SubstitutedExpression.get(), - /*BasePath=*/nullptr, VK_PRValue, FPOptionsOverride()); - - return SubstitutedExpression; - }); + std::optional + EvaluateFoldExpandedConstraintSize(const CXXFoldExpr *FE) const { + Expr *Pattern = FE->getPattern(); + + SmallVector Unexpanded; + S.collectUnexpandedParameterPacks(Pattern, Unexpanded); + assert(!Unexpanded.empty() && "Pack expansion without parameter packs?"); + bool Expand = true; + bool RetainExpansion = false; + std::optional OrigNumExpansions = FE->getNumExpansions(), + NumExpansions = OrigNumExpansions; + if (S.CheckParameterPacksForExpansion( + FE->getEllipsisLoc(), Pattern->getSourceRange(), Unexpanded, + MLTAL, Expand, RetainExpansion, NumExpansions) || + !Expand || RetainExpansion) + return std::nullopt; + + if (NumExpansions && S.getLangOpts().BracketDepth < NumExpansions) { + S.Diag(FE->getEllipsisLoc(), + clang::diag::err_fold_expression_limit_exceeded) + << *NumExpansions << S.getLangOpts().BracketDepth + << FE->getSourceRange(); + S.Diag(FE->getEllipsisLoc(), diag::note_bracket_depth); + return std::nullopt; + } + return NumExpansions; + } + }; + + return calculateConstraintSatisfaction( + S, ConstraintExpr, Satisfaction, + ConstraintEvaluator{S, Template, TemplateNameLoc, MLTAL, Satisfaction}); } static bool CheckConstraintSatisfaction( @@ -534,13 +676,21 @@ bool Sema::CheckConstraintSatisfaction( bool Sema::CheckConstraintSatisfaction(const Expr *ConstraintExpr, ConstraintSatisfaction &Satisfaction) { - return calculateConstraintSatisfaction( - *this, ConstraintExpr, Satisfaction, - [this](const Expr *AtomicExpr) -> ExprResult { - // We only do this to immitate lvalue-to-rvalue conversion. - return PerformContextuallyConvertToBool( - const_cast(AtomicExpr)); - }) + + struct ConstraintEvaluator { + Sema &S; + ExprResult EvaluateAtomicConstraint(const Expr *AtomicExpr) const { + return S.PerformContextuallyConvertToBool(const_cast(AtomicExpr)); + } + + std::optional + EvaluateFoldExpandedConstraintSize(const CXXFoldExpr *FE) const { + return 0; + } + }; + + return calculateConstraintSatisfaction(*this, ConstraintExpr, Satisfaction, + ConstraintEvaluator{*this}) .isInvalid(); } @@ -1235,18 +1385,34 @@ Sema::getNormalizedAssociatedConstraints( return CacheEntry->second; } +const NormalizedConstraint *clang::getNormalizedAssociatedConstraints( + Sema &S, NamedDecl *ConstrainedDecl, + ArrayRef AssociatedConstraints) { + return S.getNormalizedAssociatedConstraints(ConstrainedDecl, + AssociatedConstraints); +} + static bool substituteParameterMappings(Sema &S, NormalizedConstraint &N, ConceptDecl *Concept, const MultiLevelTemplateArgumentList &MLTAL, const ASTTemplateArgumentListInfo *ArgsAsWritten) { - if (!N.isAtomic()) { + + if (N.isCompound()) { if (substituteParameterMappings(S, N.getLHS(), Concept, MLTAL, ArgsAsWritten)) return true; return substituteParameterMappings(S, N.getRHS(), Concept, MLTAL, ArgsAsWritten); } + + if (N.isFoldExpanded()) { + Sema::ArgumentPackSubstitutionIndexRAII _(S, -1); + return substituteParameterMappings( + S, N.getFoldExpandedConstraint()->Constraint, Concept, MLTAL, + ArgsAsWritten); + } + TemplateParameterList *TemplateParams = Concept->getTemplateParameters(); AtomicConstraint &Atomic = *N.getAtomicConstraint(); @@ -1313,6 +1479,42 @@ static bool substituteParameterMappings(Sema &S, NormalizedConstraint &N, CSE->getTemplateArgsAsWritten()); } +NormalizedConstraint::NormalizedConstraint(ASTContext &C, + NormalizedConstraint LHS, + NormalizedConstraint RHS, + CompoundConstraintKind Kind) + : Constraint{CompoundConstraint{ + new(C) NormalizedConstraintPair{std::move(LHS), std::move(RHS)}, + Kind}} {} + +NormalizedConstraint::NormalizedConstraint(ASTContext &C, + const NormalizedConstraint &Other) { + if (Other.isAtomic()) { + Constraint = new (C) AtomicConstraint(*Other.getAtomicConstraint()); + } else if (Other.isFoldExpanded()) { + Constraint = new (C) FoldExpandedConstraint( + Other.getFoldExpandedConstraint()->Kind, + NormalizedConstraint(C, Other.getFoldExpandedConstraint()->Constraint), + Other.getFoldExpandedConstraint()->Pattern); + } else { + Constraint = CompoundConstraint( + new (C) + NormalizedConstraintPair{NormalizedConstraint(C, Other.getLHS()), + NormalizedConstraint(C, Other.getRHS())}, + Other.getCompoundKind()); + } +} + +NormalizedConstraint &NormalizedConstraint::getLHS() const { + assert(isCompound() && "getLHS called on a non-compound constraint."); + return Constraint.get().getPointer()->LHS; +} + +NormalizedConstraint &NormalizedConstraint::getRHS() const { + assert(isCompound() && "getRHS called on a non-compound constraint."); + return Constraint.get().getPointer()->RHS; +} + std::optional NormalizedConstraint::fromConstraintExprs(Sema &S, NamedDecl *D, ArrayRef E) { @@ -1387,17 +1589,75 @@ NormalizedConstraint::fromConstraintExpr(Sema &S, NamedDecl *D, const Expr *E) { return std::nullopt; return New; + } else if (auto *FE = dyn_cast(E); + FE && S.getLangOpts().CPlusPlus26 && + (FE->getOperator() == BinaryOperatorKind::BO_LAnd || + FE->getOperator() == BinaryOperatorKind::BO_LOr)) { + + // Normalize fold expressions in C++26. + + FoldExpandedConstraint::FoldOperatorKind Kind = + FE->getOperator() == BinaryOperatorKind::BO_LAnd + ? FoldExpandedConstraint::FoldOperatorKind::And + : FoldExpandedConstraint::FoldOperatorKind::Or; + + if (FE->getInit()) { + auto LHS = fromConstraintExpr(S, D, FE->getLHS()); + auto RHS = fromConstraintExpr(S, D, FE->getRHS()); + if (!LHS || !RHS) + return std::nullopt; + + if (FE->isRightFold()) + RHS = NormalizedConstraint{new (S.Context) FoldExpandedConstraint{ + Kind, std::move(*RHS), FE->getPattern()}}; + else + LHS = NormalizedConstraint{new (S.Context) FoldExpandedConstraint{ + Kind, std::move(*LHS), FE->getPattern()}}; + + return NormalizedConstraint( + S.Context, std::move(*LHS), std::move(*RHS), + FE->getOperator() == BinaryOperatorKind::BO_LAnd ? CCK_Conjunction + : CCK_Disjunction); + } + auto Sub = fromConstraintExpr(S, D, FE->getPattern()); + if (!Sub) + return std::nullopt; + return NormalizedConstraint{new (S.Context) FoldExpandedConstraint{ + Kind, std::move(*Sub), FE->getPattern()}}; } + return NormalizedConstraint{new (S.Context) AtomicConstraint(S, E)}; } -using NormalForm = - llvm::SmallVector, 4>; +bool FoldExpandedConstraint::AreCompatibleForSubsumption( + const FoldExpandedConstraint &A, const FoldExpandedConstraint &B) { + + // [C++26] [temp.constr.fold] + // Two fold expanded constraints are compatible for subsumption + // if their respective constraints both contain an equivalent unexpanded pack. -static NormalForm makeCNF(const NormalizedConstraint &Normalized) { + llvm::SmallVector APacks, BPacks; + Sema::collectUnexpandedParameterPacks(const_cast(A.Pattern), APacks); + Sema::collectUnexpandedParameterPacks(const_cast(B.Pattern), BPacks); + + for (const UnexpandedParameterPack &APack : APacks) { + std::pair DepthAndIndex = getDepthAndIndex(APack); + auto it = llvm::find_if(BPacks, [&](const UnexpandedParameterPack &BPack) { + return getDepthAndIndex(BPack) == DepthAndIndex; + }); + if (it != BPacks.end()) + return true; + } + return false; +} + +NormalForm clang::makeCNF(const NormalizedConstraint &Normalized) { if (Normalized.isAtomic()) return {{Normalized.getAtomicConstraint()}}; + else if (Normalized.isFoldExpanded()) + return {{Normalized.getFoldExpandedConstraint()}}; + NormalForm LCNF = makeCNF(Normalized.getLHS()); NormalForm RCNF = makeCNF(Normalized.getRHS()); if (Normalized.getCompoundKind() == NormalizedConstraint::CCK_Conjunction) { @@ -1423,10 +1683,13 @@ static NormalForm makeCNF(const NormalizedConstraint &Normalized) { return Res; } -static NormalForm makeDNF(const NormalizedConstraint &Normalized) { +NormalForm clang::makeDNF(const NormalizedConstraint &Normalized) { if (Normalized.isAtomic()) return {{Normalized.getAtomicConstraint()}}; + else if (Normalized.isFoldExpanded()) + return {{Normalized.getFoldExpandedConstraint()}}; + NormalForm LDNF = makeDNF(Normalized.getLHS()); NormalForm RDNF = makeDNF(Normalized.getRHS()); if (Normalized.getCompoundKind() == NormalizedConstraint::CCK_Disjunction) { @@ -1453,60 +1716,6 @@ static NormalForm makeDNF(const NormalizedConstraint &Normalized) { return Res; } -template -static bool subsumes(const NormalForm &PDNF, const NormalForm &QCNF, - AtomicSubsumptionEvaluator E) { - // C++ [temp.constr.order] p2 - // Then, P subsumes Q if and only if, for every disjunctive clause Pi in the - // disjunctive normal form of P, Pi subsumes every conjunctive clause Qj in - // the conjuctive normal form of Q, where [...] - for (const auto &Pi : PDNF) { - for (const auto &Qj : QCNF) { - // C++ [temp.constr.order] p2 - // - [...] a disjunctive clause Pi subsumes a conjunctive clause Qj if - // and only if there exists an atomic constraint Pia in Pi for which - // there exists an atomic constraint, Qjb, in Qj such that Pia - // subsumes Qjb. - bool Found = false; - for (const AtomicConstraint *Pia : Pi) { - for (const AtomicConstraint *Qjb : Qj) { - if (E(*Pia, *Qjb)) { - Found = true; - break; - } - } - if (Found) - break; - } - if (!Found) - return false; - } - } - return true; -} - -template -static bool subsumes(Sema &S, NamedDecl *DP, ArrayRef P, - NamedDecl *DQ, ArrayRef Q, bool &Subsumes, - AtomicSubsumptionEvaluator E) { - // C++ [temp.constr.order] p2 - // In order to determine if a constraint P subsumes a constraint Q, P is - // transformed into disjunctive normal form, and Q is transformed into - // conjunctive normal form. [...] - auto *PNormalized = S.getNormalizedAssociatedConstraints(DP, P); - if (!PNormalized) - return true; - const NormalForm PDNF = makeDNF(*PNormalized); - - auto *QNormalized = S.getNormalizedAssociatedConstraints(DQ, Q); - if (!QNormalized) - return true; - const NormalForm QCNF = makeCNF(*QNormalized); - - Subsumes = subsumes(PDNF, QCNF, E); - return false; -} - bool Sema::IsAtLeastAsConstrained(NamedDecl *D1, MutableArrayRef AC1, NamedDecl *D2, @@ -1559,10 +1768,11 @@ bool Sema::IsAtLeastAsConstrained(NamedDecl *D1, } } - if (subsumes(*this, D1, AC1, D2, AC2, Result, - [this] (const AtomicConstraint &A, const AtomicConstraint &B) { - return A.subsumes(Context, B); - })) + if (clang::subsumes( + *this, D1, AC1, D2, AC2, Result, + [this](const AtomicConstraint &A, const AtomicConstraint &B) { + return A.subsumes(Context, B); + })) return true; SubsumptionCache.try_emplace(Key, Result); return false; @@ -1619,10 +1829,12 @@ bool Sema::MaybeEmitAmbiguousAtomicConstraintsDiagnostic(NamedDecl *D1, const NormalForm DNF2 = makeDNF(*Normalized2); const NormalForm CNF2 = makeCNF(*Normalized2); - bool Is1AtLeastAs2Normally = subsumes(DNF1, CNF2, NormalExprEvaluator); - bool Is2AtLeastAs1Normally = subsumes(DNF2, CNF1, NormalExprEvaluator); - bool Is1AtLeastAs2 = subsumes(DNF1, CNF2, IdenticalExprEvaluator); - bool Is2AtLeastAs1 = subsumes(DNF2, CNF1, IdenticalExprEvaluator); + bool Is1AtLeastAs2Normally = + clang::subsumes(DNF1, CNF2, NormalExprEvaluator); + bool Is2AtLeastAs1Normally = + clang::subsumes(DNF2, CNF1, NormalExprEvaluator); + bool Is1AtLeastAs2 = clang::subsumes(DNF1, CNF2, IdenticalExprEvaluator); + bool Is2AtLeastAs1 = clang::subsumes(DNF2, CNF1, IdenticalExprEvaluator); if (Is1AtLeastAs2 == Is1AtLeastAs2Normally && Is2AtLeastAs1 == Is2AtLeastAs1Normally) // Same result - no ambiguity was caused by identical atomic expressions. diff --git a/clang/lib/Sema/SemaTemplateVariadic.cpp b/clang/lib/Sema/SemaTemplateVariadic.cpp index 6df7f2223d267..3d4ccaf68c700 100644 --- a/clang/lib/Sema/SemaTemplateVariadic.cpp +++ b/clang/lib/Sema/SemaTemplateVariadic.cpp @@ -566,6 +566,10 @@ void Sema::collectUnexpandedParameterPacks( .TraverseDeclarationNameInfo(NameInfo); } +void Sema::collectUnexpandedParameterPacks( + Expr *E, SmallVectorImpl &Unexpanded) { + CollectUnexpandedParameterPacksVisitor(Unexpanded).TraverseStmt(E); +} ParsedTemplateArgument Sema::ActOnPackExpansion(const ParsedTemplateArgument &Arg, diff --git a/clang/test/SemaCXX/cxx2c-fold-exprs.cpp b/clang/test/SemaCXX/cxx2c-fold-exprs.cpp new file mode 100644 index 0000000000000..1e0bc7bcfb4e7 --- /dev/null +++ b/clang/test/SemaCXX/cxx2c-fold-exprs.cpp @@ -0,0 +1,277 @@ +// RUN: %clang_cc1 -std=c++2c -verify %s + +template concept A = true; +template concept C = A && true; +template concept D = A && __is_same(T, int); + + +template requires (A) +constexpr int f(T) { return 0; }; +template requires (C && ...) +constexpr int f(T...) { return 1; }; + +static_assert(f(0) == 0); +static_assert(f(1) == 0); + + +template requires (A && ...) +constexpr int g(T...) { return 0; }; +template requires (C && ...) +constexpr int g(T...) { return 1; }; + +static_assert(g(0) == 1); +static_assert(g() == 1); +static_assert(g(1, 2) == 1); + + + +template requires (A && ...) +constexpr int h(T...) { return 0; }; // expected-note {{candidate}} +template requires (C || ...) +constexpr int h(T...) { return 1; }; // expected-note {{candidate}} + +static_assert(h(0) == 1); // expected-error {{call to 'h' is ambiguous}} + +template requires (A || ...) +constexpr int i(T...) { return 0; }; // expected-note {{candidate}} +template requires (C && ...) +constexpr int i(T...) { return 1; }; // expected-note {{candidate}} + +static_assert(i(0) == 1); // expected-error {{call to 'i' is ambiguous}} + + +template requires (A || ... || true) +constexpr int j(T...) { return 0; }; +template requires (C && ... && true) +constexpr int j(T...) { return 1; }; + +static_assert(j(0) == 1); +static_assert(j() == 1); + + + +template requires (A || ...) +constexpr int k(T...) { return 0; }; // expected-note {{candidate template ignored: constraints not satisfied [with T = <>]}} +template requires (C || ...) +constexpr int k(T...) { return 1; }; // expected-note {{candidate template ignored: constraints not satisfied [with T = <>]}} + +static_assert(k(0) == 1); +static_assert(k() == 0); // expected-error {{no matching function for call to 'k'}} +static_assert(k(1, 2) == 1); + + +consteval int terse(A auto...) {return 1;} +consteval int terse(D auto...) {return 2;} + +static_assert(terse() == 2); +static_assert(terse(0, 0) == 2); +static_assert(terse(0L, 0) == 1); + +template +consteval int tpl_head(A auto...) {return 1;} +template +consteval int tpl_head(D auto...) {return 2;} + +static_assert(tpl_head() == 2); +static_assert(tpl_head(0, 0) == 2); +static_assert(tpl_head(0L, 0) == 1); + + +namespace equivalence { + +template +struct S { + template + void f() requires (A && ...); + template + void f() requires (C && ...); + + template + void g() requires (A && ...); + template + void g() requires (C && ...); + + template + void h() requires (A && ...); // expected-note {{candidate}} + template + void h() requires (C && ...); // expected-note {{candidate}} +}; + +void test() { + S{}.f(); + S{}.g(); + S{}.h(); // expected-error {{call to member function 'h' is ambiguous}} +} + + +} + +namespace substitution { + struct S { + using type = int; +}; + +template +consteval int And1() requires (C && ...) { // #and1 + return 1; +} + +template +consteval int And2() requires (C && ... && C) { // #and2 + return 2; +} + +template +consteval int And3() requires (C && ... && C) { // #and3 + return 3; +} + +template +consteval int Or1() requires (C || ...) { // #or1 + return 1; +} + +template +consteval int Or2() requires (C || ... || C) { // #or2 + return 2; +} + +template +consteval int Or3() requires (C || ... || C) { // #or3 + return 3; +} + +static_assert(And1<>() == 1); +static_assert(And1() == 1); +static_assert(And1() == 1); +static_assert(And1() == 1); // expected-error {{no matching function for call to 'And1'}} + // expected-note@#and1 {{candidate template ignored: constraints not satisfied}} + // expected-note@#and1 {{because substituted constraint expression is ill-formed}} + +static_assert(And1() == 1); // expected-error {{no matching function for call to 'And1'}} + // expected-note@#and1 {{candidate template ignored: constraints not satisfied}} + // expected-note@#and1 {{because substituted constraint expression is ill-formed}} + +static_assert(And1() == 1); // expected-error {{no matching function for call to 'And1'}} + // expected-note@#and1 {{candidate template ignored: constraints not satisfied}} + // expected-note@#and1 {{because substituted constraint expression is ill-formed}} + +static_assert(And2() == 2); +static_assert(And2() == 2); +static_assert(And2() == 2); + +static_assert(And2() == 2); // expected-error {{no matching function for call to 'And2'}} + // expected-note@#and2 {{candidate template ignored: constraints not satisfied}} + // expected-note@#and2 {{because substituted constraint expression is ill-formed}} + +static_assert(And2() == 2); // expected-error {{no matching function for call to 'And2'}} + // expected-note@#and2 {{candidate template ignored: constraints not satisfied}} + // expected-note@#and2 {{because substituted constraint expression is ill-formed}} + +static_assert(And2() == 2); // expected-error {{no matching function for call to 'And2'}} + // expected-note@#and2 {{candidate template ignored: constraints not satisfied}} + // expected-note@#and2 {{because substituted constraint expression is ill-formed}} + +static_assert(And3() == 3); +static_assert(And3() == 3); +static_assert(And3() == 3); // expected-error {{no matching function for call to 'And3'}} + // expected-note@#and3 {{candidate template ignored: constraints not satisfied}} + // expected-note@#and3 {{because substituted constraint expression is ill-formed}} + +static_assert(And3() == 3); // expected-error {{no matching function for call to 'And3'}} + // expected-note@#and3 {{candidate template ignored: constraints not satisfied}} + // expected-note@#and3 {{because substituted constraint expression is ill-formed}} + +static_assert(And3() == 3); // expected-error {{no matching function for call to 'And3'}} + // expected-note@#and3 {{candidate template ignored: constraints not satisfied}} + // expected-note@#and3 {{because substituted constraint expression is ill-formed}} + +static_assert(And3() == 3); // expected-error {{no matching function for call to 'And3'}} + // expected-note@#and3 {{candidate template ignored: constraints not satisfied}} + // expected-note@#and3 {{because substituted constraint expression is ill-formed}} + + +static_assert(Or1<>() == 1); // expected-error {{no matching function for call to 'Or1'}} + // expected-note@#or1 {{candidate template ignored: constraints not satisfied}} +static_assert(Or1() == 1); +static_assert(Or1() == 1); +static_assert(Or1() == 1); +static_assert(Or1() == 1); +static_assert(Or1() == 1); // expected-error {{no matching function for call to 'Or1'}} + // expected-note@#or1 {{candidate template ignored: constraints not satisfied}} \ + // expected-note@#or1 {{because substituted constraint expression is ill-formed}} + + +static_assert(Or2() == 2); +static_assert(Or2() == 2); +static_assert(Or2() == 2); +static_assert(Or2() == 2); +static_assert(Or2() == 2); // expected-error {{no matching function for call to 'Or2'}} + // expected-note@#or2 {{candidate template ignored: constraints not satisfied}} \ + // expected-note@#or2 {{because substituted constraint expression is ill-formed}} + +static_assert(Or3() == 3); +static_assert(Or3() == 3); +static_assert(Or3() == 3); +static_assert(Or3() == 3); +static_assert(Or3() == 3); // expected-error {{no matching function for call to 'Or3'}} + // expected-note@#or3 {{candidate template ignored: constraints not satisfied}} \ + // expected-note@#or3 {{because substituted constraint expression is ill-formed}} +} + +namespace bool_conversion_break { + +template struct A; +struct Thingy { + static constexpr int compare(const Thingy&) {return 1;} +}; +template +void f(A *, A *) // expected-note {{candidate template ignored: failed template argument deduction}} +requires (T::compare(U{}) && ...); // expected-error {{atomic constraint must be of type 'bool' (found 'int')}} + +void g() { + A *ap; + f(ap, ap); // expected-error{{no matching function for call to 'f'}} \ + // expected-note {{while checking constraint satisfaction}} \ + // expected-note {{in instantiation of function template specialization}} +} + +} + +namespace nested { + +template +struct S { + template + consteval static int f() + requires ((A && ...) && ... && A ) { + return 1; + } + + template + consteval static int f() + requires ((C && ...) && ... && C ) { + return 2; + } + + template + consteval static int g() // #nested-ambiguous-g1 + requires ((A && ...) && ... && A ) { + return 1; + } + + template + consteval static int g() // #nested-ambiguous-g2 + requires ((C && ...) && ... && C ) { + return 2; + } +}; + +static_assert(S::f() == 2); + +static_assert(S::g() == 2); // expected-error {{call to 'g' is ambiguous}} + // expected-note@#nested-ambiguous-g1 {{candidate}} + // expected-note@#nested-ambiguous-g2 {{candidate}} + + +} diff --git a/clang/www/cxx_status.html b/clang/www/cxx_status.html index 27e2213e54caa..a6ded8be3ae9e 100755 --- a/clang/www/cxx_status.html +++ b/clang/www/cxx_status.html @@ -218,7 +218,7 @@

    C++2c implementation status

    Ordering of constraints involving fold expressions P2963R3 - No + Clang 19 Structured binding declaration as a condition From f10a78b7e48f9067bc2e5a67ea2166b707701f29 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Tue, 16 Jul 2024 17:22:43 +0100 Subject: [PATCH 222/777] [AMDGPU] clang-tidy: use std::make_unique. NFC. --- llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 6 +++--- llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp | 16 +++++++++------- llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp | 12 +++++++----- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 2 +- 4 files changed, 20 insertions(+), 16 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index eb67963c1d660..9c71f20920c01 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -344,13 +344,13 @@ bool AMDGPUAsmPrinter::doInitialization(Module &M) { if (TM.getTargetTriple().getOS() == Triple::AMDHSA) { switch (CodeObjectVersion) { case AMDGPU::AMDHSA_COV4: - HSAMetadataStream.reset(new HSAMD::MetadataStreamerMsgPackV4()); + HSAMetadataStream = std::make_unique(); break; case AMDGPU::AMDHSA_COV5: - HSAMetadataStream.reset(new HSAMD::MetadataStreamerMsgPackV5()); + HSAMetadataStream = std::make_unique(); break; case AMDGPU::AMDHSA_COV6: - HSAMetadataStream.reset(new HSAMD::MetadataStreamerMsgPackV6()); + HSAMetadataStream = std::make_unique(); break; default: report_fatal_error("Unexpected code object version"); diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp index dbc9233b72def..40d2450d775fa 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp @@ -1084,9 +1084,9 @@ bool UnmangledFuncInfo::lookup(StringRef Name, ID &Id) { AMDGPULibFunc::AMDGPULibFunc(const AMDGPULibFunc &F) { if (auto *MF = dyn_cast(F.Impl.get())) - Impl.reset(new AMDGPUMangledLibFunc(*MF)); + Impl = std::make_unique(*MF); else if (auto *UMF = dyn_cast(F.Impl.get())) - Impl.reset(new AMDGPUUnmangledLibFunc(*UMF)); + Impl = std::make_unique(*UMF); else Impl = std::unique_ptr(); } @@ -1101,19 +1101,21 @@ AMDGPULibFunc &AMDGPULibFunc::operator=(const AMDGPULibFunc &F) { AMDGPULibFunc::AMDGPULibFunc(EFuncId Id, const AMDGPULibFunc &CopyFrom) { assert(AMDGPULibFuncBase::isMangled(Id) && CopyFrom.isMangled() && "not supported"); - Impl.reset(new AMDGPUMangledLibFunc( - Id, *cast(CopyFrom.Impl.get()))); + Impl = std::make_unique( + Id, *cast(CopyFrom.Impl.get())); } AMDGPULibFunc::AMDGPULibFunc(EFuncId Id, FunctionType *FT, bool SignedInts) { - Impl.reset(new AMDGPUMangledLibFunc(Id, FT, SignedInts)); + Impl = std::make_unique(Id, FT, SignedInts); } AMDGPULibFunc::AMDGPULibFunc(StringRef Name, FunctionType *FT) { - Impl.reset(new AMDGPUUnmangledLibFunc(Name, FT)); + Impl = std::make_unique(Name, FT); } -void AMDGPULibFunc::initMangled() { Impl.reset(new AMDGPUMangledLibFunc()); } +void AMDGPULibFunc::initMangled() { + Impl = std::make_unique(); +} AMDGPULibFunc::Param *AMDGPULibFunc::getLeads() { if (!Impl) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 49af0025afa9c..55218afb9a8e8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -203,11 +203,13 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, // clang-format on MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this); EUsPerCU = AMDGPU::IsaInfo::getEUsPerCU(this); - CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); - InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering())); - Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); - RegBankInfo.reset(new AMDGPURegisterBankInfo(*this)); - InstSelector.reset(new AMDGPUInstructionSelector(*this, *RegBankInfo, TM)); + CallLoweringInfo = std::make_unique(*getTargetLowering()); + InlineAsmLoweringInfo = + std::make_unique(getTargetLowering()); + Legalizer = std::make_unique(*this, TM); + RegBankInfo = std::make_unique(*this); + InstSelector = + std::make_unique(*this, *RegBankInfo, TM); } unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index e6e74d619003d..6d12e8c6f2de2 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1089,7 +1089,7 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, // whole block for every handled copy. std::unique_ptr RS; if (Opcode == AMDGPU::INSTRUCTION_LIST_END) - RS.reset(new RegScavenger()); + RS = std::make_unique(); ArrayRef SubIndices = RI.getRegSplitParts(RC, EltSize); From 31087c5e4c8ddfe08ab3ea6d3847e05c4738eeee Mon Sep 17 00:00:00 2001 From: jeanPerier Date: Wed, 17 Jul 2024 09:15:47 +0200 Subject: [PATCH 223/777] [flang] handle alloca outside of entry blocks in MemoryAllocation (#98457) This patch generalizes the MemoryAllocation pass (alloca -> heap) to handle fir.alloca regardless of their postion in the IR. Currently, it only dealt with fir.alloca in function entry blocks. The logic is placed in a utility that can be used to replace alloca in an operation on demand to whatever kind of allocation the utility user wants via callbacks (allocmem, or custom runtime calls to instrument the code...). To do so, a concept of ownership, that was already implied a bit and used in passes like stack-reclaim, is formalized. Any operation with the LoopLikeInterface, AutomaticAllocationScope, or IsolatedFromAbove owns the alloca directly nested inside its regions, and they must not be used after the operation. The pass then looks for the exit points of region with such interface, and use that to insert deallocation. If dominance is not proved, the pass fallbacks to storing the new address into a C pointer variable created in the entry of the owning region which allows inserting deallocation as needed, included near the alloca itself to avoid leaks when the alloca is executed multiple times due to block CFGs loops. This should fix https://github.com/llvm/llvm-project/issues/88344. In a next step, I will try to refactor lowering a bit to introduce lifetime operation for alloca so that the deallocation points can be inserted as soon as possible. --- .../flang/Optimizer/Builder/FIRBuilder.h | 18 +- .../include/flang/Optimizer/Dialect/FIROps.td | 13 + .../flang/Optimizer/Transforms/MemoryUtils.h | 62 ++++ flang/lib/Optimizer/Builder/FIRBuilder.cpp | 12 +- flang/lib/Optimizer/Dialect/FIROps.cpp | 21 ++ flang/lib/Optimizer/Transforms/CMakeLists.txt | 1 + .../Optimizer/Transforms/MemoryAllocation.cpp | 143 +++------ .../lib/Optimizer/Transforms/MemoryUtils.cpp | 287 ++++++++++++++++++ flang/test/Fir/memory-allocation-opt-2.fir | 161 ++++++++++ 9 files changed, 610 insertions(+), 108 deletions(-) create mode 100644 flang/include/flang/Optimizer/Transforms/MemoryUtils.h create mode 100644 flang/lib/Optimizer/Transforms/MemoryUtils.cpp create mode 100644 flang/test/Fir/memory-allocation-opt-2.fir diff --git a/flang/include/flang/Optimizer/Builder/FIRBuilder.h b/flang/include/flang/Optimizer/Builder/FIRBuilder.h index ea35b298c0209..17a9a20c9b439 100644 --- a/flang/include/flang/Optimizer/Builder/FIRBuilder.h +++ b/flang/include/flang/Optimizer/Builder/FIRBuilder.h @@ -38,6 +38,13 @@ class ExtendedValue; class MutableBoxValue; class BoxValue; +/// Get the integer type with a pointer size. +inline mlir::Type getIntPtrType(mlir::OpBuilder &builder) { + // TODO: Delay the need of such type until codegen or find a way to use + // llvm::DataLayout::getPointerSizeInBits here. + return builder.getI64Type(); +} + //===----------------------------------------------------------------------===// // FirOpBuilder //===----------------------------------------------------------------------===// @@ -143,11 +150,7 @@ class FirOpBuilder : public mlir::OpBuilder, public mlir::OpBuilder::Listener { /// Get the integer type whose bit width corresponds to the width of pointer /// types, or is bigger. - mlir::Type getIntPtrType() { - // TODO: Delay the need of such type until codegen or find a way to use - // llvm::DataLayout::getPointerSizeInBits here. - return getI64Type(); - } + mlir::Type getIntPtrType() { return fir::getIntPtrType(*this); } /// Wrap `str` to a SymbolRefAttr. mlir::SymbolRefAttr getSymbolRefAttr(llvm::StringRef str) { @@ -712,6 +715,11 @@ fir::BoxValue createBoxValue(fir::FirOpBuilder &builder, mlir::Location loc, mlir::Value createNullBoxProc(fir::FirOpBuilder &builder, mlir::Location loc, mlir::Type boxType); +/// Convert a value to a new type. Return the value directly if it has the right +/// type. +mlir::Value createConvert(mlir::OpBuilder &, mlir::Location, mlir::Type, + mlir::Value); + /// Set internal linkage attribute on a function. void setInternalLinkage(mlir::func::FuncOp); diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td index 5b03806614f9b..89c13fa7cebe6 100644 --- a/flang/include/flang/Optimizer/Dialect/FIROps.td +++ b/flang/include/flang/Optimizer/Dialect/FIROps.td @@ -124,6 +124,13 @@ def fir_AllocaOp : fir_Op<"alloca", [AttrSizedOperandSegments, Indeed, a user would likely expect a good Fortran compiler to perform such an optimization. + Stack allocations have a maximum lifetime concept: their uses must not + exceed the lifetime of the closest parent operation with the + AutomaticAllocationScope trait, IsIsolatedFromAbove trait, or + LoopLikeOpInterface trait. This restriction is meant to ease the + insertion of stack save and restore operations, and to ease the conversion + of stack allocation into heap allocation. + Until Fortran 2018, procedures defaulted to non-recursive. A legal implementation could therefore convert stack allocations to global allocations. Such a conversion effectively adds the SAVE attribute to all @@ -183,11 +190,17 @@ def fir_AllocaOp : fir_Op<"alloca", [AttrSizedOperandSegments, mlir::Type getAllocatedType(); bool hasLenParams() { return !getTypeparams().empty(); } bool hasShapeOperands() { return !getShape().empty(); } + bool isDynamic() {return hasLenParams() || hasShapeOperands();} unsigned numLenParams() { return getTypeparams().size(); } operand_range getLenParams() { return getTypeparams(); } unsigned numShapeOperands() { return getShape().size(); } operand_range getShapeOperands() { return getShape(); } static mlir::Type getRefTy(mlir::Type ty); + /// Is this an operation that owns the alloca directly made in its region? + static bool ownsNestedAlloca(mlir::Operation* op); + /// Get the parent region that owns this alloca. Nullptr if none can be + /// identified. + mlir::Region* getOwnerRegion(); }]; } diff --git a/flang/include/flang/Optimizer/Transforms/MemoryUtils.h b/flang/include/flang/Optimizer/Transforms/MemoryUtils.h new file mode 100644 index 0000000000000..92a519cd0c838 --- /dev/null +++ b/flang/include/flang/Optimizer/Transforms/MemoryUtils.h @@ -0,0 +1,62 @@ +//===-- Optimizer/Transforms/MemoryUtils.h ----------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Coding style: https://mlir.llvm.org/getting_started/DeveloperGuide/ +// +//===----------------------------------------------------------------------===// +// +// This file defines a utility to replace fir.alloca by dynamic allocation and +// deallocation. The exact kind of dynamic allocation is left to be defined by +// the utility user via callbacks (could be fir.allocmem or custom runtime +// calls). +// +//===----------------------------------------------------------------------===// + +#ifndef FORTRAN_OPTIMIZER_TRANSFORMS_MEMORYUTILS_H +#define FORTRAN_OPTIMIZER_TRANSFORMS_MEMORYUTILS_H + +#include "flang/Optimizer/Dialect/FIROps.h" + +namespace mlir { +class RewriterBase; +} + +namespace fir { + +/// Type of callbacks that indicate if a given fir.alloca must be +/// rewritten. +using MustRewriteCallBack = llvm::function_ref; + +/// Type of callbacks that produce the replacement for a given fir.alloca. +/// It is provided extra information about the dominance of the deallocation +/// points that have been identified, and may refuse to replace the alloca, +/// even if the MustRewriteCallBack previously returned true, in which case +/// it should return a null value. +/// The callback should not delete the alloca, the utility will do it. +using AllocaRewriterCallBack = llvm::function_ref; +/// Type of callbacks that must generate deallocation of storage obtained via +/// AllocaRewriterCallBack calls. +using DeallocCallBack = + llvm::function_ref; + +/// Utility to replace fir.alloca by dynamic allocations inside \p parentOp. +/// \p MustRewriteCallBack lets the user control which fir.alloca should be +/// replaced. \p AllocaRewriterCallBack lets the user define how the new memory +/// should be allocated. \p DeallocCallBack lets the user decide how the memory +/// should be deallocated. The boolean result indicates if the utility succeeded +/// to replace all fir.alloca as requested by the user. Causes of failures are +/// the presence of unregistered operations, or OpenMP/ACC recipe operations +/// that return memory allocated inside their region. +bool replaceAllocas(mlir::RewriterBase &rewriter, mlir::Operation *parentOp, + MustRewriteCallBack, AllocaRewriterCallBack, + DeallocCallBack); + +} // namespace fir + +#endif // FORTRAN_OPTIMIZER_TRANSFORMS_MEMORYUTILS_H diff --git a/flang/lib/Optimizer/Builder/FIRBuilder.cpp b/flang/lib/Optimizer/Builder/FIRBuilder.cpp index 2ea302d188018..2961df96b3cab 100644 --- a/flang/lib/Optimizer/Builder/FIRBuilder.cpp +++ b/flang/lib/Optimizer/Builder/FIRBuilder.cpp @@ -455,15 +455,21 @@ mlir::Value fir::FirOpBuilder::convertWithSemantics( return createConvert(loc, toTy, val); } -mlir::Value fir::FirOpBuilder::createConvert(mlir::Location loc, - mlir::Type toTy, mlir::Value val) { +mlir::Value fir::factory::createConvert(mlir::OpBuilder &builder, + mlir::Location loc, mlir::Type toTy, + mlir::Value val) { if (val.getType() != toTy) { assert(!fir::isa_derived(toTy)); - return create(loc, toTy, val); + return builder.create(loc, toTy, val); } return val; } +mlir::Value fir::FirOpBuilder::createConvert(mlir::Location loc, + mlir::Type toTy, mlir::Value val) { + return fir::factory::createConvert(*this, loc, toTy, val); +} + void fir::FirOpBuilder::createStoreWithConvert(mlir::Location loc, mlir::Value val, mlir::Value addr) { diff --git a/flang/lib/Optimizer/Dialect/FIROps.cpp b/flang/lib/Optimizer/Dialect/FIROps.cpp index a499a6e4f8d04..9e6b88041ba69 100644 --- a/flang/lib/Optimizer/Dialect/FIROps.cpp +++ b/flang/lib/Optimizer/Dialect/FIROps.cpp @@ -275,6 +275,27 @@ llvm::LogicalResult fir::AllocaOp::verify() { return mlir::success(); } +bool fir::AllocaOp::ownsNestedAlloca(mlir::Operation *op) { + return op->hasTrait() || + op->hasTrait() || + mlir::isa(*op); +} + +mlir::Region *fir::AllocaOp::getOwnerRegion() { + mlir::Operation *currentOp = getOperation(); + while (mlir::Operation *parentOp = currentOp->getParentOp()) { + // If the operation was not registered, inquiries about its traits will be + // incorrect and it is not possible to reason about the operation. This + // should not happen in a normal Fortran compilation flow, but be foolproof. + if (!parentOp->isRegistered()) + return nullptr; + if (fir::AllocaOp::ownsNestedAlloca(parentOp)) + return currentOp->getParentRegion(); + currentOp = parentOp; + } + return nullptr; +} + //===----------------------------------------------------------------------===// // AllocMemOp //===----------------------------------------------------------------------===// diff --git a/flang/lib/Optimizer/Transforms/CMakeLists.txt b/flang/lib/Optimizer/Transforms/CMakeLists.txt index 94d94398d696a..3108304240894 100644 --- a/flang/lib/Optimizer/Transforms/CMakeLists.txt +++ b/flang/lib/Optimizer/Transforms/CMakeLists.txt @@ -10,6 +10,7 @@ add_flang_library(FIRTransforms ControlFlowConverter.cpp ArrayValueCopy.cpp ExternalNameConversion.cpp + MemoryUtils.cpp MemoryAllocation.cpp StackArrays.cpp MemRefDataFlowOpt.cpp diff --git a/flang/lib/Optimizer/Transforms/MemoryAllocation.cpp b/flang/lib/Optimizer/Transforms/MemoryAllocation.cpp index 03b1ae89428af..3f308a8f4b560 100644 --- a/flang/lib/Optimizer/Transforms/MemoryAllocation.cpp +++ b/flang/lib/Optimizer/Transforms/MemoryAllocation.cpp @@ -9,6 +9,7 @@ #include "flang/Optimizer/Dialect/FIRDialect.h" #include "flang/Optimizer/Dialect/FIROps.h" #include "flang/Optimizer/Dialect/FIRType.h" +#include "flang/Optimizer/Transforms/MemoryUtils.h" #include "flang/Optimizer/Transforms/Passes.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/IR/Diagnostics.h" @@ -27,50 +28,18 @@ namespace fir { // Number of elements in an array does not determine where it is allocated. static constexpr std::size_t unlimitedArraySize = ~static_cast(0); -namespace { -class ReturnAnalysis { -public: - MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(ReturnAnalysis) - - ReturnAnalysis(mlir::Operation *op) { - if (auto func = mlir::dyn_cast(op)) - for (mlir::Block &block : func) - for (mlir::Operation &i : block) - if (mlir::isa(i)) { - returnMap[op].push_back(&i); - break; - } - } - - llvm::SmallVector getReturns(mlir::Operation *func) const { - auto iter = returnMap.find(func); - if (iter != returnMap.end()) - return iter->second; - return {}; - } - -private: - llvm::DenseMap> - returnMap; -}; -} // namespace - /// Return `true` if this allocation is to remain on the stack (`fir.alloca`). /// Otherwise the allocation should be moved to the heap (`fir.allocmem`). static inline bool -keepStackAllocation(fir::AllocaOp alloca, mlir::Block *entry, +keepStackAllocation(fir::AllocaOp alloca, const fir::MemoryAllocationOptOptions &options) { - // Limitation: only arrays allocated on the stack in the entry block are - // considered for now. - // TODO: Generalize the algorithm and placement of the freemem nodes. - if (alloca->getBlock() != entry) - return true; + // Move all arrays and character with runtime determined size to the heap. + if (options.dynamicArrayOnHeap && alloca.isDynamic()) + return false; + // TODO: use data layout to reason in terms of byte size to cover all "big" + // entities, which may be scalar derived types. if (auto seqTy = mlir::dyn_cast(alloca.getInType())) { - if (fir::hasDynamicSize(seqTy)) { - // Move all arrays with runtime determined size to the heap. - if (options.dynamicArrayOnHeap) - return false; - } else { + if (!fir::hasDynamicSize(seqTy)) { std::int64_t numberOfElements = 1; for (std::int64_t i : seqTy.getShape()) { numberOfElements *= i; @@ -82,8 +51,6 @@ keepStackAllocation(fir::AllocaOp alloca, mlir::Block *entry, // the heap. if (static_cast(numberOfElements) > options.maxStackArraySize) { - LLVM_DEBUG(llvm::dbgs() - << "memory allocation opt: found " << alloca << '\n'); return false; } } @@ -91,49 +58,30 @@ keepStackAllocation(fir::AllocaOp alloca, mlir::Block *entry, return true; } -namespace { -class AllocaOpConversion : public mlir::OpRewritePattern { -public: - using OpRewritePattern::OpRewritePattern; - - AllocaOpConversion(mlir::MLIRContext *ctx, - llvm::ArrayRef rets) - : OpRewritePattern(ctx), returnOps(rets) {} - - llvm::LogicalResult - matchAndRewrite(fir::AllocaOp alloca, - mlir::PatternRewriter &rewriter) const override { - auto loc = alloca.getLoc(); - mlir::Type varTy = alloca.getInType(); - auto unpackName = - [](std::optional opt) -> llvm::StringRef { - if (opt) - return *opt; - return {}; - }; - auto uniqName = unpackName(alloca.getUniqName()); - auto bindcName = unpackName(alloca.getBindcName()); - auto heap = rewriter.create( - loc, varTy, uniqName, bindcName, alloca.getTypeparams(), - alloca.getShape()); - auto insPt = rewriter.saveInsertionPoint(); - for (mlir::Operation *retOp : returnOps) { - rewriter.setInsertionPoint(retOp); - [[maybe_unused]] auto free = rewriter.create(loc, heap); - LLVM_DEBUG(llvm::dbgs() << "memory allocation opt: add free " << free - << " for " << heap << '\n'); - } - rewriter.restoreInsertionPoint(insPt); - rewriter.replaceOpWithNewOp( - alloca, fir::ReferenceType::get(varTy), heap); - LLVM_DEBUG(llvm::dbgs() << "memory allocation opt: replaced " << alloca - << " with " << heap << '\n'); - return mlir::success(); - } +static mlir::Value genAllocmem(mlir::OpBuilder &builder, fir::AllocaOp alloca, + bool deallocPointsDominateAlloc) { + mlir::Type varTy = alloca.getInType(); + auto unpackName = [](std::optional opt) -> llvm::StringRef { + if (opt) + return *opt; + return {}; + }; + llvm::StringRef uniqName = unpackName(alloca.getUniqName()); + llvm::StringRef bindcName = unpackName(alloca.getBindcName()); + auto heap = builder.create(alloca.getLoc(), varTy, uniqName, + bindcName, alloca.getTypeparams(), + alloca.getShape()); + LLVM_DEBUG(llvm::dbgs() << "memory allocation opt: replaced " << alloca + << " with " << heap << '\n'); + return heap; +} -private: - llvm::ArrayRef returnOps; -}; +static void genFreemem(mlir::Location loc, mlir::OpBuilder &builder, + mlir::Value allocmem) { + [[maybe_unused]] auto free = builder.create(loc, allocmem); + LLVM_DEBUG(llvm::dbgs() << "memory allocation opt: add free " << free + << " for " << allocmem << '\n'); +} /// This pass can reclassify memory allocations (fir.alloca, fir.allocmem) based /// on heuristics and settings. The intention is to allow better performance and @@ -144,6 +92,7 @@ class AllocaOpConversion : public mlir::OpRewritePattern { /// make it a heap allocation. /// 2. If a stack allocation is an array with a runtime evaluated size make /// it a heap allocation. +namespace { class MemoryAllocationOpt : public fir::impl::MemoryAllocationOptBase { public: @@ -184,23 +133,17 @@ class MemoryAllocationOpt // If func is a declaration, skip it. if (func.empty()) return; - - const auto &analysis = getAnalysis(); - - target.addLegalDialect(); - target.addDynamicallyLegalOp([&](fir::AllocaOp alloca) { - return keepStackAllocation(alloca, &func.front(), options); - }); - - llvm::SmallVector returnOps = analysis.getReturns(func); - patterns.insert(context, returnOps); - if (mlir::failed( - mlir::applyPartialConversion(func, target, std::move(patterns)))) { - mlir::emitError(func.getLoc(), - "error in memory allocation optimization\n"); - signalPassFailure(); - } + auto tryReplacing = [&](fir::AllocaOp alloca) { + bool res = !keepStackAllocation(alloca, options); + if (res) { + LLVM_DEBUG(llvm::dbgs() + << "memory allocation opt: found " << alloca << '\n'); + } + return res; + }; + mlir::IRRewriter rewriter(context); + fir::replaceAllocas(rewriter, func.getOperation(), tryReplacing, + genAllocmem, genFreemem); } private: diff --git a/flang/lib/Optimizer/Transforms/MemoryUtils.cpp b/flang/lib/Optimizer/Transforms/MemoryUtils.cpp new file mode 100644 index 0000000000000..1f8edf851de9b --- /dev/null +++ b/flang/lib/Optimizer/Transforms/MemoryUtils.cpp @@ -0,0 +1,287 @@ +//===- MemoryUtils.cpp ----------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "flang/Optimizer/Transforms/MemoryUtils.h" +#include "flang/Optimizer/Builder/FIRBuilder.h" +#include "flang/Optimizer/Builder/Todo.h" +#include "mlir/Dialect/OpenACC/OpenACC.h" +#include "mlir/IR/Dominance.h" +#include "llvm/ADT/STLExtras.h" + +namespace { +/// Helper class to detect if an alloca is inside an mlir::Block that can be +/// reached again before its deallocation points via block successors. This +/// analysis is only valid if the deallocation points are inside (or nested +/// inside) the same region as alloca because it does not consider region CFG +/// (for instance, the block inside a fir.do_loop is obviously inside a loop, +/// but is not a loop formed by blocks). The dominance of the alloca on its +/// deallocation points implies this pre-condition (although it is more +/// restrictive). +class BlockCycleDetector { +public: + bool allocaIsInCycle(fir::AllocaOp alloca, + llvm::ArrayRef deallocationPoints); + +private: + // Cache for blocks owning alloca that have been analyzed. In many Fortran + // programs, allocas are usually made in the same blocks with no block cycles. + // So getting a fast "no" is beneficial. + llvm::DenseMap analyzed; +}; +} // namespace + +namespace { +class AllocaReplaceImpl { +public: + AllocaReplaceImpl(fir::AllocaRewriterCallBack allocaRewriter, + fir::DeallocCallBack deallocGenerator) + : allocaRewriter{allocaRewriter}, deallocGenerator{deallocGenerator} {} + bool replace(mlir::RewriterBase &, fir::AllocaOp); + +private: + mlir::Region *findDeallocationPointsAndOwner( + fir::AllocaOp alloca, + llvm::SmallVectorImpl &deallocationPoints); + bool + allocDominatesDealloc(fir::AllocaOp alloca, + llvm::ArrayRef deallocationPoints) { + return llvm::all_of(deallocationPoints, [&](mlir::Operation *deallocPoint) { + return this->dominanceInfo.properlyDominates(alloca.getOperation(), + deallocPoint); + }); + } + void + genIndirectDeallocation(mlir::RewriterBase &, fir::AllocaOp, + llvm::ArrayRef deallocationPoints, + mlir::Value replacement, mlir::Region &owningRegion); + +private: + fir::AllocaRewriterCallBack allocaRewriter; + fir::DeallocCallBack deallocGenerator; + mlir::DominanceInfo dominanceInfo; + BlockCycleDetector blockCycleDetector; +}; +} // namespace + +static bool +allocaIsInCycleImpl(mlir::Block *allocaBlock, + llvm::ArrayRef deallocationPoints) { + llvm::DenseSet seen; + // Insert the deallocation point blocks as "seen" so that the block + // traversal will stop at them. + for (mlir::Operation *deallocPoint : deallocationPoints) + seen.insert(deallocPoint->getBlock()); + if (seen.contains(allocaBlock)) + return false; + // Traverse the block successor graph starting by the alloca block. + llvm::SmallVector successors{allocaBlock}; + while (!successors.empty()) + for (mlir::Block *next : successors.pop_back_val()->getSuccessors()) { + if (next == allocaBlock) + return true; + if (auto pair = seen.insert(next); pair.second) + successors.push_back(next); + } + // The traversal did not reach the alloca block again. + return false; +} +bool BlockCycleDetector::allocaIsInCycle( + fir::AllocaOp alloca, + llvm::ArrayRef deallocationPoints) { + mlir::Block *allocaBlock = alloca->getBlock(); + auto analyzedPair = analyzed.try_emplace(allocaBlock, /*isInCycle=*/false); + bool alreadyAnalyzed = !analyzedPair.second; + bool &isInCycle = analyzedPair.first->second; + // Fast exit if block was already analyzed and no cycle was found. + if (alreadyAnalyzed && !isInCycle) + return false; + // If the analysis was not done generically for this block, run it and + // save the result. + if (!alreadyAnalyzed) + isInCycle = allocaIsInCycleImpl(allocaBlock, /*deallocationPoints*/ {}); + if (!isInCycle) + return false; + // If the generic analysis found a block loop, see if the deallocation + // point would be reached before reaching the block again. Do not + // cache that analysis that is specific to the deallocation points + // found for this alloca. + return allocaIsInCycleImpl(allocaBlock, deallocationPoints); +} + +static bool terminatorYieldsMemory(mlir::Operation &terminator) { + return llvm::any_of(terminator.getResults(), [](mlir::OpResult res) { + return fir::conformsWithPassByRef(res.getType()); + }); +} + +static bool isRegionTerminator(mlir::Operation &terminator) { + // Using ReturnLike trait is tempting but it is not set on + // all region terminator that matters (like omp::TerminatorOp that + // has no results). + // May be true for dead code. It is not a correctness issue and dead code can + // be eliminated by running region simplification before this utility is + // used. + // May also be true for unreachable like terminators (e.g., after an abort + // call related to Fortran STOP). This is also OK, the inserted deallocation + // will simply never be reached. It is easier for the rest of the code here + // to assume there is always at least one deallocation point, so keep + // unreachable terminators. + return !terminator.hasSuccessors(); +} + +mlir::Region *AllocaReplaceImpl::findDeallocationPointsAndOwner( + fir::AllocaOp alloca, + llvm::SmallVectorImpl &deallocationPoints) { + // Step 1: Identify the operation and region owning the alloca. + mlir::Region *owningRegion = alloca.getOwnerRegion(); + if (!owningRegion) + return nullptr; + mlir::Operation *owningOp = owningRegion->getParentOp(); + assert(owningOp && "region expected to be owned"); + // Step 2: Identify the exit points of the owning region, they are the default + // deallocation points. TODO: detect and use lifetime markers to get earlier + // deallocation points. + bool isOpenACCMPRecipe = mlir::isa(owningOp); + for (mlir::Block &block : owningRegion->getBlocks()) + if (mlir::Operation *terminator = block.getTerminator(); + isRegionTerminator(*terminator)) { + // FIXME: OpenACC and OpenMP privatization recipe are stand alone + // operation meant to be later "inlined", the value they return may + // be the address of a local alloca. It would be incorrect to insert + // deallocation before the terminator (this would introduce use after + // free once the recipe is inlined. + // This probably require redesign or special handling on the OpenACC/MP + // side. + if (isOpenACCMPRecipe && terminatorYieldsMemory(*terminator)) + return nullptr; + deallocationPoints.push_back(terminator); + } + // If no block terminators without successors have been found, this is + // an odd region we cannot reason about (never seen yet in FIR and + // mainstream dialects, but MLIR does not really prevent it). + if (deallocationPoints.empty()) + return nullptr; + + // Step 3: detect block based loops between the allocation and deallocation + // points, and add a deallocation point on the back edge to avoid memory + // leaks. + // The detection avoids doing region CFG analysis by assuming that there may + // be cycles if deallocation points are not dominated by the alloca. + // This leaves the cases where the deallocation points are in the same region + // as the alloca (or nested inside it). In which cases there may be a back + // edge between the alloca and the deallocation point via block successors. An + // analysis is run to detect those cases. + // When a loop is detected, the easiest solution to deallocate on the back + // edge is to store the allocated memory address in a variable (that dominates + // the loops) and to deallocate the address in that variable if it is set + // before executing the allocation. This strategy still leads to correct + // execution in the "false positive" cases. + // Hence, the alloca is added as a deallocation point when there is no + // dominance. Note that bringing lifetime markers above will reduce the + // false positives. + if (!allocDominatesDealloc(alloca, deallocationPoints) || + blockCycleDetector.allocaIsInCycle(alloca, deallocationPoints)) + deallocationPoints.push_back(alloca.getOperation()); + return owningRegion; +} + +void AllocaReplaceImpl::genIndirectDeallocation( + mlir::RewriterBase &rewriter, fir::AllocaOp alloca, + llvm::ArrayRef deallocationPoints, + mlir::Value replacement, mlir::Region &owningRegion) { + mlir::Location loc = alloca.getLoc(); + auto replacementInsertPoint = rewriter.saveInsertionPoint(); + // Create C pointer variable in the entry block to store the alloc + // and access it indirectly in the entry points that do not dominate. + rewriter.setInsertionPointToStart(&owningRegion.front()); + mlir::Type heapType = fir::HeapType::get(alloca.getInType()); + mlir::Value ptrVar = rewriter.create(loc, heapType); + mlir::Value nullPtr = rewriter.create(loc, heapType); + rewriter.create(loc, nullPtr, ptrVar); + // TODO: introducing a pointer compare op in FIR would help + // generating less IR here. + mlir::Type intPtrTy = fir::getIntPtrType(rewriter); + mlir::Value c0 = rewriter.create( + loc, intPtrTy, rewriter.getIntegerAttr(intPtrTy, 0)); + + // Store new storage address right after its creation. + rewriter.restoreInsertionPoint(replacementInsertPoint); + mlir::Value castReplacement = + fir::factory::createConvert(rewriter, loc, heapType, replacement); + rewriter.create(loc, castReplacement, ptrVar); + + // Generate conditional deallocation at every deallocation point. + auto genConditionalDealloc = [&](mlir::Location loc) { + mlir::Value ptrVal = rewriter.create(loc, ptrVar); + mlir::Value ptrToInt = + rewriter.create(loc, intPtrTy, ptrVal); + mlir::Value isAllocated = rewriter.create( + loc, mlir::arith::CmpIPredicate::ne, ptrToInt, c0); + auto ifOp = rewriter.create(loc, std::nullopt, isAllocated, + /*withElseRegion=*/false); + rewriter.setInsertionPointToStart(&ifOp.getThenRegion().front()); + mlir::Value cast = fir::factory::createConvert( + rewriter, loc, replacement.getType(), ptrVal); + deallocGenerator(loc, rewriter, cast); + // Currently there is no need to reset the pointer var because two + // deallocation points can never be reached without going through the + // alloca. + rewriter.setInsertionPointAfter(ifOp); + }; + for (mlir::Operation *deallocPoint : deallocationPoints) { + rewriter.setInsertionPoint(deallocPoint); + genConditionalDealloc(deallocPoint->getLoc()); + } +} + +bool AllocaReplaceImpl::replace(mlir::RewriterBase &rewriter, + fir::AllocaOp alloca) { + llvm::SmallVector deallocationPoints; + mlir::Region *owningRegion = + findDeallocationPointsAndOwner(alloca, deallocationPoints); + if (!owningRegion) + return false; + rewriter.setInsertionPointAfter(alloca.getOperation()); + bool deallocPointsDominateAlloc = + allocDominatesDealloc(alloca, deallocationPoints); + if (mlir::Value replacement = + allocaRewriter(rewriter, alloca, deallocPointsDominateAlloc)) { + mlir::Value castReplacement = fir::factory::createConvert( + rewriter, alloca.getLoc(), alloca.getType(), replacement); + if (deallocPointsDominateAlloc) + for (mlir::Operation *deallocPoint : deallocationPoints) { + rewriter.setInsertionPoint(deallocPoint); + deallocGenerator(deallocPoint->getLoc(), rewriter, replacement); + } + else + genIndirectDeallocation(rewriter, alloca, deallocationPoints, replacement, + *owningRegion); + rewriter.replaceOp(alloca, castReplacement); + } + return true; +} + +bool fir::replaceAllocas(mlir::RewriterBase &rewriter, + mlir::Operation *parentOp, + MustRewriteCallBack mustReplace, + AllocaRewriterCallBack allocaRewriter, + DeallocCallBack deallocGenerator) { + // If the parent operation is not an alloca owner, the code below would risk + // modifying IR outside of parentOp. + if (!fir::AllocaOp::ownsNestedAlloca(parentOp)) + return false; + auto insertPoint = rewriter.saveInsertionPoint(); + bool replacedAllRequestedAlloca = true; + AllocaReplaceImpl impl(allocaRewriter, deallocGenerator); + parentOp->walk([&](fir::AllocaOp alloca) { + if (mustReplace(alloca)) + replacedAllRequestedAlloca &= impl.replace(rewriter, alloca); + }); + rewriter.restoreInsertionPoint(insertPoint); + return replacedAllRequestedAlloca; +} diff --git a/flang/test/Fir/memory-allocation-opt-2.fir b/flang/test/Fir/memory-allocation-opt-2.fir new file mode 100644 index 0000000000000..2addb6ba8b999 --- /dev/null +++ b/flang/test/Fir/memory-allocation-opt-2.fir @@ -0,0 +1,161 @@ +// Test memory allocation pass for fir.alloca outside of function entry block +// RUN: fir-opt --memory-allocation-opt="dynamic-array-on-heap=true" %s | FileCheck %s + +func.func @test_loop() { + %c1 = arith.constant 1 : index + %c100 = arith.constant 100 : index + fir.do_loop %arg0 = %c1 to %c100 step %c1 { + %1 = fir.alloca !fir.array, %arg0 + fir.call @bar(%1) : (!fir.ref>) -> () + fir.result + } + return +} +// CHECK-LABEL: func.func @test_loop() { +// CHECK: %[[VAL_0:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_1:.*]] = arith.constant 100 : index +// CHECK: fir.do_loop %[[VAL_2:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_0]] { +// CHECK: %[[VAL_3:.*]] = fir.allocmem !fir.array, %[[VAL_2]] {bindc_name = "", uniq_name = ""} +// CHECK: %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (!fir.heap>) -> !fir.ref> +// CHECK: fir.call @bar(%[[VAL_4]]) : (!fir.ref>) -> () +// CHECK: fir.freemem %[[VAL_3]] : !fir.heap> +// CHECK: } +// CHECK: return +// CHECK: } + +func.func @test_unstructured(%n : index) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c100 = arith.constant 100 : index + %0 = fir.alloca index + fir.store %c100 to %0 : !fir.ref + cf.br ^bb1 +^bb1: // 2 preds: ^bb0, ^bb4 + %5 = fir.load %0 : !fir.ref + %6 = arith.cmpi sgt, %5, %c0 : index + cf.cond_br %6, ^bb2, ^bb5 +^bb2: // pred: ^bb1 + %1 = fir.alloca !fir.array, %5 + fir.call @bar(%1) : (!fir.ref>) -> () + %25 = arith.cmpi slt, %5, %n : index + cf.cond_br %25, ^bb3, ^bb4 +^bb3: // pred: ^bb2 + fir.call @abort() : () -> () + fir.unreachable +^bb4: // pred: ^bb2 + %28 = arith.subi %5, %c1 : index + fir.store %28 to %0 : !fir.ref + cf.br ^bb1 +^bb5: // pred: ^bb1 + return +} +// CHECK-LABEL: func.func @test_unstructured( +// CHECK-SAME: %[[VAL_0:.*]]: index) { +// CHECK: %[[VAL_1:.*]] = fir.alloca !fir.heap> +// CHECK: %[[VAL_2:.*]] = fir.zero_bits !fir.heap> +// CHECK: fir.store %[[VAL_2]] to %[[VAL_1]] : !fir.ref>> +// CHECK: %[[VAL_3:.*]] = arith.constant 0 : i64 +// CHECK: %[[VAL_4:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_5:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_6:.*]] = arith.constant 100 : index +// CHECK: %[[VAL_7:.*]] = fir.alloca index +// CHECK: fir.store %[[VAL_6]] to %[[VAL_7]] : !fir.ref +// CHECK: cf.br ^bb1 +// CHECK: ^bb1: +// CHECK: %[[VAL_8:.*]] = fir.load %[[VAL_7]] : !fir.ref +// CHECK: %[[VAL_9:.*]] = arith.cmpi sgt, %[[VAL_8]], %[[VAL_4]] : index +// CHECK: cf.cond_br %[[VAL_9]], ^bb2, ^bb5 +// CHECK: ^bb2: +// CHECK: %[[VAL_10:.*]] = fir.load %[[VAL_1]] : !fir.ref>> +// CHECK: %[[VAL_11:.*]] = fir.convert %[[VAL_10]] : (!fir.heap>) -> i64 +// CHECK: %[[VAL_12:.*]] = arith.cmpi ne, %[[VAL_11]], %[[VAL_3]] : i64 +// CHECK: fir.if %[[VAL_12]] { +// CHECK: fir.freemem %[[VAL_10]] : !fir.heap> +// CHECK: } +// CHECK: %[[VAL_13:.*]] = fir.allocmem !fir.array, %[[VAL_8]] {bindc_name = "", uniq_name = ""} +// CHECK: %[[VAL_14:.*]] = fir.convert %[[VAL_13]] : (!fir.heap>) -> !fir.ref> +// CHECK: fir.store %[[VAL_13]] to %[[VAL_1]] : !fir.ref>> +// CHECK: fir.call @bar(%[[VAL_14]]) : (!fir.ref>) -> () +// CHECK: %[[VAL_15:.*]] = arith.cmpi slt, %[[VAL_8]], %[[VAL_0]] : index +// CHECK: cf.cond_br %[[VAL_15]], ^bb3, ^bb4 +// CHECK: ^bb3: +// CHECK: fir.call @abort() : () -> () +// CHECK: %[[VAL_16:.*]] = fir.load %[[VAL_1]] : !fir.ref>> +// CHECK: %[[VAL_17:.*]] = fir.convert %[[VAL_16]] : (!fir.heap>) -> i64 +// CHECK: %[[VAL_18:.*]] = arith.cmpi ne, %[[VAL_17]], %[[VAL_3]] : i64 +// CHECK: fir.if %[[VAL_18]] { +// CHECK: fir.freemem %[[VAL_16]] : !fir.heap> +// CHECK: } +// CHECK: fir.unreachable +// CHECK: ^bb4: +// CHECK: %[[VAL_19:.*]] = arith.subi %[[VAL_8]], %[[VAL_5]] : index +// CHECK: fir.store %[[VAL_19]] to %[[VAL_7]] : !fir.ref +// CHECK: cf.br ^bb1 +// CHECK: ^bb5: +// CHECK: %[[VAL_20:.*]] = fir.load %[[VAL_1]] : !fir.ref>> +// CHECK: %[[VAL_21:.*]] = fir.convert %[[VAL_20]] : (!fir.heap>) -> i64 +// CHECK: %[[VAL_22:.*]] = arith.cmpi ne, %[[VAL_21]], %[[VAL_3]] : i64 +// CHECK: fir.if %[[VAL_22]] { +// CHECK: fir.freemem %[[VAL_20]] : !fir.heap> +// CHECK: } +// CHECK: return +// CHECK: } + +func.func @alloca_dominate_return_in_cycle(%arg0: index) { + %0 = fir.alloca index + %c1 = arith.constant 1 : index + fir.store %c1 to %0 : !fir.ref + cf.br ^bb1 +^bb1: // 2 preds: ^bb0, ^bb2 + %1 = fir.load %0 : !fir.ref + %2 = fir.alloca !fir.array, %1 + fir.call @bar(%2) : (!fir.ref>) -> () + %3 = arith.addi %1, %c1 : index + fir.store %3 to %0 : !fir.ref + %4 = arith.cmpi slt, %3, %arg0 : index + cf.cond_br %4, ^bb2, ^bb3 +^bb2: // pred: ^bb1 + cf.br ^bb1 +^bb3: // pred: ^bb1 + return +} +// CHECK-LABEL: func.func @alloca_dominate_return_in_cycle( +// CHECK-SAME: %[[VAL_0:.*]]: index) { +// CHECK: %[[VAL_1:.*]] = fir.alloca !fir.heap> +// CHECK: %[[VAL_2:.*]] = fir.zero_bits !fir.heap> +// CHECK: fir.store %[[VAL_2]] to %[[VAL_1]] : !fir.ref>> +// CHECK: %[[VAL_3:.*]] = arith.constant 0 : i64 +// CHECK: %[[VAL_4:.*]] = fir.alloca index +// CHECK: %[[VAL_5:.*]] = arith.constant 1 : index +// CHECK: fir.store %[[VAL_5]] to %[[VAL_4]] : !fir.ref +// CHECK: cf.br ^bb1 +// CHECK: ^bb1: +// CHECK: %[[VAL_6:.*]] = fir.load %[[VAL_4]] : !fir.ref +// CHECK: %[[VAL_7:.*]] = fir.load %[[VAL_1]] : !fir.ref>> +// CHECK: %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (!fir.heap>) -> i64 +// CHECK: %[[VAL_9:.*]] = arith.cmpi ne, %[[VAL_8]], %[[VAL_3]] : i64 +// CHECK: fir.if %[[VAL_9]] { +// CHECK: fir.freemem %[[VAL_7]] : !fir.heap> +// CHECK: } +// CHECK: %[[VAL_10:.*]] = fir.allocmem !fir.array, %[[VAL_6]] {bindc_name = "", uniq_name = ""} +// CHECK: %[[VAL_11:.*]] = fir.convert %[[VAL_10]] : (!fir.heap>) -> !fir.ref> +// CHECK: fir.store %[[VAL_10]] to %[[VAL_1]] : !fir.ref>> +// CHECK: fir.call @bar(%[[VAL_11]]) : (!fir.ref>) -> () +// CHECK: %[[VAL_12:.*]] = arith.addi %[[VAL_6]], %[[VAL_5]] : index +// CHECK: fir.store %[[VAL_12]] to %[[VAL_4]] : !fir.ref +// CHECK: %[[VAL_13:.*]] = arith.cmpi slt, %[[VAL_12]], %[[VAL_0]] : index +// CHECK: cf.cond_br %[[VAL_13]], ^bb2, ^bb3 +// CHECK: ^bb2: +// CHECK: cf.br ^bb1 +// CHECK: ^bb3: +// CHECK: %[[VAL_14:.*]] = fir.load %[[VAL_1]] : !fir.ref>> +// CHECK: %[[VAL_15:.*]] = fir.convert %[[VAL_14]] : (!fir.heap>) -> i64 +// CHECK: %[[VAL_16:.*]] = arith.cmpi ne, %[[VAL_15]], %[[VAL_3]] : i64 +// CHECK: fir.if %[[VAL_16]] { +// CHECK: fir.freemem %[[VAL_14]] : !fir.heap> +// CHECK: } +// CHECK: return +// CHECK: } + +func.func private @bar(!fir.ref>) +func.func private @abort() From 578c6191eff7c69608cf5e363460f60a890d4065 Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Wed, 17 Jul 2024 09:18:17 +0200 Subject: [PATCH 224/777] [libc++] Simplify std::is_void (#99033) --- libcxx/include/__type_traits/is_void.h | 20 ++------------------ 1 file changed, 2 insertions(+), 18 deletions(-) diff --git a/libcxx/include/__type_traits/is_void.h b/libcxx/include/__type_traits/is_void.h index 4c27060530c8e..46316b0d3a534 100644 --- a/libcxx/include/__type_traits/is_void.h +++ b/libcxx/include/__type_traits/is_void.h @@ -11,8 +11,6 @@ #include <__config> #include <__type_traits/integral_constant.h> -#include <__type_traits/is_same.h> -#include <__type_traits/remove_cv.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header @@ -20,28 +18,14 @@ _LIBCPP_BEGIN_NAMESPACE_STD -#if __has_builtin(__is_void) - template -struct _LIBCPP_TEMPLATE_VIS is_void : _BoolConstant<__is_void(_Tp)> {}; +struct _LIBCPP_TEMPLATE_VIS is_void : _BoolConstant<__is_same(__remove_cv(_Tp), void)> {}; # if _LIBCPP_STD_VER >= 17 template -inline constexpr bool is_void_v = __is_void(_Tp); +inline constexpr bool is_void_v = __is_same(__remove_cv(_Tp), void); # endif -#else - -template -struct _LIBCPP_TEMPLATE_VIS is_void : public is_same<__remove_cv_t<_Tp>, void> {}; - -# if _LIBCPP_STD_VER >= 17 -template -inline constexpr bool is_void_v = is_void<_Tp>::value; -# endif - -#endif // __has_builtin(__is_void) - _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP___TYPE_TRAITS_IS_VOID_H From f2251f93ab6977c2a2fc547b9301da2c2627c663 Mon Sep 17 00:00:00 2001 From: Guray Ozen Date: Wed, 17 Jul 2024 09:23:32 +0200 Subject: [PATCH 225/777] [mlir][gpu] Add mlir_c_runner_utils to fix #99035 This fixes the unit test that is broken in #99035. --- mlir/test/Integration/GPU/CUDA/alloc-host-shared.mlir | 1 + 1 file changed, 1 insertion(+) diff --git a/mlir/test/Integration/GPU/CUDA/alloc-host-shared.mlir b/mlir/test/Integration/GPU/CUDA/alloc-host-shared.mlir index 77fa0deffdd69..f63dbbb431658 100644 --- a/mlir/test/Integration/GPU/CUDA/alloc-host-shared.mlir +++ b/mlir/test/Integration/GPU/CUDA/alloc-host-shared.mlir @@ -3,6 +3,7 @@ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ +// RUN: --shared-libs=%mlir_c_runner_utils \ // RUN: --entry-point-result=void \ // RUN: | FileCheck %s From 863ad5af2d2e99dfb370bf370079e452212807ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Tue, 16 Jul 2024 19:00:45 +0200 Subject: [PATCH 226/777] [clang][Interp][NFC] Remove Block::deref() We already have Pointer::deref() --- clang/lib/AST/Interp/Interp.h | 3 +-- clang/lib/AST/Interp/InterpBlock.h | 7 ------- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/clang/lib/AST/Interp/Interp.h b/clang/lib/AST/Interp/Interp.h index c7d8604c7dc2a..16093393b5da2 100644 --- a/clang/lib/AST/Interp/Interp.h +++ b/clang/lib/AST/Interp/Interp.h @@ -1252,8 +1252,7 @@ bool GetGlobalUnchecked(InterpState &S, CodePtr OpPC, uint32_t I) { const Pointer &Ptr = S.P.getPtrGlobal(I); if (!Ptr.isInitialized()) return false; - const Block *B = S.P.getGlobal(I); - S.Stk.push(B->deref()); + S.Stk.push(Ptr.deref()); return true; } diff --git a/clang/lib/AST/Interp/InterpBlock.h b/clang/lib/AST/Interp/InterpBlock.h index 1f25de3589630..51799ee351753 100644 --- a/clang/lib/AST/Interp/InterpBlock.h +++ b/clang/lib/AST/Interp/InterpBlock.h @@ -105,13 +105,6 @@ class Block final { return reinterpret_cast(this) + sizeof(Block); } - /// Returns a view over the data. - template - T &deref() { return *reinterpret_cast(data()); } - template const T &deref() const { - return *reinterpret_cast(data()); - } - /// Invokes the constructor. void invokeCtor() { std::memset(rawData(), 0, Desc->getAllocSize()); From d3dab0cb8dd04a16e10b5f29b968d50749dcb2c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Tue, 16 Jul 2024 20:43:59 +0200 Subject: [PATCH 227/777] [clang][Interp][NFC] Assert initialization state in invoke{C,D}tor --- clang/lib/AST/Interp/InterpBlock.h | 2 ++ clang/lib/AST/Interp/InterpFrame.cpp | 1 + clang/lib/AST/Interp/InterpState.cpp | 2 +- 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/clang/lib/AST/Interp/InterpBlock.h b/clang/lib/AST/Interp/InterpBlock.h index 51799ee351753..ee33e5a4b2df0 100644 --- a/clang/lib/AST/Interp/InterpBlock.h +++ b/clang/lib/AST/Interp/InterpBlock.h @@ -107,6 +107,7 @@ class Block final { /// Invokes the constructor. void invokeCtor() { + assert(!IsInitialized); std::memset(rawData(), 0, Desc->getAllocSize()); if (Desc->CtorFn) Desc->CtorFn(this, data(), Desc->IsConst, Desc->IsMutable, @@ -116,6 +117,7 @@ class Block final { /// Invokes the Destructor. void invokeDtor() { + assert(IsInitialized); if (Desc->DtorFn) Desc->DtorFn(this, data(), Desc); IsInitialized = false; diff --git a/clang/lib/AST/Interp/InterpFrame.cpp b/clang/lib/AST/Interp/InterpFrame.cpp index d3f3e216b7eb2..1c37450ae1c6e 100644 --- a/clang/lib/AST/Interp/InterpFrame.cpp +++ b/clang/lib/AST/Interp/InterpFrame.cpp @@ -227,6 +227,7 @@ Pointer InterpFrame::getParamPointer(unsigned Off) { size_t BlockSize = sizeof(Block) + Desc.second->getAllocSize(); auto Memory = std::make_unique(BlockSize); auto *B = new (Memory.get()) Block(S.Ctx.getEvalID(), Desc.second); + B->invokeCtor(); // Copy the initial value. TYPE_SWITCH(Desc.first, new (B->data()) T(stackRef(Off))); diff --git a/clang/lib/AST/Interp/InterpState.cpp b/clang/lib/AST/Interp/InterpState.cpp index a8538541f4915..40eb28bfb4875 100644 --- a/clang/lib/AST/Interp/InterpState.cpp +++ b/clang/lib/AST/Interp/InterpState.cpp @@ -77,7 +77,7 @@ void InterpState::deallocate(Block *B) { // We moved the contents over to the DeadBlock. B->IsInitialized = false; - } else { + } else if (B->IsInitialized) { B->invokeDtor(); } } From 5e338f1f4ae28b9dd7d722a77ab204e358006a86 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Wed, 17 Jul 2024 08:25:15 +0100 Subject: [PATCH 228/777] [AMDGPU] clang-tidy: use emplace_back instead of push_back. NFC. --- llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 4 ++-- llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp | 2 +- llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp | 2 +- .../AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp | 14 +++++++------- llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp | 8 ++++---- llvm/lib/Target/AMDGPU/R600InstrInfo.cpp | 6 +++--- .../AMDGPU/R600OptimizeVectorRegisters.cpp | 8 +++----- llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp | 2 +- llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp | 6 ++---- llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp | 2 +- llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp | 16 ++++++++-------- .../Target/AMDGPU/Utils/AMDGPUDelayedMCExpr.cpp | 2 +- 12 files changed, 34 insertions(+), 38 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 9c71f20920c01..632657589bdd2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -285,7 +285,7 @@ void AMDGPUAsmPrinter::emitFunctionEntryLabel() { // Disassemble function name label to text. DisasmLines.push_back(MF->getName().str() + ":"); DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size()); - HexLines.push_back(""); + HexLines.emplace_back(""); } AsmPrinter::emitFunctionEntryLabel(); @@ -298,7 +298,7 @@ void AMDGPUAsmPrinter::emitBasicBlockStart(const MachineBasicBlock &MBB) { (Twine("BB") + Twine(getFunctionNumber()) + "_" + Twine(MBB.getNumber()) + ":").str()); DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size()); - HexLines.push_back(""); + HexLines.emplace_back(""); } AsmPrinter::emitBasicBlockStart(MBB); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp index 74e67690d5e88..8d74689b5ad7b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp @@ -2466,7 +2466,7 @@ int SchedGroup::link(SUnit &SU, bool MakePred, // the A->B edge impossible, otherwise it returns true; bool Added = tryAddEdge(A, B); if (Added) - AddedEdges.push_back(std::pair(A, B)); + AddedEdges.emplace_back(A, B); else ++MissedEdges; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp index f75961f6eaa77..bd0f0e048809b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp @@ -480,7 +480,7 @@ doPartitioning(SplitModuleLogger &SML, Module &M, unsigned NumParts, // partitions) so it's a cheap operation. std::vector> BalancingQueue; for (unsigned I = 0; I < NumParts; ++I) - BalancingQueue.push_back(std::make_pair(I, 0)); + BalancingQueue.emplace_back(I, 0); // Helper function to handle assigning a function to a partition. This takes // care of updating the balancing queue. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp index 1bfb7c0edd80a..3758c768b8673 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp @@ -169,7 +169,7 @@ BasicBlock *AMDGPUUnifyDivergentExitNodesImpl::unifyReturnBlockSet( // Remove and delete the return inst. BB->getTerminator()->eraseFromParent(); BranchInst::Create(NewRetBlock, BB); - Updates.push_back({DominatorTree::Insert, BB, NewRetBlock}); + Updates.emplace_back(DominatorTree::Insert, BB, NewRetBlock); } if (RequireAndPreserveDomTree) @@ -239,7 +239,7 @@ bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F, DominatorTree *DT, BI->eraseFromParent(); // Delete the unconditional branch. // Add a new conditional branch with a dummy edge to the return block. BranchInst::Create(LoopHeaderBB, DummyReturnBB, BoolTrue, BB); - Updates.push_back({DominatorTree::Insert, BB, DummyReturnBB}); + Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB); } else { // Conditional branch. SmallVector Successors(successors(BB)); @@ -250,17 +250,17 @@ bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F, DominatorTree *DT, // 'Successors' become successors of TransitionBB instead of BB, // and TransitionBB becomes a single successor of BB. - Updates.push_back({DominatorTree::Insert, BB, TransitionBB}); + Updates.emplace_back(DominatorTree::Insert, BB, TransitionBB); for (BasicBlock *Successor : Successors) { - Updates.push_back({DominatorTree::Insert, TransitionBB, Successor}); - Updates.push_back({DominatorTree::Delete, BB, Successor}); + Updates.emplace_back(DominatorTree::Insert, TransitionBB, Successor); + Updates.emplace_back(DominatorTree::Delete, BB, Successor); } // Create a branch that will always branch to the transition block and // references DummyReturnBB. BB->getTerminator()->eraseFromParent(); BranchInst::Create(TransitionBB, DummyReturnBB, BoolTrue, BB); - Updates.push_back({DominatorTree::Insert, BB, DummyReturnBB}); + Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB); } Changed = true; } @@ -281,7 +281,7 @@ bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F, DominatorTree *DT, // Remove and delete the unreachable inst. BB->getTerminator()->eraseFromParent(); BranchInst::Create(UnreachableBlock, BB); - Updates.push_back({DominatorTree::Insert, BB, UnreachableBlock}); + Updates.emplace_back(DominatorTree::Insert, BB, UnreachableBlock); } Changed = true; } diff --git a/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp b/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp index ef67e5c937dc2..ccbfa4fde09a0 100644 --- a/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp +++ b/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp @@ -133,20 +133,20 @@ class R600EmitClauseMarkers : public MachineFunctionPass { const std::pair &BankLine = getAccessedBankLine(Sel); if (CachedConsts.empty()) { CachedConsts.push_back(BankLine); - UsedKCache.push_back(std::pair(0, KCacheIndex)); + UsedKCache.emplace_back(0, KCacheIndex); continue; } if (CachedConsts[0] == BankLine) { - UsedKCache.push_back(std::pair(0, KCacheIndex)); + UsedKCache.emplace_back(0, KCacheIndex); continue; } if (CachedConsts.size() == 1) { CachedConsts.push_back(BankLine); - UsedKCache.push_back(std::pair(1, KCacheIndex)); + UsedKCache.emplace_back(1, KCacheIndex); continue; } if (CachedConsts[1] == BankLine) { - UsedKCache.push_back(std::pair(1, KCacheIndex)); + UsedKCache.emplace_back(1, KCacheIndex); continue; } return false; diff --git a/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp b/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp index 98e7359357891..29a43bf4dc52f 100644 --- a/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp @@ -326,11 +326,11 @@ R600InstrInfo::ExtractSrcs(MachineInstr &MI, Register Reg = Src.first->getReg(); int Index = RI.getEncodingValue(Reg) & 0xff; if (Reg == R600::OQAP) { - Result.push_back(std::pair(Index, 0U)); + Result.emplace_back(Index, 0U); } if (PV.contains(Reg)) { // 255 is used to tells its a PS/PV reg - Result.push_back(std::pair(255, 0U)); + Result.emplace_back(255, 0U); continue; } if (Index > 127) { @@ -339,7 +339,7 @@ R600InstrInfo::ExtractSrcs(MachineInstr &MI, continue; } unsigned Chan = RI.getHWRegChan(Reg); - Result.push_back(std::pair(Index, Chan)); + Result.emplace_back(Index, Chan); } for (; i < 3; ++i) Result.push_back(DummyPair); diff --git a/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp index affbae9b31d9f..a20319ea4f9d3 100644 --- a/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp +++ b/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp @@ -58,7 +58,7 @@ class RegSeqInfo { MachineOperand &MO = Instr->getOperand(i); unsigned Chan = Instr->getOperand(i + 1).getImm(); if (isImplicitlyDef(MRI, MO.getReg())) - UndefReg.push_back(Chan); + UndefReg.emplace_back(Chan); else RegToChan[MO.getReg()] = Chan; } @@ -154,14 +154,12 @@ bool R600VectorRegMerger::tryMergeVector(const RegSeqInfo *Untouched, DenseMap::const_iterator PosInUntouched = Untouched->RegToChan.find(It.first); if (PosInUntouched != Untouched->RegToChan.end()) { - Remap.push_back( - std::pair(It.second, (*PosInUntouched).second)); + Remap.emplace_back(It.second, (*PosInUntouched).second); continue; } if (CurrentUndexIdx >= Untouched->UndefReg.size()) return false; - Remap.push_back(std::pair( - It.second, Untouched->UndefReg[CurrentUndexIdx++])); + Remap.emplace_back(It.second, Untouched->UndefReg[CurrentUndexIdx++]); } return true; diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp index 3e0baede919cc..a32a18e506018 100644 --- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp @@ -237,7 +237,7 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs( int JunkFI = MFI.CreateStackObject(TRI->getSpillSize(*RC), TRI->getSpillAlign(*RC), true); - CSI.push_back(CalleeSavedInfo(Reg, JunkFI)); + CSI.emplace_back(Reg, JunkFI); CalleeSavedFIs.push_back(JunkFI); } } diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index bf8e61b554fae..5e89c286bfbbd 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -358,8 +358,7 @@ bool SIMachineFunctionInfo::allocateVirtualVGPRForSGPRSpills( LaneVGPR = SpillVGPRs.back(); } - SGPRSpillsToVirtualVGPRLanes[FI].push_back( - SIRegisterInfo::SpilledReg(LaneVGPR, LaneIndex)); + SGPRSpillsToVirtualVGPRLanes[FI].emplace_back(LaneVGPR, LaneIndex); return true; } @@ -393,8 +392,7 @@ bool SIMachineFunctionInfo::allocatePhysicalVGPRForSGPRSpills( LaneVGPR = SpillPhysVGPRs.back(); } - SGPRSpillsToPhysicalVGPRLanes[FI].push_back( - SIRegisterInfo::SpilledReg(LaneVGPR, LaneIndex)); + SGPRSpillsToPhysicalVGPRLanes[FI].emplace_back(LaneVGPR, LaneIndex); return true; } diff --git a/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp b/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp index 6550f98018aa4..fb4f5ea4aa760 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp @@ -546,7 +546,7 @@ void SIScheduleBlock::addSucc(SIScheduleBlock *Succ, } if (Succ->isHighLatencyBlock()) ++NumHighLatencySuccessors; - Succs.push_back(std::pair(Succ, Kind)); + Succs.emplace_back(Succ, Kind); assert(none_of(Preds, [=](SIScheduleBlock *P) { return SuccID == P->getID(); }) && diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp index 19e761ef45b25..3dc8cc17afc16 100644 --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -307,7 +307,7 @@ void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag, LLVM_DEBUG(dbgs() << "markInstruction " << PrintState(Flag) << ": " << MI); II.Needs |= Flag; - Worklist.push_back(&MI); + Worklist.emplace_back(&MI); } /// Mark all relevant definitions of register \p Reg in usage \p UseMI. @@ -539,7 +539,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF, BBI.Needs |= StateExact; if (!(BBI.InNeeds & StateExact)) { BBI.InNeeds |= StateExact; - Worklist.push_back(MBB); + Worklist.emplace_back(MBB); } GlobalFlags |= StateExact; III.Disabled = StateWQM | StateStrict; @@ -568,7 +568,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF, BBI.Needs |= StateExact; if (!(BBI.InNeeds & StateExact)) { BBI.InNeeds |= StateExact; - Worklist.push_back(MBB); + Worklist.emplace_back(MBB); } GlobalFlags |= StateExact; III.Disabled = StateWQM | StateStrict; @@ -638,7 +638,7 @@ void SIWholeQuadMode::propagateInstruction(MachineInstr &MI, BI.Needs |= StateWQM; if (!(BI.InNeeds & StateWQM)) { BI.InNeeds |= StateWQM; - Worklist.push_back(MBB); + Worklist.emplace_back(MBB); } } @@ -649,7 +649,7 @@ void SIWholeQuadMode::propagateInstruction(MachineInstr &MI, InstrInfo &PrevII = Instructions[PrevMI]; if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) { PrevII.OutNeeds |= InNeeds; - Worklist.push_back(PrevMI); + Worklist.emplace_back(PrevMI); } } } @@ -678,7 +678,7 @@ void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB, InstrInfo &LastII = Instructions[LastMI]; if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) { LastII.OutNeeds |= BI.OutNeeds; - Worklist.push_back(LastMI); + Worklist.emplace_back(LastMI); } } @@ -690,7 +690,7 @@ void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB, PredBI.OutNeeds |= BI.InNeeds; PredBI.InNeeds |= BI.InNeeds; - Worklist.push_back(Pred); + Worklist.emplace_back(Pred); } // All successors must be prepared to accept the same set of WQM/Exact data. @@ -700,7 +700,7 @@ void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB, continue; SuccBI.InNeeds |= BI.OutNeeds; - Worklist.push_back(Succ); + Worklist.emplace_back(Succ); } } diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUDelayedMCExpr.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUDelayedMCExpr.cpp index a4f4a9ed5da41..ceb475d77cb32 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUDelayedMCExpr.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUDelayedMCExpr.cpp @@ -37,7 +37,7 @@ void DelayedMCExprs::assignDocNode(msgpack::DocNode &DN, msgpack::Type Type, } } - DelayedExprs.push_back(Expr{DN, Type, ExprValue}); + DelayedExprs.emplace_back(DN, Type, ExprValue); } bool DelayedMCExprs::resolveDelayedExpressions() { From a5b5208ba627da46310db67af0dcbb0a824fab92 Mon Sep 17 00:00:00 2001 From: Mariya Podchishchaeva Date: Wed, 17 Jul 2024 10:00:47 +0200 Subject: [PATCH 229/777] [clang] Be careful when choosing "fast path" for initialization with #embed (#99023) When #embed appears in an initializer list, we may choose a "fast path" if the target declaration is a char array. We simply initialize it with string literal that contains embedded data. However we need to be careful when checking that we actually can use this "fast path" since char array may be nested in a struct. --- clang/lib/Sema/SemaInit.cpp | 17 +++++++++++++---- clang/test/Preprocessor/embed_codegen.cpp | 8 ++++++++ clang/test/Preprocessor/embed_constexpr.cpp | 8 ++++++++ 3 files changed, 29 insertions(+), 4 deletions(-) diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp index a27ed02fc73b8..d97a5c8988840 100644 --- a/clang/lib/Sema/SemaInit.cpp +++ b/clang/lib/Sema/SemaInit.cpp @@ -1993,9 +1993,18 @@ static bool checkDestructorReference(QualType ElementType, SourceLocation Loc, return SemaRef.DiagnoseUseOfDecl(Destructor, Loc); } -static bool canInitializeArrayWithEmbedDataString(ArrayRef ExprList, - QualType InitType, - ASTContext &Context) { +static bool +canInitializeArrayWithEmbedDataString(ArrayRef ExprList, + const InitializedEntity &Entity, + ASTContext &Context) { + QualType InitType = Entity.getType(); + const InitializedEntity *Parent = &Entity; + + while (Parent) { + InitType = Parent->getType(); + Parent = Parent->getParent(); + } + // Only one initializer, it's an embed and the types match; EmbedExpr *EE = ExprList.size() == 1 @@ -2034,7 +2043,7 @@ void InitListChecker::CheckArrayType(const InitializedEntity &Entity, } } - if (canInitializeArrayWithEmbedDataString(IList->inits(), DeclType, + if (canInitializeArrayWithEmbedDataString(IList->inits(), Entity, SemaRef.Context)) { EmbedExpr *Embed = cast(IList->inits()[0]); IList->setInit(0, Embed->getDataStringLiteral()); diff --git a/clang/test/Preprocessor/embed_codegen.cpp b/clang/test/Preprocessor/embed_codegen.cpp index 201bf300bc669..2cf14d8d6a15d 100644 --- a/clang/test/Preprocessor/embed_codegen.cpp +++ b/clang/test/Preprocessor/embed_codegen.cpp @@ -3,6 +3,7 @@ // CHECK: @__const._Z3fooi.ca = private unnamed_addr constant [3 x i32] [i32 0, i32 106, i32 107], align 4 // CHECK: @__const._Z3fooi.sc = private unnamed_addr constant %struct.S1 { i32 106, i32 107, i32 0 }, align 4 // CHECK: @__const._Z3fooi.t = private unnamed_addr constant [3 x %struct.T] [%struct.T { [2 x i32] [i32 48, i32 49], %struct.S1 { i32 50, i32 51, i32 52 } }, %struct.T { [2 x i32] [i32 53, i32 54], %struct.S1 { i32 55, i32 56, i32 57 } }, %struct.T { [2 x i32] [i32 10, i32 0], %struct.S1 zeroinitializer }], align 16 +// CHECK: @__const._Z3fooi.W = private unnamed_addr constant %struct.Wrapper { i32 48, %struct.HasCharArray { [10 x i8] c"123456789\0A" } }, align 4 void foo(int a) { // CHECK: %a.addr = alloca i32, align 4 // CHECK: store i32 %a, ptr %a.addr, align 4 @@ -82,4 +83,11 @@ struct T tnonc[] = { #embed prefix(,) }; + +struct HasCharArray { unsigned char h[10]; }; +struct Wrapper { int a; struct HasCharArray d; }; +constexpr struct Wrapper W = { +#embed "numbers.txt" +}; + } diff --git a/clang/test/Preprocessor/embed_constexpr.cpp b/clang/test/Preprocessor/embed_constexpr.cpp index a7857641a2e8d..c51c02def7dfb 100644 --- a/clang/test/Preprocessor/embed_constexpr.cpp +++ b/clang/test/Preprocessor/embed_constexpr.cpp @@ -96,3 +96,11 @@ struct ST {}; ST< #embed limit(1) > st; + +struct HasCharArray { unsigned char h[10]; }; +struct Wrapper { int a; struct HasCharArray d; }; +constexpr struct Wrapper W = { +#embed "numbers.txt" +}; + +static_assert(W.d.h[2] == '3'); From 0905732f75cb0f774972c721810aba74021102f2 Mon Sep 17 00:00:00 2001 From: Abe Date: Wed, 17 Jul 2024 04:07:17 -0400 Subject: [PATCH 230/777] [clang-tools-extra] Fix typos in Modularize.rst (#99256) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mainly, fixed “the Clang module mechanism doesn’t support headers the rely on other headers” => “the Clang module mechanism doesn’t support headers that rely on other headers”. [emphasis on “the” versus “that”] --- clang-tools-extra/docs/modularize.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang-tools-extra/docs/modularize.rst b/clang-tools-extra/docs/modularize.rst index 64ca8c99d4e8e..97fd33b958650 100644 --- a/clang-tools-extra/docs/modularize.rst +++ b/clang-tools-extra/docs/modularize.rst @@ -254,8 +254,8 @@ For example, with the same header list from above:: } Note that headers with dependents will be ignored with a warning, as the -Clang module mechanism doesn't support headers the rely on other headers -to be included first. +Clang module mechanism doesn't support headers that rely on other headers +being included first. The module map format defines some keywords which can't be used in module names. If a header has one of these names, an underscore ('_') will be From caaf8099efa87a7ebca8920971b7d7f719808591 Mon Sep 17 00:00:00 2001 From: Jan Patrick Lehr Date: Wed, 17 Jul 2024 10:15:19 +0200 Subject: [PATCH 231/777] [Offload][OMPT] Add callbacks for (dis)associate_ptr (#99046) This adds the OMPT callbacks for the API functions disassociate_ptr and associate_ptr. --- offload/include/OpenMP/OMPT/Interface.h | 29 +++++++++++++ offload/src/OpenMP/API.cpp | 11 +++++ offload/src/OpenMP/OMPT/Callback.cpp | 57 +++++++++++++++++++++++++ offload/test/ompt/omp_api.c | 39 +++++++++++++++++ 4 files changed, 136 insertions(+) create mode 100644 offload/test/ompt/omp_api.c diff --git a/offload/include/OpenMP/OMPT/Interface.h b/offload/include/OpenMP/OMPT/Interface.h index 327fadfcd4acd..0dc1bad8f7ece 100644 --- a/offload/include/OpenMP/OMPT/Interface.h +++ b/offload/include/OpenMP/OMPT/Interface.h @@ -109,6 +109,25 @@ class Interface { /// Top-level function for invoking callback after target update construct void endTargetUpdate(int64_t DeviceId, void *Code); + /// Top-level function for invoking callback before target associate API + void beginTargetAssociatePointer(int64_t DeviceId, void *HstPtrBegin, + void *TgtPtrBegin, size_t Size, void *Code); + + /// Top-level function for invoking callback after target associate API + void endTargetAssociatePointer(int64_t DeviceId, void *HstPtrBegin, + void *TgtPtrBegin, size_t Size, void *Code); + + /// Top-level function for invoking callback before target disassociate API + void beginTargetDisassociatePointer(int64_t DeviceId, void *HstPtrBegin, + void *TgtPtrBegin, size_t Size, + void *Code); + + /// Top-level function for invoking callback after target disassociate API + void endTargetDisassociatePointer(int64_t DeviceId, void *HstPtrBegin, + void *TgtPtrBegin, size_t Size, void *Code); + + // Target kernel callbacks + /// Top-level function for invoking callback before target construct void beginTarget(int64_t DeviceId, void *Code); @@ -137,6 +156,16 @@ class Interface { return std::make_pair(std::mem_fn(&Interface::beginTargetDataRetrieve), std::mem_fn(&Interface::endTargetDataRetrieve)); + if constexpr (OpType == ompt_target_data_associate) + return std::make_pair( + std::mem_fn(&Interface::beginTargetAssociatePointer), + std::mem_fn(&Interface::endTargetAssociatePointer)); + + if constexpr (OpType == ompt_target_data_disassociate) + return std::make_pair( + std::mem_fn(&Interface::beginTargetDisassociatePointer), + std::mem_fn(&Interface::endTargetDisassociatePointer)); + llvm_unreachable("Unhandled target data operation type!"); } diff --git a/offload/src/OpenMP/API.cpp b/offload/src/OpenMP/API.cpp index 374c54163d6a4..e59bdba8abf0e 100644 --- a/offload/src/OpenMP/API.cpp +++ b/offload/src/OpenMP/API.cpp @@ -597,6 +597,12 @@ EXTERN int omp_target_associate_ptr(const void *HostPtr, const void *DevicePtr, FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str()); void *DeviceAddr = (void *)((uint64_t)DevicePtr + (uint64_t)DeviceOffset); + + OMPT_IF_BUILT(InterfaceRAII( + RegionInterface.getCallbacks(), DeviceNum, + const_cast(HostPtr), const_cast(DevicePtr), Size, + __builtin_return_address(0))); + int Rc = DeviceOrErr->getMappingInfo().associatePtr( const_cast(HostPtr), const_cast(DeviceAddr), Size); DP("omp_target_associate_ptr returns %d\n", Rc); @@ -625,6 +631,11 @@ EXTERN int omp_target_disassociate_ptr(const void *HostPtr, int DeviceNum) { if (!DeviceOrErr) FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str()); + OMPT_IF_BUILT(InterfaceRAII( + RegionInterface.getCallbacks(), DeviceNum, + const_cast(HostPtr), + /*DevicePtr=*/nullptr, /*Size=*/0, __builtin_return_address(0))); + int Rc = DeviceOrErr->getMappingInfo().disassociatePtr( const_cast(HostPtr)); DP("omp_target_disassociate_ptr returns %d\n", Rc); diff --git a/offload/src/OpenMP/OMPT/Callback.cpp b/offload/src/OpenMP/OMPT/Callback.cpp index f285843e39f38..f2964281eeb95 100644 --- a/offload/src/OpenMP/OMPT/Callback.cpp +++ b/offload/src/OpenMP/OMPT/Callback.cpp @@ -332,6 +332,63 @@ void Interface::endTargetUpdate(int64_t DeviceId, void *Code) { endTargetRegion(); } +void Interface::beginTargetAssociatePointer(int64_t DeviceId, void *HstPtrBegin, + void *TgtPtrBegin, size_t Size, + void *Code) { + beginTargetDataOperation(); + if (ompt_callback_target_data_op_emi_fn) { + ompt_callback_target_data_op_emi_fn( + ompt_scope_begin, TargetTaskData, &TargetData, &HostOpId, + ompt_target_data_associate, HstPtrBegin, omp_get_initial_device(), + TgtPtrBegin, DeviceId, Size, Code); + } else if (ompt_callback_target_data_op_fn) { + HostOpId = createOpId(); + ompt_callback_target_data_op_fn( + TargetData.value, HostOpId, ompt_target_data_associate, HstPtrBegin, + omp_get_initial_device(), TgtPtrBegin, DeviceId, Size, Code); + } +} + +void Interface::endTargetAssociatePointer(int64_t DeviceId, void *HstPtrBegin, + void *TgtPtrBegin, size_t Size, + void *Code) { + if (ompt_callback_target_data_op_emi_fn) { + ompt_callback_target_data_op_emi_fn( + ompt_scope_end, TargetTaskData, &TargetData, &HostOpId, + ompt_target_data_associate, HstPtrBegin, omp_get_initial_device(), + TgtPtrBegin, DeviceId, Size, Code); + } +} + +void Interface::beginTargetDisassociatePointer(int64_t DeviceId, + void *HstPtrBegin, + void *TgtPtrBegin, size_t Size, + void *Code) { + beginTargetDataOperation(); + if (ompt_callback_target_data_op_emi_fn) { + ompt_callback_target_data_op_emi_fn( + ompt_scope_begin, TargetTaskData, &TargetData, &HostOpId, + ompt_target_data_disassociate, HstPtrBegin, omp_get_initial_device(), + TgtPtrBegin, DeviceId, Size, Code); + } else if (ompt_callback_target_data_op_fn) { + HostOpId = createOpId(); + ompt_callback_target_data_op_fn( + TargetData.value, HostOpId, ompt_target_data_disassociate, HstPtrBegin, + omp_get_initial_device(), TgtPtrBegin, DeviceId, Size, Code); + } +} +void Interface::endTargetDisassociatePointer(int64_t DeviceId, + void *HstPtrBegin, + void *TgtPtrBegin, size_t Size, + void *Code) { + if (ompt_callback_target_data_op_emi_fn) { + ompt_callback_target_data_op_emi_fn( + ompt_scope_end, TargetTaskData, &TargetData, &HostOpId, + ompt_target_data_disassociate, HstPtrBegin, omp_get_initial_device(), + TgtPtrBegin, DeviceId, Size, Code); + } +} + void Interface::beginTarget(int64_t DeviceId, void *Code) { beginTargetRegion(); if (ompt_callback_target_emi_fn) { diff --git a/offload/test/ompt/omp_api.c b/offload/test/ompt/omp_api.c new file mode 100644 index 0000000000000..a16ef7a64aa7d --- /dev/null +++ b/offload/test/ompt/omp_api.c @@ -0,0 +1,39 @@ +// RUN: %libomptarget-compile-run-and-check-generic +// REQUIRES: ompt +// REQUIRES: gpu + +#include "omp.h" +#include +#include + +#include "callbacks.h" +#include "register_non_emi.h" + +#define N 1024 + +int main(int argc, char **argv) { + int *h_a; + int *d_a; + + h_a = (int *)malloc(N * sizeof(int)); + memset(h_a, 0, N); + + d_a = (int *)omp_target_alloc(N * sizeof(int), omp_get_default_device()); + + omp_target_associate_ptr(h_a, d_a, N * sizeof(int), 0, + omp_get_default_device()); + omp_target_disassociate_ptr(h_a, omp_get_default_device()); + + omp_target_free(d_a, omp_get_default_device()); + free(h_a); + + return 0; +} + +// clang-format off +/// CHECK: Callback Init: +/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1 +/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=5 +/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=6 +/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4 +/// CHECK: Callback Fini: From d28ed29d6bd9f0389092775406fff7e6205d4d5f Mon Sep 17 00:00:00 2001 From: Sam Parker Date: Wed, 17 Jul 2024 09:21:52 +0100 Subject: [PATCH 232/777] [TTI][WebAssembly] Pairwise reduction expansion (#93948) WebAssembly doesn't support horizontal operations nor does it have a way of expressing fast-math or reassoc flags, so runtimes are currently unable to use pairwise operations when generating code from the existing shuffle patterns. This patch allows the backend to select which, arbitary, shuffle pattern to be used per reduction intrinsic. The default behaviour is the same as the existing, which is by splitting the vector into a top and bottom half. The other pattern introduced is for a pairwise shuffle. WebAssembly enables pairwise reductions for int/fp add/sub. --- .../llvm/Analysis/TargetTransformInfo.h | 14 + .../llvm/Analysis/TargetTransformInfoImpl.h | 5 + .../include/llvm/Transforms/Utils/LoopUtils.h | 2 + llvm/lib/Analysis/TargetTransformInfo.cpp | 6 + llvm/lib/CodeGen/ExpandReductions.cpp | 10 +- .../WebAssemblyTargetTransformInfo.cpp | 12 + .../WebAssemblyTargetTransformInfo.h | 2 + llvm/lib/Transforms/Utils/LoopUtils.cpp | 42 +- .../test/CodeGen/WebAssembly/vector-reduce.ll | 1074 +++++++++++++++++ 9 files changed, 1151 insertions(+), 16 deletions(-) create mode 100644 llvm/test/CodeGen/WebAssembly/vector-reduce.ll diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index dcdd9f82cde8e..bda9d4e624505 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1705,6 +1705,13 @@ class TargetTransformInfo { /// into a shuffle sequence. bool shouldExpandReduction(const IntrinsicInst *II) const; + enum struct ReductionShuffle { SplitHalf, Pairwise }; + + /// \returns The shuffle sequence pattern used to expand the given reduction + /// intrinsic. + ReductionShuffle + getPreferredExpandedReductionShuffle(const IntrinsicInst *II) const; + /// \returns the size cost of rematerializing a GlobalValue address relative /// to a stack reload. unsigned getGISelRematGlobalCost() const; @@ -2156,6 +2163,8 @@ class TargetTransformInfo::Concept { virtual bool preferEpilogueVectorization() const = 0; virtual bool shouldExpandReduction(const IntrinsicInst *II) const = 0; + virtual ReductionShuffle + getPreferredExpandedReductionShuffle(const IntrinsicInst *II) const = 0; virtual unsigned getGISelRematGlobalCost() const = 0; virtual unsigned getMinTripCountTailFoldingThreshold() const = 0; virtual bool enableScalableVectorization() const = 0; @@ -2898,6 +2907,11 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept { return Impl.shouldExpandReduction(II); } + ReductionShuffle + getPreferredExpandedReductionShuffle(const IntrinsicInst *II) const override { + return Impl.getPreferredExpandedReductionShuffle(II); + } + unsigned getGISelRematGlobalCost() const override { return Impl.getGISelRematGlobalCost(); } diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 01624de190d51..c1eb6151440be 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -936,6 +936,11 @@ class TargetTransformInfoImplBase { bool shouldExpandReduction(const IntrinsicInst *II) const { return true; } + TTI::ReductionShuffle + getPreferredExpandedReductionShuffle(const IntrinsicInst *II) const { + return TTI::ReductionShuffle::SplitHalf; + } + unsigned getGISelRematGlobalCost() const { return 1; } unsigned getMinTripCountTailFoldingThreshold() const { return 0; } diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h index 1a878126aa082..b01a447f3c28b 100644 --- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h +++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h @@ -15,6 +15,7 @@ #include "llvm/Analysis/IVDescriptors.h" #include "llvm/Analysis/LoopAccessAnalysis.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/VectorBuilder.h" #include "llvm/Transforms/Utils/ValueMapper.h" @@ -385,6 +386,7 @@ Value *getOrderedReduction(IRBuilderBase &Builder, Value *Acc, Value *Src, /// Generates a vector reduction using shufflevectors to reduce the value. /// Fast-math-flags are propagated using the IRBuilder's setting. Value *getShuffleReduction(IRBuilderBase &Builder, Value *Src, unsigned Op, + TargetTransformInfo::ReductionShuffle RS, RecurKind MinMaxKind = RecurKind::None); /// Create a target reduction of the given vector. The reduction operation diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index c175d1737e54b..be4069bb3eabf 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1317,6 +1317,12 @@ bool TargetTransformInfo::shouldExpandReduction(const IntrinsicInst *II) const { return TTIImpl->shouldExpandReduction(II); } +TargetTransformInfo::ReductionShuffle +TargetTransformInfo::getPreferredExpandedReductionShuffle( + const IntrinsicInst *II) const { + return TTIImpl->getPreferredExpandedReductionShuffle(II); +} + unsigned TargetTransformInfo::getGISelRematGlobalCost() const { return TTIImpl->getGISelRematGlobalCost(); } diff --git a/llvm/lib/CodeGen/ExpandReductions.cpp b/llvm/lib/CodeGen/ExpandReductions.cpp index 0b1504e51b1bb..d6778ec666cbe 100644 --- a/llvm/lib/CodeGen/ExpandReductions.cpp +++ b/llvm/lib/CodeGen/ExpandReductions.cpp @@ -59,6 +59,8 @@ bool expandReductions(Function &F, const TargetTransformInfo *TTI) { isa(II) ? II->getFastMathFlags() : FastMathFlags{}; Intrinsic::ID ID = II->getIntrinsicID(); RecurKind RK = getMinMaxReductionRecurKind(ID); + TargetTransformInfo::ReductionShuffle RS = + TTI->getPreferredExpandedReductionShuffle(II); Value *Rdx = nullptr; IRBuilder<> Builder(II); @@ -79,7 +81,7 @@ bool expandReductions(Function &F, const TargetTransformInfo *TTI) { if (!isPowerOf2_32( cast(Vec->getType())->getNumElements())) continue; - Rdx = getShuffleReduction(Builder, Vec, RdxOpcode, RK); + Rdx = getShuffleReduction(Builder, Vec, RdxOpcode, RS, RK); Rdx = Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, Acc, Rdx, "bin.rdx"); } @@ -112,7 +114,7 @@ bool expandReductions(Function &F, const TargetTransformInfo *TTI) { break; } unsigned RdxOpcode = getArithmeticReductionInstruction(ID); - Rdx = getShuffleReduction(Builder, Vec, RdxOpcode, RK); + Rdx = getShuffleReduction(Builder, Vec, RdxOpcode, RS, RK); break; } case Intrinsic::vector_reduce_add: @@ -127,7 +129,7 @@ bool expandReductions(Function &F, const TargetTransformInfo *TTI) { cast(Vec->getType())->getNumElements())) continue; unsigned RdxOpcode = getArithmeticReductionInstruction(ID); - Rdx = getShuffleReduction(Builder, Vec, RdxOpcode, RK); + Rdx = getShuffleReduction(Builder, Vec, RdxOpcode, RS, RK); break; } case Intrinsic::vector_reduce_fmax: @@ -140,7 +142,7 @@ bool expandReductions(Function &F, const TargetTransformInfo *TTI) { !FMF.noNaNs()) continue; unsigned RdxOpcode = getArithmeticReductionInstruction(ID); - Rdx = getShuffleReduction(Builder, Vec, RdxOpcode, RK); + Rdx = getShuffleReduction(Builder, Vec, RdxOpcode, RS, RK); break; } } diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp index 9a434d9b1db54..b109594811d97 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp @@ -94,6 +94,18 @@ WebAssemblyTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, return Cost; } +TTI::ReductionShuffle WebAssemblyTTIImpl::getPreferredExpandedReductionShuffle( + const IntrinsicInst *II) const { + + switch (II->getIntrinsicID()) { + default: + break; + case Intrinsic::vector_reduce_fadd: + return TTI::ReductionShuffle::Pairwise; + } + return TTI::ReductionShuffle::SplitHalf; +} + bool WebAssemblyTTIImpl::areInlineCompatible(const Function *Caller, const Function *Callee) const { // Allow inlining only when the Callee has a subset of the Caller's diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h index e10f0928ed531..269922cc3ea84 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h +++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h @@ -70,6 +70,8 @@ class WebAssemblyTTIImpl final : public BasicTTIImplBase { TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1); + TTI::ReductionShuffle + getPreferredExpandedReductionShuffle(const IntrinsicInst *II) const; /// @} bool areInlineCompatible(const Function *Caller, diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index ff93035ce0652..4609376a748f9 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -1077,7 +1077,9 @@ Value *llvm::getOrderedReduction(IRBuilderBase &Builder, Value *Acc, Value *Src, // Helper to generate a log2 shuffle reduction. Value *llvm::getShuffleReduction(IRBuilderBase &Builder, Value *Src, - unsigned Op, RecurKind RdxKind) { + unsigned Op, + TargetTransformInfo::ReductionShuffle RS, + RecurKind RdxKind) { unsigned VF = cast(Src->getType())->getNumElements(); // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles // and vector ops, reducing the set of values being computed by half each @@ -1091,18 +1093,10 @@ Value *llvm::getShuffleReduction(IRBuilderBase &Builder, Value *Src, // will never be relevant here. Note that it would be generally unsound to // propagate these from an intrinsic call to the expansion anyways as we/ // change the order of operations. - Value *TmpVec = Src; - SmallVector ShuffleMask(VF); - for (unsigned i = VF; i != 1; i >>= 1) { - // Move the upper half of the vector to the lower half. - for (unsigned j = 0; j != i / 2; ++j) - ShuffleMask[j] = i / 2 + j; - - // Fill the rest of the mask with undef. - std::fill(&ShuffleMask[i / 2], ShuffleMask.end(), -1); - + auto BuildShuffledOp = [&Builder, &Op, + &RdxKind](SmallVectorImpl &ShuffleMask, + Value *&TmpVec) -> void { Value *Shuf = Builder.CreateShuffleVector(TmpVec, ShuffleMask, "rdx.shuf"); - if (Op != Instruction::ICmp && Op != Instruction::FCmp) { TmpVec = Builder.CreateBinOp((Instruction::BinaryOps)Op, TmpVec, Shuf, "bin.rdx"); @@ -1111,6 +1105,30 @@ Value *llvm::getShuffleReduction(IRBuilderBase &Builder, Value *Src, "Invalid min/max"); TmpVec = createMinMaxOp(Builder, RdxKind, TmpVec, Shuf); } + }; + + Value *TmpVec = Src; + if (TargetTransformInfo::ReductionShuffle::Pairwise == RS) { + SmallVector ShuffleMask(VF); + for (unsigned stride = 1; stride < VF; stride <<= 1) { + // Initialise the mask with undef. + std::fill(ShuffleMask.begin(), ShuffleMask.end(), -1); + for (unsigned j = 0; j < VF; j += stride << 1) { + ShuffleMask[j] = j + stride; + } + BuildShuffledOp(ShuffleMask, TmpVec); + } + } else { + SmallVector ShuffleMask(VF); + for (unsigned i = VF; i != 1; i >>= 1) { + // Move the upper half of the vector to the lower half. + for (unsigned j = 0; j != i / 2; ++j) + ShuffleMask[j] = i / 2 + j; + + // Fill the rest of the mask with undef. + std::fill(&ShuffleMask[i / 2], ShuffleMask.end(), -1); + BuildShuffledOp(ShuffleMask, TmpVec); + } } // The result is in the first element of the vector. return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)); diff --git a/llvm/test/CodeGen/WebAssembly/vector-reduce.ll b/llvm/test/CodeGen/WebAssembly/vector-reduce.ll new file mode 100644 index 0000000000000..4b1a1a8a0c5b6 --- /dev/null +++ b/llvm/test/CodeGen/WebAssembly/vector-reduce.ll @@ -0,0 +1,1074 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc < %s -mtriple=wasm32 -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128 | FileCheck %s --check-prefix=SIMD128 + +define i64 @pairwise_add_v2i64(<2 x i64> %arg) { +; SIMD128-LABEL: pairwise_add_v2i64: +; SIMD128: .functype pairwise_add_v2i64 (v128) -> (i64) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 +; SIMD128-NEXT: i64x2.add $push1=, $0, $pop0 +; SIMD128-NEXT: i64x2.extract_lane $push2=, $pop1, 0 +; SIMD128-NEXT: return $pop2 + %res = tail call i64 @llvm.vector.reduce.add.i64.v4i64(<2 x i64> %arg) + ret i64 %res +} + +define i32 @pairwise_add_v4i32(<4 x i32> %arg) { +; SIMD128-LABEL: pairwise_add_v4i32: +; SIMD128: .functype pairwise_add_v4i32 (v128) -> (i32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 +; SIMD128-NEXT: i32x4.add $push5=, $0, $pop0 +; SIMD128-NEXT: local.tee $push4=, $0=, $pop5 +; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 +; SIMD128-NEXT: i32x4.add $push2=, $pop4, $pop1 +; SIMD128-NEXT: i32x4.extract_lane $push3=, $pop2, 0 +; SIMD128-NEXT: return $pop3 + %res = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %arg) + ret i32 %res +} + +define i16 @pairwise_add_v8i16(<8 x i16> %arg) { +; SIMD128-LABEL: pairwise_add_v8i16: +; SIMD128: .functype pairwise_add_v8i16 (v128) -> (i32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: i16x8.add $push8=, $0, $pop0 +; SIMD128-NEXT: local.tee $push7=, $0=, $pop8 +; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: i16x8.add $push6=, $pop7, $pop1 +; SIMD128-NEXT: local.tee $push5=, $0=, $pop6 +; SIMD128-NEXT: i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: i16x8.add $push3=, $pop5, $pop2 +; SIMD128-NEXT: i16x8.extract_lane_u $push4=, $pop3, 0 +; SIMD128-NEXT: return $pop4 + %res = tail call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %arg) + ret i16 %res +} + +define i8 @pairwise_add_v16i8(<16 x i8> %arg) { +; SIMD128-LABEL: pairwise_add_v16i8: +; SIMD128: .functype pairwise_add_v16i8 (v128) -> (i32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: i8x16.add $push11=, $0, $pop0 +; SIMD128-NEXT: local.tee $push10=, $0=, $pop11 +; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: i8x16.add $push9=, $pop10, $pop1 +; SIMD128-NEXT: local.tee $push8=, $0=, $pop9 +; SIMD128-NEXT: i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: i8x16.add $push7=, $pop8, $pop2 +; SIMD128-NEXT: local.tee $push6=, $0=, $pop7 +; SIMD128-NEXT: i8x16.shuffle $push3=, $0, $0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: i8x16.add $push4=, $pop6, $pop3 +; SIMD128-NEXT: i8x16.extract_lane_u $push5=, $pop4, 0 +; SIMD128-NEXT: return $pop5 + %res = tail call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %arg) + ret i8 %res +} + +define i64 @pairwise_mul_v2i64(<2 x i64> %arg) { +; SIMD128-LABEL: pairwise_mul_v2i64: +; SIMD128: .functype pairwise_mul_v2i64 (v128) -> (i64) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 +; SIMD128-NEXT: i64x2.mul $push1=, $0, $pop0 +; SIMD128-NEXT: i64x2.extract_lane $push2=, $pop1, 0 +; SIMD128-NEXT: return $pop2 + %res = tail call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> %arg) + ret i64 %res +} + +define i32 @pairwise_mul_v4i32(<4 x i32> %arg) { +; SIMD128-LABEL: pairwise_mul_v4i32: +; SIMD128: .functype pairwise_mul_v4i32 (v128) -> (i32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 +; SIMD128-NEXT: i32x4.mul $push5=, $0, $pop0 +; SIMD128-NEXT: local.tee $push4=, $0=, $pop5 +; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 +; SIMD128-NEXT: i32x4.mul $push2=, $pop4, $pop1 +; SIMD128-NEXT: i32x4.extract_lane $push3=, $pop2, 0 +; SIMD128-NEXT: return $pop3 + %res = tail call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %arg) + ret i32 %res +} + +define i16 @pairwise_mul_v8i16(<8 x i16> %arg) { +; SIMD128-LABEL: pairwise_mul_v8i16: +; SIMD128: .functype pairwise_mul_v8i16 (v128) -> (i32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: i16x8.mul $push8=, $0, $pop0 +; SIMD128-NEXT: local.tee $push7=, $0=, $pop8 +; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: i16x8.mul $push6=, $pop7, $pop1 +; SIMD128-NEXT: local.tee $push5=, $0=, $pop6 +; SIMD128-NEXT: i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: i16x8.mul $push3=, $pop5, $pop2 +; SIMD128-NEXT: i16x8.extract_lane_u $push4=, $pop3, 0 +; SIMD128-NEXT: return $pop4 + %res = tail call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> %arg) + ret i16 %res +} + +define i8 @pairwise_mul_v16i8(<16 x i8> %arg) { +; SIMD128-LABEL: pairwise_mul_v16i8: +; SIMD128: .functype pairwise_mul_v16i8 (v128) -> (i32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.extract_lane_u $push26=, $0, 0 +; SIMD128-NEXT: i8x16.shuffle $push32=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: local.tee $push31=, $1=, $pop32 +; SIMD128-NEXT: i8x16.extract_lane_u $push25=, $pop31, 0 +; SIMD128-NEXT: i32.mul $push27=, $pop26, $pop25 +; SIMD128-NEXT: i8x16.extract_lane_u $push23=, $0, 4 +; SIMD128-NEXT: i8x16.extract_lane_u $push22=, $1, 4 +; SIMD128-NEXT: i32.mul $push24=, $pop23, $pop22 +; SIMD128-NEXT: i32.mul $push28=, $pop27, $pop24 +; SIMD128-NEXT: i8x16.extract_lane_u $push19=, $0, 2 +; SIMD128-NEXT: i8x16.extract_lane_u $push18=, $1, 2 +; SIMD128-NEXT: i32.mul $push20=, $pop19, $pop18 +; SIMD128-NEXT: i8x16.extract_lane_u $push16=, $0, 6 +; SIMD128-NEXT: i8x16.extract_lane_u $push15=, $1, 6 +; SIMD128-NEXT: i32.mul $push17=, $pop16, $pop15 +; SIMD128-NEXT: i32.mul $push21=, $pop20, $pop17 +; SIMD128-NEXT: i32.mul $push29=, $pop28, $pop21 +; SIMD128-NEXT: i8x16.extract_lane_u $push11=, $0, 1 +; SIMD128-NEXT: i8x16.extract_lane_u $push10=, $1, 1 +; SIMD128-NEXT: i32.mul $push12=, $pop11, $pop10 +; SIMD128-NEXT: i8x16.extract_lane_u $push8=, $0, 5 +; SIMD128-NEXT: i8x16.extract_lane_u $push7=, $1, 5 +; SIMD128-NEXT: i32.mul $push9=, $pop8, $pop7 +; SIMD128-NEXT: i32.mul $push13=, $pop12, $pop9 +; SIMD128-NEXT: i8x16.extract_lane_u $push4=, $0, 3 +; SIMD128-NEXT: i8x16.extract_lane_u $push3=, $1, 3 +; SIMD128-NEXT: i32.mul $push5=, $pop4, $pop3 +; SIMD128-NEXT: i8x16.extract_lane_u $push1=, $0, 7 +; SIMD128-NEXT: i8x16.extract_lane_u $push0=, $1, 7 +; SIMD128-NEXT: i32.mul $push2=, $pop1, $pop0 +; SIMD128-NEXT: i32.mul $push6=, $pop5, $pop2 +; SIMD128-NEXT: i32.mul $push14=, $pop13, $pop6 +; SIMD128-NEXT: i32.mul $push30=, $pop29, $pop14 +; SIMD128-NEXT: return $pop30 + %res = tail call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> %arg) + ret i8 %res +} + +define i64 @pairwise_and_v2i64(<2 x i64> %arg) { +; SIMD128-LABEL: pairwise_and_v2i64: +; SIMD128: .functype pairwise_and_v2i64 (v128) -> (i64) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 +; SIMD128-NEXT: v128.and $push1=, $0, $pop0 +; SIMD128-NEXT: i64x2.extract_lane $push2=, $pop1, 0 +; SIMD128-NEXT: return $pop2 + %res = tail call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> %arg) + ret i64 %res +} + +define i32 @pairwise_and_v4i32(<4 x i32> %arg) { +; SIMD128-LABEL: pairwise_and_v4i32: +; SIMD128: .functype pairwise_and_v4i32 (v128) -> (i32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 +; SIMD128-NEXT: v128.and $push5=, $0, $pop0 +; SIMD128-NEXT: local.tee $push4=, $0=, $pop5 +; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 +; SIMD128-NEXT: v128.and $push2=, $pop4, $pop1 +; SIMD128-NEXT: i32x4.extract_lane $push3=, $pop2, 0 +; SIMD128-NEXT: return $pop3 + %res = tail call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %arg) + ret i32 %res +} + +define i16 @pairwise_and_v8i16(<8 x i16> %arg) { +; SIMD128-LABEL: pairwise_and_v8i16: +; SIMD128: .functype pairwise_and_v8i16 (v128) -> (i32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: v128.and $push8=, $0, $pop0 +; SIMD128-NEXT: local.tee $push7=, $0=, $pop8 +; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: v128.and $push6=, $pop7, $pop1 +; SIMD128-NEXT: local.tee $push5=, $0=, $pop6 +; SIMD128-NEXT: i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: v128.and $push3=, $pop5, $pop2 +; SIMD128-NEXT: i16x8.extract_lane_u $push4=, $pop3, 0 +; SIMD128-NEXT: return $pop4 + %res = tail call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> %arg) + ret i16 %res +} + +define i8 @pairwise_and_v16i8(<16 x i8> %arg) { +; SIMD128-LABEL: pairwise_and_v16i8: +; SIMD128: .functype pairwise_and_v16i8 (v128) -> (i32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: v128.and $push11=, $0, $pop0 +; SIMD128-NEXT: local.tee $push10=, $0=, $pop11 +; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: v128.and $push9=, $pop10, $pop1 +; SIMD128-NEXT: local.tee $push8=, $0=, $pop9 +; SIMD128-NEXT: i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: v128.and $push7=, $pop8, $pop2 +; SIMD128-NEXT: local.tee $push6=, $0=, $pop7 +; SIMD128-NEXT: i8x16.shuffle $push3=, $0, $0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: v128.and $push4=, $pop6, $pop3 +; SIMD128-NEXT: i8x16.extract_lane_u $push5=, $pop4, 0 +; SIMD128-NEXT: return $pop5 + %res = tail call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> %arg) + ret i8 %res +} + +define i64 @pairwise_or_v2i64(<2 x i64> %arg) { +; SIMD128-LABEL: pairwise_or_v2i64: +; SIMD128: .functype pairwise_or_v2i64 (v128) -> (i64) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 +; SIMD128-NEXT: v128.or $push1=, $0, $pop0 +; SIMD128-NEXT: i64x2.extract_lane $push2=, $pop1, 0 +; SIMD128-NEXT: return $pop2 + %res = tail call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> %arg) + ret i64 %res +} + +define i32 @pairwise_or_v4i32(<4 x i32> %arg) { +; SIMD128-LABEL: pairwise_or_v4i32: +; SIMD128: .functype pairwise_or_v4i32 (v128) -> (i32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 +; SIMD128-NEXT: v128.or $push5=, $0, $pop0 +; SIMD128-NEXT: local.tee $push4=, $0=, $pop5 +; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 +; SIMD128-NEXT: v128.or $push2=, $pop4, $pop1 +; SIMD128-NEXT: i32x4.extract_lane $push3=, $pop2, 0 +; SIMD128-NEXT: return $pop3 + %res = tail call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %arg) + ret i32 %res +} + +define i16 @pairwise_or_v8i16(<8 x i16> %arg) { +; SIMD128-LABEL: pairwise_or_v8i16: +; SIMD128: .functype pairwise_or_v8i16 (v128) -> (i32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: v128.or $push8=, $0, $pop0 +; SIMD128-NEXT: local.tee $push7=, $0=, $pop8 +; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: v128.or $push6=, $pop7, $pop1 +; SIMD128-NEXT: local.tee $push5=, $0=, $pop6 +; SIMD128-NEXT: i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: v128.or $push3=, $pop5, $pop2 +; SIMD128-NEXT: i16x8.extract_lane_u $push4=, $pop3, 0 +; SIMD128-NEXT: return $pop4 + %res = tail call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> %arg) + ret i16 %res +} + +define i8 @pairwise_or_v16i8(<16 x i8> %arg) { +; SIMD128-LABEL: pairwise_or_v16i8: +; SIMD128: .functype pairwise_or_v16i8 (v128) -> (i32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: v128.or $push11=, $0, $pop0 +; SIMD128-NEXT: local.tee $push10=, $0=, $pop11 +; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: v128.or $push9=, $pop10, $pop1 +; SIMD128-NEXT: local.tee $push8=, $0=, $pop9 +; SIMD128-NEXT: i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: v128.or $push7=, $pop8, $pop2 +; SIMD128-NEXT: local.tee $push6=, $0=, $pop7 +; SIMD128-NEXT: i8x16.shuffle $push3=, $0, $0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: v128.or $push4=, $pop6, $pop3 +; SIMD128-NEXT: i8x16.extract_lane_u $push5=, $pop4, 0 +; SIMD128-NEXT: return $pop5 + %res = tail call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> %arg) + ret i8 %res +} + +define i64 @pairwise_xor_v2i64(<2 x i64> %arg) { +; SIMD128-LABEL: pairwise_xor_v2i64: +; SIMD128: .functype pairwise_xor_v2i64 (v128) -> (i64) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 +; SIMD128-NEXT: v128.xor $push1=, $0, $pop0 +; SIMD128-NEXT: i64x2.extract_lane $push2=, $pop1, 0 +; SIMD128-NEXT: return $pop2 + %res = tail call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> %arg) + ret i64 %res +} + +define i32 @pairwise_xor_v4i32(<4 x i32> %arg) { +; SIMD128-LABEL: pairwise_xor_v4i32: +; SIMD128: .functype pairwise_xor_v4i32 (v128) -> (i32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 +; SIMD128-NEXT: v128.xor $push5=, $0, $pop0 +; SIMD128-NEXT: local.tee $push4=, $0=, $pop5 +; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 +; SIMD128-NEXT: v128.xor $push2=, $pop4, $pop1 +; SIMD128-NEXT: i32x4.extract_lane $push3=, $pop2, 0 +; SIMD128-NEXT: return $pop3 + %res = tail call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %arg) + ret i32 %res +} + +define i16 @pairwise_xor_v8i16(<8 x i16> %arg) { +; SIMD128-LABEL: pairwise_xor_v8i16: +; SIMD128: .functype pairwise_xor_v8i16 (v128) -> (i32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: v128.xor $push8=, $0, $pop0 +; SIMD128-NEXT: local.tee $push7=, $0=, $pop8 +; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: v128.xor $push6=, $pop7, $pop1 +; SIMD128-NEXT: local.tee $push5=, $0=, $pop6 +; SIMD128-NEXT: i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: v128.xor $push3=, $pop5, $pop2 +; SIMD128-NEXT: i16x8.extract_lane_u $push4=, $pop3, 0 +; SIMD128-NEXT: return $pop4 + %res = tail call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> %arg) + ret i16 %res +} + +define i8 @pairwise_xor_v16i8(<16 x i8> %arg) { +; SIMD128-LABEL: pairwise_xor_v16i8: +; SIMD128: .functype pairwise_xor_v16i8 (v128) -> (i32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: v128.xor $push11=, $0, $pop0 +; SIMD128-NEXT: local.tee $push10=, $0=, $pop11 +; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: v128.xor $push9=, $pop10, $pop1 +; SIMD128-NEXT: local.tee $push8=, $0=, $pop9 +; SIMD128-NEXT: i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: v128.xor $push7=, $pop8, $pop2 +; SIMD128-NEXT: local.tee $push6=, $0=, $pop7 +; SIMD128-NEXT: i8x16.shuffle $push3=, $0, $0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: v128.xor $push4=, $pop6, $pop3 +; SIMD128-NEXT: i8x16.extract_lane_u $push5=, $pop4, 0 +; SIMD128-NEXT: return $pop5 + %res = tail call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> %arg) + ret i8 %res +} + +define i64 @pairwise_smax_v2i64(<2 x i64> %arg) { +; SIMD128-LABEL: pairwise_smax_v2i64: +; SIMD128: .functype pairwise_smax_v2i64 (v128) -> (i64) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push4=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 +; SIMD128-NEXT: local.tee $push3=, $1=, $pop4 +; SIMD128-NEXT: i64x2.gt_s $push0=, $0, $1 +; SIMD128-NEXT: v128.bitselect $push1=, $0, $pop3, $pop0 +; SIMD128-NEXT: i64x2.extract_lane $push2=, $pop1, 0 +; SIMD128-NEXT: return $pop2 + %res = tail call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> %arg) + ret i64 %res +} + +define i32 @pairwise_smax_v4i32(<4 x i32> %arg) { +; SIMD128-LABEL: pairwise_smax_v4i32: +; SIMD128: .functype pairwise_smax_v4i32 (v128) -> (i32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 +; SIMD128-NEXT: i32x4.max_s $push5=, $0, $pop0 +; SIMD128-NEXT: local.tee $push4=, $0=, $pop5 +; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 +; SIMD128-NEXT: i32x4.max_s $push2=, $pop4, $pop1 +; SIMD128-NEXT: i32x4.extract_lane $push3=, $pop2, 0 +; SIMD128-NEXT: return $pop3 + %res = tail call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %arg) + ret i32 %res +} + +define i16 @pairwise_smax_v8i16(<8 x i16> %arg) { +; SIMD128-LABEL: pairwise_smax_v8i16: +; SIMD128: .functype pairwise_smax_v8i16 (v128) -> (i32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: i16x8.max_s $push8=, $0, $pop0 +; SIMD128-NEXT: local.tee $push7=, $0=, $pop8 +; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: i16x8.max_s $push6=, $pop7, $pop1 +; SIMD128-NEXT: local.tee $push5=, $0=, $pop6 +; SIMD128-NEXT: i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: i16x8.max_s $push3=, $pop5, $pop2 +; SIMD128-NEXT: i16x8.extract_lane_u $push4=, $pop3, 0 +; SIMD128-NEXT: return $pop4 + %res = tail call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> %arg) + ret i16 %res +} + +define i8 @pairwise_smax_v16i8(<16 x i8> %arg) { +; SIMD128-LABEL: pairwise_smax_v16i8: +; SIMD128: .functype pairwise_smax_v16i8 (v128) -> (i32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: i8x16.max_s $push11=, $0, $pop0 +; SIMD128-NEXT: local.tee $push10=, $0=, $pop11 +; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: i8x16.max_s $push9=, $pop10, $pop1 +; SIMD128-NEXT: local.tee $push8=, $0=, $pop9 +; SIMD128-NEXT: i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: i8x16.max_s $push7=, $pop8, $pop2 +; SIMD128-NEXT: local.tee $push6=, $0=, $pop7 +; SIMD128-NEXT: i8x16.shuffle $push3=, $0, $0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: i8x16.max_s $push4=, $pop6, $pop3 +; SIMD128-NEXT: i8x16.extract_lane_u $push5=, $pop4, 0 +; SIMD128-NEXT: return $pop5 + %res = tail call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> %arg) + ret i8 %res +} + +define i64 @pairwise_smin_v2i64(<2 x i64> %arg) { +; SIMD128-LABEL: pairwise_smin_v2i64: +; SIMD128: .functype pairwise_smin_v2i64 (v128) -> (i64) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push4=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 +; SIMD128-NEXT: local.tee $push3=, $1=, $pop4 +; SIMD128-NEXT: i64x2.lt_s $push0=, $0, $1 +; SIMD128-NEXT: v128.bitselect $push1=, $0, $pop3, $pop0 +; SIMD128-NEXT: i64x2.extract_lane $push2=, $pop1, 0 +; SIMD128-NEXT: return $pop2 + %res = tail call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> %arg) + ret i64 %res +} + +define i32 @pairwise_smin_v4i32(<4 x i32> %arg) { +; SIMD128-LABEL: pairwise_smin_v4i32: +; SIMD128: .functype pairwise_smin_v4i32 (v128) -> (i32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 +; SIMD128-NEXT: i32x4.min_s $push5=, $0, $pop0 +; SIMD128-NEXT: local.tee $push4=, $0=, $pop5 +; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 +; SIMD128-NEXT: i32x4.min_s $push2=, $pop4, $pop1 +; SIMD128-NEXT: i32x4.extract_lane $push3=, $pop2, 0 +; SIMD128-NEXT: return $pop3 + %res = tail call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %arg) + ret i32 %res +} + +define i16 @pairwise_smin_v8i16(<8 x i16> %arg) { +; SIMD128-LABEL: pairwise_smin_v8i16: +; SIMD128: .functype pairwise_smin_v8i16 (v128) -> (i32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: i16x8.min_s $push8=, $0, $pop0 +; SIMD128-NEXT: local.tee $push7=, $0=, $pop8 +; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: i16x8.min_s $push6=, $pop7, $pop1 +; SIMD128-NEXT: local.tee $push5=, $0=, $pop6 +; SIMD128-NEXT: i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: i16x8.min_s $push3=, $pop5, $pop2 +; SIMD128-NEXT: i16x8.extract_lane_u $push4=, $pop3, 0 +; SIMD128-NEXT: return $pop4 + %res = tail call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %arg) + ret i16 %res +} + +define i8 @pairwise_smin_v16i8(<16 x i8> %arg) { +; SIMD128-LABEL: pairwise_smin_v16i8: +; SIMD128: .functype pairwise_smin_v16i8 (v128) -> (i32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: i8x16.min_s $push11=, $0, $pop0 +; SIMD128-NEXT: local.tee $push10=, $0=, $pop11 +; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: i8x16.min_s $push9=, $pop10, $pop1 +; SIMD128-NEXT: local.tee $push8=, $0=, $pop9 +; SIMD128-NEXT: i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: i8x16.min_s $push7=, $pop8, $pop2 +; SIMD128-NEXT: local.tee $push6=, $0=, $pop7 +; SIMD128-NEXT: i8x16.shuffle $push3=, $0, $0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: i8x16.min_s $push4=, $pop6, $pop3 +; SIMD128-NEXT: i8x16.extract_lane_u $push5=, $pop4, 0 +; SIMD128-NEXT: return $pop5 + %res = tail call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> %arg) + ret i8 %res +} + +define i64 @pairwise_umax_v2i64(<2 x i64> %arg) { +; SIMD128-LABEL: pairwise_umax_v2i64: +; SIMD128: .functype pairwise_umax_v2i64 (v128) -> (i64) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push10=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 +; SIMD128-NEXT: local.tee $push9=, $1=, $pop10 +; SIMD128-NEXT: i64.const $push4=, -1 +; SIMD128-NEXT: i64.const $push3=, 0 +; SIMD128-NEXT: i64x2.extract_lane $push1=, $0, 0 +; SIMD128-NEXT: i64x2.extract_lane $push0=, $1, 0 +; SIMD128-NEXT: i64.gt_u $push2=, $pop1, $pop0 +; SIMD128-NEXT: i64.select $push5=, $pop4, $pop3, $pop2 +; SIMD128-NEXT: i64x2.replace_lane $push6=, $0, 0, $pop5 +; SIMD128-NEXT: v128.bitselect $push7=, $0, $pop9, $pop6 +; SIMD128-NEXT: i64x2.extract_lane $push8=, $pop7, 0 +; SIMD128-NEXT: return $pop8 + %res = tail call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> %arg) + ret i64 %res +} + +define i32 @pairwise_umax_v4i32(<4 x i32> %arg) { +; SIMD128-LABEL: pairwise_umax_v4i32: +; SIMD128: .functype pairwise_umax_v4i32 (v128) -> (i32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 +; SIMD128-NEXT: i32x4.max_u $push5=, $0, $pop0 +; SIMD128-NEXT: local.tee $push4=, $0=, $pop5 +; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 +; SIMD128-NEXT: i32x4.max_u $push2=, $pop4, $pop1 +; SIMD128-NEXT: i32x4.extract_lane $push3=, $pop2, 0 +; SIMD128-NEXT: return $pop3 + %res = tail call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %arg) + ret i32 %res +} + +define i16 @pairwise_umax_v8i16(<8 x i16> %arg) { +; SIMD128-LABEL: pairwise_umax_v8i16: +; SIMD128: .functype pairwise_umax_v8i16 (v128) -> (i32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: i16x8.max_u $push8=, $0, $pop0 +; SIMD128-NEXT: local.tee $push7=, $0=, $pop8 +; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: i16x8.max_u $push6=, $pop7, $pop1 +; SIMD128-NEXT: local.tee $push5=, $0=, $pop6 +; SIMD128-NEXT: i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: i16x8.max_u $push3=, $pop5, $pop2 +; SIMD128-NEXT: i16x8.extract_lane_u $push4=, $pop3, 0 +; SIMD128-NEXT: return $pop4 + %res = tail call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> %arg) + ret i16 %res +} + +define i8 @pairwise_umax_v16i8(<16 x i8> %arg) { +; SIMD128-LABEL: pairwise_umax_v16i8: +; SIMD128: .functype pairwise_umax_v16i8 (v128) -> (i32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: i8x16.max_u $push11=, $0, $pop0 +; SIMD128-NEXT: local.tee $push10=, $0=, $pop11 +; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: i8x16.max_u $push9=, $pop10, $pop1 +; SIMD128-NEXT: local.tee $push8=, $0=, $pop9 +; SIMD128-NEXT: i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: i8x16.max_u $push7=, $pop8, $pop2 +; SIMD128-NEXT: local.tee $push6=, $0=, $pop7 +; SIMD128-NEXT: i8x16.shuffle $push3=, $0, $0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: i8x16.max_u $push4=, $pop6, $pop3 +; SIMD128-NEXT: i8x16.extract_lane_u $push5=, $pop4, 0 +; SIMD128-NEXT: return $pop5 + %res = tail call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> %arg) + ret i8 %res +} + +define i64 @pairwise_umin_v2i64(<2 x i64> %arg) { +; SIMD128-LABEL: pairwise_umin_v2i64: +; SIMD128: .functype pairwise_umin_v2i64 (v128) -> (i64) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push10=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 +; SIMD128-NEXT: local.tee $push9=, $1=, $pop10 +; SIMD128-NEXT: i64.const $push4=, -1 +; SIMD128-NEXT: i64.const $push3=, 0 +; SIMD128-NEXT: i64x2.extract_lane $push1=, $0, 0 +; SIMD128-NEXT: i64x2.extract_lane $push0=, $1, 0 +; SIMD128-NEXT: i64.lt_u $push2=, $pop1, $pop0 +; SIMD128-NEXT: i64.select $push5=, $pop4, $pop3, $pop2 +; SIMD128-NEXT: i64x2.replace_lane $push6=, $0, 0, $pop5 +; SIMD128-NEXT: v128.bitselect $push7=, $0, $pop9, $pop6 +; SIMD128-NEXT: i64x2.extract_lane $push8=, $pop7, 0 +; SIMD128-NEXT: return $pop8 + %res = tail call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> %arg) + ret i64 %res +} + +define i32 @pairwise_umin_v4i32(<4 x i32> %arg) { +; SIMD128-LABEL: pairwise_umin_v4i32: +; SIMD128: .functype pairwise_umin_v4i32 (v128) -> (i32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 +; SIMD128-NEXT: i32x4.min_u $push5=, $0, $pop0 +; SIMD128-NEXT: local.tee $push4=, $0=, $pop5 +; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 +; SIMD128-NEXT: i32x4.min_u $push2=, $pop4, $pop1 +; SIMD128-NEXT: i32x4.extract_lane $push3=, $pop2, 0 +; SIMD128-NEXT: return $pop3 + %res = tail call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %arg) + ret i32 %res +} + +define i16 @pairwise_umin_v8i16(<8 x i16> %arg) { +; SIMD128-LABEL: pairwise_umin_v8i16: +; SIMD128: .functype pairwise_umin_v8i16 (v128) -> (i32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: i16x8.min_u $push8=, $0, $pop0 +; SIMD128-NEXT: local.tee $push7=, $0=, $pop8 +; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: i16x8.min_u $push6=, $pop7, $pop1 +; SIMD128-NEXT: local.tee $push5=, $0=, $pop6 +; SIMD128-NEXT: i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: i16x8.min_u $push3=, $pop5, $pop2 +; SIMD128-NEXT: i16x8.extract_lane_u $push4=, $pop3, 0 +; SIMD128-NEXT: return $pop4 + %res = tail call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> %arg) + ret i16 %res +} + +define i8 @pairwise_umin_v16i8(<16 x i8> %arg) { +; SIMD128-LABEL: pairwise_umin_v16i8: +; SIMD128: .functype pairwise_umin_v16i8 (v128) -> (i32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: i8x16.min_u $push11=, $0, $pop0 +; SIMD128-NEXT: local.tee $push10=, $0=, $pop11 +; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: i8x16.min_u $push9=, $pop10, $pop1 +; SIMD128-NEXT: local.tee $push8=, $0=, $pop9 +; SIMD128-NEXT: i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: i8x16.min_u $push7=, $pop8, $pop2 +; SIMD128-NEXT: local.tee $push6=, $0=, $pop7 +; SIMD128-NEXT: i8x16.shuffle $push3=, $0, $0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: i8x16.min_u $push4=, $pop6, $pop3 +; SIMD128-NEXT: i8x16.extract_lane_u $push5=, $pop4, 0 +; SIMD128-NEXT: return $pop5 + %res = tail call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> %arg) + ret i8 %res +} + +define double @pairwise_add_v2f64(<2 x double> %arg) { +; SIMD128-LABEL: pairwise_add_v2f64: +; SIMD128: .functype pairwise_add_v2f64 (v128) -> (f64) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: f64x2.extract_lane $push1=, $0, 0 +; SIMD128-NEXT: f64x2.extract_lane $push0=, $0, 1 +; SIMD128-NEXT: f64.add $push2=, $pop1, $pop0 +; SIMD128-NEXT: return $pop2 + %res = tail call double @llvm.vector.reduce.fadd.v2f64(double -0.0, <2 x double> %arg) + ret double%res +} + +define double @pairwise_add_v2f64_fast(<2 x double> %arg) { +; SIMD128-LABEL: pairwise_add_v2f64_fast: +; SIMD128: .functype pairwise_add_v2f64_fast (v128) -> (f64) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 +; SIMD128-NEXT: f64x2.add $push1=, $0, $pop0 +; SIMD128-NEXT: f64x2.extract_lane $push2=, $pop1, 0 +; SIMD128-NEXT: return $pop2 + %res = tail call fast double @llvm.vector.reduce.fadd.v2f64(double -0.0, <2 x double> %arg) + ret double%res +} + +define float @pairwise_add_v4f32(<4 x float> %arg) { +; SIMD128-LABEL: pairwise_add_v4f32: +; SIMD128: .functype pairwise_add_v4f32 (v128) -> (f32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: f32x4.extract_lane $push1=, $0, 0 +; SIMD128-NEXT: f32x4.extract_lane $push0=, $0, 1 +; SIMD128-NEXT: f32.add $push2=, $pop1, $pop0 +; SIMD128-NEXT: f32x4.extract_lane $push3=, $0, 2 +; SIMD128-NEXT: f32.add $push4=, $pop2, $pop3 +; SIMD128-NEXT: f32x4.extract_lane $push5=, $0, 3 +; SIMD128-NEXT: f32.add $push6=, $pop4, $pop5 +; SIMD128-NEXT: return $pop6 + %res = tail call float @llvm.vector.reduce.fadd.v4f32(float -0.0, <4 x float> %arg) + ret float %res +} + +define float @pairwise_add_v4f32_fast(<4 x float> %arg) { +; SIMD128-LABEL: pairwise_add_v4f32_fast: +; SIMD128: .functype pairwise_add_v4f32_fast (v128) -> (f32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 0, 1, 2, 3 +; SIMD128-NEXT: f32x4.add $push5=, $0, $pop0 +; SIMD128-NEXT: local.tee $push4=, $0=, $pop5 +; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 +; SIMD128-NEXT: f32x4.add $push2=, $pop4, $pop1 +; SIMD128-NEXT: f32x4.extract_lane $push3=, $pop2, 0 +; SIMD128-NEXT: return $pop3 + %res = tail call fast float @llvm.vector.reduce.fadd.v4f32(float -0.0, <4 x float> %arg) + ret float %res +} + +define float @pairwise_add_v4f32_reassoc(<4 x float> %arg) { +; SIMD128-LABEL: pairwise_add_v4f32_reassoc: +; SIMD128: .functype pairwise_add_v4f32_reassoc (v128) -> (f32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 0, 1, 2, 3 +; SIMD128-NEXT: f32x4.add $push5=, $0, $pop0 +; SIMD128-NEXT: local.tee $push4=, $0=, $pop5 +; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 +; SIMD128-NEXT: f32x4.add $push2=, $pop4, $pop1 +; SIMD128-NEXT: f32x4.extract_lane $push3=, $pop2, 0 +; SIMD128-NEXT: return $pop3 + %res = tail call reassoc float @llvm.vector.reduce.fadd.v4f32(float -0.0, <4 x float> %arg) + ret float %res +} + +define double @pairwise_mul_v2f64(<2 x double> %arg) { +; SIMD128-LABEL: pairwise_mul_v2f64: +; SIMD128: .functype pairwise_mul_v2f64 (v128) -> (f64) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: f64x2.extract_lane $push0=, $0, 0 +; SIMD128-NEXT: f64.const $push1=, -0x0p0 +; SIMD128-NEXT: f64.mul $push2=, $pop0, $pop1 +; SIMD128-NEXT: f64x2.extract_lane $push3=, $0, 1 +; SIMD128-NEXT: f64.mul $push4=, $pop2, $pop3 +; SIMD128-NEXT: return $pop4 + %res = tail call double @llvm.vector.reduce.fmul.v2f64(double -0.0, <2 x double> %arg) + ret double%res +} + +define double @pairwise_mul_v2f64_fast(<2 x double> %arg) { +; SIMD128-LABEL: pairwise_mul_v2f64_fast: +; SIMD128: .functype pairwise_mul_v2f64_fast (v128) -> (f64) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: f64.const $push0=, 0x0p0 +; SIMD128-NEXT: return $pop0 + %res = tail call fast double @llvm.vector.reduce.fmul.v2f64(double -0.0, <2 x double> %arg) + ret double%res +} + +define float @pairwise_mul_v4f32(<4 x float> %arg) { +; SIMD128-LABEL: pairwise_mul_v4f32: +; SIMD128: .functype pairwise_mul_v4f32 (v128) -> (f32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: f32x4.extract_lane $push0=, $0, 0 +; SIMD128-NEXT: f32.const $push1=, -0x0p0 +; SIMD128-NEXT: f32.mul $push2=, $pop0, $pop1 +; SIMD128-NEXT: f32x4.extract_lane $push3=, $0, 1 +; SIMD128-NEXT: f32.mul $push4=, $pop2, $pop3 +; SIMD128-NEXT: f32x4.extract_lane $push5=, $0, 2 +; SIMD128-NEXT: f32.mul $push6=, $pop4, $pop5 +; SIMD128-NEXT: f32x4.extract_lane $push7=, $0, 3 +; SIMD128-NEXT: f32.mul $push8=, $pop6, $pop7 +; SIMD128-NEXT: return $pop8 + %res = tail call float @llvm.vector.reduce.fmul.v4f32(float -0.0, <4 x float> %arg) + ret float %res +} + +define float @pairwise_mul_v4f32_fast(<4 x float> %arg) { +; SIMD128-LABEL: pairwise_mul_v4f32_fast: +; SIMD128: .functype pairwise_mul_v4f32_fast (v128) -> (f32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: f32.const $push0=, 0x0p0 +; SIMD128-NEXT: return $pop0 + %res = tail call fast float @llvm.vector.reduce.fmul.v4f32(float -0.0, <4 x float> %arg) + ret float %res +} + +define float @pairwise_mul_v4f32_reassoc(<4 x float> %arg) { +; SIMD128-LABEL: pairwise_mul_v4f32_reassoc: +; SIMD128: .functype pairwise_mul_v4f32_reassoc (v128) -> (f32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 +; SIMD128-NEXT: f32x4.mul $push7=, $0, $pop0 +; SIMD128-NEXT: local.tee $push6=, $0=, $pop7 +; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 +; SIMD128-NEXT: f32x4.mul $push2=, $pop6, $pop1 +; SIMD128-NEXT: f32x4.extract_lane $push3=, $pop2, 0 +; SIMD128-NEXT: f32.const $push4=, -0x0p0 +; SIMD128-NEXT: f32.mul $push5=, $pop3, $pop4 +; SIMD128-NEXT: return $pop5 + %res = tail call reassoc float @llvm.vector.reduce.fmul.v4f32(float -0.0, <4 x float> %arg) + ret float %res +} + +define double @pairwise_max_v2f64(<2 x double> %arg) { +; SIMD128-LABEL: pairwise_max_v2f64: +; SIMD128: .functype pairwise_max_v2f64 (v128) -> (f64) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: f64x2.extract_lane $push1=, $0, 0 +; SIMD128-NEXT: f64x2.extract_lane $push0=, $0, 1 +; SIMD128-NEXT: call $push2=, fmax, $pop1, $pop0 +; SIMD128-NEXT: return $pop2 + %res = tail call double @llvm.vector.reduce.fmax.v2f64(<2 x double> %arg) + ret double%res +} + +define double @pairwise_max_v2f64_fast(<2 x double> %arg) { +; SIMD128-LABEL: pairwise_max_v2f64_fast: +; SIMD128: .functype pairwise_max_v2f64_fast (v128) -> (f64) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push4=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 +; SIMD128-NEXT: local.tee $push3=, $1=, $pop4 +; SIMD128-NEXT: f64x2.gt $push0=, $0, $1 +; SIMD128-NEXT: v128.bitselect $push1=, $0, $pop3, $pop0 +; SIMD128-NEXT: f64x2.extract_lane $push2=, $pop1, 0 +; SIMD128-NEXT: return $pop2 + %res = tail call fast double @llvm.vector.reduce.fmax.v2f64(<2 x double> %arg) + ret double%res +} + +define float @pairwise_max_v4f32(<4 x float> %arg) { +; SIMD128-LABEL: pairwise_max_v4f32: +; SIMD128: .functype pairwise_max_v4f32 (v128) -> (f32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: f32x4.extract_lane $push3=, $0, 0 +; SIMD128-NEXT: f32x4.extract_lane $push2=, $0, 1 +; SIMD128-NEXT: call $push4=, fmaxf, $pop3, $pop2 +; SIMD128-NEXT: f32x4.extract_lane $push1=, $0, 2 +; SIMD128-NEXT: call $push5=, fmaxf, $pop4, $pop1 +; SIMD128-NEXT: f32x4.extract_lane $push0=, $0, 3 +; SIMD128-NEXT: call $push6=, fmaxf, $pop5, $pop0 +; SIMD128-NEXT: return $pop6 + %res = tail call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %arg) + ret float %res +} + +define float @pairwise_max_v4f32_fast(<4 x float> %arg) { +; SIMD128-LABEL: pairwise_max_v4f32_fast: +; SIMD128: .functype pairwise_max_v4f32_fast (v128) -> (f32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push9=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 +; SIMD128-NEXT: local.tee $push8=, $1=, $pop9 +; SIMD128-NEXT: f32x4.gt $push0=, $0, $1 +; SIMD128-NEXT: v128.bitselect $push7=, $0, $pop8, $pop0 +; SIMD128-NEXT: local.tee $push6=, $0=, $pop7 +; SIMD128-NEXT: i8x16.shuffle $push5=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 +; SIMD128-NEXT: local.tee $push4=, $1=, $pop5 +; SIMD128-NEXT: f32x4.gt $push1=, $0, $1 +; SIMD128-NEXT: v128.bitselect $push2=, $pop6, $pop4, $pop1 +; SIMD128-NEXT: f32x4.extract_lane $push3=, $pop2, 0 +; SIMD128-NEXT: return $pop3 + %res = tail call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> %arg) + ret float %res +} + +define float @pairwise_max_v4f32_reassoc(<4 x float> %arg) { +; SIMD128-LABEL: pairwise_max_v4f32_reassoc: +; SIMD128: .functype pairwise_max_v4f32_reassoc (v128) -> (f32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: f32x4.extract_lane $push3=, $0, 0 +; SIMD128-NEXT: f32x4.extract_lane $push2=, $0, 1 +; SIMD128-NEXT: call $push4=, fmaxf, $pop3, $pop2 +; SIMD128-NEXT: f32x4.extract_lane $push1=, $0, 2 +; SIMD128-NEXT: call $push5=, fmaxf, $pop4, $pop1 +; SIMD128-NEXT: f32x4.extract_lane $push0=, $0, 3 +; SIMD128-NEXT: call $push6=, fmaxf, $pop5, $pop0 +; SIMD128-NEXT: return $pop6 + %res = tail call reassoc float @llvm.vector.reduce.fmax.v4f32(<4 x float> %arg) + ret float %res +} + +define double @pairwise_min_v2f64(<2 x double> %arg) { +; SIMD128-LABEL: pairwise_min_v2f64: +; SIMD128: .functype pairwise_min_v2f64 (v128) -> (f64) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: f64x2.extract_lane $push1=, $0, 0 +; SIMD128-NEXT: f64x2.extract_lane $push0=, $0, 1 +; SIMD128-NEXT: call $push2=, fmin, $pop1, $pop0 +; SIMD128-NEXT: return $pop2 + %res = tail call double @llvm.vector.reduce.fmin.v2f64(<2 x double> %arg) + ret double%res +} + +define double @pairwise_min_v2f64_fast(<2 x double> %arg) { +; SIMD128-LABEL: pairwise_min_v2f64_fast: +; SIMD128: .functype pairwise_min_v2f64_fast (v128) -> (f64) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push4=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 +; SIMD128-NEXT: local.tee $push3=, $1=, $pop4 +; SIMD128-NEXT: f64x2.lt $push0=, $0, $1 +; SIMD128-NEXT: v128.bitselect $push1=, $0, $pop3, $pop0 +; SIMD128-NEXT: f64x2.extract_lane $push2=, $pop1, 0 +; SIMD128-NEXT: return $pop2 + %res = tail call fast double @llvm.vector.reduce.fmin.v2f64(<2 x double> %arg) + ret double%res +} + +define float @pairwise_min_v4f32(<4 x float> %arg) { +; SIMD128-LABEL: pairwise_min_v4f32: +; SIMD128: .functype pairwise_min_v4f32 (v128) -> (f32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: f32x4.extract_lane $push3=, $0, 0 +; SIMD128-NEXT: f32x4.extract_lane $push2=, $0, 1 +; SIMD128-NEXT: call $push4=, fminf, $pop3, $pop2 +; SIMD128-NEXT: f32x4.extract_lane $push1=, $0, 2 +; SIMD128-NEXT: call $push5=, fminf, $pop4, $pop1 +; SIMD128-NEXT: f32x4.extract_lane $push0=, $0, 3 +; SIMD128-NEXT: call $push6=, fminf, $pop5, $pop0 +; SIMD128-NEXT: return $pop6 + %res = tail call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %arg) + ret float %res +} + +define float @pairwise_min_v4f32_fast(<4 x float> %arg) { +; SIMD128-LABEL: pairwise_min_v4f32_fast: +; SIMD128: .functype pairwise_min_v4f32_fast (v128) -> (f32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push9=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 +; SIMD128-NEXT: local.tee $push8=, $1=, $pop9 +; SIMD128-NEXT: f32x4.lt $push0=, $0, $1 +; SIMD128-NEXT: v128.bitselect $push7=, $0, $pop8, $pop0 +; SIMD128-NEXT: local.tee $push6=, $0=, $pop7 +; SIMD128-NEXT: i8x16.shuffle $push5=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 +; SIMD128-NEXT: local.tee $push4=, $1=, $pop5 +; SIMD128-NEXT: f32x4.lt $push1=, $0, $1 +; SIMD128-NEXT: v128.bitselect $push2=, $pop6, $pop4, $pop1 +; SIMD128-NEXT: f32x4.extract_lane $push3=, $pop2, 0 +; SIMD128-NEXT: return $pop3 + %res = tail call fast float @llvm.vector.reduce.fmin.v4f32(<4 x float> %arg) + ret float %res +} + +define float @pairwise_min_v4f32_reassoc(<4 x float> %arg) { +; SIMD128-LABEL: pairwise_min_v4f32_reassoc: +; SIMD128: .functype pairwise_min_v4f32_reassoc (v128) -> (f32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: f32x4.extract_lane $push3=, $0, 0 +; SIMD128-NEXT: f32x4.extract_lane $push2=, $0, 1 +; SIMD128-NEXT: call $push4=, fminf, $pop3, $pop2 +; SIMD128-NEXT: f32x4.extract_lane $push1=, $0, 2 +; SIMD128-NEXT: call $push5=, fminf, $pop4, $pop1 +; SIMD128-NEXT: f32x4.extract_lane $push0=, $0, 3 +; SIMD128-NEXT: call $push6=, fminf, $pop5, $pop0 +; SIMD128-NEXT: return $pop6 + %res = tail call reassoc float @llvm.vector.reduce.fmin.v4f32(<4 x float> %arg) + ret float %res +} + +define double @pairwise_maximum_v2f64(<2 x double> %arg) { +; SIMD128-LABEL: pairwise_maximum_v2f64: +; SIMD128: .functype pairwise_maximum_v2f64 (v128) -> (f64) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: f64x2.extract_lane $push1=, $0, 0 +; SIMD128-NEXT: f64x2.extract_lane $push0=, $0, 1 +; SIMD128-NEXT: f64.max $push2=, $pop1, $pop0 +; SIMD128-NEXT: return $pop2 + %res = tail call double @llvm.vector.reduce.fmaximum.v2f64(<2 x double> %arg) + ret double%res +} + +define double @pairwise_maximum_v2f64_fast(<2 x double> %arg) { +; SIMD128-LABEL: pairwise_maximum_v2f64_fast: +; SIMD128: .functype pairwise_maximum_v2f64_fast (v128) -> (f64) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: f64x2.extract_lane $push1=, $0, 0 +; SIMD128-NEXT: f64x2.extract_lane $push0=, $0, 1 +; SIMD128-NEXT: f64.max $push2=, $pop1, $pop0 +; SIMD128-NEXT: return $pop2 + %res = tail call fast double @llvm.vector.reduce.fmaximum.v2f64(<2 x double> %arg) + ret double%res +} + +define float @pairwise_maximum_v4f32(<4 x float> %arg) { +; SIMD128-LABEL: pairwise_maximum_v4f32: +; SIMD128: .functype pairwise_maximum_v4f32 (v128) -> (f32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: f32x4.extract_lane $push1=, $0, 0 +; SIMD128-NEXT: f32x4.extract_lane $push0=, $0, 1 +; SIMD128-NEXT: f32.max $push2=, $pop1, $pop0 +; SIMD128-NEXT: f32x4.extract_lane $push3=, $0, 2 +; SIMD128-NEXT: f32.max $push4=, $pop2, $pop3 +; SIMD128-NEXT: f32x4.extract_lane $push5=, $0, 3 +; SIMD128-NEXT: f32.max $push6=, $pop4, $pop5 +; SIMD128-NEXT: return $pop6 + %res = tail call float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> %arg) + ret float %res +} + +define float @pairwise_maximum_v4f32_fast(<4 x float> %arg) { +; SIMD128-LABEL: pairwise_maximum_v4f32_fast: +; SIMD128: .functype pairwise_maximum_v4f32_fast (v128) -> (f32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: f32x4.extract_lane $push1=, $0, 0 +; SIMD128-NEXT: f32x4.extract_lane $push0=, $0, 1 +; SIMD128-NEXT: f32.max $push2=, $pop1, $pop0 +; SIMD128-NEXT: f32x4.extract_lane $push3=, $0, 2 +; SIMD128-NEXT: f32.max $push4=, $pop2, $pop3 +; SIMD128-NEXT: f32x4.extract_lane $push5=, $0, 3 +; SIMD128-NEXT: f32.max $push6=, $pop4, $pop5 +; SIMD128-NEXT: return $pop6 + %res = tail call fast float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> %arg) + ret float %res +} + +define float @pairwise_maximum_v4f32_reassoc(<4 x float> %arg) { +; SIMD128-LABEL: pairwise_maximum_v4f32_reassoc: +; SIMD128: .functype pairwise_maximum_v4f32_reassoc (v128) -> (f32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: f32x4.extract_lane $push1=, $0, 0 +; SIMD128-NEXT: f32x4.extract_lane $push0=, $0, 1 +; SIMD128-NEXT: f32.max $push2=, $pop1, $pop0 +; SIMD128-NEXT: f32x4.extract_lane $push3=, $0, 2 +; SIMD128-NEXT: f32.max $push4=, $pop2, $pop3 +; SIMD128-NEXT: f32x4.extract_lane $push5=, $0, 3 +; SIMD128-NEXT: f32.max $push6=, $pop4, $pop5 +; SIMD128-NEXT: return $pop6 + %res = tail call reassoc float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> %arg) + ret float %res +} + +define double @pairwise_minimum_v2f64(<2 x double> %arg) { +; SIMD128-LABEL: pairwise_minimum_v2f64: +; SIMD128: .functype pairwise_minimum_v2f64 (v128) -> (f64) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: f64x2.extract_lane $push1=, $0, 0 +; SIMD128-NEXT: f64x2.extract_lane $push0=, $0, 1 +; SIMD128-NEXT: f64.min $push2=, $pop1, $pop0 +; SIMD128-NEXT: return $pop2 + %res = tail call double @llvm.vector.reduce.fminimum.v2f64(<2 x double> %arg) + ret double%res +} + +define double @pairwise_minimum_v2f64_fast(<2 x double> %arg) { +; SIMD128-LABEL: pairwise_minimum_v2f64_fast: +; SIMD128: .functype pairwise_minimum_v2f64_fast (v128) -> (f64) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: f64x2.extract_lane $push1=, $0, 0 +; SIMD128-NEXT: f64x2.extract_lane $push0=, $0, 1 +; SIMD128-NEXT: f64.min $push2=, $pop1, $pop0 +; SIMD128-NEXT: return $pop2 + %res = tail call fast double @llvm.vector.reduce.fminimum.v2f64(<2 x double> %arg) + ret double%res +} + +define float @pairwise_minimum_v4f32(<4 x float> %arg) { +; SIMD128-LABEL: pairwise_minimum_v4f32: +; SIMD128: .functype pairwise_minimum_v4f32 (v128) -> (f32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: f32x4.extract_lane $push1=, $0, 0 +; SIMD128-NEXT: f32x4.extract_lane $push0=, $0, 1 +; SIMD128-NEXT: f32.min $push2=, $pop1, $pop0 +; SIMD128-NEXT: f32x4.extract_lane $push3=, $0, 2 +; SIMD128-NEXT: f32.min $push4=, $pop2, $pop3 +; SIMD128-NEXT: f32x4.extract_lane $push5=, $0, 3 +; SIMD128-NEXT: f32.min $push6=, $pop4, $pop5 +; SIMD128-NEXT: return $pop6 + %res = tail call float @llvm.vector.reduce.fminimum.v4f32(<4 x float> %arg) + ret float %res +} + +define float @pairwise_minimum_v4f32_fast(<4 x float> %arg) { +; SIMD128-LABEL: pairwise_minimum_v4f32_fast: +; SIMD128: .functype pairwise_minimum_v4f32_fast (v128) -> (f32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: f32x4.extract_lane $push1=, $0, 0 +; SIMD128-NEXT: f32x4.extract_lane $push0=, $0, 1 +; SIMD128-NEXT: f32.min $push2=, $pop1, $pop0 +; SIMD128-NEXT: f32x4.extract_lane $push3=, $0, 2 +; SIMD128-NEXT: f32.min $push4=, $pop2, $pop3 +; SIMD128-NEXT: f32x4.extract_lane $push5=, $0, 3 +; SIMD128-NEXT: f32.min $push6=, $pop4, $pop5 +; SIMD128-NEXT: return $pop6 + %res = tail call fast float @llvm.vector.reduce.fminimum.v4f32(<4 x float> %arg) + ret float %res +} + +define float @pairwise_minimum_v4f32_reassoc(<4 x float> %arg) { +; SIMD128-LABEL: pairwise_minimum_v4f32_reassoc: +; SIMD128: .functype pairwise_minimum_v4f32_reassoc (v128) -> (f32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: f32x4.extract_lane $push1=, $0, 0 +; SIMD128-NEXT: f32x4.extract_lane $push0=, $0, 1 +; SIMD128-NEXT: f32.min $push2=, $pop1, $pop0 +; SIMD128-NEXT: f32x4.extract_lane $push3=, $0, 2 +; SIMD128-NEXT: f32.min $push4=, $pop2, $pop3 +; SIMD128-NEXT: f32x4.extract_lane $push5=, $0, 3 +; SIMD128-NEXT: f32.min $push6=, $pop4, $pop5 +; SIMD128-NEXT: return $pop6 + %res = tail call reassoc float @llvm.vector.reduce.fminimum.v4f32(<4 x float> %arg) + ret float %res +} From f270a4dd6667759d7305797a077ae09648318ac7 Mon Sep 17 00:00:00 2001 From: Amara Emerson Date: Wed, 17 Jul 2024 01:31:52 -0700 Subject: [PATCH 233/777] [AArch64] Don't tail call memset if it would convert to a bzero. (#98969) Well, not quite that simple. We can tc memset since it returns the first argument but bzero doesn't do that and therefore we can end up miscompiling. This patch also refactors the logic out of isInTailCallPosition() into the callers. As a result memcpy and memmove are also modified to do the same thing for consistency. rdar://131419786 --- llvm/include/llvm/CodeGen/Analysis.h | 10 ++- llvm/include/llvm/CodeGen/SelectionDAG.h | 26 +++++--- llvm/lib/CodeGen/Analysis.cpp | 57 ++++++----------- .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 62 +++++++++++++++---- .../SelectionDAG/SelectionDAGBuilder.cpp | 41 ++++++------ .../Target/AArch64/AArch64ISelLowering.cpp | 7 ++- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 2 +- .../Target/Hexagon/HexagonISelLowering.cpp | 10 +-- llvm/lib/Target/Lanai/LanaiISelLowering.cpp | 2 +- .../LoongArch/LoongArchISelLowering.cpp | 2 +- llvm/lib/Target/MSP430/MSP430ISelLowering.cpp | 11 ++-- llvm/lib/Target/Mips/MipsISelLowering.cpp | 2 +- llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 10 +-- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 2 +- llvm/lib/Target/Sparc/SparcISelLowering.cpp | 4 +- .../Target/SystemZ/SystemZISelLowering.cpp | 2 +- .../WebAssembly/WebAssemblyISelLowering.cpp | 9 +-- llvm/lib/Target/X86/X86ISelLowering.cpp | 3 +- llvm/lib/Target/X86/X86ISelLoweringCall.cpp | 2 +- llvm/lib/Target/X86/X86SelectionDAGInfo.cpp | 4 +- llvm/lib/Target/XCore/XCoreISelLowering.cpp | 6 +- .../AArch64/no-tail-call-bzero-from-memset.ll | 20 ++++++ 22 files changed, 172 insertions(+), 122 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/no-tail-call-bzero-from-memset.ll diff --git a/llvm/include/llvm/CodeGen/Analysis.h b/llvm/include/llvm/CodeGen/Analysis.h index 6f7ed22b8ac71..362cc30bbd06a 100644 --- a/llvm/include/llvm/CodeGen/Analysis.h +++ b/llvm/include/llvm/CodeGen/Analysis.h @@ -126,7 +126,8 @@ ICmpInst::Predicate getICmpCondCode(ISD::CondCode Pred); /// between it and the return. /// /// This function only tests target-independent requirements. -bool isInTailCallPosition(const CallBase &Call, const TargetMachine &TM); +bool isInTailCallPosition(const CallBase &Call, const TargetMachine &TM, + bool ReturnsFirstArg = false); /// Test if given that the input instruction is in the tail call position, if /// there is an attribute mismatch between the caller and the callee that will @@ -144,7 +145,12 @@ bool attributesPermitTailCall(const Function *F, const Instruction *I, /// optimization. bool returnTypeIsEligibleForTailCall(const Function *F, const Instruction *I, const ReturnInst *Ret, - const TargetLoweringBase &TLI); + const TargetLoweringBase &TLI, + bool ReturnsFirstArg = false); + +/// Returns true if the parent of \p CI returns CI's first argument after +/// calling \p CI. +bool funcReturnsFirstArgOfCall(const CallInst &CI); DenseMap getEHScopeMembership(const MachineFunction &MF); diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h index 8e189e9e8bf86..16ec65f2e7daa 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAG.h +++ b/llvm/include/llvm/CodeGen/SelectionDAG.h @@ -1182,16 +1182,22 @@ class SelectionDAG { /// stack arguments from being clobbered. SDValue getStackArgumentTokenFactor(SDValue Chain); - SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, - SDValue Size, Align Alignment, bool isVol, - bool AlwaysInline, bool isTailCall, - MachinePointerInfo DstPtrInfo, - MachinePointerInfo SrcPtrInfo, - const AAMDNodes &AAInfo = AAMDNodes(), - AAResults *AA = nullptr); - + /* \p CI if not null is the memset call being lowered. + * \p OverrideTailCall is an optional parameter that can be used to override + * the tail call optimization decision. */ + SDValue + getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, + SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, + const CallInst *CI, std::optional OverrideTailCall, + MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, + const AAMDNodes &AAInfo = AAMDNodes(), AAResults *AA = nullptr); + + /* \p CI if not null is the memset call being lowered. + * \p OverrideTailCall is an optional parameter that can be used to override + * the tail call optimization decision. */ SDValue getMemmove(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, - SDValue Size, Align Alignment, bool isVol, bool isTailCall, + SDValue Size, Align Alignment, bool isVol, + const CallInst *CI, std::optional OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo = AAMDNodes(), @@ -1199,7 +1205,7 @@ class SelectionDAG { SDValue getMemset(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, - bool AlwaysInline, bool isTailCall, + bool AlwaysInline, const CallInst *CI, MachinePointerInfo DstPtrInfo, const AAMDNodes &AAInfo = AAMDNodes()); diff --git a/llvm/lib/CodeGen/Analysis.cpp b/llvm/lib/CodeGen/Analysis.cpp index 7fc18639e5852..128060ec912c7 100644 --- a/llvm/lib/CodeGen/Analysis.cpp +++ b/llvm/lib/CodeGen/Analysis.cpp @@ -532,7 +532,8 @@ static bool nextRealType(SmallVectorImpl &SubTypes, /// between it and the return. /// /// This function only tests target-independent requirements. -bool llvm::isInTailCallPosition(const CallBase &Call, const TargetMachine &TM) { +bool llvm::isInTailCallPosition(const CallBase &Call, const TargetMachine &TM, + bool ReturnsFirstArg) { const BasicBlock *ExitBB = Call.getParent(); const Instruction *Term = ExitBB->getTerminator(); const ReturnInst *Ret = dyn_cast(Term); @@ -575,7 +576,8 @@ bool llvm::isInTailCallPosition(const CallBase &Call, const TargetMachine &TM) { const Function *F = ExitBB->getParent(); return returnTypeIsEligibleForTailCall( - F, &Call, Ret, *TM.getSubtargetImpl(*F)->getTargetLowering()); + F, &Call, Ret, *TM.getSubtargetImpl(*F)->getTargetLowering(), + ReturnsFirstArg); } bool llvm::attributesPermitTailCall(const Function *F, const Instruction *I, @@ -638,26 +640,11 @@ bool llvm::attributesPermitTailCall(const Function *F, const Instruction *I, return CallerAttrs == CalleeAttrs; } -/// Check whether B is a bitcast of a pointer type to another pointer type, -/// which is equal to A. -static bool isPointerBitcastEqualTo(const Value *A, const Value *B) { - assert(A && B && "Expected non-null inputs!"); - - auto *BitCastIn = dyn_cast(B); - - if (!BitCastIn) - return false; - - if (!A->getType()->isPointerTy() || !B->getType()->isPointerTy()) - return false; - - return A == BitCastIn->getOperand(0); -} - bool llvm::returnTypeIsEligibleForTailCall(const Function *F, const Instruction *I, const ReturnInst *Ret, - const TargetLoweringBase &TLI) { + const TargetLoweringBase &TLI, + bool ReturnsFirstArg) { // If the block ends with a void return or unreachable, it doesn't matter // what the call's return type is. if (!Ret || Ret->getNumOperands() == 0) return true; @@ -671,26 +658,11 @@ bool llvm::returnTypeIsEligibleForTailCall(const Function *F, if (!attributesPermitTailCall(F, I, Ret, TLI, &AllowDifferingSizes)) return false; - const Value *RetVal = Ret->getOperand(0), *CallVal = I; - // Intrinsic like llvm.memcpy has no return value, but the expanded - // libcall may or may not have return value. On most platforms, it - // will be expanded as memcpy in libc, which returns the first - // argument. On other platforms like arm-none-eabi, memcpy may be - // expanded as library call without return value, like __aeabi_memcpy. - const CallInst *Call = cast(I); - if (Function *F = Call->getCalledFunction()) { - Intrinsic::ID IID = F->getIntrinsicID(); - if (((IID == Intrinsic::memcpy && - TLI.getLibcallName(RTLIB::MEMCPY) == StringRef("memcpy")) || - (IID == Intrinsic::memmove && - TLI.getLibcallName(RTLIB::MEMMOVE) == StringRef("memmove")) || - (IID == Intrinsic::memset && - TLI.getLibcallName(RTLIB::MEMSET) == StringRef("memset"))) && - (RetVal == Call->getArgOperand(0) || - isPointerBitcastEqualTo(RetVal, Call->getArgOperand(0)))) - return true; - } + // If the return value is the first argument of the call. + if (ReturnsFirstArg) + return true; + const Value *RetVal = Ret->getOperand(0), *CallVal = I; SmallVector RetPath, CallPath; SmallVector RetSubTypes, CallSubTypes; @@ -739,6 +711,15 @@ bool llvm::returnTypeIsEligibleForTailCall(const Function *F, return true; } +bool llvm::funcReturnsFirstArgOfCall(const CallInst &CI) { + const ReturnInst *Ret = dyn_cast(CI.getParent()->getTerminator()); + Value *RetVal = Ret ? Ret->getReturnValue() : nullptr; + bool ReturnsFirstArg = false; + if (RetVal && ((RetVal == CI.getArgOperand(0)))) + ReturnsFirstArg = true; + return ReturnsFirstArg; +} + static void collectEHScopeMembers( DenseMap &EHScopeMembership, int EHScope, const MachineBasicBlock *MBB) { diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 9bd0d1c51fbc2..94349ec97693f 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -75,6 +75,7 @@ #include #include #include +#include #include #include #include @@ -8236,12 +8237,11 @@ static void checkAddrSpaceIsValidForLibcall(const TargetLowering *TLI, } } -SDValue SelectionDAG::getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, - SDValue Src, SDValue Size, Align Alignment, - bool isVol, bool AlwaysInline, bool isTailCall, - MachinePointerInfo DstPtrInfo, - MachinePointerInfo SrcPtrInfo, - const AAMDNodes &AAInfo, AAResults *AA) { +SDValue SelectionDAG::getMemcpy( + SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, + Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, + std::optional OverrideTailCall, MachinePointerInfo DstPtrInfo, + MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo, AAResults *AA) { // Check to see if we should lower the memcpy to loads and stores first. // For cases within the target-specified limits, this is the best choice. ConstantSDNode *ConstantSize = dyn_cast(Size); @@ -8296,6 +8296,18 @@ SDValue SelectionDAG::getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, Entry.Node = Size; Args.push_back(Entry); // FIXME: pass in SDLoc TargetLowering::CallLoweringInfo CLI(*this); + bool IsTailCall = false; + if (OverrideTailCall.has_value()) { + IsTailCall = *OverrideTailCall; + } else { + bool LowersToMemcpy = + TLI->getLibcallName(RTLIB::MEMCPY) == StringRef("memcpy"); + bool ReturnsFirstArg = CI && funcReturnsFirstArgOfCall(*CI); + IsTailCall = CI && CI->isTailCall() && + isInTailCallPosition(*CI, getTarget(), + ReturnsFirstArg && LowersToMemcpy); + } + CLI.setDebugLoc(dl) .setChain(Chain) .setLibCallee(TLI->getLibcallCallingConv(RTLIB::MEMCPY), @@ -8304,7 +8316,7 @@ SDValue SelectionDAG::getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, TLI->getPointerTy(getDataLayout())), std::move(Args)) .setDiscardResult() - .setTailCall(isTailCall); + .setTailCall(IsTailCall); std::pair CallResult = TLI->LowerCallTo(CLI); return CallResult.second; @@ -8352,7 +8364,8 @@ SDValue SelectionDAG::getAtomicMemcpy(SDValue Chain, const SDLoc &dl, SDValue SelectionDAG::getMemmove(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, - bool isVol, bool isTailCall, + bool isVol, const CallInst *CI, + std::optional OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo, AAResults *AA) { @@ -8398,6 +8411,19 @@ SDValue SelectionDAG::getMemmove(SDValue Chain, const SDLoc &dl, SDValue Dst, Entry.Node = Size; Args.push_back(Entry); // FIXME: pass in SDLoc TargetLowering::CallLoweringInfo CLI(*this); + + bool IsTailCall = false; + if (OverrideTailCall.has_value()) { + IsTailCall = *OverrideTailCall; + } else { + bool LowersToMemmove = + TLI->getLibcallName(RTLIB::MEMMOVE) == StringRef("memmove"); + bool ReturnsFirstArg = CI && funcReturnsFirstArgOfCall(*CI); + IsTailCall = CI && CI->isTailCall() && + isInTailCallPosition(*CI, getTarget(), + ReturnsFirstArg && LowersToMemmove); + } + CLI.setDebugLoc(dl) .setChain(Chain) .setLibCallee(TLI->getLibcallCallingConv(RTLIB::MEMMOVE), @@ -8406,7 +8432,7 @@ SDValue SelectionDAG::getMemmove(SDValue Chain, const SDLoc &dl, SDValue Dst, TLI->getPointerTy(getDataLayout())), std::move(Args)) .setDiscardResult() - .setTailCall(isTailCall); + .setTailCall(IsTailCall); std::pair CallResult = TLI->LowerCallTo(CLI); return CallResult.second; @@ -8454,7 +8480,8 @@ SDValue SelectionDAG::getAtomicMemmove(SDValue Chain, const SDLoc &dl, SDValue SelectionDAG::getMemset(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, - bool isVol, bool AlwaysInline, bool isTailCall, + bool isVol, bool AlwaysInline, + const CallInst *CI, MachinePointerInfo DstPtrInfo, const AAMDNodes &AAInfo) { // Check to see if we should lower the memset to stores first. @@ -8514,8 +8541,9 @@ SDValue SelectionDAG::getMemset(SDValue Chain, const SDLoc &dl, SDValue Dst, return Entry; }; + bool UseBZero = isNullConstant(Src) && BzeroName; // If zeroing out and bzero is present, use it. - if (isNullConstant(Src) && BzeroName) { + if (UseBZero) { TargetLowering::ArgListTy Args; Args.push_back(CreateEntry(Dst, PointerType::getUnqual(Ctx))); Args.push_back(CreateEntry(Size, DL.getIntPtrType(Ctx))); @@ -8533,8 +8561,16 @@ SDValue SelectionDAG::getMemset(SDValue Chain, const SDLoc &dl, SDValue Dst, TLI->getPointerTy(DL)), std::move(Args)); } - - CLI.setDiscardResult().setTailCall(isTailCall); + bool LowersToMemset = + TLI->getLibcallName(RTLIB::MEMSET) == StringRef("memset"); + // If we're going to use bzero, make sure not to tail call unless the + // subsequent return doesn't need a value, as bzero doesn't return the first + // arg unlike memset. + bool ReturnsFirstArg = CI && funcReturnsFirstArgOfCall(*CI) && !UseBZero; + bool IsTailCall = + CI && CI->isTailCall() && + isInTailCallPosition(*CI, getTarget(), ReturnsFirstArg && LowersToMemset); + CLI.setDiscardResult().setTailCall(IsTailCall); std::pair CallResult = TLI->LowerCallTo(CLI); return CallResult.second; diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 51cbdd9b3ad31..d5cbb733a408d 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -6460,14 +6460,14 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, Align SrcAlign = MCI.getSourceAlign().valueOrOne(); Align Alignment = std::min(DstAlign, SrcAlign); bool isVol = MCI.isVolatile(); - bool isTC = I.isTailCall() && isInTailCallPosition(I, DAG.getTarget()); // FIXME: Support passing different dest/src alignments to the memcpy DAG // node. SDValue Root = isVol ? getRoot() : getMemoryRoot(); - SDValue MC = DAG.getMemcpy( - Root, sdl, Op1, Op2, Op3, Alignment, isVol, - /* AlwaysInline */ false, isTC, MachinePointerInfo(I.getArgOperand(0)), - MachinePointerInfo(I.getArgOperand(1)), I.getAAMetadata(), AA); + SDValue MC = DAG.getMemcpy(Root, sdl, Op1, Op2, Op3, Alignment, isVol, + /* AlwaysInline */ false, &I, std::nullopt, + MachinePointerInfo(I.getArgOperand(0)), + MachinePointerInfo(I.getArgOperand(1)), + I.getAAMetadata(), AA); updateDAGForMaybeTailCall(MC); return; } @@ -6482,13 +6482,13 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, Align SrcAlign = MCI.getSourceAlign().valueOrOne(); Align Alignment = std::min(DstAlign, SrcAlign); bool isVol = MCI.isVolatile(); - bool isTC = I.isTailCall() && isInTailCallPosition(I, DAG.getTarget()); // FIXME: Support passing different dest/src alignments to the memcpy DAG // node. - SDValue MC = DAG.getMemcpy( - getRoot(), sdl, Dst, Src, Size, Alignment, isVol, - /* AlwaysInline */ true, isTC, MachinePointerInfo(I.getArgOperand(0)), - MachinePointerInfo(I.getArgOperand(1)), I.getAAMetadata(), AA); + SDValue MC = DAG.getMemcpy(getRoot(), sdl, Dst, Src, Size, Alignment, isVol, + /* AlwaysInline */ true, &I, std::nullopt, + MachinePointerInfo(I.getArgOperand(0)), + MachinePointerInfo(I.getArgOperand(1)), + I.getAAMetadata(), AA); updateDAGForMaybeTailCall(MC); return; } @@ -6500,11 +6500,10 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, // @llvm.memset defines 0 and 1 to both mean no alignment. Align Alignment = MSI.getDestAlign().valueOrOne(); bool isVol = MSI.isVolatile(); - bool isTC = I.isTailCall() && isInTailCallPosition(I, DAG.getTarget()); SDValue Root = isVol ? getRoot() : getMemoryRoot(); SDValue MS = DAG.getMemset( Root, sdl, Op1, Op2, Op3, Alignment, isVol, /* AlwaysInline */ false, - isTC, MachinePointerInfo(I.getArgOperand(0)), I.getAAMetadata()); + &I, MachinePointerInfo(I.getArgOperand(0)), I.getAAMetadata()); updateDAGForMaybeTailCall(MS); return; } @@ -6517,10 +6516,9 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, // @llvm.memset defines 0 and 1 to both mean no alignment. Align DstAlign = MSII.getDestAlign().valueOrOne(); bool isVol = MSII.isVolatile(); - bool isTC = I.isTailCall() && isInTailCallPosition(I, DAG.getTarget()); SDValue Root = isVol ? getRoot() : getMemoryRoot(); SDValue MC = DAG.getMemset(Root, sdl, Dst, Value, Size, DstAlign, isVol, - /* AlwaysInline */ true, isTC, + /* AlwaysInline */ true, &I, MachinePointerInfo(I.getArgOperand(0)), I.getAAMetadata()); updateDAGForMaybeTailCall(MC); @@ -6536,12 +6534,12 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, Align SrcAlign = MMI.getSourceAlign().valueOrOne(); Align Alignment = std::min(DstAlign, SrcAlign); bool isVol = MMI.isVolatile(); - bool isTC = I.isTailCall() && isInTailCallPosition(I, DAG.getTarget()); // FIXME: Support passing different dest/src alignments to the memmove DAG // node. SDValue Root = isVol ? getRoot() : getMemoryRoot(); - SDValue MM = DAG.getMemmove(Root, sdl, Op1, Op2, Op3, Alignment, isVol, - isTC, MachinePointerInfo(I.getArgOperand(0)), + SDValue MM = DAG.getMemmove(Root, sdl, Op1, Op2, Op3, Alignment, isVol, &I, + /* OverrideTailCall */ std::nullopt, + MachinePointerInfo(I.getArgOperand(0)), MachinePointerInfo(I.getArgOperand(1)), I.getAAMetadata(), AA); updateDAGForMaybeTailCall(MM); @@ -9039,11 +9037,10 @@ bool SelectionDAGBuilder::visitMemPCpyCall(const CallInst &I) { // because the return pointer needs to be adjusted by the size of // the copied memory. SDValue Root = getMemoryRoot(); - SDValue MC = DAG.getMemcpy(Root, sdl, Dst, Src, Size, Alignment, false, false, - /*isTailCall=*/false, - MachinePointerInfo(I.getArgOperand(0)), - MachinePointerInfo(I.getArgOperand(1)), - I.getAAMetadata()); + SDValue MC = DAG.getMemcpy( + Root, sdl, Dst, Src, Size, Alignment, false, false, /*CI=*/nullptr, + std::nullopt, MachinePointerInfo(I.getArgOperand(0)), + MachinePointerInfo(I.getArgOperand(1)), I.getAAMetadata()); assert(MC.getNode() != nullptr && "** memcpy should not be lowered as TailCall in mempcpy context **"); DAG.setRoot(MC); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index df9b0ae1a632f..eef83a845e2c3 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -8578,7 +8578,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getNonZeroByValAlign(), /*isVol = */ false, /*AlwaysInline = */ false, - /*isTailCall = */ false, DstInfo, MachinePointerInfo()); + /*CI=*/nullptr, std::nullopt, DstInfo, MachinePointerInfo()); MemOpChains.push_back(Cpy); } else { @@ -10878,8 +10878,9 @@ SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op, return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2), DAG.getConstant(VaListSize, DL, MVT::i32), - Align(PtrSize), false, false, false, - MachinePointerInfo(DestSV), MachinePointerInfo(SrcSV)); + Align(PtrSize), false, false, /*CI=*/nullptr, + std::nullopt, MachinePointerInfo(DestSV), + MachinePointerInfo(SrcSV)); } SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index bb8e21772e566..a79d8f7bd1b5e 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -3822,7 +3822,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getNonZeroByValAlign(), /*isVol = */ false, /*AlwaysInline = */ true, - /*isTailCall = */ false, DstInfo, + /*CI=*/nullptr, std::nullopt, DstInfo, MachinePointerInfo(AMDGPUAS::PRIVATE_ADDRESS)); MemOpChains.push_back(Cpy); diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp index 79cffc0da7a4f..7aeaebc584c64 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp @@ -178,7 +178,7 @@ static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst, return DAG.getMemcpy( Chain, dl, Dst, Src, SizeNode, Flags.getNonZeroByValAlign(), /*isVolatile=*/false, /*AlwaysInline=*/false, - /*isTailCall=*/false, MachinePointerInfo(), MachinePointerInfo()); + /*CI=*/nullptr, std::nullopt, MachinePointerInfo(), MachinePointerInfo()); } bool @@ -1038,10 +1038,10 @@ HexagonTargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); // Size of the va_list is 12 bytes as it has 3 pointers. Therefore, // we need to memcopy 12 bytes from va_list to another similar list. - return DAG.getMemcpy(Chain, DL, DestPtr, SrcPtr, - DAG.getIntPtrConstant(12, DL), Align(4), - /*isVolatile*/ false, false, false, - MachinePointerInfo(DestSV), MachinePointerInfo(SrcSV)); + return DAG.getMemcpy( + Chain, DL, DestPtr, SrcPtr, DAG.getIntPtrConstant(12, DL), Align(4), + /*isVolatile*/ false, false, /*CI=*/nullptr, std::nullopt, + MachinePointerInfo(DestSV), MachinePointerInfo(SrcSV)); } SDValue HexagonTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { diff --git a/llvm/lib/Target/Lanai/LanaiISelLowering.cpp b/llvm/lib/Target/Lanai/LanaiISelLowering.cpp index 06fd7ac807d8a..f6763a35cc0d5 100644 --- a/llvm/lib/Target/Lanai/LanaiISelLowering.cpp +++ b/llvm/lib/Target/Lanai/LanaiISelLowering.cpp @@ -648,7 +648,7 @@ SDValue LanaiTargetLowering::LowerCCCCallTo( Chain = DAG.getMemcpy(Chain, DL, FIPtr, Arg, SizeNode, Alignment, /*IsVolatile=*/false, /*AlwaysInline=*/false, - /*isTailCall=*/false, MachinePointerInfo(), + /*CI=*/nullptr, std::nullopt, MachinePointerInfo(), MachinePointerInfo()); ByValArgs.push_back(FIPtr); } diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index 79da36c03e304..ba6be85c7f2e8 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -4225,7 +4225,7 @@ LoongArchTargetLowering::LowerCall(CallLoweringInfo &CLI, Chain = DAG.getMemcpy(Chain, DL, FIPtr, Arg, SizeNode, Alignment, /*IsVolatile=*/false, - /*AlwaysInline=*/false, /*isTailCall=*/IsTailCall, + /*AlwaysInline=*/false, /*CI=*/nullptr, std::nullopt, MachinePointerInfo(), MachinePointerInfo()); ByValArgs.push_back(FIPtr); } diff --git a/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp b/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp index fc066f001316d..ba7b6c85bd81a 100644 --- a/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp +++ b/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp @@ -864,11 +864,12 @@ SDValue MSP430TargetLowering::LowerCCCCallTo( if (Flags.isByVal()) { SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i16); - MemOp = DAG.getMemcpy( - Chain, dl, PtrOff, Arg, SizeNode, Flags.getNonZeroByValAlign(), - /*isVolatile*/ false, - /*AlwaysInline=*/true, - /*isTailCall=*/false, MachinePointerInfo(), MachinePointerInfo()); + MemOp = DAG.getMemcpy(Chain, dl, PtrOff, Arg, SizeNode, + Flags.getNonZeroByValAlign(), + /*isVolatile*/ false, + /*AlwaysInline=*/true, + /*CI=*/nullptr, std::nullopt, + MachinePointerInfo(), MachinePointerInfo()); } else { MemOp = DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()); } diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp index ef70ef2772681..0f2047fcac640 100644 --- a/llvm/lib/Target/Mips/MipsISelLowering.cpp +++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp @@ -4506,7 +4506,7 @@ void MipsTargetLowering::passByValArg( Chain = DAG.getMemcpy( Chain, DL, Dst, Src, DAG.getConstant(MemCpySize, DL, PtrTy), Align(Alignment), /*isVolatile=*/false, /*AlwaysInline=*/false, - /*isTailCall=*/false, MachinePointerInfo(), MachinePointerInfo()); + /*CI=*/nullptr, std::nullopt, MachinePointerInfo(), MachinePointerInfo()); MemOpChains.push_back(Chain); } diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 411114599543c..a11ab93b8db3c 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -3904,8 +3904,8 @@ SDValue PPCTargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const { // 2*sizeof(char) + 2 Byte alignment + 2*sizeof(char*) = 12 Byte return DAG.getMemcpy(Op.getOperand(0), Op, Op.getOperand(1), Op.getOperand(2), DAG.getConstant(12, SDLoc(Op), MVT::i32), Align(8), - false, true, false, MachinePointerInfo(), - MachinePointerInfo()); + false, true, /*CI=*/nullptr, std::nullopt, + MachinePointerInfo(), MachinePointerInfo()); } SDValue PPCTargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op, @@ -5275,9 +5275,9 @@ static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, ISD::ArgFlagsTy Flags, SelectionDAG &DAG, const SDLoc &dl) { SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32); - return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, - Flags.getNonZeroByValAlign(), false, false, false, - MachinePointerInfo(), MachinePointerInfo()); + return DAG.getMemcpy( + Chain, dl, Dst, Src, SizeNode, Flags.getNonZeroByValAlign(), false, false, + /*CI=*/nullptr, std::nullopt, MachinePointerInfo(), MachinePointerInfo()); } /// LowerMemOpCallTo - Store the argument to the stack or remember it in case of diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 953196a586b6e..fef1441eca9c6 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -19932,7 +19932,7 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI, Chain = DAG.getMemcpy(Chain, DL, FIPtr, Arg, SizeNode, Alignment, /*IsVolatile=*/false, - /*AlwaysInline=*/false, IsTailCall, + /*AlwaysInline=*/false, /*CI*/ nullptr, IsTailCall, MachinePointerInfo(), MachinePointerInfo()); ByValArgs.push_back(FIPtr); } diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/llvm/lib/Target/Sparc/SparcISelLowering.cpp index 0dba6c47be030..50aa19446f880 100644 --- a/llvm/lib/Target/Sparc/SparcISelLowering.cpp +++ b/llvm/lib/Target/Sparc/SparcISelLowering.cpp @@ -868,8 +868,8 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI, Chain = DAG.getMemcpy(Chain, dl, FIPtr, Arg, SizeNode, Alignment, false, // isVolatile, (Size <= 32), // AlwaysInline if size <= 32, - false, // isTailCall - MachinePointerInfo(), MachinePointerInfo()); + /*CI=*/nullptr, std::nullopt, MachinePointerInfo(), + MachinePointerInfo()); ByValArgs.push_back(FIPtr); } else { diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp index 0a7229a2bc0fb..b2b88143354a5 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -3951,7 +3951,7 @@ SDValue SystemZTargetLowering::lowerVACOPY(SDValue Op, Subtarget.isTargetXPLINK64() ? getTargetMachine().getPointerSize(0) : 32; return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, DAG.getIntPtrConstant(Sz, DL), Align(8), /*isVolatile*/ false, /*AlwaysInline*/ false, - /*isTailCall*/ false, MachinePointerInfo(DstSV), + /*CI=*/nullptr, std::nullopt, MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV)); } diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index a793e59c3d1a7..f77076d7244ca 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -1123,10 +1123,11 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI, SDValue SizeNode = DAG.getConstant(Out.Flags.getByValSize(), DL, MVT::i32); SDValue FINode = DAG.getFrameIndex(FI, getPointerTy(Layout)); - Chain = DAG.getMemcpy( - Chain, DL, FINode, OutVal, SizeNode, Out.Flags.getNonZeroByValAlign(), - /*isVolatile*/ false, /*AlwaysInline=*/false, - /*isTailCall*/ false, MachinePointerInfo(), MachinePointerInfo()); + Chain = DAG.getMemcpy(Chain, DL, FINode, OutVal, SizeNode, + Out.Flags.getNonZeroByValAlign(), + /*isVolatile*/ false, /*AlwaysInline=*/false, + /*CI=*/nullptr, std::nullopt, MachinePointerInfo(), + MachinePointerInfo()); OutVal = FINode; } // Count the number of fixed args *after* legalization. diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 9d651d4db6731..bcdd31c22d314 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -25147,7 +25147,8 @@ static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget, Chain, DL, DstPtr, SrcPtr, DAG.getIntPtrConstant(Subtarget.isTarget64BitLP64() ? 24 : 16, DL), Align(Subtarget.isTarget64BitLP64() ? 8 : 4), /*isVolatile*/ false, false, - false, MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV)); + /*CI=*/nullptr, std::nullopt, MachinePointerInfo(DstSV), + MachinePointerInfo(SrcSV)); } // Helper to get immediate/variable SSE shift opcode from other shift opcodes. diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp index ed3fb13b2b232..3c49b6ea8ec7b 100644 --- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp +++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp @@ -1240,7 +1240,7 @@ static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst, return DAG.getMemcpy( Chain, dl, Dst, Src, SizeNode, Flags.getNonZeroByValAlign(), /*isVolatile*/ false, /*AlwaysInline=*/true, - /*isTailCall*/ false, MachinePointerInfo(), MachinePointerInfo()); + /*CI=*/nullptr, std::nullopt, MachinePointerInfo(), MachinePointerInfo()); } /// Return true if the calling convention is one that we can guarantee TCO for. diff --git a/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp b/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp index e5f07f230fe6c..055466ac660cc 100644 --- a/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp +++ b/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp @@ -147,7 +147,7 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset( DAG.getConstant(Offset, dl, AddrVT)), Val, DAG.getConstant(BytesLeft, dl, SizeVT), Alignment, isVolatile, AlwaysInline, - /* isTailCall */ false, DstPtrInfo.getWithOffset(Offset))); + /* CI */ nullptr, DstPtrInfo.getWithOffset(Offset))); return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Results); } @@ -255,7 +255,7 @@ static SDValue emitConstantSizeRepmov( DAG.getNode(ISD::ADD, dl, DstVT, Dst, DAG.getConstant(Offset, dl, DstVT)), DAG.getNode(ISD::ADD, dl, SrcVT, Src, DAG.getConstant(Offset, dl, SrcVT)), DAG.getConstant(BytesLeft, dl, SizeVT), Alignment, isVolatile, - /*AlwaysInline*/ true, /*isTailCall*/ false, + /*AlwaysInline*/ true, /*CI=*/nullptr, std::nullopt, DstPtrInfo.getWithOffset(Offset), SrcPtrInfo.getWithOffset(Offset))); return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Results); } diff --git a/llvm/lib/Target/XCore/XCoreISelLowering.cpp b/llvm/lib/Target/XCore/XCoreISelLowering.cpp index c81da0365af3d..f4d32833bce99 100644 --- a/llvm/lib/Target/XCore/XCoreISelLowering.cpp +++ b/llvm/lib/Target/XCore/XCoreISelLowering.cpp @@ -1301,8 +1301,8 @@ SDValue XCoreTargetLowering::LowerCCCArguments( InVals.push_back(FIN); MemOps.push_back(DAG.getMemcpy( Chain, dl, FIN, ArgDI.SDV, DAG.getConstant(Size, dl, MVT::i32), - Alignment, false, false, false, MachinePointerInfo(), - MachinePointerInfo())); + Alignment, false, false, /*CI=*/nullptr, std::nullopt, + MachinePointerInfo(), MachinePointerInfo())); } else { InVals.push_back(ArgDI.SDV); } @@ -1704,7 +1704,7 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N, bool isTail = isInTailCallPosition(DAG, ST, Chain); return DAG.getMemmove(Chain, dl, ST->getBasePtr(), LD->getBasePtr(), DAG.getConstant(StoreBits / 8, dl, MVT::i32), - Alignment, false, isTail, + Alignment, false, nullptr, isTail, ST->getPointerInfo(), LD->getPointerInfo()); } } diff --git a/llvm/test/CodeGen/AArch64/no-tail-call-bzero-from-memset.ll b/llvm/test/CodeGen/AArch64/no-tail-call-bzero-from-memset.ll new file mode 100644 index 0000000000000..34c6c63cc1798 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/no-tail-call-bzero-from-memset.ll @@ -0,0 +1,20 @@ +; RUN: llc -o - %s | FileCheck %s +; RUN: llc -global-isel -o - %s | FileCheck %s +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" +target triple = "arm64-apple-macosx15.0.0" + +define ptr @test() { +; CHECK-LABEL: test: +; CHECK: bl _bzero + %1 = tail call ptr @fn(i32 noundef 1) #3 + tail call void @llvm.memset.p0.i64(ptr noundef nonnull align 1 dereferenceable(1000) %1, i8 noundef 0, i64 noundef 1000, i1 noundef false) #3 + ret ptr %1 +} + +declare ptr @fn(i32 noundef) + +; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write) +declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #2 + +attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: write) } +attributes #3 = { nounwind optsize } From a751f653b40f2021f091a2f1ebcc2d91bc4cc89d Mon Sep 17 00:00:00 2001 From: dlav-sc Date: Wed, 17 Jul 2024 11:36:19 +0300 Subject: [PATCH 234/777] [lldb][RISCV] function prologue backtrace fix (#99043) CreateFunctionEntryUnwindPlan RISCV ABI function fix needed to receive a valid backtrace at the start of functions. Fixed tests for RISCV target: TestNumThreads.NumberOfThreadsTestCase TestCPPExceptionBreakpoints.CPPBreakpointTestCase TestStepThroughTrampoline.StepThroughTrampoline TestOSPluginStepping.TestOSPluginStepping TestSteppingOutWithArtificialFrames.TestArtificialFrameThreadStepOut1 TestStepAvoidsRegexp.StepAvoidsRegexTestCase TestInlineStepping.TestInlineStepping TestStepOverBreakpoint.StepOverBreakpointsTestCase TestStepOverBreakpoint.StepOverBreakpointsTestCase TestSteppingOutWithArtificialFrames.TestArtificialFrameThreadStepOut1 TestTailCallFrameSBAPI.TestTailCallFrameSBAPI TestThreadPlanUserBreakpoint.ThreadPlanUserBreakpointsTestCase --- lldb/source/Plugins/ABI/RISCV/ABISysV_riscv.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/lldb/source/Plugins/ABI/RISCV/ABISysV_riscv.cpp b/lldb/source/Plugins/ABI/RISCV/ABISysV_riscv.cpp index 6395f5bb5bd9b..35d4f0521bf1f 100644 --- a/lldb/source/Plugins/ABI/RISCV/ABISysV_riscv.cpp +++ b/lldb/source/Plugins/ABI/RISCV/ABISysV_riscv.cpp @@ -13,6 +13,7 @@ #include "llvm/IR/DerivedTypes.h" +#include "Utility/RISCV_DWARF_Registers.h" #include "lldb/Core/PluginManager.h" #include "lldb/Core/Value.h" #include "lldb/Core/ValueObjectConstResult.h" @@ -643,9 +644,9 @@ bool ABISysV_riscv::CreateFunctionEntryUnwindPlan(UnwindPlan &unwind_plan) { unwind_plan.Clear(); unwind_plan.SetRegisterKind(eRegisterKindDWARF); - uint32_t pc_reg_num = LLDB_REGNUM_GENERIC_PC; - uint32_t sp_reg_num = LLDB_REGNUM_GENERIC_SP; - uint32_t ra_reg_num = LLDB_REGNUM_GENERIC_RA; + uint32_t pc_reg_num = riscv_dwarf::dwarf_gpr_pc; + uint32_t sp_reg_num = riscv_dwarf::dwarf_gpr_sp; + uint32_t ra_reg_num = riscv_dwarf::dwarf_gpr_ra; UnwindPlan::RowSP row(new UnwindPlan::Row); From 8bf952d77fbe63f979e4293e95a5ca379e26eede Mon Sep 17 00:00:00 2001 From: David Spickett Date: Wed, 17 Jul 2024 08:51:19 +0000 Subject: [PATCH 235/777] [flang][test] Fix mtune test on AArch64 bots The native architecture is AArch64 here so the pentium name won't work even if you've got the x86 backend enabled. https://lab.llvm.org/buildbot/#/builders/17/builds/898 Pass an explicit target for each run line to fix this. Test added in f1d3fe7aae7867b5de96b84d6d26b5c9f02f209a / #98517 --- flang/test/Lower/tune-cpu-llvm.f90 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/flang/test/Lower/tune-cpu-llvm.f90 b/flang/test/Lower/tune-cpu-llvm.f90 index dc2a68730cf23..6cf7d91ad76b4 100644 --- a/flang/test/Lower/tune-cpu-llvm.f90 +++ b/flang/test/Lower/tune-cpu-llvm.f90 @@ -1,5 +1,5 @@ -! RUN: %if x86-registered-target %{ %flang -mtune=pentium4 -S -emit-llvm %s -o - | FileCheck %s --check-prefixes=ALL,CHECK-X86 %} -! RUN: %if aarch64-registered-target %{ %flang -mtune=neoverse-n1 -S -emit-llvm %s -o - | FileCheck %s --check-prefixes=ALL,CHECK-ARM %} +! RUN: %if x86-registered-target %{ %flang -target x86_64-linux-gnu -mtune=pentium4 -S -emit-llvm %s -o - | FileCheck %s --check-prefixes=ALL,CHECK-X86 %} +! RUN: %if aarch64-registered-target %{ %flang -target aarch64-linux-gnu -mtune=neoverse-n1 -S -emit-llvm %s -o - | FileCheck %s --check-prefixes=ALL,CHECK-ARM %} !ALL: attributes #{{[0-9]+}} = { !CHECK-X86-SAME: "tune-cpu"="pentium4" From 72b3d7bc87019ba7ef268ce322f90382f01b11af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Wed, 17 Jul 2024 10:53:14 +0200 Subject: [PATCH 236/777] [clang][Interp] Makre sure we don't overflow Descriptor::AllocSize We allocate the metadata and the array elements in one allocation, and we save its size in a field of type 'unsigned'. Makre sure the full size of the allocation doesn't overflow the field. --- clang/lib/AST/Interp/Descriptor.cpp | 1 + clang/lib/AST/Interp/Descriptor.h | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/clang/lib/AST/Interp/Descriptor.cpp b/clang/lib/AST/Interp/Descriptor.cpp index a3801a01688c8..f7d1201f625bb 100644 --- a/clang/lib/AST/Interp/Descriptor.cpp +++ b/clang/lib/AST/Interp/Descriptor.cpp @@ -303,6 +303,7 @@ Descriptor::Descriptor(const DeclTy &D, PrimType Type, MetadataSize MD, IsArray(true), CtorFn(getCtorArrayPrim(Type)), DtorFn(getDtorArrayPrim(Type)), MoveFn(getMoveArrayPrim(Type)) { assert(Source && "Missing source"); + assert(NumElems <= (MaxArrayElemBytes / ElemSize)); } /// Primitive unknown-size arrays. diff --git a/clang/lib/AST/Interp/Descriptor.h b/clang/lib/AST/Interp/Descriptor.h index f444b8a78e802..0dd97812e5a5c 100644 --- a/clang/lib/AST/Interp/Descriptor.h +++ b/clang/lib/AST/Interp/Descriptor.h @@ -13,6 +13,7 @@ #ifndef LLVM_CLANG_AST_INTERP_DESCRIPTOR_H #define LLVM_CLANG_AST_INTERP_DESCRIPTOR_H +#include "PrimType.h" #include "clang/AST/Decl.h" #include "clang/AST/Expr.h" @@ -125,6 +126,11 @@ struct Descriptor final { static constexpr MetadataSize InlineDescMD = sizeof(InlineDescriptor); static constexpr MetadataSize GlobalMD = sizeof(GlobalInlineDescriptor); + /// Maximum number of bytes to be used for array elements. + static constexpr unsigned MaxArrayElemBytes = + std::numeric_limits::max() - sizeof(InitMapPtr) - + align(std::max(*InlineDescMD, *GlobalMD)); + /// Pointer to the record, if block contains records. const Record *const ElemRecord = nullptr; /// Descriptor of the array element. From 7c597c0e691bb685a7b9ef01e3ccaad5e64a3e92 Mon Sep 17 00:00:00 2001 From: Fraser Cormack Date: Wed, 17 Jul 2024 10:12:02 +0100 Subject: [PATCH 237/777] [Utils][vim] Match vector 'splat' keyword (#99004) --- llvm/utils/vim/syntax/llvm.vim | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/vim/syntax/llvm.vim b/llvm/utils/vim/syntax/llvm.vim index a3eb010989ef6..2a294223269ba 100644 --- a/llvm/utils/vim/syntax/llvm.vim +++ b/llvm/utils/vim/syntax/llvm.vim @@ -177,6 +177,7 @@ syn keyword llvmKeyword \ speculative_load_hardening \ spir_func \ spir_kernel + \ splat \ sret \ ssp \ sspreq From 2e56497bf7b2c848b2c43ce8c64e585bc006240a Mon Sep 17 00:00:00 2001 From: Fraser Cormack Date: Wed, 17 Jul 2024 10:12:46 +0100 Subject: [PATCH 238/777] [Utils][vim] Match more hexadecimal float constants (#99000) The `0x[KLMHR]` syntax denotes different floating-point types: various long doubles, half and bfloat. See https://llvm.org/docs/LangRef.html#simple-constants for reference. --- llvm/utils/vim/syntax/llvm.vim | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/utils/vim/syntax/llvm.vim b/llvm/utils/vim/syntax/llvm.vim index 2a294223269ba..fac509c355cb8 100644 --- a/llvm/utils/vim/syntax/llvm.vim +++ b/llvm/utils/vim/syntax/llvm.vim @@ -219,7 +219,7 @@ syn keyword llvmError getresult begin end syn match llvmNoName /[%@!]\d\+\>/ syn match llvmNumber /-\?\<\d\+\>/ syn match llvmFloat /-\?\<\d\+\.\d*\(e[+-]\d\+\)\?\>/ -syn match llvmFloat /\<0x\x\+\>/ +syn match llvmFloat /\<0x[KLMHR]\?\x\+\>/ syn keyword llvmBoolean true false syn keyword llvmConstant zeroinitializer undef null none poison vscale syn match llvmComment /;.*$/ From c7309dadbf5a07353fa18a712895e3cfb48a78e7 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Wed, 17 Jul 2024 10:18:03 +0100 Subject: [PATCH 239/777] [AMDGPU] Use range-based for loops. NFC. (#99047) --- llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp | 8 ++------ llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 4 ++-- llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp | 10 ++++------ .../Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp | 13 ++++++------- .../Target/AMDGPU/R600EmitClauseMarkers.cpp | 18 ++++++++---------- .../AMDGPU/R600MachineCFGStructurizer.cpp | 4 ++-- .../AMDGPU/R600OpenCLImageTypeLoweringPass.cpp | 5 ++--- llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp | 14 +++++--------- llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 8 +++----- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 7 +++---- .../Target/AMDGPU/SIMachineFunctionInfo.cpp | 6 +++--- 11 files changed, 40 insertions(+), 57 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp index 8d74689b5ad7b..1ddf6686b97e7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp @@ -190,13 +190,9 @@ class SchedGroup { // Returns true if the SU matches all rules bool allowedByRules(const SUnit *SU, SmallVectorImpl &SyncPipe) const { - if (Rules.empty()) - return true; - for (size_t I = 0; I < Rules.size(); I++) { - auto TheRule = Rules[I].get(); - if (!TheRule->apply(SU, Collection, SyncPipe)) { + for (auto &Rule : Rules) { + if (!Rule.get()->apply(SU, Collection, SyncPipe)) return false; - } } return true; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index d4b87d85a7c20..39ae7c96cf772 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -1342,8 +1342,8 @@ SDValue AMDGPUTargetLowering::lowerUnhandledCall(CallLoweringInfo &CLI, DAG.getContext()->diagnose(NoCalls); if (!CLI.IsTailCall) { - for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I) - InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT)); + for (ISD::InputArg &Arg : CLI.Ins) + InVals.push_back(DAG.getUNDEF(Arg.VT)); } return DAG.getEntryNode(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp index 30c5e5eebfcdc..e01c9dc66a3f1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp @@ -861,9 +861,8 @@ bool AMDGPULibCalls::TDOFold(CallInst *CI, const FuncInfo &FInfo) { Constant *nval; if (getArgType(FInfo) == AMDGPULibFunc::F32) { SmallVector FVal; - for (unsigned i = 0; i < DVal.size(); ++i) { - FVal.push_back((float)DVal[i]); - } + for (double D : DVal) + FVal.push_back((float)D); ArrayRef tmp(FVal); nval = ConstantDataVector::get(context, tmp); } else { // F64 @@ -1082,9 +1081,8 @@ bool AMDGPULibCalls::fold_pow(FPMathOperator *FPOp, IRBuilder<> &B, } if (getArgType(FInfo) == AMDGPULibFunc::F32) { SmallVector FVal; - for (unsigned i=0; i < DVal.size(); ++i) { - FVal.push_back((float)DVal[i]); - } + for (double D : DVal) + FVal.push_back((float)D); ArrayRef tmp(FVal); cnval = ConstantDataVector::get(M->getContext(), tmp); } else { diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp index a295117de6414..bb2603e0076e4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -321,8 +321,7 @@ class AMDGPULowerModuleLDS { ArrayType *KernelOffsetsType = ArrayType::get(I32, Variables.size()); SmallVector Elements; - for (size_t i = 0; i < Variables.size(); i++) { - GlobalVariable *GV = Variables[i]; + for (GlobalVariable *GV : Variables) { auto ConstantGepIt = LDSVarsToConstantGEP.find(GV); if (ConstantGepIt != LDSVarsToConstantGEP.end()) { auto elt = ConstantExpr::getPtrToInt(ConstantGepIt->second, I32); @@ -1194,10 +1193,10 @@ class AMDGPULowerModuleLDS { IsPaddingField.reserve(LDSVarsToTransform.size()); { uint64_t CurrentOffset = 0; - for (size_t I = 0; I < LayoutFields.size(); I++) { - GlobalVariable *FGV = static_cast( - const_cast(LayoutFields[I].Id)); - Align DataAlign = LayoutFields[I].Alignment; + for (auto &F : LayoutFields) { + GlobalVariable *FGV = + static_cast(const_cast(F.Id)); + Align DataAlign = F.Alignment; uint64_t DataAlignV = DataAlign.value(); if (uint64_t Rem = CurrentOffset % DataAlignV) { @@ -1218,7 +1217,7 @@ class AMDGPULowerModuleLDS { LocalVars.push_back(FGV); IsPaddingField.push_back(false); - CurrentOffset += LayoutFields[I].Size; + CurrentOffset += F.Size; } } diff --git a/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp b/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp index ccbfa4fde09a0..de3c06f3a71e2 100644 --- a/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp +++ b/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp @@ -124,10 +124,9 @@ class R600EmitClauseMarkers : public MachineFunctionPass { assert( (TII->isALUInstr(MI.getOpcode()) || MI.getOpcode() == R600::DOT_4) && "Can't assign Const"); - for (unsigned i = 0, n = Consts.size(); i < n; ++i) { - if (Consts[i].first->getReg() != R600::ALU_CONST) + for (auto &[Op, Sel] : Consts) { + if (Op->getReg() != R600::ALU_CONST) continue; - unsigned Sel = Consts[i].second; unsigned Chan = Sel & 3, Index = ((Sel >> 2) - 512) & 31; unsigned KCacheIndex = Index * 4 + Chan; const std::pair &BankLine = getAccessedBankLine(Sel); @@ -155,17 +154,16 @@ class R600EmitClauseMarkers : public MachineFunctionPass { if (!UpdateInstr) return true; - for (unsigned i = 0, j = 0, n = Consts.size(); i < n; ++i) { - if (Consts[i].first->getReg() != R600::ALU_CONST) + unsigned j = 0; + for (auto &[Op, Sel] : Consts) { + if (Op->getReg() != R600::ALU_CONST) continue; - switch(UsedKCache[j].first) { + switch (UsedKCache[j].first) { case 0: - Consts[i].first->setReg( - R600::R600_KC0RegClass.getRegister(UsedKCache[j].second)); + Op->setReg(R600::R600_KC0RegClass.getRegister(UsedKCache[j].second)); break; case 1: - Consts[i].first->setReg( - R600::R600_KC1RegClass.getRegister(UsedKCache[j].second)); + Op->setReg(R600::R600_KC1RegClass.getRegister(UsedKCache[j].second)); break; default: llvm_unreachable("Wrong Cache Line"); diff --git a/llvm/lib/Target/AMDGPU/R600MachineCFGStructurizer.cpp b/llvm/lib/Target/AMDGPU/R600MachineCFGStructurizer.cpp index 3aa8dd8c52162..abcccc492c671 100644 --- a/llvm/lib/Target/AMDGPU/R600MachineCFGStructurizer.cpp +++ b/llvm/lib/Target/AMDGPU/R600MachineCFGStructurizer.cpp @@ -669,8 +669,8 @@ void R600MachineCFGStructurizer::wrapup(MachineBasicBlock *MBB) { } //delete continue right before endloop - for (unsigned i = 0; i < ContInstr.size(); ++i) - ContInstr[i]->eraseFromParent(); + for (auto *MI : ContInstr) + MI->eraseFromParent(); // TODO to fix up jump table so later phase won't be confused. if // (jumpTableInfo->isEmpty() == false) { need to clean the jump table, but diff --git a/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp b/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp index c1a5e3b593748..604a4cb1bf881 100644 --- a/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp +++ b/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp @@ -244,9 +244,8 @@ class R600OpenCLImageTypeLoweringPass : public ModulePass { Modified |= replaceSamplerUses(Arg, ResourceID); } } - for (unsigned i = 0; i < InstsToErase.size(); ++i) { - InstsToErase[i]->eraseFromParent(); - } + for (auto *Inst : InstsToErase) + Inst->eraseFromParent(); return Modified; } diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index d43100254bfc9..3491558a3e8e7 100644 --- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -613,10 +613,8 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { TII = ST.getInstrInfo(); MDT = &getAnalysis().getDomTree(); - for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); - BI != BE; ++BI) { - MachineBasicBlock *MBB = &*BI; - for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E; + for (MachineBasicBlock &MBB : MF) { + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; ++I) { MachineInstr &MI = *I; @@ -665,7 +663,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { Register NewDst = MRI->createVirtualRegister(DestRC); MachineBasicBlock *BlockToInsertCopy = MI.isPHI() ? MI.getOperand(MO.getOperandNo() + 1).getMBB() - : MBB; + : &MBB; MachineBasicBlock::iterator PointToInsertCopy = MI.isPHI() ? BlockToInsertCopy->getFirstInstrTerminator() : I; @@ -1095,10 +1093,8 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) { void SIFixSGPRCopies::fixSCCCopies(MachineFunction &MF) { bool IsWave32 = MF.getSubtarget().isWave32(); - for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; - ++BI) { - MachineBasicBlock *MBB = &*BI; - for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E; + for (MachineBasicBlock &MBB : MF) { + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; ++I) { MachineInstr &MI = *I; // May already have been lowered. diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 7bf6a635158eb..0e8c96625b221 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -1770,8 +1770,7 @@ bool SIFoldOperands::tryFoldRegSequence(MachineInstr &MI) { if (!getRegSeqInit(Defs, Reg, MCOI::OPERAND_REGISTER)) return false; - for (auto &Def : Defs) { - const auto *Op = Def.first; + for (auto &[Op, SubIdx] : Defs) { if (!Op->isReg()) return false; if (TRI->isAGPR(*MRI, Op->getReg())) @@ -1809,8 +1808,7 @@ bool SIFoldOperands::tryFoldRegSequence(MachineInstr &MI) { auto RS = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(AMDGPU::REG_SEQUENCE), Dst); - for (unsigned I = 0; I < Defs.size(); ++I) { - MachineOperand *Def = Defs[I].first; + for (auto &[Def, SubIdx] : Defs) { Def->setIsKill(false); if (TRI->isAGPR(*MRI, Def->getReg())) { RS.add(*Def); @@ -1819,7 +1817,7 @@ bool SIFoldOperands::tryFoldRegSequence(MachineInstr &MI) { SubDef->getOperand(1).setIsKill(false); RS.addReg(SubDef->getOperand(1).getReg(), 0, Def->getSubReg()); } - RS.addImm(Defs[I].second); + RS.addImm(SubIdx); } Op->setReg(Dst); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index a79d8f7bd1b5e..df5a334f83082 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -3250,8 +3250,7 @@ SDValue SITargetLowering::LowerCallResult( CCInfo.AnalyzeCallResult(Ins, RetCC); // Copy all of the result registers out of their specified physreg. - for (unsigned i = 0; i != RVLocs.size(); ++i) { - CCValAssign VA = RVLocs[i]; + for (CCValAssign VA : RVLocs) { SDValue Val; if (VA.isRegLoc()) { @@ -3642,8 +3641,8 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, if (Callee.isUndef() || isNullConstant(Callee)) { if (!CLI.IsTailCall) { - for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I) - InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT)); + for (ISD::InputArg &Arg : CLI.Ins) + InVals.push_back(DAG.getUNDEF(Arg.VT)); } return Chain; diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 5e89c286bfbbd..2186c1ede468c 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -324,8 +324,7 @@ void SIMachineFunctionInfo::shiftSpillPhysVGPRsToLowestRange( MachineFunction &MF) { const SIRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); - for (unsigned I = 0, E = SpillPhysVGPRs.size(); I < E; ++I) { - Register Reg = SpillPhysVGPRs[I]; + for (Register &Reg : SpillPhysVGPRs) { Register NewReg = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF); if (!NewReg || NewReg >= Reg) @@ -334,7 +333,6 @@ void SIMachineFunctionInfo::shiftSpillPhysVGPRsToLowestRange( MRI.replaceRegWith(Reg, NewReg); // Update various tables with the new VGPR. - SpillPhysVGPRs[I] = NewReg; WWMReservedRegs.remove(Reg); WWMReservedRegs.insert(NewReg); WWMSpills.insert(std::make_pair(NewReg, WWMSpills[Reg])); @@ -344,6 +342,8 @@ void SIMachineFunctionInfo::shiftSpillPhysVGPRsToLowestRange( MBB.removeLiveIn(Reg); MBB.sortUniqueLiveIns(); } + + Reg = NewReg; } } From de61875e9d276cdb1899c2bea818bca86025b35a Mon Sep 17 00:00:00 2001 From: Hugo Trachino Date: Wed, 17 Jul 2024 10:22:25 +0100 Subject: [PATCH 240/777] [MLIR][Vector] Generalize DropUnitDimFromElementwiseOps to non leading / trailing dimensions. (#98455) Generalizes DropUnitDimFromElementwiseOps to support inner unit dimensions. This change stems from improving lowering of contractionOps for Arm SME. Where we end up with inner unit dimensions on MulOp, BroadcastOp and TransposeOp, preventing the generation of outerproducts. discussed [here](https://discourse.llvm.org/t/on-improving-arm-sme-lowering-resilience-in-mlir/78543/17?u=nujaa). Fix after : https://github.com/llvm/llvm-project/pull/97652 showed an unhandled edge case when all dimensions are one. The generated target VectorType would be `vector` which is apparently not supported by the mulf. In case all dimensions are dropped, the target vectorType is vector<1xf32> --------- Co-authored-by: Benjamin Maxwell --- .../Vector/Transforms/VectorTransforms.cpp | 57 +++++++++++-------- .../Vector/vector-transfer-flatten.mlir | 51 +++++++++++++++++ 2 files changed, 85 insertions(+), 23 deletions(-) diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp index b69e9421384b0..2686277bba59d 100644 --- a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp +++ b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp @@ -1627,7 +1627,33 @@ struct ChainedReduction final : OpRewritePattern { } }; -/// For vectors with either leading or trailing unit dim, replaces: +// Helper function dropping unit non-scalable dimension from a VectorType +// keeping at least 1 dimension to avoid generating 0-D vectors. Scalable unit +// dimensions are not dropped. Folding such dimensions would require "shifting" +// the scalable flag onto some other fixed-width dim (e.g. vector<[1]x4xf32> -> +// vector<[4]xf32>). This could be implemented in the future. +static VectorType dropNonScalableUnitDimFromType(VectorType inVecTy) { + auto inVecShape = inVecTy.getShape(); + SmallVector newShape; + SmallVector newScalableDims; + for (auto [dim, isScalable] : + llvm::zip_equal(inVecShape, inVecTy.getScalableDims())) { + if (dim == 1 && !isScalable) + continue; + + newShape.push_back(dim); + newScalableDims.push_back(isScalable); + } + // All dims have been dropped, return vector<1xeType>. + if (newShape.empty()) { + newShape.push_back(1); + newScalableDims.push_back(false); + } + + return VectorType::get(newShape, inVecTy.getElementType(), newScalableDims); +} + +/// For vectors with at least one unit dim, replaces: /// elementwise(a, b) /// with: /// sc_a = shape_cast(a) @@ -1639,20 +1665,16 @@ struct ChainedReduction final : OpRewritePattern { /// required to be rank > 1. /// /// Ex: -/// ``` /// %mul = arith.mulf %B_row, %A_row : vector<1x[4]xf32> /// %cast = vector.shape_cast %mul : vector<1x[4]xf32> to vector<[4]xf32> -/// ``` /// /// gets converted to: /// -/// ``` /// %B_row_sc = vector.shape_cast %B_row : vector<1x[4]xf32> to vector<[4]xf32> /// %A_row_sc = vector.shape_cast %A_row : vector<1x[4]xf32> to vector<[4]xf32> /// %mul = arith.mulf %B_row_sc, %A_row_sc : vector<[4]xf32> /// %cast_new = vector.shape_cast %mul : vector<[4]xf32> to vector<1x[4]xf32> /// %cast = vector.shape_cast %cast_new : vector<1x[4]xf32> to vector<[4]xf32> -/// ``` /// /// Patterns for folding shape_casts should instantly eliminate `%cast_new` and /// `%cast`. @@ -1677,37 +1699,26 @@ struct DropUnitDimFromElementwiseOps final if (sourceVectorType.getRank() < 2) return failure(); - bool hasTrailingDimUnitFixed = - ((sourceVectorType.getShape().back() == 1) && - (!sourceVectorType.getScalableDims().back())); - bool hasLeadingDimUnitFixed = - ((sourceVectorType.getShape().front() == 1) && - (!sourceVectorType.getScalableDims().front())); - if (!hasLeadingDimUnitFixed && !hasTrailingDimUnitFixed) - return failure(); - - // Drop leading/trailing unit dim by applying vector.shape_cast to all - // operands - int64_t dim = hasLeadingDimUnitFixed ? 0 : sourceVectorType.getRank() - 1; - SmallVector newOperands; auto loc = op->getLoc(); for (auto operand : op->getOperands()) { auto opVectorType = cast(operand.getType()); - VectorType newVType = VectorType::Builder(opVectorType).dropDim(dim); + auto newVType = dropNonScalableUnitDimFromType(opVectorType); + if (newVType == opVectorType) + return rewriter.notifyMatchFailure(op, "No unit dimension to remove."); + auto opSC = rewriter.create(loc, newVType, operand); newOperands.push_back(opSC); } VectorType newResultVectorType = - VectorType::Builder(resultVectorType).dropDim(dim); - // Create an updated elementwise Op without leading/trailing unit dim + dropNonScalableUnitDimFromType(resultVectorType); + // Create an updated elementwise Op without unit dim. Operation *elementwiseOp = rewriter.create(loc, op->getName().getIdentifier(), newOperands, newResultVectorType, op->getAttrs()); - // Restore the leading/trailing unit dim by applying vector.shape_cast - // to the result + // Restore the unit dim by applying vector.shape_cast to the result. rewriter.replaceOpWithNewOp(op, resultVectorType, elementwiseOp->getResult(0)); diff --git a/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir b/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir index 5fd3cbd54aa58..303f841e8a828 100644 --- a/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir +++ b/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir @@ -604,6 +604,57 @@ func.func @fold_unit_dims_entirely(%arg0 : vector<8xi32>, // ----- +func.func @fold_inner_unit_dim(%arg0 : vector<8x1x3xf128>, + %arg1 : vector<1x8x3xf128>) -> vector<8x3xf128> { + %sc_arg1 = vector.shape_cast %arg1 : vector<1x8x3xf128> to vector<8x1x3xf128> + %mul = arith.mulf %arg0, %sc_arg1 : vector<8x1x3xf128> + %res = vector.shape_cast %mul : vector<8x1x3xf128> to vector<8x3xf128> + return %res : vector<8x3xf128> +} + +// CHECK-LABEL: func.func @fold_inner_unit_dim( +// CHECK-SAME: %[[VAL_0:.*]]: vector<8x1x3xf128>, +// CHECK-SAME: %[[VAL_1:.*]]: vector<1x8x3xf128>) -> vector<8x3xf128> { +// CHECK: %[[VAL_2:.*]] = vector.shape_cast %[[VAL_0]] : vector<8x1x3xf128> to vector<8x3xf128> +// CHECK: %[[VAL_3:.*]] = vector.shape_cast %[[VAL_1]] : vector<1x8x3xf128> to vector<8x3xf128> +// CHECK: %[[VAL_4:.*]] = arith.mulf %[[VAL_2]], %[[VAL_3]] : vector<8x3xf128> +// CHECK: return %[[VAL_4]] : vector<8x3xf128> + +// ----- + +func.func @fold_inner_unit_dim_scalable(%arg0 : vector<8x1x[1]x3xf128>, + %arg1 : vector<1x8x[1]x3xf128>) -> vector<8x[1]x3xf128> { + %sc_arg1 = vector.shape_cast %arg1 : vector<1x8x[1]x3xf128> to vector<8x1x[1]x3xf128> + %mul = arith.mulf %arg0, %sc_arg1 : vector<8x1x[1]x3xf128> + %res = vector.shape_cast %mul : vector<8x1x[1]x3xf128> to vector<8x[1]x3xf128> + return %res : vector<8x[1]x3xf128> +} + +// CHECK-LABEL: func.func @fold_inner_unit_dim_scalable( +// CHECK-SAME: %[[VAL_0:.*]]: vector<8x1x[1]x3xf128>, +// CHECK-SAME: %[[VAL_1:.*]]: vector<1x8x[1]x3xf128>) -> vector<8x[1]x3xf128> { +// CHECK: %[[VAL_2:.*]] = vector.shape_cast %[[VAL_0]] : vector<8x1x[1]x3xf128> to vector<8x[1]x3xf128> +// CHECK: %[[VAL_3:.*]] = vector.shape_cast %[[VAL_1]] : vector<1x8x[1]x3xf128> to vector<8x[1]x3xf128> +// CHECK: %[[VAL_4:.*]] = arith.mulf %[[VAL_2]], %[[VAL_3]] : vector<8x[1]x3xf128> +// CHECK: return %[[VAL_4]] : vector<8x[1]x3xf128> + +// ----- + +func.func @fold_all_unit_dims(%arg0: vector<1x1xf32>) -> vector<1xf32> { + %0 = arith.mulf %arg0, %arg0 : vector<1x1xf32> + %res = vector.shape_cast %0 : vector<1x1xf32> to vector<1xf32> + return %res : vector<1xf32> +} + +// CHECK-LABEL: func.func @fold_all_unit_dims( +// CHECK-SAME: %[[VAL_0:.*]]: vector<1x1xf32>) -> vector<1xf32> +// CHECK: %[[VAL_1:.*]] = vector.shape_cast %[[VAL_0]] : vector<1x1xf32> to vector<1xf32> +// CHECK: %[[VAL_2:.*]] = vector.shape_cast %[[VAL_0]] : vector<1x1xf32> to vector<1xf32> +// CHECK: %[[VAL_3:.*]] = arith.mulf %[[VAL_1]], %[[VAL_2]] : vector<1xf32> +// CHECK: return %[[VAL_3]] : vector<1xf32> + +// ----- + func.func @negative_out_of_bound_transfer_read( %arg : memref>) -> vector<5x4x3x2xi8> { %c0 = arith.constant 0 : index From 3941f652317d95cac203e64791bfa730de7bbd1e Mon Sep 17 00:00:00 2001 From: Sylvestre Ledru Date: Wed, 17 Jul 2024 11:25:47 +0200 Subject: [PATCH 241/777] adjust the m86k backend after change f270a4dd6667759d7305797a077ae09648318ac7 --- llvm/lib/Target/M68k/M68kISelLowering.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/M68k/M68kISelLowering.cpp b/llvm/lib/Target/M68k/M68kISelLowering.cpp index 62e4b36b5c9a8..316a6eebc2db0 100644 --- a/llvm/lib/Target/M68k/M68kISelLowering.cpp +++ b/llvm/lib/Target/M68k/M68kISelLowering.cpp @@ -268,7 +268,7 @@ static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst, return DAG.getMemcpy( Chain, DL, Dst, Src, SizeNode, Flags.getNonZeroByValAlign(), /*isVolatile=*/false, /*AlwaysInline=*/true, - /*isTailCall=*/false, MachinePointerInfo(), MachinePointerInfo()); + /*CI=*/nullptr, std::nullopt, MachinePointerInfo(), MachinePointerInfo()); } /// Return true if the calling convention is one that we can guarantee TCO for. From aa21ee7926a265c705b00bae186cf8adf0ca7410 Mon Sep 17 00:00:00 2001 From: Finlay Date: Wed, 17 Jul 2024 10:26:25 +0100 Subject: [PATCH 242/777] [MLIR] Add attributes no_unwind and will_return to the LLVMIR dialect (#98921) And testing. These are being added to be used in the GPU to LLVM SPV pass. --------- Co-authored-by: Victor Perez --- mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td | 2 ++ mlir/lib/Target/LLVMIR/ModuleImport.cpp | 6 +++++ mlir/lib/Target/LLVMIR/ModuleTranslation.cpp | 4 ++++ mlir/test/Dialect/LLVMIR/func.mlir | 13 +++++++++++ .../LLVMIR/Import/function-attributes.ll | 12 ++++++++++ mlir/test/Target/LLVMIR/llvmir.mlir | 22 +++++++++++++++++++ 6 files changed, 59 insertions(+) diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td index f0dec69a5032a..06656c791c594 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td @@ -1461,6 +1461,8 @@ def LLVM_LLVMFuncOp : LLVM_Op<"func", [ OptionalAttr:$fp_contract, OptionalAttr:$no_inline, OptionalAttr:$always_inline, + OptionalAttr:$no_unwind, + OptionalAttr:$will_return, OptionalAttr:$optimize_none ); diff --git a/mlir/lib/Target/LLVMIR/ModuleImport.cpp b/mlir/lib/Target/LLVMIR/ModuleImport.cpp index 5bc3dd680d02d..16007592175f7 100644 --- a/mlir/lib/Target/LLVMIR/ModuleImport.cpp +++ b/mlir/lib/Target/LLVMIR/ModuleImport.cpp @@ -1686,11 +1686,13 @@ static constexpr std::array kExplicitAttributes{ StringLiteral("no-nans-fp-math"), StringLiteral("no-signed-zeros-fp-math"), StringLiteral("noinline"), + StringLiteral("nounwind"), StringLiteral("optnone"), StringLiteral("target-features"), StringLiteral("tune-cpu"), StringLiteral("unsafe-fp-math"), StringLiteral("vscale_range"), + StringLiteral("willreturn"), }; static void processPassthroughAttrs(llvm::Function *func, LLVMFuncOp funcOp) { @@ -1763,6 +1765,10 @@ void ModuleImport::processFunctionAttributes(llvm::Function *func, funcOp.setOptimizeNone(true); if (func->hasFnAttribute(llvm::Attribute::Convergent)) funcOp.setConvergent(true); + if (func->hasFnAttribute(llvm::Attribute::NoUnwind)) + funcOp.setNoUnwind(true); + if (func->hasFnAttribute(llvm::Attribute::WillReturn)) + funcOp.setWillReturn(true); if (func->hasFnAttribute("aarch64_pstate_sm_enabled")) funcOp.setArmStreaming(true); diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp index ef226dd3a77d5..fc3fb0b5334c1 100644 --- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp @@ -1441,6 +1441,10 @@ static void convertFunctionAttributes(LLVMFuncOp func, llvmFunc->addFnAttr(llvm::Attribute::OptimizeNone); if (func.getConvergentAttr()) llvmFunc->addFnAttr(llvm::Attribute::Convergent); + if (func.getNoUnwindAttr()) + llvmFunc->addFnAttr(llvm::Attribute::NoUnwind); + if (func.getWillReturnAttr()) + llvmFunc->addFnAttr(llvm::Attribute::WillReturn); convertFunctionMemoryAttributes(func, llvmFunc); } diff --git a/mlir/test/Dialect/LLVMIR/func.mlir b/mlir/test/Dialect/LLVMIR/func.mlir index e0810a23697f8..0e29a548de72f 100644 --- a/mlir/test/Dialect/LLVMIR/func.mlir +++ b/mlir/test/Dialect/LLVMIR/func.mlir @@ -312,6 +312,19 @@ module { llvm.return } + llvm.func @nounwind_function() attributes {no_unwind} { + // CHECK: @nounwind_function + // CHECK-SAME: attributes {no_unwind} + llvm.return + } + + llvm.func @willreturn_function() attributes {will_return} { + // CHECK: @willreturn_function + // CHECK-SAME: attributes {will_return} + llvm.return + } + + } // ----- diff --git a/mlir/test/Target/LLVMIR/Import/function-attributes.ll b/mlir/test/Target/LLVMIR/Import/function-attributes.ll index 9ca6f62fd0e2d..6c38979a0a719 100644 --- a/mlir/test/Target/LLVMIR/Import/function-attributes.ll +++ b/mlir/test/Target/LLVMIR/Import/function-attributes.ll @@ -385,3 +385,15 @@ declare void @optnone_attribute() noinline optnone ; CHECK-LABEL: @convergent_attribute ; CHECK-SAME: attributes {convergent} declare void @convergent_attribute() convergent + +// ----- + +; CHECK-LABEL: @nounwind_attribute +; CHECK-SAME: attributes {no_unwind} +declare void @nounwind_attribute() nounwind + +// ----- + +; CHECK-LABEL: @willreturn_attribute +; CHECK-SAME: attributes {will_return} +declare void @willreturn_attribute() willreturn diff --git a/mlir/test/Target/LLVMIR/llvmir.mlir b/mlir/test/Target/LLVMIR/llvmir.mlir index 29fdb55c1b301..132a8eb668eba 100644 --- a/mlir/test/Target/LLVMIR/llvmir.mlir +++ b/mlir/test/Target/LLVMIR/llvmir.mlir @@ -2452,3 +2452,25 @@ llvm.func @convergent() attributes { convergent } { // CHECK: #[[ATTRS]] // CHECK-SAME: convergent + +// ----- + +// CHECK-LABEL: @nounwind +// CHECK-SAME: #[[ATTRS:[0-9]+]] +llvm.func @nounwind() attributes { no_unwind } { + llvm.return +} + +// CHECK: #[[ATTRS]] +// CHECK-SAME: nounwind + +// ----- + +// CHECK-LABEL: @willreturn +// CHECK-SAME: #[[ATTRS:[0-9]+]] +llvm.func @willreturn() attributes { will_return } { + llvm.return +} + +// CHECK: #[[ATTRS]] +// CHECK-SAME: willreturn From 39d751ad976ba9f5e8a1ad3880559faba38c3c3f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Wed, 17 Jul 2024 11:18:13 +0200 Subject: [PATCH 243/777] [clang][Interp] Use an array root's field decl in the LValuePath Instead of pushing the index 0. --- clang/lib/AST/Interp/Pointer.cpp | 5 +++-- clang/test/AST/Interp/functions.cpp | 18 ++++++++++++++++++ 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/clang/lib/AST/Interp/Pointer.cpp b/clang/lib/AST/Interp/Pointer.cpp index b2e3a7ff70881..ff4da0fa805dc 100644 --- a/clang/lib/AST/Interp/Pointer.cpp +++ b/clang/lib/AST/Interp/Pointer.cpp @@ -152,8 +152,9 @@ APValue Pointer::toAPValue() const { Pointer Ptr = *this; while (Ptr.isField() || Ptr.isArrayElement()) { if (Ptr.isArrayRoot()) { - Path.push_back(APValue::LValuePathEntry::ArrayIndex(0)); - Ptr = Ptr.getBase(); + Path.push_back(APValue::LValuePathEntry( + {Ptr.getFieldDesc()->asDecl(), /*IsVirtual=*/false})); + Ptr = Ptr.getBase(); } else if (Ptr.isArrayElement()) { if (Ptr.isOnePastEnd()) Path.push_back(APValue::LValuePathEntry::ArrayIndex(Ptr.getArray().getNumElems())); diff --git a/clang/test/AST/Interp/functions.cpp b/clang/test/AST/Interp/functions.cpp index fa29e08a30175..f190262ad3ebe 100644 --- a/clang/test/AST/Interp/functions.cpp +++ b/clang/test/AST/Interp/functions.cpp @@ -644,3 +644,21 @@ namespace FunctionCast { // both-warning {{are a Clang extension}} int b[(int)IntFn(f)()]; // ok } + +#if __cplusplus >= 202002L +namespace StableAddress { + template struct str { + char arr[N]; + }; + // FIXME: Deduction guide not needed with P1816R0. + template str(const char (&)[N]) -> str; + + template constexpr int sum() { + int n = 0; + for (char c : s.arr) + n += c; + return n; + } + static_assert(sum() == 1234, ""); +} +#endif From cf673604c16cc3aeee604642a2c6ea30b0eeeaba Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 17 Jul 2024 10:29:57 +0100 Subject: [PATCH 244/777] [LV] Use VF from selected plan when creating InnerLoopVectorizer. This makes sure the same VF is used when executing the plan and in the functions in InnerLoopVectorizer when the assertion is disabled (e.g. release builds). No tests added as they would trigger an assertion. --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 1481ddffe6b26..3bdb545946e2b 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -10402,10 +10402,6 @@ bool LoopVectorizePass::processLoop(Loop *L) { if (!MainILV.areSafetyChecksAdded()) DisableRuntimeUnroll = true; } else { - InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, - VF.MinProfitableTripCount, IC, &LVL, &CM, BFI, - PSI, Checks); - VPlan &BestPlan = LVP.getBestPlan(); assert(size(BestPlan.vectorFactors()) == 1 && "Plan should have a single VF"); @@ -10414,6 +10410,9 @@ bool LoopVectorizePass::processLoop(Loop *L) { << "\n"); assert(VF.Width == Width && "VPlan cost model and legacy cost model disagreed"); + InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, Width, + VF.MinProfitableTripCount, IC, &LVL, &CM, BFI, + PSI, Checks); LVP.executePlan(Width, IC, BestPlan, LB, DT, false); ++LoopsVectorized; From 35a3b665bb321b114fb15a7c38065ad8a67e5ef6 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 17 Jul 2024 10:35:00 +0100 Subject: [PATCH 245/777] [X86] Fold blend(pshufb(x,m1),pshufb(y,m2)) -> blend(pshufb(x,blend(m1,m2)),pshufb(y,blend(m1,m2))) to reduce constant pool (#98466) Share PSHUFB masks where we have no overlap in used elements. Fixes #98346 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 70 +- llvm/test/CodeGen/X86/oddshuffles.ll | 14 +- .../X86/shuffle-strided-with-offset-512.ll | 5 +- llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll | 10 +- .../vector-interleaved-load-i16-stride-2.ll | 99 +- .../vector-interleaved-load-i16-stride-4.ll | 1338 +++-- .../vector-interleaved-load-i16-stride-5.ll | 116 +- .../vector-interleaved-load-i16-stride-6.ll | 4568 ++++++++--------- .../vector-interleaved-load-i16-stride-7.ll | 1785 +++---- .../vector-interleaved-load-i8-stride-2.ll | 84 +- .../vector-interleaved-load-i8-stride-5.ll | 142 +- .../vector-interleaved-load-i8-stride-6.ll | 232 +- .../vector-interleaved-load-i8-stride-7.ll | 392 +- .../vector-interleaved-store-i16-stride-3.ll | 28 +- .../vector-interleaved-store-i16-stride-4.ll | 28 +- .../vector-interleaved-store-i16-stride-5.ll | 86 +- .../vector-interleaved-store-i16-stride-6.ll | 228 +- .../vector-interleaved-store-i16-stride-7.ll | 21 +- .../vector-interleaved-store-i16-stride-8.ll | 30 +- .../vector-interleaved-store-i8-stride-6.ll | 77 +- .../vector-interleaved-store-i8-stride-8.ll | 293 +- .../CodeGen/X86/vector-shuffle-256-v16.ll | 21 +- .../CodeGen/X86/vector-shuffle-256-v32.ll | 40 +- 23 files changed, 4936 insertions(+), 4771 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index bcdd31c22d314..64303130922bd 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -41024,23 +41024,59 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL, case X86ISD::BLENDI: { SDValue N0 = N.getOperand(0); SDValue N1 = N.getOperand(1); - - // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types. - // TODO: Handle MVT::v16i16 repeated blend mask. - if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST && - N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) { - MVT SrcVT = N0.getOperand(0).getSimpleValueType(); - if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0 && - SrcVT.getScalarSizeInBits() >= 32) { - unsigned Size = VT.getVectorNumElements(); - unsigned NewSize = SrcVT.getVectorNumElements(); - APInt BlendMask = N.getConstantOperandAPInt(2).zextOrTrunc(Size); - APInt NewBlendMask = APIntOps::ScaleBitMask(BlendMask, NewSize); - return DAG.getBitcast( - VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0), - N1.getOperand(0), - DAG.getTargetConstant(NewBlendMask.getZExtValue(), - DL, MVT::i8))); + unsigned EltBits = VT.getScalarSizeInBits(); + + if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) { + // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types. + // TODO: Handle MVT::v16i16 repeated blend mask. + if (N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) { + MVT SrcVT = N0.getOperand(0).getSimpleValueType(); + unsigned SrcBits = SrcVT.getScalarSizeInBits(); + if ((EltBits % SrcBits) == 0 && SrcBits >= 32) { + unsigned Size = VT.getVectorNumElements(); + unsigned NewSize = SrcVT.getVectorNumElements(); + APInt BlendMask = N.getConstantOperandAPInt(2).zextOrTrunc(Size); + APInt NewBlendMask = APIntOps::ScaleBitMask(BlendMask, NewSize); + return DAG.getBitcast( + VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0), + N1.getOperand(0), + DAG.getTargetConstant(NewBlendMask.getZExtValue(), + DL, MVT::i8))); + } + } + // Share PSHUFB masks: + // blend(pshufb(x,m1),pshufb(y,m2)) + // --> m3 = blend(m1,m2) + // blend(pshufb(x,m3),pshufb(y,m3)) + if (N0.hasOneUse() && N1.hasOneUse()) { + SmallVector Mask, ByteMask; + SmallVector Ops; + SDValue LHS = peekThroughOneUseBitcasts(N0); + SDValue RHS = peekThroughOneUseBitcasts(N1); + if (LHS.getOpcode() == X86ISD::PSHUFB && + RHS.getOpcode() == X86ISD::PSHUFB && + LHS.getOperand(1) != RHS.getOperand(1) && + LHS.getOperand(1).hasOneUse() && RHS.getOperand(1).hasOneUse() && + getTargetShuffleMask(N, /*AllowSentinelZero=*/false, Ops, Mask)) { + assert(Ops.size() == 2 && LHS == peekThroughOneUseBitcasts(Ops[0]) && + RHS == peekThroughOneUseBitcasts(Ops[1]) && + "BLENDI decode mismatch"); + MVT ShufVT = LHS.getSimpleValueType(); + SDValue MaskLHS = LHS.getOperand(1); + SDValue MaskRHS = RHS.getOperand(1); + llvm::narrowShuffleMaskElts(EltBits / 8, Mask, ByteMask); + if (SDValue NewMask = combineX86ShufflesConstants( + ShufVT, {MaskLHS, MaskRHS}, ByteMask, + /*HasVariableMask=*/true, DAG, DL, Subtarget)) { + SDValue NewLHS = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, + LHS.getOperand(0), NewMask); + SDValue NewRHS = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, + RHS.getOperand(0), NewMask); + return DAG.getNode(X86ISD::BLENDI, DL, VT, + DAG.getBitcast(VT, NewLHS), + DAG.getBitcast(VT, NewRHS), N.getOperand(2)); + } + } } } return SDValue(); diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll index d3a3b1e980db0..b40b2c82843cc 100644 --- a/llvm/test/CodeGen/X86/oddshuffles.ll +++ b/llvm/test/CodeGen/X86/oddshuffles.ll @@ -1294,10 +1294,11 @@ define void @interleave_24i16_in(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind { ; AVX2-SLOW-NEXT: vmovdqu (%rdx), %xmm1 ; AVX2-SLOW-NEXT: vmovdqu (%rcx), %xmm2 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[0,1,u,u,6,7,2,3,u,u,8,9,4,5,u,u,16,17,u,u,22,23,18,19,u,u,24,25,20,21,u,u] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,0,1,6,7,2,3,2,3,8,9,4,5,4,5,16,17,6,7,22,23,18,19,8,9,24,25,20,21,10,11] +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm3, %ymm5 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,0,1,u,u,u,u,2,3,u,u,u,u,4,5,u,u,22,23,u,u,u,u,24,25,u,u,u,u,26,27] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14],ymm3[15] +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6],ymm3[7],ymm5[8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13,14],ymm3[15] ; AVX2-SLOW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,1,1,0,2] ; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm4, %ymm4 ; AVX2-SLOW-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] @@ -1339,10 +1340,11 @@ define void @interleave_24i16_in(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind { ; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rdx), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rcx), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[0,1,u,u,6,7,2,3,u,u,8,9,4,5,u,u,16,17,u,u,22,23,18,19,u,u,24,25,20,21,u,u] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,0,1,6,7,2,3,2,3,8,9,4,5,4,5,16,17,6,7,22,23,18,19,8,9,24,25,20,21,10,11] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm3, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,0,1,u,u,u,u,2,3,u,u,u,u,4,5,u,u,22,23,u,u,u,u,24,25,u,u,u,u,26,27] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14],ymm3[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6],ymm3[7],ymm5[8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13,14],ymm3[15] ; AVX2-FAST-PERLANE-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,1,1,0,2] ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm4, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] diff --git a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll index e94f51233256c..45842d4148a8b 100644 --- a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll +++ b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll @@ -12,8 +12,9 @@ define void @shuffle_v64i8_to_v32i8_1(ptr %L, ptr %S) nounwind { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31] -; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] +; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX512F-NEXT: vmovdqa %ymm0, (%rsi) diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll index 95e249984e184..cf0820aac3262 100644 --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll @@ -20,8 +20,9 @@ define void @shuffle_v64i8_to_v32i8(ptr %L, ptr %S) nounwind { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] -; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14] +; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX512F-NEXT: vmovdqa %ymm0, (%rsi) @@ -44,8 +45,9 @@ define void @shuffle_v64i8_to_v32i8(ptr %L, ptr %S) nounwind { ; AVX512VL-FAST-PERLANE: # %bb.0: ; AVX512VL-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512VL-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512VL-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] -; AVX512VL-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u] +; AVX512VL-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14] +; AVX512VL-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512VL-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX512VL-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX512VL-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX512VL-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rsi) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll index 3bc97f71f04fb..00e43df15deea 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll @@ -488,8 +488,9 @@ define void @load_i16_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] ; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm3[0,2],ymm2[0,2],ymm3[4,6],ymm2[4,6] ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3] -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,6,7,10,11,14,15,2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] +; AVX2-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vmovaps %ymm2, (%rsi) @@ -506,8 +507,9 @@ define void @load_i16_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX2-FP-NEXT: vpshufb %ymm2, %ymm0, %ymm2 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2],ymm3[0,2],ymm2[4,6],ymm3[4,6] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,6,7,10,11,14,15,2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] +; AVX2-FP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX2-FP-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-FP-NEXT: vmovaps %ymm2, (%rsi) @@ -524,8 +526,9 @@ define void @load_i16_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm2 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2],ymm3[0,2],ymm2[4,6],ymm3[4,6] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,6,7,10,11,14,15,2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] +; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-FCP-NEXT: vmovaps %ymm2, (%rsi) @@ -736,14 +739,13 @@ define void @load_i16_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX2-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] ; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm6[0,2],ymm5[0,2],ymm6[4,6],ymm5[4,6] ; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,1,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [2,3,6,7,10,11,14,15,2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] ; AVX2-NEXT: vpshufb %ymm6, %ymm3, %ymm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb %ymm7, %ymm2, %ymm2 +; AVX2-NEXT: vpshufb %ymm6, %ymm2, %ymm2 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] ; AVX2-NEXT: vpshufb %ymm6, %ymm1, %ymm1 -; AVX2-NEXT: vpshufb %ymm7, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb %ymm6, %ymm0, %ymm0 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vmovaps %ymm5, (%rsi) @@ -768,14 +770,13 @@ define void @load_i16_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX2-FP-NEXT: vpshufb %ymm4, %ymm0, %ymm4 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2],ymm6[0,2],ymm4[4,6],ymm6[4,6] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,1,3] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [2,3,6,7,10,11,14,15,2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] ; AVX2-FP-NEXT: vpshufb %ymm6, %ymm3, %ymm3 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb %ymm7, %ymm2, %ymm2 +; AVX2-FP-NEXT: vpshufb %ymm6, %ymm2, %ymm2 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] ; AVX2-FP-NEXT: vpshufb %ymm6, %ymm1, %ymm1 -; AVX2-FP-NEXT: vpshufb %ymm7, %ymm0, %ymm0 +; AVX2-FP-NEXT: vpshufb %ymm6, %ymm0, %ymm0 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-FP-NEXT: vmovaps %ymm4, (%rsi) @@ -800,14 +801,13 @@ define void @load_i16_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm4 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2],ymm6[0,2],ymm4[4,6],ymm6[4,6] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,1,3] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [2,3,6,7,10,11,14,15,2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] ; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm3 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] ; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-FCP-NEXT: vmovaps %ymm4, (%rsi) @@ -1180,20 +1180,20 @@ define void @load_i16_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX2-NEXT: vmovdqa 224(%rdi), %ymm6 ; AVX2-NEXT: vmovdqa (%rdi), %ymm3 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX2-NEXT: vmovdqa 64(%rdi), %ymm8 -; AVX2-NEXT: vmovdqa 96(%rdi), %ymm9 -; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm9[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX2-NEXT: vmovdqa 64(%rdi), %ymm7 +; AVX2-NEXT: vmovdqa 96(%rdi), %ymm8 +; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm8[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] ; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-NEXT: vpshuflw {{.*#+}} ymm7 = ymm8[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm7[0,2],ymm2[0,2],ymm7[4,6],ymm2[4,6] +; AVX2-NEXT: vpshuflw {{.*#+}} ymm9 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX2-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm9[0,2],ymm2[0,2],ymm9[4,6],ymm2[4,6] ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3] -; AVX2-NEXT: vpshuflw {{.*#+}} ymm7 = ymm6[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX2-NEXT: vpshuflw {{.*#+}} ymm9 = ymm6[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX2-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] ; AVX2-NEXT: vpshuflw {{.*#+}} ymm10 = ymm4[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] ; AVX2-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-NEXT: vshufps {{.*#+}} ymm7 = ymm10[0,2],ymm7[0,2],ymm10[4,6],ymm7[4,6] -; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,1,3] +; AVX2-NEXT: vshufps {{.*#+}} ymm9 = ymm10[0,2],ymm9[0,2],ymm10[4,6],ymm9[4,6] +; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,2,1,3] ; AVX2-NEXT: vpshuflw {{.*#+}} ymm10 = ymm5[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] ; AVX2-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] ; AVX2-NEXT: vpshuflw {{.*#+}} ymm11 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] @@ -1206,32 +1206,31 @@ define void @load_i16_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX2-NEXT: vpshufhw {{.*#+}} ymm12 = ymm12[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] ; AVX2-NEXT: vshufps {{.*#+}} ymm11 = ymm12[0,2],ymm11[0,2],ymm12[4,6],ymm11[4,6] ; AVX2-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,2,1,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31] -; AVX2-NEXT: vpshufb %ymm12, %ymm9, %ymm9 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm13 = [2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb %ymm13, %ymm8, %ymm8 -; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,1,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = [2,3,6,7,10,11,14,15,2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] +; AVX2-NEXT: vpshufb %ymm12, %ymm8, %ymm8 +; AVX2-NEXT: vpshufb %ymm12, %ymm7, %ymm7 +; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3],ymm7[4,5],ymm8[6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,1,3] ; AVX2-NEXT: vpshufb %ymm12, %ymm6, %ymm6 -; AVX2-NEXT: vpshufb %ymm13, %ymm4, %ymm4 +; AVX2-NEXT: vpshufb %ymm12, %ymm4, %ymm4 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3],ymm4[4,5],ymm6[6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,1,3] ; AVX2-NEXT: vpshufb %ymm12, %ymm5, %ymm5 -; AVX2-NEXT: vpshufb %ymm13, %ymm3, %ymm3 +; AVX2-NEXT: vpshufb %ymm12, %ymm3, %ymm3 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5],ymm5[6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,1,3] ; AVX2-NEXT: vpshufb %ymm12, %ymm1, %ymm1 -; AVX2-NEXT: vpshufb %ymm13, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb %ymm12, %ymm0, %ymm0 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vmovaps %ymm11, 64(%rsi) ; AVX2-NEXT: vmovaps %ymm10, (%rsi) -; AVX2-NEXT: vmovaps %ymm7, 96(%rsi) +; AVX2-NEXT: vmovaps %ymm9, 96(%rsi) ; AVX2-NEXT: vmovaps %ymm2, 32(%rsi) ; AVX2-NEXT: vmovdqa %ymm0, 64(%rdx) ; AVX2-NEXT: vmovdqa %ymm3, (%rdx) ; AVX2-NEXT: vmovdqa %ymm4, 96(%rdx) -; AVX2-NEXT: vmovdqa %ymm8, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm7, 32(%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -1262,22 +1261,21 @@ define void @load_i16_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX2-FP-NEXT: vpshufb %ymm9, %ymm0, %ymm9 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,2],ymm12[0,2],ymm9[4,6],ymm12[4,6] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,2,1,3] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [2,3,6,7,10,11,14,15,2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] ; AVX2-FP-NEXT: vpshufb %ymm12, %ymm8, %ymm8 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb %ymm13, %ymm7, %ymm7 +; AVX2-FP-NEXT: vpshufb %ymm12, %ymm7, %ymm7 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3],ymm7[4,5],ymm8[6,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,1,3] ; AVX2-FP-NEXT: vpshufb %ymm12, %ymm4, %ymm4 -; AVX2-FP-NEXT: vpshufb %ymm13, %ymm3, %ymm3 +; AVX2-FP-NEXT: vpshufb %ymm12, %ymm3, %ymm3 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,1,3] ; AVX2-FP-NEXT: vpshufb %ymm12, %ymm6, %ymm4 -; AVX2-FP-NEXT: vpshufb %ymm13, %ymm5, %ymm5 +; AVX2-FP-NEXT: vpshufb %ymm12, %ymm5, %ymm5 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,1,3] ; AVX2-FP-NEXT: vpshufb %ymm12, %ymm1, %ymm1 -; AVX2-FP-NEXT: vpshufb %ymm13, %ymm0, %ymm0 +; AVX2-FP-NEXT: vpshufb %ymm12, %ymm0, %ymm0 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-FP-NEXT: vmovaps %ymm9, 64(%rsi) @@ -1318,22 +1316,21 @@ define void @load_i16_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm9 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,2],ymm12[0,2],ymm9[4,6],ymm12[4,6] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,2,1,3] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [2,3,6,7,10,11,14,15,2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] ; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm8, %ymm8 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm7, %ymm7 +; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm7, %ymm7 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3],ymm7[4,5],ymm8[6,7] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,1,3] ; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm4, %ymm4 -; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm3, %ymm3 +; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm3, %ymm3 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,1,3] ; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm6, %ymm4 -; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm5, %ymm5 +; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm5, %ymm5 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,1,3] ; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-FCP-NEXT: vmovaps %ymm9, 64(%rsi) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll index 3f77e50260c8d..df28ac14a30c0 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll @@ -1235,64 +1235,62 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,2,2,3,0,2,4,6] ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm4 -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm5 = [16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29] ; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm6 -; AVX2-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm7 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm7, %ymm3 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm3 -; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm6 -; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm9 -; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm10 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm10, %xmm12 -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm9, %xmm11 +; AVX2-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm3 +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm7 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm6 +; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm7 +; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm8 +; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm9 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] +; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm9, %xmm11 +; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm8, %xmm10 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm7, %xmm12 +; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm6, %xmm11 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm6, %xmm13 -; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm12 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm12 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] -; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm4, %ymm4 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm7, %ymm7 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,3,2,3,1,3,5,7] -; AVX2-FCP-NEXT: vpermd %ymm2, %ymm7, %ymm2 -; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm5 -; AVX2-FCP-NEXT: vpermd %ymm1, %ymm7, %ymm1 -; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm1, %ymm7 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[3,1,2,3] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[0,1,2,0,4,5,6,7] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[3,1,2,3] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm9[0,1,2,0,4,5,6,7] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] +; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm4 +; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,3,2,3,1,3,5,7] +; AVX2-FCP-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm10 +; AVX2-FCP-NEXT: vpermd %ymm1, %ymm4, %ymm1 +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm4 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm10[6,7] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[3,1,2,3] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm9 = xmm5[0,1,2,0,4,5,6,7] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[3,1,2,3] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm8[0,1,2,0,4,5,6,7] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm7[2,0,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm6[2,0,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm3[2,0,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm2 -; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm6[2,0,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[0,1,3,1,4,5,6,7] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm9[0,1,3,1,4,5,6,7] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[0,1,3,1,4,5,6,7] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm8[0,1,3,1,4,5,6,7] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm7[3,1,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqa %ymm0, (%rsi) -; AVX2-FCP-NEXT: vmovdqa %ymm4, (%rdx) -; AVX2-FCP-NEXT: vmovdqa %ymm5, (%rcx) +; AVX2-FCP-NEXT: vmovdqa %ymm3, (%rdx) +; AVX2-FCP-NEXT: vmovdqa %ymm4, (%rcx) ; AVX2-FCP-NEXT: vmovdqa %ymm1, (%r8) ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq @@ -1382,10 +1380,9 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 ; AVX512-FCP-NEXT: vpmovqw %zmm4, %xmm9 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm9 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm3 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm1 +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm1 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX512-FCP-NEXT: vpsrlq $16, %zmm4, %zmm3 ; AVX512-FCP-NEXT: vpmovqw %zmm3, %xmm3 @@ -1400,7 +1397,7 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm3, %ymm3 +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm3 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX512-FCP-NEXT: vpsrlq $48, %zmm4, %zmm3 ; AVX512-FCP-NEXT: vpmovqw %zmm3, %xmm3 @@ -1497,10 +1494,9 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm4, %xmm9 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm9 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm4, %zmm3 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm3, %xmm3 @@ -1515,7 +1511,7 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm3 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm4, %zmm3 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm3, %xmm3 @@ -2563,145 +2559,140 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-LABEL: load_i16_stride4_vf32: ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: subq $104, %rsp -; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm5 -; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm3 +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm6 ; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm7 -; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm8 +; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm9 +; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm7 ; AVX2-FCP-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FCP-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FCP-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX2-FCP-NEXT: vpackusdw %xmm4, %xmm2, %xmm2 ; AVX2-FCP-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,2,3,0,2,4,6] -; AVX2-FCP-NEXT: vpermd %ymm8, %ymm2, %ymm10 -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm4 = [16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29] -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm10, %ymm3 -; AVX2-FCP-NEXT: vmovdqa %ymm4, %ymm9 -; AVX2-FCP-NEXT: vpermd %ymm7, %ymm2, %ymm11 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm11, %ymm4 -; AVX2-FCP-NEXT: vmovdqa %ymm7, %ymm12 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vpermd %ymm7, %ymm2, %ymm5 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29] +; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm5, %ymm8 +; AVX2-FCP-NEXT: vpermd %ymm9, %ymm2, %ymm10 +; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm10, %ymm9 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-FCP-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm8 +; AVX2-FCP-NEXT: vpackusdw %xmm8, %xmm1, %xmm1 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-FCP-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm8 +; AVX2-FCP-NEXT: vpackusdw %xmm8, %xmm0, %xmm0 ; AVX2-FCP-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vpermd %ymm6, %ymm2, %ymm7 -; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm7, %ymm1 -; AVX2-FCP-NEXT: vpermd %ymm5, %ymm2, %ymm4 -; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm4, %ymm3 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vpermd %ymm6, %ymm2, %ymm9 +; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm1 +; AVX2-FCP-NEXT: vpermd %ymm3, %ymm2, %ymm6 +; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm8 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm15 -; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm5 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm0 -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm15, %xmm9 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] -; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm14 -; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm6 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm6, %xmm12 -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm14, %xmm13 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm9[2,3] -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm0 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] -; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm10, %ymm13 -; AVX2-FCP-NEXT: vmovdqa %ymm0, %ymm9 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm11, %ymm11 -; AVX2-FCP-NEXT: vmovdqa %ymm0, %ymm10 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm13[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm14 +; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] +; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm0 +; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm14, %xmm8 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] +; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm13 +; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm4 +; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm13, %xmm11 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm11[0],xmm4[0],xmm11[1],xmm4[1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,3] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] +; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm5 +; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm10, %ymm10 +; AVX2-FCP-NEXT: vmovdqa %ymm0, %ymm8 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %xmm13 -; AVX2-FCP-NEXT: vmovdqa 176(%rdi), %xmm11 -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm11, %xmm12 -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm13, %xmm3 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm12[0],xmm3[1],xmm12[1] -; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %xmm12 -; AVX2-FCP-NEXT: vmovdqa 144(%rdi), %xmm3 -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm0 -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm12, %xmm2 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm7, %ymm1 -; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm2 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %xmm11 +; AVX2-FCP-NEXT: vmovdqa 176(%rdi), %xmm15 +; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm15, %xmm4 +; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm11, %xmm10 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[1],xmm4[1] +; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %xmm10 +; AVX2-FCP-NEXT: vmovdqa 144(%rdi), %xmm12 +; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm12, %xmm0 +; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm1 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm9, %ymm1 +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm6, %ymm4 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,3,2,3,1,3,5,7] -; AVX2-FCP-NEXT: vpermd %ymm8, %ymm7, %ymm4 -; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm10 = [16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29] -; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm0 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,3,2,3,1,3,5,7] +; AVX2-FCP-NEXT: vpermd %ymm7, %ymm6, %ymm4 +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29] +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm0 +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm7, %ymm1 +; AVX2-FCP-NEXT: vmovdqa %ymm5, %ymm8 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[3,1,2,3] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[3,1,2,3] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[0,1,2,0,4,5,6,7] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[0,1,2,0,4,5,6,7] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[3,1,2,3] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[3,1,2,3] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[2,0,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm9 = xmm14[2,0,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,1,2,0,4,5,6,7] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm14[0,1,2,0,4,5,6,7] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[3,1,2,3] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[2,0,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm9 = xmm13[2,0,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm5 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm5, %ymm7 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3,4,5],ymm7[6,7] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[3,1,2,3] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm13[3,1,2,3] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm13 = xmm9[0,1,2,0,4,5,6,7] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm11[0,1,2,0,4,5,6,7] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm3 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm5 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm6 +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm5, %ymm9 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm15[3,1,2,3] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[3,1,2,3] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm15 = xmm9[0,1,2,0,4,5,6,7] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[0,1,2,0,4,5,6,7] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[3,1,2,3] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm13 = xmm3[2,0,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm12[2,0,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm10 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] -; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm4 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm8, %ymm7 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,1,2,3] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm15 = xmm12[2,0,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm10[2,0,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm15[0],xmm8[1],xmm15[1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0,1],xmm0[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm4 +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm7, %ymm6 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm15[0,1,3,1,4,5,6,7] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm14[3,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm14[0,1,3,1,4,5,6,7] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm13[3,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm5, %ymm4 -; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm2, %ymm2 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[0,1,3,1,4,5,6,7] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm11[0,1,3,1,4,5,6,7] +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm2 +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm5, %ymm3 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[0,1,3,1,4,5,6,7] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[0,1,3,1,4,5,6,7] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[3,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm10[3,1,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm12[3,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%rsi) @@ -2858,9 +2849,9 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0] ; AVX512-FCP-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,2,2,3,0,2,4,6] -; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %ymm6 -; AVX512-FCP-NEXT: vpermd %ymm6, %ymm4, %ymm5 -; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm5, %ymm7 +; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %ymm5 +; AVX512-FCP-NEXT: vpermd %ymm5, %ymm4, %ymm6 +; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm6, %ymm7 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,u,u,u,u,8,9,12,13,12,13,14,15,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm8 ; AVX512-FCP-NEXT: vpermd %ymm8, %ymm4, %ymm9 @@ -2879,54 +2870,53 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpmovqw %zmm0, %xmm13 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7] ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm4[0,1,2,3],zmm7[0,1,2,3] -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm4 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] -; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm5, %ymm13 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm9, %ymm9 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm13[6,7] -; AVX512-FCP-NEXT: vpsrlq $16, %zmm1, %zmm13 -; AVX512-FCP-NEXT: vpmovqw %zmm13, %xmm13 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] -; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm12, %ymm12 -; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm15, %ymm13 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] -; AVX512-FCP-NEXT: vpsrlq $16, %zmm0, %zmm13 -; AVX512-FCP-NEXT: vpmovqw %zmm13, %xmm13 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm12[0,1,2,3],zmm9[0,1,2,3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [1,3,2,3,1,3,5,7] -; AVX512-FCP-NEXT: vpermd %ymm6, %ymm12, %ymm6 -; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm6, %ymm13 -; AVX512-FCP-NEXT: vpermd %ymm8, %ymm12, %ymm8 -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm8, %ymm15 -; AVX512-FCP-NEXT: vpermt2d %ymm13, %ymm11, %ymm15 -; AVX512-FCP-NEXT: vpsrlq $32, %zmm1, %zmm13 -; AVX512-FCP-NEXT: vpmovqw %zmm13, %xmm13 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm15[4,5,6,7] -; AVX512-FCP-NEXT: vpermd %ymm10, %ymm12, %ymm10 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm6 +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm9 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7] +; AVX512-FCP-NEXT: vpsrlq $16, %zmm1, %zmm9 +; AVX512-FCP-NEXT: vpmovqw %zmm9, %xmm9 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm12, %ymm9 +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm15, %ymm12 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5],ymm9[6,7] +; AVX512-FCP-NEXT: vpsrlq $16, %zmm0, %zmm12 +; AVX512-FCP-NEXT: vpmovqw %zmm12, %xmm12 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm9[0,1,2,3],zmm6[0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [1,3,2,3,1,3,5,7] +; AVX512-FCP-NEXT: vpermd %ymm5, %ymm9, %ymm5 +; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm5, %ymm12 +; AVX512-FCP-NEXT: vpermd %ymm8, %ymm9, %ymm8 +; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm8, %ymm13 +; AVX512-FCP-NEXT: vpermt2d %ymm12, %ymm11, %ymm13 +; AVX512-FCP-NEXT: vpsrlq $32, %zmm1, %zmm12 +; AVX512-FCP-NEXT: vpmovqw %zmm12, %xmm12 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] +; AVX512-FCP-NEXT: vpermd %ymm10, %ymm9, %ymm10 ; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm10, %ymm2 -; AVX512-FCP-NEXT: vpermd %ymm14, %ymm12, %ymm12 -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm12, %ymm3 +; AVX512-FCP-NEXT: vpermd %ymm14, %ymm9, %ymm9 +; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm9, %ymm3 ; AVX512-FCP-NEXT: vpermt2d %ymm2, %ymm11, %ymm3 ; AVX512-FCP-NEXT: vpsrlq $32, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vpmovqw %zmm2, %xmm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm13[0,1,2,3] -; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm3 -; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm8, %ymm6 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm12[0,1,2,3] +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm5, %ymm3 +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm8, %ymm5 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] ; AVX512-FCP-NEXT: vpsrlq $48, %zmm1, %zmm1 ; AVX512-FCP-NEXT: vpmovqw %zmm1, %xmm1 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm10, %ymm3 -; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm12, %ymm4 +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm4 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] ; AVX512-FCP-NEXT: vpsrlq $48, %zmm0, %zmm0 ; AVX512-FCP-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, (%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, (%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, (%rdx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rcx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%r8) ; AVX512-FCP-NEXT: vzeroupper @@ -3070,9 +3060,9 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0] ; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,2,2,3,0,2,4,6] -; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %ymm6 -; AVX512DQ-FCP-NEXT: vpermd %ymm6, %ymm4, %ymm5 -; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm5, %ymm7 +; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %ymm5 +; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm4, %ymm6 +; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm6, %ymm7 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,u,u,u,u,8,9,12,13,12,13,14,15,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm8 ; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm4, %ymm9 @@ -3091,54 +3081,53 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpmovqw %zmm0, %xmm13 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7] ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm4[0,1,2,3],zmm7[0,1,2,3] -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm4 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] -; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm5, %ymm13 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm9, %ymm9 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm13[6,7] -; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm1, %zmm13 -; AVX512DQ-FCP-NEXT: vpmovqw %zmm13, %xmm13 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm12, %ymm12 -; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm15, %ymm13 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] -; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm0, %zmm13 -; AVX512DQ-FCP-NEXT: vpmovqw %zmm13, %xmm13 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm12[0,1,2,3],zmm9[0,1,2,3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [1,3,2,3,1,3,5,7] -; AVX512DQ-FCP-NEXT: vpermd %ymm6, %ymm12, %ymm6 -; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm6, %ymm13 -; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm12, %ymm8 -; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm8, %ymm15 -; AVX512DQ-FCP-NEXT: vpermt2d %ymm13, %ymm11, %ymm15 -; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm1, %zmm13 -; AVX512DQ-FCP-NEXT: vpmovqw %zmm13, %xmm13 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermd %ymm10, %ymm12, %ymm10 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm6 +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm9 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7] +; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm1, %zmm9 +; AVX512DQ-FCP-NEXT: vpmovqw %zmm9, %xmm9 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm12, %ymm9 +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm15, %ymm12 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5],ymm9[6,7] +; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm0, %zmm12 +; AVX512DQ-FCP-NEXT: vpmovqw %zmm12, %xmm12 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm9[0,1,2,3],zmm6[0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [1,3,2,3,1,3,5,7] +; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm9, %ymm5 +; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm5, %ymm12 +; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm9, %ymm8 +; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm8, %ymm13 +; AVX512DQ-FCP-NEXT: vpermt2d %ymm12, %ymm11, %ymm13 +; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm1, %zmm12 +; AVX512DQ-FCP-NEXT: vpmovqw %zmm12, %xmm12 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermd %ymm10, %ymm9, %ymm10 ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm10, %ymm2 -; AVX512DQ-FCP-NEXT: vpermd %ymm14, %ymm12, %ymm12 -; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm12, %ymm3 +; AVX512DQ-FCP-NEXT: vpermd %ymm14, %ymm9, %ymm9 +; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm9, %ymm3 ; AVX512DQ-FCP-NEXT: vpermt2d %ymm2, %ymm11, %ymm3 ; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm2, %xmm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm13[0,1,2,3] -; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm3 -; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm8, %ymm6 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm12[0,1,2,3] +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm5, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm8, %ymm5 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] ; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm1, %zmm1 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm1, %xmm1 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm10, %ymm3 -; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm12, %ymm4 +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm4 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] ; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, (%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, (%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, (%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rcx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%r8) ; AVX512DQ-FCP-NEXT: vzeroupper @@ -5349,10 +5338,10 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-LABEL: load_i16_stride4_vf64: ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: subq $680, %rsp # imm = 0x2A8 -; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm6 -; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %ymm7 +; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm7 ; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %ymm6 +; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm5 ; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm4 @@ -5364,252 +5353,243 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FCP-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-FCP-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,2,3,0,2,4,6] -; AVX2-FCP-NEXT: vpermd %ymm4, %ymm2, %ymm3 +; AVX2-FCP-NEXT: vpackusdw %xmm0, %xmm2, %xmm2 +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] +; AVX2-FCP-NEXT: vpermd %ymm4, %ymm0, %ymm3 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm4 = [16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29] ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX2-FCP-NEXT: vmovdqa %ymm4, %ymm8 -; AVX2-FCP-NEXT: vpermd %ymm5, %ymm2, %ymm10 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm10, %ymm4 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-FCP-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 +; AVX2-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm5 +; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm5, %ymm5 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FCP-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FCP-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 -; AVX2-FCP-NEXT: vpackusdw %xmm0, %xmm3, %xmm0 -; AVX2-FCP-NEXT: vpermd %ymm7, %ymm2, %ymm3 -; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm4 -; AVX2-FCP-NEXT: vpermd %ymm6, %ymm2, %ymm7 -; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm7, %ymm6 -; AVX2-FCP-NEXT: vmovdqa %ymm5, %ymm9 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-FCP-NEXT: vpackusdw %xmm4, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX2-FCP-NEXT: vpackusdw %xmm6, %xmm4, %xmm4 -; AVX2-FCP-NEXT: vpackusdw %xmm0, %xmm4, %xmm0 -; AVX2-FCP-NEXT: vmovdqa 448(%rdi), %ymm5 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX2-FCP-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 +; AVX2-FCP-NEXT: vpackusdw %xmm2, %xmm3, %xmm3 +; AVX2-FCP-NEXT: vpermd %ymm6, %ymm0, %ymm2 +; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm5 +; AVX2-FCP-NEXT: vpermd %ymm7, %ymm0, %ymm6 +; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm7 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX2-FCP-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX2-FCP-NEXT: vpackusdw %xmm7, %xmm5, %xmm5 +; AVX2-FCP-NEXT: vmovdqa 448(%rdi), %ymm7 +; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpackusdw %xmm3, %xmm5, %xmm3 +; AVX2-FCP-NEXT: vmovdqa 480(%rdi), %ymm5 ; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 480(%rdi), %ymm4 -; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermd %ymm4, %ymm2, %ymm4 -; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm4 -; AVX2-FCP-NEXT: vpermd %ymm5, %ymm2, %ymm12 -; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm12, %ymm6 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-FCP-NEXT: vpackusdw %xmm4, %xmm0, %xmm0 +; AVX2-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm5 +; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm5, %ymm5 +; AVX2-FCP-NEXT: vpermd %ymm7, %ymm0, %ymm14 +; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm14, %ymm7 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX2-FCP-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX2-FCP-NEXT: vpackusdw %xmm4, %xmm1, %xmm1 -; AVX2-FCP-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 -; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm1 -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm4 -; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermd %ymm4, %ymm2, %ymm5 -; AVX2-FCP-NEXT: vpermd %ymm1, %ymm2, %ymm11 -; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm5, %ymm2 -; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm11, %ymm4 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX2-FCP-NEXT: vpackusdw %xmm5, %xmm1, %xmm1 +; AVX2-FCP-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm5 +; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm3 +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermd %ymm3, %ymm0, %ymm9 +; AVX2-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm3 +; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm0 +; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm5 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %xmm1 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovdqa 304(%rdi), %xmm0 -; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm6 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %xmm0 -; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovdqa 272(%rdi), %xmm9 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm9, %xmm14 -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm15 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm6 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] -; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm3 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm7, %ymm7 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm13 -; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm7, %xmm3 -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm13, %xmm14 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] -; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm3 -; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm14 -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm14, %xmm0 -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm3, %xmm2 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,3] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm2 -; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm10, %ymm10 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %xmm10 +; AVX2-FCP-NEXT: vmovdqa 304(%rdi), %xmm12 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] +; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm12, %xmm0 +; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %xmm10 ; AVX2-FCP-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovdqa 176(%rdi), %xmm0 -; AVX2-FCP-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm10, %xmm2 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %xmm15 -; AVX2-FCP-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovdqa 144(%rdi), %xmm2 -; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm15, %xmm10 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm10[0],xmm2[0],xmm10[1],xmm2[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] -; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm2 -; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm11, %ymm5 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %xmm5 -; AVX2-FCP-NEXT: vmovdqa 432(%rdi), %xmm10 -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm10, %xmm0 -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm5, %xmm2 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %xmm4 -; AVX2-FCP-NEXT: vmovdqa 400(%rdi), %xmm2 -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm11 -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm4, %xmm1 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm12, %ymm11 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] -; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vmovdqa 272(%rdi), %xmm7 +; AVX2-FCP-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm7, %xmm7 +; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm10, %xmm8 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] +; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm6, %ymm6 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm6 +; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm7 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm7 +; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm15 +; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm15, %xmm11 +; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm7, %xmm13 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm11[0,1],xmm6[2,3] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm11 +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm13 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm11[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm12 = [16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29] -; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm6, %ymm11 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[3,1,2,3] -; AVX2-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm13[3,1,2,3] +; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %xmm8 ; AVX2-FCP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[0,1,2,0,4,5,6,7] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm8[0,1,2,0,4,5,6,7] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm11[0],xmm7[0],xmm11[1],xmm7[1] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm14[3,1,2,3] -; AVX2-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[3,1,2,3] +; AVX2-FCP-NEXT: vmovdqa 176(%rdi), %xmm6 +; AVX2-FCP-NEXT: vmovdqa %xmm6, (%rsp) # 16-byte Spill +; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm6, %xmm6 +; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm8, %xmm11 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm11[0],xmm6[0],xmm11[1],xmm6[1] +; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %xmm8 ; AVX2-FCP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[2,0,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm8[2,0,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm11[0],xmm3[0],xmm11[1],xmm3[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm7[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 144(%rdi), %xmm11 +; AVX2-FCP-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm11, %xmm11 +; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm8, %xmm13 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm11[0,1],xmm6[2,3] +; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm9 +; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm3, %ymm3 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %xmm6 +; AVX2-FCP-NEXT: vmovdqa 432(%rdi), %xmm13 +; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm13, %xmm3 +; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm6, %xmm5 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %xmm5 +; AVX2-FCP-NEXT: vmovdqa 400(%rdi), %xmm3 +; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm11 +; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm0 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm9[2,3] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm9 +; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm14, %ymm11 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm8, %ymm9 +; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm11, %ymm11 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] +; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] +; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[3,1,2,3] +; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3] +; AVX2-FCP-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[2,0,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vmovdqa %ymm12, %ymm6 -; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm3, %ymm3 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm3 = mem[3,1,2,3] -; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[3,1,2,3] +; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm14 = mem[3,1,2,3] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,0,4,5,6,7] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm14[0,1,2,0,4,5,6,7] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm9[3,1,2,3] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm11 = mem[3,1,2,3] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm9 = xmm13[2,0,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm9 = mem[3,1,2,3] ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm11[2,0,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm9[0,1],xmm3[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm15 = xmm9[2,0,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm15[0],xmm12[0],xmm15[1],xmm12[1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm12[0,1],xmm2[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vmovdqa %ymm15, %ymm7 -; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm3, %ymm3 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[3,1,2,3] -; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3] -; AVX2-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,0,4,5,6,7] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,0,4,5,6,7] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[3,1,2,3] ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm15 = xmm4[3,1,2,3] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm15[2,0,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3] +; AVX2-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,2,0,4,5,6,7] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm3[3,1,2,3] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm5[3,1,2,3] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[2,0,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm12[2,0,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm9, %ymm0 -; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm8, %ymm1 +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm8, %ymm0 +; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm7, %ymm1 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vpshufd $231, (%rsp), %xmm7 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm7 = mem[3,1,2,3] +; AVX2-FCP-NEXT: vpshufd $231, (%rsp), %xmm6 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm6 = mem[3,1,2,3] ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm5 = mem[3,1,2,3] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[0,1,2,0,4,5,6,7] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,1,2,0,4,5,6,7] ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[0,1,2,0,4,5,6,7] ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm4 = mem[3,1,2,3] -; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm6 = mem[3,1,2,3] +; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm2 = mem[3,1,2,3] ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[2,0,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm6[2,0,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm15 = xmm2[2,0,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm10 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] @@ -5618,51 +5598,48 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; AVX2-FCP-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm12 = mem[3,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm12[0],xmm3[0],xmm12[1],xmm3[1] +; AVX2-FCP-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm15 = mem[3,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm15[0],xmm3[0],xmm15[1],xmm3[1] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm14[0,1,3,1,4,5,6,7] ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[3,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm11[0],xmm3[0],xmm11[1],xmm3[1] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[3,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[3,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm1 -; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm9 -; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm8, %ymm3 +; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm1 +; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm7, %ymm3 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm7[0,1,3,1,4,5,6,7] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[0,1,3,1,4,5,6,7] ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7] ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[3,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm2, %ymm2 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm3 +; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm3, %ymm3 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm3 = mem[0,1,3,1,4,5,6,7] ; AVX2-FCP-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm4 = mem[0,1,3,1,4,5,6,7] ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FCP-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm4 = mem[3,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm15[3,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[3,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm12[3,1,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] @@ -5693,8 +5670,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqa %ymm2, 96(%r8) ; AVX2-FCP-NEXT: vmovdqa %ymm1, 32(%r8) ; AVX2-FCP-NEXT: vmovdqa %ymm0, 64(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm0, (%r8) +; AVX2-FCP-NEXT: vmovdqa %ymm15, (%r8) ; AVX2-FCP-NEXT: addq $680, %rsp # imm = 0x2A8 ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq @@ -5976,148 +5952,147 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512-FCP-LABEL: load_i16_stride4_vf64: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm23 -; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm26 -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm30 +; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm22 +; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm25 +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4 ; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0] ; AVX512-FCP-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,2,3,0,2,4,6] -; AVX512-FCP-NEXT: vmovdqa64 224(%rdi), %ymm24 -; AVX512-FCP-NEXT: vpermd %ymm24, %ymm1, %ymm10 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,2,2,3,0,2,4,6] +; AVX512-FCP-NEXT: vmovdqa64 224(%rdi), %ymm23 +; AVX512-FCP-NEXT: vpermd %ymm23, %ymm9, %ymm10 ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm10, %ymm0 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,4,5,u,u,u,u,8,9,12,13,12,13,14,15,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u] -; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm25 -; AVX512-FCP-NEXT: vpermd %ymm25, %ymm1, %ymm11 -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm11, %ymm3 +; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm24 +; AVX512-FCP-NEXT: vpermd %ymm24, %ymm9, %ymm3 +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm1 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,2,2,3,4,6,12,14] -; AVX512-FCP-NEXT: vpermt2d %ymm0, %ymm7, %ymm3 +; AVX512-FCP-NEXT: vpermt2d %ymm0, %ymm7, %ymm1 ; AVX512-FCP-NEXT: vpmovqw %zmm4, %xmm0 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 96(%rdi), %ymm27 -; AVX512-FCP-NEXT: vpermd %ymm27, %ymm1, %ymm3 -; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm9 -; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm28 -; AVX512-FCP-NEXT: vpermd %ymm28, %ymm1, %ymm8 -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm8, %ymm12 -; AVX512-FCP-NEXT: vpermt2d %ymm9, %ymm7, %ymm12 -; AVX512-FCP-NEXT: vpmovqw %zmm30, %xmm9 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm12[4,5,6,7] -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm9[0,1,2,3],zmm0[0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 480(%rdi), %ymm16 -; AVX512-FCP-NEXT: vpermd %ymm16, %ymm1, %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm9 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 96(%rdi), %ymm26 +; AVX512-FCP-NEXT: vpermd %ymm26, %ymm9, %ymm8 +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm8, %ymm11 +; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm27 +; AVX512-FCP-NEXT: vpermd %ymm27, %ymm9, %ymm0 +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm12 +; AVX512-FCP-NEXT: vpermt2d %ymm11, %ymm7, %ymm12 +; AVX512-FCP-NEXT: vpmovqw %zmm2, %xmm11 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm11[0,1,2,3],zmm1[0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 480(%rdi), %ymm28 +; AVX512-FCP-NEXT: vpermd %ymm28, %ymm9, %ymm11 +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm11, %ymm1 ; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %ymm17 -; AVX512-FCP-NEXT: vpermd %ymm17, %ymm1, %ymm12 +; AVX512-FCP-NEXT: vpermd %ymm17, %ymm9, %ymm12 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm12, %ymm13 -; AVX512-FCP-NEXT: vpermt2d %ymm9, %ymm7, %ymm13 -; AVX512-FCP-NEXT: vpmovqw %zmm26, %xmm9 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm13[4,5,6,7] +; AVX512-FCP-NEXT: vpermt2d %ymm1, %ymm7, %ymm13 +; AVX512-FCP-NEXT: vpmovqw %zmm25, %xmm1 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm13[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa64 352(%rdi), %ymm18 -; AVX512-FCP-NEXT: vpermd %ymm18, %ymm1, %ymm13 -; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm13, %ymm14 -; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %ymm20 -; AVX512-FCP-NEXT: vpermd %ymm20, %ymm1, %ymm1 -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm15 -; AVX512-FCP-NEXT: vpermt2d %ymm14, %ymm7, %ymm15 -; AVX512-FCP-NEXT: vpmovqw %zmm23, %xmm14 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm14[0,1,2,3],zmm9[0,1,2,3] -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm9 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] -; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm10, %ymm14 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm11, %ymm11 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm14[6,7] -; AVX512-FCP-NEXT: vpsrlq $16, %zmm4, %zmm14 -; AVX512-FCP-NEXT: vpmovqw %zmm14, %xmm14 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm11[4,5,6,7] +; AVX512-FCP-NEXT: vpermd %ymm18, %ymm9, %ymm1 +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm15 +; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %ymm19 +; AVX512-FCP-NEXT: vpermd %ymm19, %ymm9, %ymm13 +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm13, %ymm9 +; AVX512-FCP-NEXT: vpermt2d %ymm15, %ymm7, %ymm9 +; AVX512-FCP-NEXT: vpmovqw %zmm22, %xmm15 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm15[0,1,2,3],ymm9[4,5,6,7] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm9[0,1,2,3],zmm14[0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm10, %ymm10 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm3 -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm8 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm3[6,7] -; AVX512-FCP-NEXT: vpsrlq $16, %zmm30, %zmm8 -; AVX512-FCP-NEXT: vpmovqw %zmm8, %xmm8 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6,7] -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm29 = zmm3[0,1,2,3],zmm11[0,1,2,3] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm10[6,7] +; AVX512-FCP-NEXT: vpsrlq $16, %zmm4, %zmm10 +; AVX512-FCP-NEXT: vpmovqw %zmm10, %xmm10 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1,2,3],ymm3[4,5,6,7] +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm8, %ymm8 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm12, %ymm3 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm8[6,7] +; AVX512-FCP-NEXT: vpsrlq $16, %zmm2, %zmm8 +; AVX512-FCP-NEXT: vpmovqw %zmm8, %xmm8 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm29 = zmm0[0,1,2,3],zmm3[0,1,2,3] +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm11, %ymm0 +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm12, %ymm3 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512-FCP-NEXT: vpsrlq $16, %zmm26, %zmm3 +; AVX512-FCP-NEXT: vpsrlq $16, %zmm25, %zmm3 ; AVX512-FCP-NEXT: vpmovqw %zmm3, %xmm3 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm1 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm13, %ymm3 -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm1 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX512-FCP-NEXT: vpsrlq $16, %zmm23, %zmm3 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX512-FCP-NEXT: vpsrlq $16, %zmm22, %zmm3 ; AVX512-FCP-NEXT: vpmovqw %zmm3, %xmm3 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm1[0,1,2,3],zmm0[0,1,2,3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [1,3,2,3,1,3,5,7] -; AVX512-FCP-NEXT: vpermd %ymm24, %ymm15, %ymm3 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm1[0,1,2,3],zmm0[0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,3,2,3,1,3,5,7] +; AVX512-FCP-NEXT: vpermd %ymm23, %ymm14, %ymm3 ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm0 -; AVX512-FCP-NEXT: vpermd %ymm25, %ymm15, %ymm8 +; AVX512-FCP-NEXT: vpermd %ymm24, %ymm14, %ymm8 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm8, %ymm1 ; AVX512-FCP-NEXT: vpermt2d %ymm0, %ymm7, %ymm1 ; AVX512-FCP-NEXT: vpsrlq $32, %zmm4, %zmm0 ; AVX512-FCP-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vpermd %ymm27, %ymm15, %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm13 -; AVX512-FCP-NEXT: vpermd %ymm28, %ymm15, %ymm12 -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm12, %ymm14 -; AVX512-FCP-NEXT: vpermt2d %ymm13, %ymm7, %ymm14 -; AVX512-FCP-NEXT: vpsrlq $32, %zmm30, %zmm13 -; AVX512-FCP-NEXT: vpmovqw %zmm13, %xmm13 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm13[0,1,2,3],zmm1[0,1,2,3] -; AVX512-FCP-NEXT: vpermd %ymm16, %ymm15, %ymm13 -; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm13, %ymm1 -; AVX512-FCP-NEXT: vpermd %ymm17, %ymm15, %ymm14 -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm14, %ymm11 -; AVX512-FCP-NEXT: vpermt2d %ymm1, %ymm7, %ymm11 -; AVX512-FCP-NEXT: vpsrlq $32, %zmm26, %zmm1 +; AVX512-FCP-NEXT: vpermd %ymm26, %ymm14, %ymm0 +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm12 +; AVX512-FCP-NEXT: vpermd %ymm27, %ymm14, %ymm11 +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm11, %ymm13 +; AVX512-FCP-NEXT: vpermt2d %ymm12, %ymm7, %ymm13 +; AVX512-FCP-NEXT: vpsrlq $32, %zmm2, %zmm12 +; AVX512-FCP-NEXT: vpmovqw %zmm12, %xmm12 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm12[0,1,2,3],zmm1[0,1,2,3] +; AVX512-FCP-NEXT: vpermd %ymm28, %ymm14, %ymm12 +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm12, %ymm1 +; AVX512-FCP-NEXT: vpermd %ymm17, %ymm14, %ymm13 +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm13, %ymm10 +; AVX512-FCP-NEXT: vpermt2d %ymm1, %ymm7, %ymm10 +; AVX512-FCP-NEXT: vpsrlq $32, %zmm25, %zmm1 ; AVX512-FCP-NEXT: vpmovqw %zmm1, %xmm1 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm11[4,5,6,7] -; AVX512-FCP-NEXT: vpermd %ymm18, %ymm15, %ymm1 -; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm2 -; AVX512-FCP-NEXT: vpermd %ymm20, %ymm15, %ymm5 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm10[4,5,6,7] +; AVX512-FCP-NEXT: vpermd %ymm18, %ymm14, %ymm1 +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm15 +; AVX512-FCP-NEXT: vpermd %ymm19, %ymm14, %ymm5 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm6 -; AVX512-FCP-NEXT: vpermt2d %ymm2, %ymm7, %ymm6 -; AVX512-FCP-NEXT: vpsrlq $32, %zmm23, %zmm2 -; AVX512-FCP-NEXT: vpmovqw %zmm2, %xmm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm11[0,1,2,3] +; AVX512-FCP-NEXT: vpermt2d %ymm15, %ymm7, %ymm6 +; AVX512-FCP-NEXT: vpsrlq $32, %zmm22, %zmm7 +; AVX512-FCP-NEXT: vpmovqw %zmm7, %xmm7 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm10[0,1,2,3] ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm3 -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm6 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7] +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm8, %ymm7 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5],ymm3[6,7] ; AVX512-FCP-NEXT: vpsrlq $48, %zmm4, %zmm4 ; AVX512-FCP-NEXT: vpmovqw %zmm4, %xmm4 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm12, %ymm4 +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm11, %ymm4 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] -; AVX512-FCP-NEXT: vpsrlq $48, %zmm30, %zmm4 -; AVX512-FCP-NEXT: vpmovqw %zmm4, %xmm4 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX512-FCP-NEXT: vpsrlq $48, %zmm2, %zmm2 +; AVX512-FCP-NEXT: vpmovqw %zmm2, %xmm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm3[0,1,2,3] +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm12, %ymm2 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm13, %ymm3 -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm14, %ymm4 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX512-FCP-NEXT: vpsrlq $48, %zmm26, %zmm4 -; AVX512-FCP-NEXT: vpmovqw %zmm4, %xmm4 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX512-FCP-NEXT: vpsrlq $48, %zmm25, %zmm3 +; AVX512-FCP-NEXT: vpmovqw %zmm3, %xmm3 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm1 -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm5, %ymm4 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] -; AVX512-FCP-NEXT: vpsrlq $48, %zmm23, %zmm4 -; AVX512-FCP-NEXT: vpmovqw %zmm4, %xmm4 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm3[0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, 64(%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, (%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 64(%rdx) +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm5, %ymm3 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX512-FCP-NEXT: vpsrlq $48, %zmm22, %zmm3 +; AVX512-FCP-NEXT: vpmovqw %zmm3, %xmm3 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, 64(%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, (%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 64(%rdx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm29, (%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 64(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, (%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 64(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, (%rcx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 64(%r8) ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%r8) ; AVX512-FCP-NEXT: vzeroupper @@ -6400,148 +6375,147 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512DQ-FCP-LABEL: load_i16_stride4_vf64: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm23 -; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm26 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm30 +; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm25 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4 ; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0] ; AVX512DQ-FCP-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,2,3,0,2,4,6] -; AVX512DQ-FCP-NEXT: vmovdqa64 224(%rdi), %ymm24 -; AVX512DQ-FCP-NEXT: vpermd %ymm24, %ymm1, %ymm10 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,2,2,3,0,2,4,6] +; AVX512DQ-FCP-NEXT: vmovdqa64 224(%rdi), %ymm23 +; AVX512DQ-FCP-NEXT: vpermd %ymm23, %ymm9, %ymm10 ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm10, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,4,5,u,u,u,u,8,9,12,13,12,13,14,15,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u] -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm25 -; AVX512DQ-FCP-NEXT: vpermd %ymm25, %ymm1, %ymm11 -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm11, %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm24 +; AVX512DQ-FCP-NEXT: vpermd %ymm24, %ymm9, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm1 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,2,2,3,4,6,12,14] -; AVX512DQ-FCP-NEXT: vpermt2d %ymm0, %ymm7, %ymm3 +; AVX512DQ-FCP-NEXT: vpermt2d %ymm0, %ymm7, %ymm1 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm4, %xmm0 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 96(%rdi), %ymm27 -; AVX512DQ-FCP-NEXT: vpermd %ymm27, %ymm1, %ymm3 -; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm28 -; AVX512DQ-FCP-NEXT: vpermd %ymm28, %ymm1, %ymm8 -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm8, %ymm12 -; AVX512DQ-FCP-NEXT: vpermt2d %ymm9, %ymm7, %ymm12 -; AVX512DQ-FCP-NEXT: vpmovqw %zmm30, %xmm9 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm9[0,1,2,3],zmm0[0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 480(%rdi), %ymm16 -; AVX512DQ-FCP-NEXT: vpermd %ymm16, %ymm1, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm9 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 96(%rdi), %ymm26 +; AVX512DQ-FCP-NEXT: vpermd %ymm26, %ymm9, %ymm8 +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm8, %ymm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm27 +; AVX512DQ-FCP-NEXT: vpermd %ymm27, %ymm9, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm12 +; AVX512DQ-FCP-NEXT: vpermt2d %ymm11, %ymm7, %ymm12 +; AVX512DQ-FCP-NEXT: vpmovqw %zmm2, %xmm11 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm11[0,1,2,3],zmm1[0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 480(%rdi), %ymm28 +; AVX512DQ-FCP-NEXT: vpermd %ymm28, %ymm9, %ymm11 +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm11, %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %ymm17 -; AVX512DQ-FCP-NEXT: vpermd %ymm17, %ymm1, %ymm12 +; AVX512DQ-FCP-NEXT: vpermd %ymm17, %ymm9, %ymm12 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm12, %ymm13 -; AVX512DQ-FCP-NEXT: vpermt2d %ymm9, %ymm7, %ymm13 -; AVX512DQ-FCP-NEXT: vpmovqw %zmm26, %xmm9 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2d %ymm1, %ymm7, %ymm13 +; AVX512DQ-FCP-NEXT: vpmovqw %zmm25, %xmm1 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm13[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa64 352(%rdi), %ymm18 -; AVX512DQ-FCP-NEXT: vpermd %ymm18, %ymm1, %ymm13 -; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm13, %ymm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %ymm20 -; AVX512DQ-FCP-NEXT: vpermd %ymm20, %ymm1, %ymm1 -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm15 -; AVX512DQ-FCP-NEXT: vpermt2d %ymm14, %ymm7, %ymm15 -; AVX512DQ-FCP-NEXT: vpmovqw %zmm23, %xmm14 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm14[0,1,2,3],zmm9[0,1,2,3] -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm9 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] -; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm10, %ymm14 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm11, %ymm11 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm14[6,7] -; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm4, %zmm14 -; AVX512DQ-FCP-NEXT: vpmovqw %zmm14, %xmm14 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermd %ymm18, %ymm9, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %ymm19 +; AVX512DQ-FCP-NEXT: vpermd %ymm19, %ymm9, %ymm13 +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm13, %ymm9 +; AVX512DQ-FCP-NEXT: vpermt2d %ymm15, %ymm7, %ymm9 +; AVX512DQ-FCP-NEXT: vpmovqw %zmm22, %xmm15 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm15[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm9[0,1,2,3],zmm14[0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm10, %ymm10 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm3 -; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm8 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm3[6,7] -; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm30, %zmm8 -; AVX512DQ-FCP-NEXT: vpmovqw %zmm8, %xmm8 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm29 = zmm3[0,1,2,3],zmm11[0,1,2,3] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm10[6,7] +; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm4, %zmm10 +; AVX512DQ-FCP-NEXT: vpmovqw %zmm10, %xmm10 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm8, %ymm8 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm12, %ymm3 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm8[6,7] +; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm2, %zmm8 +; AVX512DQ-FCP-NEXT: vpmovqw %zmm8, %xmm8 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm29 = zmm0[0,1,2,3],zmm3[0,1,2,3] +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm11, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm12, %ymm3 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm26, %zmm3 +; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm25, %zmm3 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm3, %xmm3 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm13, %ymm3 -; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm1 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm23, %zmm3 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm22, %zmm3 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm3, %xmm3 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm1[0,1,2,3],zmm0[0,1,2,3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [1,3,2,3,1,3,5,7] -; AVX512DQ-FCP-NEXT: vpermd %ymm24, %ymm15, %ymm3 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm1[0,1,2,3],zmm0[0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,3,2,3,1,3,5,7] +; AVX512DQ-FCP-NEXT: vpermd %ymm23, %ymm14, %ymm3 ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm0 -; AVX512DQ-FCP-NEXT: vpermd %ymm25, %ymm15, %ymm8 +; AVX512DQ-FCP-NEXT: vpermd %ymm24, %ymm14, %ymm8 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm8, %ymm1 ; AVX512DQ-FCP-NEXT: vpermt2d %ymm0, %ymm7, %ymm1 ; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm4, %zmm0 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermd %ymm27, %ymm15, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm13 -; AVX512DQ-FCP-NEXT: vpermd %ymm28, %ymm15, %ymm12 -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm12, %ymm14 -; AVX512DQ-FCP-NEXT: vpermt2d %ymm13, %ymm7, %ymm14 -; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm30, %zmm13 -; AVX512DQ-FCP-NEXT: vpmovqw %zmm13, %xmm13 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm13[0,1,2,3],zmm1[0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermd %ymm16, %ymm15, %ymm13 -; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm13, %ymm1 -; AVX512DQ-FCP-NEXT: vpermd %ymm17, %ymm15, %ymm14 -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm14, %ymm11 -; AVX512DQ-FCP-NEXT: vpermt2d %ymm1, %ymm7, %ymm11 -; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm26, %zmm1 +; AVX512DQ-FCP-NEXT: vpermd %ymm26, %ymm14, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm12 +; AVX512DQ-FCP-NEXT: vpermd %ymm27, %ymm14, %ymm11 +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm11, %ymm13 +; AVX512DQ-FCP-NEXT: vpermt2d %ymm12, %ymm7, %ymm13 +; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm2, %zmm12 +; AVX512DQ-FCP-NEXT: vpmovqw %zmm12, %xmm12 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm12[0,1,2,3],zmm1[0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermd %ymm28, %ymm14, %ymm12 +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm12, %ymm1 +; AVX512DQ-FCP-NEXT: vpermd %ymm17, %ymm14, %ymm13 +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm13, %ymm10 +; AVX512DQ-FCP-NEXT: vpermt2d %ymm1, %ymm7, %ymm10 +; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm25, %zmm1 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm1, %xmm1 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermd %ymm18, %ymm15, %ymm1 -; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm2 -; AVX512DQ-FCP-NEXT: vpermd %ymm20, %ymm15, %ymm5 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermd %ymm18, %ymm14, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm15 +; AVX512DQ-FCP-NEXT: vpermd %ymm19, %ymm14, %ymm5 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm6 -; AVX512DQ-FCP-NEXT: vpermt2d %ymm2, %ymm7, %ymm6 -; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm23, %zmm2 -; AVX512DQ-FCP-NEXT: vpmovqw %zmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm11[0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermt2d %ymm15, %ymm7, %ymm6 +; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm22, %zmm7 +; AVX512DQ-FCP-NEXT: vpmovqw %zmm7, %xmm7 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm10[0,1,2,3] ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm3 -; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm6 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7] +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm8, %ymm7 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5],ymm3[6,7] ; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm4, %zmm4 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm4, %xmm4 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm12, %ymm4 +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm11, %ymm4 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm30, %zmm4 -; AVX512DQ-FCP-NEXT: vpmovqw %zmm4, %xmm4 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm2, %zmm2 +; AVX512DQ-FCP-NEXT: vpmovqw %zmm2, %xmm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm3[0,1,2,3] +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm12, %ymm2 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm13, %ymm3 -; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm14, %ymm4 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm26, %zmm4 -; AVX512DQ-FCP-NEXT: vpmovqw %zmm4, %xmm4 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm25, %zmm3 +; AVX512DQ-FCP-NEXT: vpmovqw %zmm3, %xmm3 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm1 -; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm5, %ymm4 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] -; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm23, %zmm4 -; AVX512DQ-FCP-NEXT: vpmovqw %zmm4, %xmm4 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm3[0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, 64(%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, (%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 64(%rdx) +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm5, %ymm3 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm22, %zmm3 +; AVX512DQ-FCP-NEXT: vpmovqw %zmm3, %xmm3 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, 64(%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, (%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 64(%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, (%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 64(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, (%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 64(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, (%rcx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 64(%r8) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%r8) ; AVX512DQ-FCP-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll index 8e55cb48cf7a2..b18f08b62f0d4 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll @@ -937,24 +937,27 @@ define void @load_i16_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm4 ; AVX2-FP-NEXT: vpsllq $48, %xmm4, %xmm5 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm5[7] -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] -; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3,4],xmm5[5,6,7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm4[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,14,15,8,9,2,3,12,13,6,7,0,1,10,11] +; AVX2-FP-NEXT: vpshufb %xmm5, %xmm4, %xmm6 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] +; AVX2-FP-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3,4],xmm7[5,6,7] +; AVX2-FP-NEXT: vpshufb %xmm5, %xmm7, %xmm5 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3] -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] -; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [6,7,0,1,10,11,4,5,14,15,8,9,2,3,12,13] +; AVX2-FP-NEXT: vpshufb %xmm6, %xmm4, %xmm7 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] +; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2],xmm8[3] +; AVX2-FP-NEXT: vpshufb %xmm6, %xmm8, %xmm6 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,2,3,12,13,6,7,0,1,10,11,4,5,14,15] +; AVX2-FP-NEXT: vpshufb %xmm7, %xmm4, %xmm4 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX2-FP-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3] ; AVX2-FP-NEXT: vmovdqa %xmm2, (%rsi) ; AVX2-FP-NEXT: vmovdqa %xmm3, (%rdx) ; AVX2-FP-NEXT: vmovdqa %xmm5, (%rcx) @@ -980,24 +983,27 @@ define void @load_i16_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm4 ; AVX2-FCP-NEXT: vpsllq $48, %xmm4, %xmm5 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm5[7] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3,4],xmm5[5,6,7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm4[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,14,15,8,9,2,3,12,13,6,7,0,1,10,11] +; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm6 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3,4],xmm7[5,6,7] +; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm7, %xmm5 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [6,7,0,1,10,11,4,5,14,15,8,9,2,3,12,13] +; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm4, %xmm7 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2],xmm8[3] +; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm8, %xmm6 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,2,3,12,13,6,7,0,1,10,11,4,5,14,15] +; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm4, %xmm4 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3] ; AVX2-FCP-NEXT: vmovdqa %xmm2, (%rsi) ; AVX2-FCP-NEXT: vmovdqa %xmm3, (%rdx) ; AVX2-FCP-NEXT: vmovdqa %xmm5, (%rcx) @@ -1069,24 +1075,27 @@ define void @load_i16_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,12,13,6,7,0,1,10,11,4,5,14,15,u,u] ; AVX512-FCP-NEXT: vpsllq $48, %xmm3, %xmm5 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3,4],xmm5[5,6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,14,15,8,9,2,3,12,13,6,7,0,1,10,11] +; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm6 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3,4],xmm7[5,6,7] +; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm7, %xmm5 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [6,7,0,1,10,11,4,5,14,15,8,9,2,3,12,13] +; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm7 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2],xmm8[3] +; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm8, %xmm6 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,2,3,12,13,6,7,0,1,10,11,4,5,14,15] +; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm3 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3] ; AVX512-FCP-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512-FCP-NEXT: vmovdqa %xmm4, (%rdx) ; AVX512-FCP-NEXT: vmovdqa %xmm5, (%rcx) @@ -1158,24 +1167,27 @@ define void @load_i16_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,12,13,6,7,0,1,10,11,4,5,14,15,u,u] ; AVX512DQ-FCP-NEXT: vpsllq $48, %xmm3, %xmm5 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3,4],xmm5[5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,14,15,8,9,2,3,12,13,6,7,0,1,10,11] +; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm6 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3,4],xmm7[5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm7, %xmm5 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [6,7,0,1,10,11,4,5,14,15,8,9,2,3,12,13] +; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm7 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2],xmm8[3] +; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm8, %xmm6 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,2,3,12,13,6,7,0,1,10,11,4,5,14,15] +; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm3 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3] ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, (%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, (%rcx) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll index 1ddd8166c998e..605deed6536bf 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll @@ -1023,11 +1023,12 @@ define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7] ; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2],xmm6[3],xmm8[4,5],xmm6[6,7] ; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,14,15,10,11,6,7,2,3,14,15,12,13,14,15] +; AVX2-NEXT: vpshufb %xmm6, %xmm7, %xmm7 +; AVX2-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2],xmm5[3],xmm7[4,5],xmm5[6,7] ; AVX2-NEXT: vpbroadcastw 74(%rdi), %xmm6 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,10,11,u,u,2,3,14,15,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15] -; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2],xmm5[3],xmm7[4,5],xmm5[6,7] ; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] ; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[2,1,2,3] @@ -1072,34 +1073,36 @@ define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-LABEL: load_i16_stride6_vf8: ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm2 -; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX2-FP-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX2-FP-NEXT: vpslld $16, %xmm0, %xmm3 -; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX2-FP-NEXT: vpsrldq {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] -; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,0,3] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2],xmm6[3],xmm8[4,5],xmm6[6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3] +; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15] +; AVX2-FP-NEXT: vpshufb %xmm1, %xmm5, %xmm3 +; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm4 +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[2,1,0,3] +; AVX2-FP-NEXT: vpshufb %xmm1, %xmm6, %xmm1 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3],xmm1[4,5],xmm3[6,7] +; AVX2-FP-NEXT: vmovdqa 80(%rdi), %xmm3 +; AVX2-FP-NEXT: vpslld $16, %xmm3, %xmm7 +; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX2-FP-NEXT: vpsrldq {{.*#+}} xmm8 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm7[3] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [2,3,14,15,2,3,6,7,10,11,14,15,12,13,14,15] +; AVX2-FP-NEXT: vpshufb %xmm7, %xmm5, %xmm5 +; AVX2-FP-NEXT: vpshufb %xmm7, %xmm6, %xmm6 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2],xmm5[3],xmm6[4,5],xmm5[6,7] ; AVX2-FP-NEXT: vpbroadcastw 74(%rdi), %xmm6 -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,2,3,u,u,10,11,14,15,u,u,u,u] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2],xmm5[3],xmm7[4,5],xmm5[6,7] +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[2,1,2,3] ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[2,1,2,0,4,5,6,7] ; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm6 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2],xmm8[3],xmm9[4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm9 = xmm1[0,1],xmm0[2],xmm1[3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm9 = xmm4[0,1],xmm3[2],xmm4[3] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,u,0,1,12,13,8,9] ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm10[5,6,7] ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7] @@ -1107,24 +1110,25 @@ define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3],xmm6[4,5,6,7] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u,u,2,3,14,15,10,11] ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7] -; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm4[2,2,2,2,4,5,6,7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = xmm2[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] +; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[2,2,2,2,4,5,6,7] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3],xmm7[4],xmm9[5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2,3,4],xmm1[5,6,7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,6,7,u,u,u,u,10,11,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,u,u,2,3,14,15,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5,6,7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,6,7,2,3,14,15] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4],xmm0[5,6,7] -; AVX2-FP-NEXT: vmovdqa %xmm3, (%rsi) +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2,3,4],xmm4[5,6,7] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15] +; AVX2-FP-NEXT: vpshufb %xmm7, %xmm2, %xmm2 +; AVX2-FP-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,u,u,u,u,u,6,7,2,3,14,15] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7] +; AVX2-FP-NEXT: vmovdqa %xmm1, (%rsi) ; AVX2-FP-NEXT: vmovdqa %xmm5, (%rdx) ; AVX2-FP-NEXT: vmovdqa %xmm8, (%rcx) ; AVX2-FP-NEXT: vmovdqa %xmm6, (%r8) -; AVX2-FP-NEXT: vmovdqa %xmm1, (%r9) +; AVX2-FP-NEXT: vmovdqa %xmm4, (%r9) ; AVX2-FP-NEXT: vmovdqa %xmm0, (%rax) ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq @@ -1132,34 +1136,36 @@ define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-LABEL: load_i16_stride6_vf8: ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX2-FCP-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX2-FCP-NEXT: vpslld $16, %xmm0, %xmm3 -; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX2-FCP-NEXT: vpsrldq {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,0,3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2],xmm6[3],xmm8[4,5],xmm6[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3] +; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15] +; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm3 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm4 +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[2,1,0,3] +; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm6, %xmm1 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3],xmm1[4,5],xmm3[6,7] +; AVX2-FCP-NEXT: vmovdqa 80(%rdi), %xmm3 +; AVX2-FCP-NEXT: vpslld $16, %xmm3, %xmm7 +; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX2-FCP-NEXT: vpsrldq {{.*#+}} xmm8 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm7[3] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [2,3,14,15,2,3,6,7,10,11,14,15,12,13,14,15] +; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm5, %xmm5 +; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm6 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2],xmm5[3],xmm6[4,5],xmm5[6,7] ; AVX2-FCP-NEXT: vpbroadcastw 74(%rdi), %xmm6 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,2,3,u,u,10,11,14,15,u,u,u,u] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2],xmm5[3],xmm7[4,5],xmm5[6,7] +; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[2,1,2,3] ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[2,1,2,0,4,5,6,7] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2],xmm8[3],xmm9[4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm1[0,1],xmm0[2],xmm1[3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm4[0,1],xmm3[2],xmm4[3] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,u,0,1,12,13,8,9] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm10[5,6,7] ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7] @@ -1167,24 +1173,25 @@ define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3],xmm6[4,5,6,7] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u,u,2,3,14,15,10,11] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm4[2,2,2,2,4,5,6,7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm2[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[2,2,2,2,4,5,6,7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3],xmm7[4],xmm9[5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2,3,4],xmm1[5,6,7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,6,7,u,u,u,u,10,11,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,u,u,2,3,14,15,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5,6,7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,6,7,2,3,14,15] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4],xmm0[5,6,7] -; AVX2-FCP-NEXT: vmovdqa %xmm3, (%rsi) +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2,3,4],xmm4[5,6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15] +; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm2 +; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,u,u,u,u,u,6,7,2,3,14,15] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7] +; AVX2-FCP-NEXT: vmovdqa %xmm1, (%rsi) ; AVX2-FCP-NEXT: vmovdqa %xmm5, (%rdx) ; AVX2-FCP-NEXT: vmovdqa %xmm8, (%rcx) ; AVX2-FCP-NEXT: vmovdqa %xmm6, (%r8) -; AVX2-FCP-NEXT: vmovdqa %xmm1, (%r9) +; AVX2-FCP-NEXT: vmovdqa %xmm4, (%r9) ; AVX2-FCP-NEXT: vmovdqa %xmm0, (%rax) ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq @@ -1259,35 +1266,36 @@ define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-LABEL: load_i16_stride6_vf8: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX512-FCP-NEXT: vpslld $16, %xmm0, %xmm2 -; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[0,1,12,13,u,u,4,5,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,0,3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2],xmm6[3],xmm8[4,5],xmm6[6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3] -; AVX512-FCP-NEXT: vpbroadcastw 74(%rdi), %xmm6 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,12,13,0,1,4,5,8,9,12,13,u,u,u,u] +; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] +; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm3 +; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm4 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[2,1,0,3] +; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3],xmm0[4,5],xmm3[6,7] +; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm3 +; AVX512-FCP-NEXT: vpslld $16, %xmm3, %xmm7 +; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm8 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm7[3] +; AVX512-FCP-NEXT: vpbroadcastw 74(%rdi), %xmm7 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,2,3,u,u,14,15,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,1,1,1,4,5,6,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2],xmm5[3,4],xmm7[5],xmm5[6],xmm7[7] +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,1,1,1,4,5,6,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2],xmm5[3,4],xmm6[5],xmm5[6],xmm6[7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,6,7,4,5,0,1,10,11,14,15,u,u,u,u] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[2,1,2,3] ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[2,1,2,0,4,5,6,7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2],xmm8[3],xmm9[4,5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm1[0,1],xmm0[2],xmm1[3] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm4[0,1],xmm3[2],xmm4[3] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,u,0,1,12,13,8,9] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm10[5,6,7] ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7] @@ -1295,25 +1303,26 @@ define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3],xmm6[4,5,6,7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u,u,2,3,14,15,10,11] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm4[2,2,2,2,4,5,6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm3[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[2,2,2,2,4,5,6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3],xmm7[4],xmm9[5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2,3,4],xmm1[5,6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,6,7,u,u,u,u,10,11,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[10,11,u,u,2,3,14,15,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,6,7,2,3,14,15] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5,6,7] -; AVX512-FCP-NEXT: vmovdqa %xmm2, (%rsi) +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2,3,4],xmm4[5,6,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15] +; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm2 +; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,u,u,u,u,u,6,7,2,3,14,15] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] +; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rsi) ; AVX512-FCP-NEXT: vmovdqa %xmm5, (%rdx) ; AVX512-FCP-NEXT: vmovdqa %xmm8, (%rcx) ; AVX512-FCP-NEXT: vmovdqa %xmm6, (%r8) -; AVX512-FCP-NEXT: vmovdqa %xmm1, (%r9) -; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rax) +; AVX512-FCP-NEXT: vmovdqa %xmm4, (%r9) +; AVX512-FCP-NEXT: vmovdqa %xmm1, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -1387,35 +1396,36 @@ define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-LABEL: load_i16_stride6_vf8: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX512DQ-FCP-NEXT: vpslld $16, %xmm0, %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[0,1,12,13,u,u,4,5,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,0,3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2],xmm6[3],xmm8[4,5],xmm6[6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3] -; AVX512DQ-FCP-NEXT: vpbroadcastw 74(%rdi), %xmm6 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,12,13,0,1,4,5,8,9,12,13,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm3 +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm4 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[2,1,0,3] +; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3],xmm0[4,5],xmm3[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm3 +; AVX512DQ-FCP-NEXT: vpslld $16, %xmm3, %xmm7 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm8 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm7[3] +; AVX512DQ-FCP-NEXT: vpbroadcastw 74(%rdi), %xmm7 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,2,3,u,u,14,15,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,1,1,1,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2],xmm5[3,4],xmm7[5],xmm5[6],xmm7[7] +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,1,1,1,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2],xmm5[3,4],xmm6[5],xmm5[6],xmm6[7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,6,7,4,5,0,1,10,11,14,15,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[2,1,2,3] ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[2,1,2,0,4,5,6,7] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2],xmm8[3],xmm9[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm1[0,1],xmm0[2],xmm1[3] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm4[0,1],xmm3[2],xmm4[3] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,u,0,1,12,13,8,9] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm10[5,6,7] ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7] @@ -1423,25 +1433,26 @@ define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3],xmm6[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u,u,2,3,14,15,10,11] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm4[2,2,2,2,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm3[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[2,2,2,2,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3],xmm7[4],xmm9[5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2,3,4],xmm1[5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,6,7,u,u,u,u,10,11,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[10,11,u,u,2,3,14,15,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,6,7,2,3,14,15] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, (%rsi) +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2,3,4],xmm4[5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm2 +; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,u,u,u,u,u,6,7,2,3,14,15] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, (%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa %xmm8, (%rcx) ; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, (%r8) -; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, (%r9) -; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, (%r9) +; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -2174,27 +2185,29 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-LABEL: load_i16_stride6_vf16: ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm3 -; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm4 +; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm5 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm5 +; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm4 ; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm2 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[u,u,u,u,u,u,4,5,u,u,u,u,8,9,u,u] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5] +; AVX2-FP-NEXT: vpshufb %xmm6, %xmm8, %xmm7 ; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3],xmm7[4,5],xmm6[6],xmm7[7] +; AVX2-FP-NEXT: vpshufb %xmm6, %xmm9, %xmm6 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3],xmm6[4,5],xmm7[6],xmm6[7] ; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm10 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm11[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] -; AVX2-FP-NEXT: vextracti128 $1, %ymm11, %xmm7 -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm12 = xmm7[2,1,0,3] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm12[u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm13 = xmm6[0,1],xmm7[2],xmm6[3],xmm7[4,5],xmm6[6,7] -; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm5[2,3] -; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm0[0,1],ymm5[0,1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6],ymm5[7] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15] +; AVX2-FP-NEXT: vpshufb %xmm6, %xmm11, %xmm7 +; AVX2-FP-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,0,3] +; AVX2-FP-NEXT: vpshufb %xmm6, %xmm12, %xmm6 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm13 = xmm7[0,1],xmm6[2],xmm7[3],xmm6[4,5],xmm7[6,7] +; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm4[2,3] +; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm0[0,1],ymm4[0,1] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm0[3,4,5,6,7] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] @@ -2208,10 +2221,10 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm12[1,1,1,1,4,5,6,7] ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm11[2],xmm10[3,4],xmm11[5],xmm10[6],xmm11[7] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[2,3,6,7,4,5,0,1,10,11,14,15,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2],ymm5[3,4,5,6,7] -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0,1,2],ymm8[3,4,5,6,7],ymm5[8,9,10],ymm8[11,12,13,14,15] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2],ymm4[3,4,5,6,7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm4[0,1,2],ymm8[3,4,5,6,7],ymm4[8,9,10],ymm8[11,12,13,14,15] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] ; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm10 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,2,1] @@ -2220,7 +2233,7 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm12[0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u] ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm11[4],xmm8[5,6],xmm11[7] ; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7] ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm13 = xmm11[2,1,2,3] ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[2,1,2,0,4,5,6,7] ; AVX2-FP-NEXT: vextracti128 $1, %ymm11, %xmm11 @@ -2245,39 +2258,42 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,3,2] ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3,4],xmm9[5,6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7] ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[2,2,2,2,4,5,6,7] +; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[2,2,2,2,4,5,6,7] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0],xmm4[1],xmm10[2,3],xmm4[4],xmm10[5,6,7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0],xmm5[1],xmm10[2,3],xmm5[4],xmm10[5,6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} xmm10 = [65535,65535,65535,65535,65535,0,0,0] -; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm4, %ymm7, %ymm4 +; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm5, %ymm7, %ymm5 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,6,7,u,u,u,u,10,11,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] -; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15] +; AVX2-FP-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX2-FP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm3 = ymm6[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm0, %ymm3, %ymm0 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13] +; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-FP-NEXT: vpshufb %xmm2, %xmm3, %xmm6 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3],xmm3[4],xmm6[5],xmm3[6,7] -; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6,7] +; AVX2-FP-NEXT: vpshufb %xmm2, %xmm1, %xmm2 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4],xmm2[5],xmm6[6,7] +; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15] +; AVX2-FP-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX2-FP-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6,7] ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-FP-NEXT: vmovdqa %ymm5, (%rdx) +; AVX2-FP-NEXT: vmovdqa %ymm4, (%rdx) ; AVX2-FP-NEXT: vmovdqa %ymm8, (%rcx) ; AVX2-FP-NEXT: vmovdqa %ymm9, (%r8) -; AVX2-FP-NEXT: vmovdqa %ymm3, (%r9) +; AVX2-FP-NEXT: vmovdqa %ymm2, (%r9) ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FP-NEXT: vmovdqa %ymm0, (%rax) ; AVX2-FP-NEXT: vzeroupper @@ -2286,27 +2302,29 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-LABEL: load_i16_stride6_vf16: ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm3 -; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm4 +; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm5 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm5 +; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm4 ; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[u,u,u,u,u,u,4,5,u,u,u,u,8,9,u,u] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5] +; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm8, %xmm7 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3],xmm7[4,5],xmm6[6],xmm7[7] +; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm9, %xmm6 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3],xmm6[4,5],xmm7[6],xmm6[7] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm10 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm11[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm11, %xmm7 -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm7[2,1,0,3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm12[u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm6[0,1],xmm7[2],xmm6[3],xmm7[4,5],xmm6[6,7] -; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm5[2,3] -; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm0[0,1],ymm5[0,1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6],ymm5[7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15] +; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm11, %xmm7 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,0,3] +; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm12, %xmm6 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm7[0,1],xmm6[2],xmm7[3],xmm6[4,5],xmm7[6,7] +; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm4[2,3] +; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm0[0,1],ymm4[0,1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm0[3,4,5,6,7] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] @@ -2320,10 +2338,10 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm12[1,1,1,1,4,5,6,7] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm11[2],xmm10[3,4],xmm11[5],xmm10[6],xmm11[7] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[2,3,6,7,4,5,0,1,10,11,14,15,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2],ymm5[3,4,5,6,7] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0,1,2],ymm8[3,4,5,6,7],ymm5[8,9,10],ymm8[11,12,13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2],ymm4[3,4,5,6,7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm4[0,1,2],ymm8[3,4,5,6,7],ymm4[8,9,10],ymm8[11,12,13,14,15] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm8, %xmm10 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,2,1] @@ -2332,7 +2350,7 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm12[0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm11[4],xmm8[5,6],xmm11[7] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm11[2,1,2,3] ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[2,1,2,0,4,5,6,7] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm11, %xmm11 @@ -2357,39 +2375,42 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,3,2] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3,4],xmm9[5,6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[2,2,2,2,4,5,6,7] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[2,2,2,2,4,5,6,7] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0],xmm4[1],xmm10[2,3],xmm4[4],xmm10[5,6,7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0],xmm5[1],xmm10[2,3],xmm5[4],xmm10[5,6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} xmm10 = [65535,65535,65535,65535,65535,0,0,0] -; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm4, %ymm7, %ymm4 +; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm5, %ymm7, %ymm5 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,6,7,u,u,u,u,10,11,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15] +; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm6[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm0, %ymm3, %ymm0 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm6 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3],xmm3[4],xmm6[5],xmm3[6,7] -; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6,7] +; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm2 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4],xmm2[5],xmm6[6,7] +; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15] +; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6,7] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-FCP-NEXT: vmovdqa %ymm5, (%rdx) +; AVX2-FCP-NEXT: vmovdqa %ymm4, (%rdx) ; AVX2-FCP-NEXT: vmovdqa %ymm8, (%rcx) ; AVX2-FCP-NEXT: vmovdqa %ymm9, (%r8) -; AVX2-FCP-NEXT: vmovdqa %ymm3, (%r9) +; AVX2-FCP-NEXT: vmovdqa %ymm2, (%r9) ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: vmovdqa %ymm0, (%rax) ; AVX2-FCP-NEXT: vzeroupper @@ -2421,17 +2442,18 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2],ymm1[3,4,5,6,7] ; AVX512-NEXT: vpblendw {{.*#+}} ymm9 = ymm1[0,1,2],ymm9[3,4,5,6,7],ymm1[8,9,10],ymm9[11,12,13,14,15] ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm9 = [2,3,14,15,10,11,6,7,2,3,14,15,12,13,14,15] +; AVX512-NEXT: vpshufb %xmm9, %xmm13, %xmm12 +; AVX512-NEXT: vpshufb %xmm9, %xmm11, %xmm9 +; AVX512-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm12[2],xmm9[3],xmm12[4,5],xmm9[6,7] +; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] ; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] ; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3],xmm8[4,5],xmm5[6],xmm8[7] ; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = xmm13[u,u,u,u,10,11,u,u,2,3,14,15,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm11[2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15] -; AVX512-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2],xmm10[3],xmm9[4,5],xmm10[6,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3,4,5,6,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0,1,2],ymm5[3,4,5,6,7],ymm8[8,9,10],ymm5[11,12,13,14,15] -; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] +; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm9[0,1,2],ymm5[3,4,5,6,7],ymm9[8,9,10],ymm5[11,12,13,14,15] +; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] ; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] @@ -2509,42 +2531,45 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512-FCP-LABEL: load_i16_stride6_vf16: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm0 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm2 -; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm1 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[u,u,u,u,u,u,4,5,u,u,u,u,8,9,u,u] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm8 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3],xmm7[4,5],xmm6[6],xmm7[7] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm9 -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm2[2,3],mem[2,3] -; AVX512-FCP-NEXT: vinserti128 $1, 96(%rdi), %ymm2, %ymm7 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm11, %xmm13 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,0,3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm14[2],xmm12[3],xmm14[4,5],xmm12[6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2],ymm2[3,4,5,6,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2],ymm9[3,4,5,6,7],ymm2[8,9,10],ymm9[11,12,13,14,15] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] -; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3],xmm8[4,5],xmm5[6],xmm8[7] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm11[2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm13[u,u,u,u,2,3,u,u,10,11,14,15,u,u,u,u] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm10[2],xmm9[3],xmm10[4,5],xmm9[6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3,4,5,6,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0,1,2],ymm5[3,4,5,6,7],ymm8[8,9,10],ymm5[11,12,13,14,15] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15] +; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm4 +; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm0 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7] +; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm8, %xmm5 +; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm6 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[2,1,0,3] +; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm9, %xmm1 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1],xmm1[2],xmm5[3],xmm1[4,5],xmm5[6,7] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm4[2,3],mem[2,3] +; AVX512-FCP-NEXT: vinserti128 $1, 96(%rdi), %ymm4, %ymm7 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3,4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm5 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1],ymm5[2],ymm0[3,4],ymm5[5],ymm0[6,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5] +; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm10, %xmm12 +; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm13 +; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm13, %xmm11 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm12[3],xmm11[4,5],xmm12[6],xmm11[7] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm1[0,1,2],ymm11[3,4,5,6,7],ymm1[8,9,10],ymm11[11,12,13,14,15] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [2,3,14,15,2,3,6,7,10,11,14,15,12,13,14,15] +; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm8 +; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm9, %xmm9 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2],xmm8[3],xmm9[4,5],xmm8[6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2],ymm4[3,4,5,6,7] +; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm10[0,1,2,3,5,5,5,5] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm13[6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4,5],xmm8[6],xmm9[7] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm4[0,1,2],ymm8[3,4,5,6,7],ymm4[8,9,10],ymm8[11,12,13,14,15] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1],ymm0[2],ymm5[3,4],ymm0[5],ymm5[6,7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,6,5,6,4] @@ -2553,7 +2578,7 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm10[4],xmm8[5,6],xmm10[7] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[2,1,2,3] ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[2,1,2,0,4,5,6,7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm12 @@ -2577,36 +2602,39 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm4[2,2,2,2,4,5,6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[2,2,2,2,4,5,6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm2[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] ; AVX512-FCP-NEXT: vpternlogq $236, %ymm11, %ymm7, %ymm10 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3],ymm5[4],ymm0[5,6],ymm5[7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm7 +; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm7, %xmm12 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm12[0,1,2,3],xmm7[4],xmm12[5],xmm7[6,7] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3,4],ymm7[5,6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,6,7,u,u,u,u,10,11,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpternlogq $236, %ymm11, %ymm4, %ymm3 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6,7] +; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm12[4],xmm5[5],xmm12[6,7] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3,4],ymm5[5,6,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15] +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm3 +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm2 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vpternlogq $248, %ymm11, %ymm2, %ymm3 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15] +; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm7, %xmm6 +; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4],xmm0[5],xmm6[6,7] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7] -; AVX512-FCP-NEXT: vmovdqa %ymm2, (%rsi) -; AVX512-FCP-NEXT: vmovdqa %ymm5, (%rdx) +; AVX512-FCP-NEXT: vmovdqa %ymm1, (%rsi) +; AVX512-FCP-NEXT: vmovdqa %ymm4, (%rdx) ; AVX512-FCP-NEXT: vmovdqa %ymm8, (%rcx) ; AVX512-FCP-NEXT: vmovdqa %ymm9, (%r8) -; AVX512-FCP-NEXT: vmovdqa %ymm7, (%r9) +; AVX512-FCP-NEXT: vmovdqa %ymm5, (%r9) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rax) ; AVX512-FCP-NEXT: vzeroupper @@ -2638,17 +2666,18 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2],ymm1[3,4,5,6,7] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm9 = ymm1[0,1,2],ymm9[3,4,5,6,7],ymm1[8,9,10],ymm9[11,12,13,14,15] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm9 = [2,3,14,15,10,11,6,7,2,3,14,15,12,13,14,15] +; AVX512DQ-NEXT: vpshufb %xmm9, %xmm13, %xmm12 +; AVX512DQ-NEXT: vpshufb %xmm9, %xmm11, %xmm9 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm12[2],xmm9[3],xmm12[4,5],xmm9[6,7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3],xmm8[4,5],xmm5[6],xmm8[7] ; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = xmm13[u,u,u,u,10,11,u,u,2,3,14,15,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm11[2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2],xmm10[3],xmm9[4,5],xmm10[6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3,4,5,6,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0,1,2],ymm5[3,4,5,6,7],ymm8[8,9,10],ymm5[11,12,13,14,15] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm9[0,1,2],ymm5[3,4,5,6,7],ymm9[8,9,10],ymm5[11,12,13,14,15] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] ; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] @@ -2726,42 +2755,45 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512DQ-FCP-LABEL: load_i16_stride6_vf16: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm1 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[u,u,u,u,u,u,4,5,u,u,u,u,8,9,u,u] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm8 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3],xmm7[4,5],xmm6[6],xmm7[7] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm9 -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm2[2,3],mem[2,3] -; AVX512DQ-FCP-NEXT: vinserti128 $1, 96(%rdi), %ymm2, %ymm7 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm11, %xmm13 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,0,3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm14[2],xmm12[3],xmm14[4,5],xmm12[6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2],ymm2[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2],ymm9[3,4,5,6,7],ymm2[8,9,10],ymm9[11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3],xmm8[4,5],xmm5[6],xmm8[7] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm11[2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm13[u,u,u,u,2,3,u,u,10,11,14,15,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm10[2],xmm9[3],xmm10[4,5],xmm9[6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0,1,2],ymm5[3,4,5,6,7],ymm8[8,9,10],ymm5[11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm4 +; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm0 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm8, %xmm5 +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm6 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[2,1,0,3] +; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm9, %xmm1 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1],xmm1[2],xmm5[3],xmm1[4,5],xmm5[6,7] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm4[2,3],mem[2,3] +; AVX512DQ-FCP-NEXT: vinserti128 $1, 96(%rdi), %ymm4, %ymm7 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm5 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1],ymm5[2],ymm0[3,4],ymm5[5],ymm0[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5] +; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm10, %xmm12 +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm13 +; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm13, %xmm11 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm12[3],xmm11[4,5],xmm12[6],xmm11[7] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm1[0,1,2],ymm11[3,4,5,6,7],ymm1[8,9,10],ymm11[11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [2,3,14,15,2,3,6,7,10,11,14,15,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm8 +; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm9, %xmm9 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2],xmm8[3],xmm9[4,5],xmm8[6,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2],ymm4[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm10[0,1,2,3,5,5,5,5] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm13[6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4,5],xmm8[6],xmm9[7] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm4[0,1,2],ymm8[3,4,5,6,7],ymm4[8,9,10],ymm8[11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1],ymm0[2],ymm5[3,4],ymm0[5],ymm5[6,7] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,6,5,6,4] @@ -2770,7 +2802,7 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm10[4],xmm8[5,6],xmm10[7] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[2,1,2,3] ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[2,1,2,0,4,5,6,7] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm12 @@ -2794,36 +2826,39 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm4[2,2,2,2,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[2,2,2,2,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm2[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] ; AVX512DQ-FCP-NEXT: vpternlogq $236, %ymm11, %ymm7, %ymm10 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3],ymm5[4],ymm0[5,6],ymm5[7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm7 +; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm7, %xmm12 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm12[0,1,2,3],xmm7[4],xmm12[5],xmm7[6,7] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3,4],ymm7[5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,6,7,u,u,u,u,10,11,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpternlogq $236, %ymm11, %ymm4, %ymm3 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm12[4],xmm5[5],xmm12[6,7] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3,4],ymm5[5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm3 +; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm2 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpternlogq $248, %ymm11, %ymm2, %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15] +; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm7, %xmm6 +; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4],xmm0[5],xmm6[6,7] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, (%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, (%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, (%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, (%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, (%rcx) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, (%r8) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, (%r9) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, (%r9) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper @@ -4410,9 +4445,9 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX2-FP-LABEL: load_i16_stride6_vf32: ; AVX2-FP: # %bb.0: -; AVX2-FP-NEXT: subq $488, %rsp # imm = 0x1E8 +; AVX2-FP-NEXT: subq $456, %rsp # imm = 0x1C8 ; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm5 -; AVX2-FP-NEXT: vmovdqu %ymm5, (%rsp) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm7 ; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm0 @@ -4421,12 +4456,12 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm10 ; AVX2-FP-NEXT: vmovdqa 288(%rdi), %ymm2 ; AVX2-FP-NEXT: vmovdqa 256(%rdi), %ymm3 -; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm3[2,3],ymm2[2,3] -; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm3[0,1],ymm2[0,1] +; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm3[2,3],ymm2[2,3] +; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm3[0,1],ymm2[0,1] ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] -; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm0[1],ymm2[2,3,4,5],ymm0[6],ymm2[7] ; AVX2-FP-NEXT: vpshufb %ymm4, %ymm1, %ymm6 @@ -4440,121 +4475,116 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm8, %ymm6, %ymm5 ; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa %ymm10, %ymm5 ; AVX2-FP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7] -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm6, %xmm8 -; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm7[2,2,2,2,4,5,6,7] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm10[1],xmm8[2,3],xmm10[4],xmm8[5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpshufb %xmm7, %xmm6, %xmm7 +; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm8 +; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm8[2,2,2,2,4,5,6,7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm11[1],xmm7[2,3],xmm11[4],xmm7[5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0],ymm12[1],ymm13[2,3,4,5],ymm12[6],ymm13[7] -; AVX2-FP-NEXT: vpshufb %ymm4, %ymm10, %ymm4 -; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm8, %ymm4, %ymm4 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0],ymm14[1],ymm12[2,3,4,5],ymm14[6],ymm12[7] +; AVX2-FP-NEXT: vpshufb %ymm4, %ymm11, %ymm4 +; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm7, %ymm4, %ymm4 ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm4 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15] ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm8 = [10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15] -; AVX2-FP-NEXT: vpshufb %xmm8, %xmm2, %xmm2 +; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm2 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] ; AVX2-FP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpshufb %ymm3, %ymm10, %ymm1 -; AVX2-FP-NEXT: vpshufb %xmm4, %xmm7, %xmm3 -; AVX2-FP-NEXT: vpshufb %xmm8, %xmm6, %xmm4 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm9[1],ymm5[2,3],ymm9[4],ymm5[5,6],ymm9[7] +; AVX2-FP-NEXT: vpshufb %ymm3, %ymm11, %ymm1 +; AVX2-FP-NEXT: vpshufb %xmm4, %xmm8, %xmm3 +; AVX2-FP-NEXT: vpshufb %xmm4, %xmm6, %xmm4 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6],ymm9[7] ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6,7] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15] ; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm3, %ymm1, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpshufb %xmm15, %xmm2, %xmm0 -; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm1 +; AVX2-FP-NEXT: vpshufb %xmm15, %xmm5, %xmm0 +; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm1 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[2,1,0,3] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u] +; AVX2-FP-NEXT: vpshufb %xmm15, %xmm11, %xmm1 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0],ymm13[1],ymm12[2,3,4,5],ymm13[6],ymm12[7] -; AVX2-FP-NEXT: vpshufb %ymm14, %ymm8, %ymm1 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0],ymm12[1],ymm14[2,3,4,5],ymm12[6],ymm14[7] +; AVX2-FP-NEXT: vpshufb %ymm13, %ymm10, %ymm1 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FP-NEXT: vmovdqa 352(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 320(%rdi), %ymm3 -; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7] -; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX2-FP-NEXT: vpshufb %xmm5, %xmm7, %xmm6 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5] -; AVX2-FP-NEXT: vextracti128 $1, %ymm7, %xmm4 -; AVX2-FP-NEXT: vpshufb %xmm10, %xmm4, %xmm9 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3],xmm9[4,5],xmm6[6],xmm9[7] +; AVX2-FP-NEXT: vmovdqa 320(%rdi), %ymm2 +; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5] +; AVX2-FP-NEXT: vpshufb %xmm2, %xmm7, %xmm6 +; AVX2-FP-NEXT: vextracti128 $1, %ymm7, %xmm9 +; AVX2-FP-NEXT: vpshufb %xmm2, %xmm9, %xmm8 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3],xmm8[4,5],xmm6[6],xmm8[7] ; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0,1,2],ymm6[3,4,5,6,7],ymm0[8,9,10],ymm6[11,12,13,14,15] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm9 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm0[2],ymm9[3,4],ymm0[5],ymm9[6,7] -; AVX2-FP-NEXT: vpshufb %xmm5, %xmm0, %xmm5 -; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-FP-NEXT: vpshufb %xmm10, %xmm3, %xmm10 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm5[3],xmm10[4,5],xmm5[6],xmm10[7] -; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm5 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5,6],mem[7] -; AVX2-FP-NEXT: vpshufb %xmm15, %xmm5, %xmm15 -; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm13 -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,0,3] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0,1],xmm12[2],xmm15[3],xmm12[4,5],xmm15[6,7] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3,4,5],mem[6],ymm1[7] -; AVX2-FP-NEXT: vpshufb %ymm14, %ymm1, %ymm14 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm14[3,4,5,6,7] -; AVX2-FP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0,1,2],ymm10[3,4,5,6,7],ymm12[8,9,10],ymm10[11,12,13,14,15] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm12 = [6,7,2,3,12,13,14,15,6,7,2,3,12,13,14,15] -; AVX2-FP-NEXT: vpshufb %xmm12, %xmm2, %xmm2 -; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm11[1,1,1,1,4,5,6,7] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm10[2],xmm2[3,4],xmm10[5],xmm2[6],xmm10[7] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX2-FP-NEXT: vpshufb %ymm11, %ymm8, %ymm8 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm14 = [2,3,6,7,4,5,0,1,10,11,14,15,12,13,14,15] -; AVX2-FP-NEXT: vpshufb %xmm14, %xmm2, %xmm2 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm8[3,4,5,6,7] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm8 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] -; AVX2-FP-NEXT: vpshufb %xmm8, %xmm4, %xmm4 +; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm8 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7] +; AVX2-FP-NEXT: vpshufb %xmm2, %xmm1, %xmm0 +; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-FP-NEXT: vpshufb %xmm2, %xmm4, %xmm2 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1,2],xmm0[3],xmm2[4,5],xmm0[6],xmm2[7] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm2 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] +; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm12 +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,0,3] +; AVX2-FP-NEXT: vpshufb %xmm15, %xmm2, %xmm0 +; AVX2-FP-NEXT: vpshufb %xmm15, %xmm12, %xmm15 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm15[2],xmm0[3],xmm15[4,5],xmm0[6,7] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FP-NEXT: vpblendd $189, (%rsp), %ymm6, %ymm15 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm15 = mem[0],ymm6[1],mem[2,3,4,5],ymm6[6],mem[7] +; AVX2-FP-NEXT: vpshufb %ymm13, %ymm15, %ymm13 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm13[3,4,5,6,7] +; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3,4,5,6,7],ymm0[8,9,10],ymm3[11,12,13,14,15] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm6 = [6,7,2,3,12,13,14,15,6,7,2,3,12,13,14,15] +; AVX2-FP-NEXT: vpshufb %xmm6, %xmm5, %xmm3 +; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm11[1,1,1,1,4,5,6,7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2],xmm3[3,4],xmm5[5],xmm3[6],xmm5[7] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX2-FP-NEXT: vpshufb %ymm5, %ymm10, %ymm10 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [2,3,6,7,4,5,0,1,10,11,14,15,12,13,14,15] +; AVX2-FP-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm10[3,4,5,6,7] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm10 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] +; AVX2-FP-NEXT: vpshufb %xmm10, %xmm9, %xmm9 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[3],xmm4[4,5],xmm7[6],xmm4[7] -; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0,1,2],ymm4[3,4,5,6,7],ymm2[8,9,10],ymm4[11,12,13,14,15] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpshufb %xmm8, %xmm3, %xmm2 -; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3],xmm2[4,5],xmm0[6],xmm2[7] -; AVX2-FP-NEXT: vpshufb %ymm11, %ymm1, %ymm1 -; AVX2-FP-NEXT: vpshufb %xmm12, %xmm5, %xmm2 -; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[1,1,1,1,4,5,6,7] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6],xmm3[7] -; AVX2-FP-NEXT: vpshufb %xmm14, %xmm2, %xmm2 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1,2],xmm7[3],xmm9[4,5],xmm7[6],xmm9[7] +; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0,1,2],ymm7[3,4,5,6,7],ymm3[8,9,10],ymm7[11,12,13,14,15] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpshufb %xmm10, %xmm4, %xmm3 +; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7] +; AVX2-FP-NEXT: vpshufb %ymm5, %ymm15, %ymm3 +; AVX2-FP-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[1,1,1,1,4,5,6,7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3,4],xmm4[5],xmm2[6],xmm4[7] +; AVX2-FP-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm14 = ymm14[0,1],mem[2],ymm14[3],mem[4],ymm14[5,6],mem[7] ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm6 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX2-FP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,1,0,3] ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,2,1] @@ -4565,106 +4595,104 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] -; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm8 -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm11 = xmm8[0,3,2,1] +; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm6 +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[0,3,2,1] ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,2,3] -; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm8 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] -; AVX2-FP-NEXT: vpshufb %xmm8, %xmm11, %xmm12 -; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm13 = xmm2[2,1,2,0,4,5,6,7] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0],xmm12[1,2],xmm13[3],xmm12[4,5,6,7] +; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm6 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] +; AVX2-FP-NEXT: vpshufb %xmm6, %xmm10, %xmm9 +; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm2[2,1,2,0,4,5,6,7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm9 = xmm12[0],xmm9[1,2],xmm12[3],xmm9[4,5,6,7] ; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] -; AVX2-FP-NEXT: vpshufb %ymm13, %ymm6, %ymm14 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0,1,2],ymm3[3,4,5,6,7],ymm14[8,9,10],ymm3[11,12,13,14,15] -; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,6,5,4] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm14[5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm15 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] +; AVX2-FP-NEXT: vpshufb %ymm15, %ymm14, %ymm12 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm12[0,1,2],ymm3[3,4,5,6,7],ymm12[8,9,10],ymm3[11,12,13,14,15] +; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,6,5,4] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm12[5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm3[0,1],ymm9[2],ymm3[3,4],ymm9[5],ymm3[6,7] -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm12 = xmm14[2,1,0,3] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7] +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm12 = xmm9[2,1,0,3] ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm12, %xmm0 -; AVX2-FP-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,1] -; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm15 = xmm14[0,1,2,3,6,5,6,4] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm15[4],xmm0[5,6],xmm15[7] +; AVX2-FP-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] +; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm13 = xmm9[0,1,2,3,6,5,6,4] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm13[4],xmm0[5,6],xmm13[7] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vpblendd $107, (%rsp), %ymm0, %ymm13 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm13 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm15 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7] -; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm10 -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,3,2,1] -; AVX2-FP-NEXT: vpshufb %xmm8, %xmm10, %xmm8 +; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm11 +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,3,2,1] +; AVX2-FP-NEXT: vpshufb %xmm6, %xmm11, %xmm6 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm0[2,1,2,0,4,5,6,7] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1,2],xmm7[3],xmm8[4,5,6,7] +; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[2,1,2,0,4,5,6,7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1,2],xmm3[3],xmm6[4,5,6,7] ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FP-NEXT: vpshufb %ymm13, %ymm15, %ymm8 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0,1,2],ymm1[3,4,5,6,7],ymm8[8,9,10],ymm1[11,12,13,14,15] -; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,5,4] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm8[5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm7[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vpshufb %ymm15, %ymm13, %ymm6 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7],ymm6[8,9,10],ymm1[11,12,13,14,15] +; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm6[5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u] -; AVX2-FP-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX2-FP-NEXT: vpshufb %xmm1, %xmm5, %xmm3 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,5] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4],xmm5[5,6],xmm4[7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7] ; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm5 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0] -; AVX2-FP-NEXT: vpshufb %xmm5, %xmm11, %xmm7 +; AVX2-FP-NEXT: vpshufb %xmm5, %xmm10, %xmm4 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,1,4,5,6,7] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm7[1,2],xmm2[3],xmm7[4,5,6,7] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19] -; AVX2-FP-NEXT: vpshufb %ymm7, %ymm6, %ymm6 -; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3,4,5,6,7],ymm6[8,9,10],ymm4[11,12,13,14,15] -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,2] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm6[5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2],xmm2[3],xmm4[4,5,6,7] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19] +; AVX2-FP-NEXT: vpshufb %ymm10, %ymm14, %ymm4 +; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15] +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,2] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm4[5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm12, %xmm1 -; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm14[0,1,2,3,7,5,6,5] +; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,7,5,6,5] ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] -; AVX2-FP-NEXT: vpshufb %ymm7, %ymm15, %ymm2 -; AVX2-FP-NEXT: vpshufb %xmm5, %xmm10, %xmm5 +; AVX2-FP-NEXT: vpshufb %ymm10, %ymm13, %ymm2 +; AVX2-FP-NEXT: vpshufb %xmm5, %xmm11, %xmm3 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1,2],xmm0[3],xmm5[4,5,6,7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2],xmm0[3],xmm3[4,5,6,7] ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0],ymm3[1],ymm9[2,3],ymm3[4],ymm9[5,6],ymm3[7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6],ymm7[7] ; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] -; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm5 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX2-FP-NEXT: vpshufb %xmm5, %xmm2, %xmm6 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u] -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm1, %xmm9 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1,2,3],xmm6[4],xmm9[5],xmm6[6,7] -; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm6 = mem[0,1,2,3,4],ymm6[5,6,7] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm9 = mem[0],ymm9[1],mem[2,3],ymm9[4],mem[5,6],ymm9[7] -; AVX2-FP-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX2-FP-NEXT: vpshufb %xmm5, %xmm10, %xmm5 -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,3,2,1] -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm9, %xmm7 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13] +; AVX2-FP-NEXT: vpshufb %xmm3, %xmm2, %xmm5 +; AVX2-FP-NEXT: vpshufb %xmm3, %xmm1, %xmm7 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2,3],xmm5[4],xmm7[5],xmm5[6,7] ; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm5 = mem[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm7 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm2, %xmm2 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u] -; AVX2-FP-NEXT: vpshufb %xmm11, %xmm1, %xmm1 +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FP-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3],mem[4],ymm7[5,6],mem[7] +; AVX2-FP-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,3,2,1] +; AVX2-FP-NEXT: vpshufb %xmm3, %xmm8, %xmm9 +; AVX2-FP-NEXT: vpshufb %xmm3, %xmm7, %xmm3 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm9[4],xmm3[5],xmm9[6,7] +; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm3 = mem[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15] +; AVX2-FP-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX2-FP-NEXT: vpshufb %xmm9, %xmm1, %xmm1 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6,7] ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm10, %xmm2 -; AVX2-FP-NEXT: vpshufb %xmm11, %xmm9, %xmm7 +; AVX2-FP-NEXT: vpshufb %xmm9, %xmm8, %xmm2 +; AVX2-FP-NEXT: vpshufb %xmm9, %xmm7, %xmm7 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1,2,3],xmm2[4],xmm7[5],xmm2[6,7] ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload @@ -4675,27 +4703,27 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovaps %ymm7, (%rsi) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm7, 32(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm3, (%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm3, 32(%rcx) -; AVX2-FP-NEXT: vmovdqa %ymm8, (%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm7, (%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm7, 32(%rcx) +; AVX2-FP-NEXT: vmovdqa %ymm6, (%rcx) ; AVX2-FP-NEXT: vmovdqa %ymm4, 32(%r8) ; AVX2-FP-NEXT: vmovdqa %ymm0, (%r8) -; AVX2-FP-NEXT: vmovdqa %ymm5, 32(%r9) -; AVX2-FP-NEXT: vmovdqa %ymm6, (%r9) +; AVX2-FP-NEXT: vmovdqa %ymm3, 32(%r9) +; AVX2-FP-NEXT: vmovdqa %ymm5, (%r9) ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FP-NEXT: vmovdqa %ymm2, 32(%rax) ; AVX2-FP-NEXT: vmovdqa %ymm1, (%rax) -; AVX2-FP-NEXT: addq $488, %rsp # imm = 0x1E8 +; AVX2-FP-NEXT: addq $456, %rsp # imm = 0x1C8 ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq ; ; AVX2-FCP-LABEL: load_i16_stride6_vf32: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: subq $488, %rsp # imm = 0x1E8 +; AVX2-FCP-NEXT: subq $456, %rsp # imm = 0x1C8 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm5 -; AVX2-FCP-NEXT: vmovdqu %ymm5, (%rsp) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm7 ; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm0 @@ -4704,12 +4732,12 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm10 ; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %ymm2 ; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm3 -; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm3[2,3],ymm2[2,3] -; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm3[0,1],ymm2[0,1] +; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm3[2,3],ymm2[2,3] +; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm3[0,1],ymm2[0,1] ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] -; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm0[1],ymm2[2,3,4,5],ymm0[6],ymm2[7] ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm6 @@ -4723,121 +4751,116 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm8, %ymm6, %ymm5 ; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa %ymm10, %ymm5 ; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7] -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm8 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm7[2,2,2,2,4,5,6,7] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm10[1],xmm8[2,3],xmm10[4],xmm8[5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm7 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm8 +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm8[2,2,2,2,4,5,6,7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm11[1],xmm7[2,3],xmm11[4],xmm7[5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0],ymm12[1],ymm13[2,3,4,5],ymm12[6],ymm13[7] -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm10, %ymm4 -; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm8, %ymm4, %ymm4 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0],ymm14[1],ymm12[2,3,4,5],ymm14[6],ymm12[7] +; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm11, %ymm4 +; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm7, %ymm4, %ymm4 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15] ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15] -; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm2, %xmm2 +; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm2 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] ; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm10, %ymm1 -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm7, %xmm3 -; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm6, %xmm4 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm9[1],ymm5[2,3],ymm9[4],ymm5[5,6],ymm9[7] +; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm11, %ymm1 +; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm8, %xmm3 +; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm4 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6],ymm9[7] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6,7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15] ; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm3, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm2, %xmm0 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm1 +; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm5, %xmm0 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm1 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[2,1,0,3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm11, %xmm1 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0],ymm13[1],ymm12[2,3,4,5],ymm13[6],ymm12[7] -; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm8, %ymm1 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0],ymm12[1],ymm14[2,3,4,5],ymm12[6],ymm14[7] +; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm10, %ymm1 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm3 -; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7] -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm7, %xmm6 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm4 -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm9 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3],xmm9[4,5],xmm6[6],xmm9[7] +; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm2 +; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5] +; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm7, %xmm6 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm9 +; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm9, %xmm8 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3],xmm8[4,5],xmm6[6],xmm8[7] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0,1,2],ymm6[3,4,5,6,7],ymm0[8,9,10],ymm6[11,12,13,14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm9 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm0[2],ymm9[3,4],ymm0[5],ymm9[6,7] -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm5 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm10 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm5[3],xmm10[4,5],xmm5[6],xmm10[7] -; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm5 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5,6],mem[7] -; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm5, %xmm15 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm13 -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,0,3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0,1],xmm12[2],xmm15[3],xmm12[4,5],xmm15[6,7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3,4,5],mem[6],ymm1[7] -; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm1, %ymm14 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm14[3,4,5,6,7] -; AVX2-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0,1,2],ymm10[3,4,5,6,7],ymm12[8,9,10],ymm10[11,12,13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm12 = [6,7,2,3,12,13,14,15,6,7,2,3,12,13,14,15] -; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm2 -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm11[1,1,1,1,4,5,6,7] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm10[2],xmm2[3,4],xmm10[5],xmm2[6],xmm10[7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm8, %ymm8 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [2,3,6,7,4,5,0,1,10,11,14,15,12,13,14,15] -; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm2, %xmm2 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm8[3,4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] -; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm4, %xmm4 +; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm8 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7] +; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm0 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm2 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1,2],xmm0[3],xmm2[4,5],xmm0[6],xmm2[7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm2 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm12 +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,0,3] +; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm2, %xmm0 +; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm12, %xmm15 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm15[2],xmm0[3],xmm15[4,5],xmm0[6,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendd $189, (%rsp), %ymm6, %ymm15 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm15 = mem[0],ymm6[1],mem[2,3,4,5],ymm6[6],mem[7] +; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm15, %ymm13 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm13[3,4,5,6,7] +; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3,4,5,6,7],ymm0[8,9,10],ymm3[11,12,13,14,15] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm6 = [6,7,2,3,12,13,14,15,6,7,2,3,12,13,14,15] +; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm3 +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm11[1,1,1,1,4,5,6,7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2],xmm3[3,4],xmm5[5],xmm3[6],xmm5[7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm10, %ymm10 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [2,3,6,7,4,5,0,1,10,11,14,15,12,13,14,15] +; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm10[3,4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] +; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm9, %xmm9 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[3],xmm4[4,5],xmm7[6],xmm4[7] -; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0,1,2],ymm4[3,4,5,6,7],ymm2[8,9,10],ymm4[11,12,13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm2 -; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3],xmm2[4,5],xmm0[6],xmm2[7] -; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm5, %xmm2 -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[1,1,1,1,4,5,6,7] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6],xmm3[7] -; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm2, %xmm2 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1,2],xmm7[3],xmm9[4,5],xmm7[6],xmm9[7] +; AVX2-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0,1,2],ymm7[3,4,5,6,7],ymm3[8,9,10],ymm7[11,12,13,14,15] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm3 +; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7] +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm15, %ymm3 +; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[1,1,1,1,4,5,6,7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3,4],xmm4[5],xmm2[6],xmm4[7] +; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm14 = ymm14[0,1],mem[2],ymm14[3],mem[4],ymm14[5,6],mem[7] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm6 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX2-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,1,0,3] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,2,1] @@ -4848,106 +4871,104 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm8 -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm8[0,3,2,1] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm6 +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[0,3,2,1] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,2,3] -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm8 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] -; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm11, %xmm12 -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm13 = xmm2[2,1,2,0,4,5,6,7] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0],xmm12[1,2],xmm13[3],xmm12[4,5,6,7] +; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm6 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] +; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm10, %xmm9 +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm2[2,1,2,0,4,5,6,7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm12[0],xmm9[1,2],xmm12[3],xmm9[4,5,6,7] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] -; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm6, %ymm14 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0,1,2],ymm3[3,4,5,6,7],ymm14[8,9,10],ymm3[11,12,13,14,15] -; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,6,5,4] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm14[5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] +; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm14, %ymm12 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm12[0,1,2],ymm3[3,4,5,6,7],ymm12[8,9,10],ymm3[11,12,13,14,15] +; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,6,5,4] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm12[5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm3[0,1],ymm9[2],ymm3[3,4],ymm9[5],ymm3[6,7] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm14[2,1,0,3] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm9[2,1,0,3] ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm0 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,1] -; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm15 = xmm14[0,1,2,3,6,5,6,4] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm15[4],xmm0[5,6],xmm15[7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] +; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm13 = xmm9[0,1,2,3,6,5,6,4] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm13[4],xmm0[5,6],xmm13[7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendd $107, (%rsp), %ymm0, %ymm13 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm13 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm15 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7] -; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm10 -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,3,2,1] -; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm10, %xmm8 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm11 +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,3,2,1] +; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm11, %xmm6 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm0[2,1,2,0,4,5,6,7] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1,2],xmm7[3],xmm8[4,5,6,7] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[2,1,2,0,4,5,6,7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1,2],xmm3[3],xmm6[4,5,6,7] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm15, %ymm8 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0,1,2],ymm1[3,4,5,6,7],ymm8[8,9,10],ymm1[11,12,13,14,15] -; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,5,4] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm8[5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm7[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm13, %ymm6 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7],ymm6[8,9,10],ymm1[11,12,13,14,15] +; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm6[5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u] -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm3 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,5] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4],xmm5[5,6],xmm4[7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7] ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm5 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0] -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm11, %xmm7 +; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm10, %xmm4 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,1,4,5,6,7] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm7[1,2],xmm2[3],xmm7[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19] -; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm6, %ymm6 -; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3,4,5,6,7],ymm6[8,9,10],ymm4[11,12,13,14,15] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,2] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm6[5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2],xmm2[3],xmm4[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19] +; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm14, %ymm4 +; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,2] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm4[5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm12, %xmm1 -; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm14[0,1,2,3,7,5,6,5] +; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,7,5,6,5] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] -; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm15, %ymm2 -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm10, %xmm5 +; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm13, %ymm2 +; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm11, %xmm3 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1,2],xmm0[3],xmm5[4,5,6,7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2],xmm0[3],xmm3[4,5,6,7] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0],ymm3[1],ymm9[2,3],ymm3[4],ymm9[5,6],ymm3[7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6],ymm7[7] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm5 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm6 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm9 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1,2,3],xmm6[4],xmm9[5],xmm6[6,7] -; AVX2-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm6 = mem[0,1,2,3,4],ymm6[5,6,7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm9 = mem[0],ymm9[1],mem[2,3],ymm9[4],mem[5,6],ymm9[7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm10, %xmm5 -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,3,2,1] -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm9, %xmm7 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13] +; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm5 +; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm7 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2,3],xmm5[4],xmm7[5],xmm5[6,7] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm5 = mem[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm7 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm2 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm1, %xmm1 +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3],mem[4],ymm7[5,6],mem[7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,3,2,1] +; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm9 +; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm3 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm9[4],xmm3[5],xmm9[6,7] +; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm3 = mem[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15] +; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm1 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6,7] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm10, %xmm2 -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm9, %xmm7 +; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm8, %xmm2 +; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm7, %xmm7 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1,2,3],xmm2[4],xmm7[5],xmm2[6,7] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload @@ -4958,19 +4979,19 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovaps %ymm7, (%rsi) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm7, 32(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm3, (%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%rcx) -; AVX2-FCP-NEXT: vmovdqa %ymm8, (%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm7, (%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm7, 32(%rcx) +; AVX2-FCP-NEXT: vmovdqa %ymm6, (%rcx) ; AVX2-FCP-NEXT: vmovdqa %ymm4, 32(%r8) ; AVX2-FCP-NEXT: vmovdqa %ymm0, (%r8) -; AVX2-FCP-NEXT: vmovdqa %ymm5, 32(%r9) -; AVX2-FCP-NEXT: vmovdqa %ymm6, (%r9) +; AVX2-FCP-NEXT: vmovdqa %ymm3, 32(%r9) +; AVX2-FCP-NEXT: vmovdqa %ymm5, (%r9) ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: vmovdqa %ymm2, 32(%rax) ; AVX2-FCP-NEXT: vmovdqa %ymm1, (%rax) -; AVX2-FCP-NEXT: addq $488, %rsp # imm = 0x1E8 +; AVX2-FCP-NEXT: addq $456, %rsp # imm = 0x1C8 ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; @@ -5040,10 +5061,9 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,4,6] ; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] ; AVX512-NEXT: vmovdqa64 %ymm10, %ymm30 -; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm10 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm10 = [2,3,14,15,10,11,6,7,2,3,14,15,12,13,14,15] ; AVX512-NEXT: vpshufb %xmm10, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm12 = [2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15] -; AVX512-NEXT: vpshufb %xmm12, %xmm2, %xmm2 +; AVX512-NEXT: vpshufb %xmm10, %xmm2, %xmm2 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4,5],xmm2[6,7] ; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] ; AVX512-NEXT: vpshufb %xmm2, %xmm4, %xmm4 @@ -5053,7 +5073,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpshufb %xmm10, %xmm5, %xmm0 -; AVX512-NEXT: vpshufb %xmm12, %xmm3, %xmm1 +; AVX512-NEXT: vpshufb %xmm10, %xmm3, %xmm1 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] ; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] ; AVX512-NEXT: vpblendd {{.*#+}} ymm15 = ymm0[0,1,2],ymm1[3,4,5,6,7] @@ -5262,109 +5282,105 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-LABEL: load_i16_stride6_vf32: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: subq $136, %rsp -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] -; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %ymm15 -; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm1 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm15[1],ymm1[2,3],ymm15[4],ymm1[5,6],ymm15[7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm17 -; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm0 -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm6 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm1 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[2,1,0,3] -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm11, %xmm1 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] -; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm4 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15] +; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %ymm13 +; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm15 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm15[0],ymm13[1],ymm15[2,3],ymm13[4],ymm15[5,6],ymm13[7] +; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm7, %xmm2 +; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm3 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm3[2,1,0,3] +; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm11, %xmm3 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7] +; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm3 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm13 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm4[2],ymm13[3,4],ymm4[5],ymm13[6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm24 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm7 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm8 -; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5] -; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm9 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4,5],xmm8[6],xmm9[7] -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm12 -; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm9 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm8, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm12[1],ymm0[2,3],ymm12[4],ymm0[5,6],ymm12[7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm12, %ymm16 +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm10 +; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm6 +; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm12 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm3[2],ymm12[3,4],ymm3[5],ymm12[6,7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm26 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5] +; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm5 +; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm8 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3],xmm8[4,5],xmm5[6],xmm8[7] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm4, %zmm5, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0],ymm10[1],ymm0[2,3],ymm10[4],ymm0[5,6],ymm10[7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm10, %ymm16 ; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm22 -; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm8 -; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm4, %xmm8 +; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,0,3] -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm6 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm6[2],xmm8[3],xmm6[4,5],xmm8[6,7] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],mem[2,3] -; AVX512-FCP-NEXT: vinserti128 $1, 96(%rdi), %ymm9, %ymm14 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0],ymm0[1],ymm14[2,3,4,5],ymm0[6],ymm14[7] +; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0,1],xmm1[2],xmm8[3],xmm1[4,5],xmm8[6,7] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm6[2,3],mem[2,3] +; AVX512-FCP-NEXT: vinserti128 $1, 96(%rdi), %ymm6, %ymm8 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0],ymm0[1],ymm8[2,3,4,5],ymm0[6],ymm8[7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm8, %ymm27 ; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm28 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm9[3,4,5,6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm8[3,4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa 352(%rdi), %ymm0 ; AVX512-FCP-NEXT: vmovdqa 320(%rdi), %ymm8 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7] ; AVX512-FCP-NEXT: vmovdqa64 %ymm8, %ymm19 ; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm20 -; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm9, %xmm8 -; AVX512-FCP-NEXT: vextracti128 $1, %ymm9, %xmm7 -; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm7, %xmm10 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2],xmm8[3],xmm10[4,5],xmm8[6],xmm10[7] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm10 -; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %ymm8 -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm8[2,3],mem[2,3] -; AVX512-FCP-NEXT: vinserti128 $1, 288(%rdi), %ymm8, %ymm12 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0],ymm0[1],ymm12[2,3,4,5],ymm0[6],ymm12[7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm12, %ymm25 -; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm26 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm8[0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u] +; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm10 +; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm8 +; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm8, %xmm9 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm10[3],xmm9[4,5],xmm10[6],xmm9[7] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm10 +; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %ymm9 +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],mem[2,3] +; AVX512-FCP-NEXT: vinserti128 $1, 288(%rdi), %ymm9, %ymm14 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0],ymm0[1],ymm14[2,3,4,5],ymm0[6],ymm14[7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm14, %ymm24 +; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm25 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm9[0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15] ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm29 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15] -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm10 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] -; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm11, %xmm11 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm11[2],xmm3[3],xmm11[4,5],xmm3[6,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [2,3,14,15,2,3,6,7,10,11,14,15,12,13,14,15] +; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm7, %xmm7 +; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm11, %xmm10 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm10[2],xmm7[3],xmm10[4,5],xmm7[6,7] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] -; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm4, %xmm4 -; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3],xmm4[4,5],xmm1[6],xmm4[7] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm3, %zmm1, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm1 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm3, %xmm3 +; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5],xmm2[6],xmm3[7] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm2, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm2 +; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm0 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4,5],xmm2[6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm7, %xmm0 -; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm9[0,1,2,3,5,5,5,5] +; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm0 +; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm18 -; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm0 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4],ymm15[5],ymm0[6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm15, %ymm30 -; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm31 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm13[2],ymm15[3,4],ymm13[5],ymm15[6,7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm13, %ymm30 +; AVX512-FCP-NEXT: vmovdqa64 %ymm15, %ymm31 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[2,1,2,3] ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm10[2,1,2,0,4,5,6,7] ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[0,3,2,1] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[u,u,0,1,4,5,u,u,12,13,12,13,12,13,12,13] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm0 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm13[2],ymm0[3,4],ymm13[5],ymm0[6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm13, %ymm21 +; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm0 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm12[2],ymm0[3,4],ymm12[5],ymm0[6,7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm12, %ymm21 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,1,0,3] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u] @@ -5384,6 +5400,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,3,2,1] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3],xmm2[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm14 ; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm12 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1],ymm12[2],ymm14[3],ymm12[4],ymm14[5,6],ymm12[7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] @@ -5400,8 +5417,8 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm13 = xmm2[0,1,2,3,6,5,6,4] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm13[4],xmm0[5,6],xmm13[7] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm13 -; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm0 -; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm1 +; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm1 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3],ymm1[4],ymm0[5,6],ymm1[7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0,1,2],ymm13[3,4,5,6,7],ymm0[8,9,10],ymm13[11,12,13,14,15] @@ -5440,96 +5457,93 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpternlogq $226, %zmm7, %zmm0, %zmm4 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 ; AVX512-FCP-NEXT: vpternlogq $184, %zmm4, %zmm17, %zmm20 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] ; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm0 ; AVX512-FCP-NEXT: vmovdqa64 %ymm31, %ymm1 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm0 -; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[2,2,2,2,4,5,6,7] +; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm0 +; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[2,2,2,2,4,5,6,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm2 +; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm2 ; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm3 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm6 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm8 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm6, %xmm3 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm7 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1,2,3],xmm3[4],xmm7[5],xmm3[6,7] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm3 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,3,2,1] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13] +; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm6, %xmm2 +; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm7 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1,2,3],xmm2[4],xmm7[5],xmm2[6,7] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm14[1],ymm12[2,3,4,5],ymm14[6],ymm12[7] ; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm7 -; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm10 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm10[2],ymm7[3,4],ymm10[5],ymm7[6,7] -; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm7, %xmm4 -; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm10 -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm10[2,2,2,2,4,5,6,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm11[1],xmm4[2,3],xmm11[4],xmm4[5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm23 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpternlogq $236, %ymm23, %ymm13, %ymm4 +; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm9 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm9[2],ymm7[3,4],ymm9[5],ymm7[6,7] +; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm7, %xmm5 +; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm9 +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm9[2,2,2,2,4,5,6,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm10[1],xmm5[2,3],xmm10[4],xmm5[5,6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vpternlogq $236, %ymm11, %ymm10, %ymm5 ; AVX512-FCP-NEXT: movw $31, %ax ; AVX512-FCP-NEXT: kmovw %eax, %k1 -; AVX512-FCP-NEXT: vmovdqa32 %zmm4, %zmm3 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm4 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0],ymm15[1],ymm4[2,3],ymm15[4],ymm4[5,6],ymm15[7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm4 -; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm4, %xmm15 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm13[0,3,2,1] -; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm8, %xmm9 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm15[4],xmm9[5],xmm15[6,7] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm13 -; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm9 -; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm11 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0],ymm9[1],ymm11[2,3,4,5],ymm9[6],ymm11[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm9[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm13 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm15 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] -; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm10, %xmm10 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15] -; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm7, %xmm7 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm10[1],xmm7[2,3],xmm10[4],xmm7[5,6,7] +; AVX512-FCP-NEXT: vmovdqa32 %zmm5, %zmm2 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm5 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0],ymm15[1],ymm5[2,3],ymm15[4],ymm5[5,6],ymm15[7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm5 +; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm13 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,3,2,1] +; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm10, %xmm8 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm13[4],xmm8[5],xmm13[6,7] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm12 +; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm13 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0],ymm12[1],ymm13[2,3,4,5],ymm12[6],ymm13[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm13[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1,2,3,4],ymm8[5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm8 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15] +; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm9, %xmm9 +; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm7, %xmm7 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm9[1],xmm7[2,3],xmm9[4],xmm7[5,6,7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpternlogq $248, %ymm23, %ymm7, %ymm0 -; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm5, %xmm5 -; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm1, %xmm1 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3],xmm5[4],xmm1[5,6,7] -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm5 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm6, %xmm6 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4],xmm2[5],xmm6[6,7] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1 +; AVX512-FCP-NEXT: vpternlogq $248, %ymm11, %ymm7, %ymm0 +; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm4, %xmm4 +; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm1, %xmm1 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3],xmm4[4],xmm1[5,6,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15] +; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm6 +; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4],xmm3[5],xmm6[6,7] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm3, %zmm1 ; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm0 -; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm8, %xmm2 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4],xmm2[5],xmm0[6,7] +; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm5, %xmm0 +; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm10, %xmm3 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4],xmm3[5],xmm0[6,7] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm9[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm13[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload ; AVX512-FCP-NEXT: movw $-2048, %ax # imm = 0xF800 ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqa32 %zmm19, %zmm4 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload -; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload ; AVX512-FCP-NEXT: vmovdqa32 %zmm28, %zmm4 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rdx) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vpternlogq $184, %zmm3, %zmm17, %zmm13 +; AVX512-FCP-NEXT: vpternlogq $184, %zmm2, %zmm17, %zmm8 ; AVX512-FCP-NEXT: vpternlogq $184, %zmm1, %zmm17, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, (%rcx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm20, (%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, (%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, (%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512-FCP-NEXT: addq $136, %rsp ; AVX512-FCP-NEXT: vzeroupper @@ -5539,10 +5553,10 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: pushq %rax ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] -; AVX512DQ-NEXT: vmovdqa 224(%rdi), %ymm1 -; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm13 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0],ymm1[1],ymm13[2,3],ymm1[4],ymm13[5,6],ymm1[7] -; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm18 +; AVX512DQ-NEXT: vmovdqa 224(%rdi), %ymm13 +; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0],ymm13[1],ymm2[2,3],ymm13[4],ymm2[5,6],ymm13[7] +; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm25 ; AVX512DQ-NEXT: vpshufb %xmm0, %xmm3, %xmm1 ; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm9 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[0,2,0,3] @@ -5554,8 +5568,8 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm4 ; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm7 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm14 = ymm7[0,1],ymm2[2],ymm7[3,4],ymm2[5],ymm7[6,7] -; AVX512DQ-NEXT: vmovdqa64 %ymm7, %ymm22 -; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm24 +; AVX512DQ-NEXT: vmovdqa64 %ymm7, %ymm20 +; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm22 ; AVX512DQ-NEXT: vextracti128 $1, %ymm14, %xmm15 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm14[2,2,2,2,4,5,6,7] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm15[0,1,2],xmm2[3,4],xmm15[5,6,7] @@ -5565,8 +5579,8 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm16 ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],mem[2,3] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5,6],ymm6[7] -; AVX512DQ-NEXT: vmovdqa64 %ymm6, %ymm19 -; AVX512DQ-NEXT: vmovdqa64 %ymm5, %ymm21 +; AVX512DQ-NEXT: vmovdqa64 %ymm6, %ymm18 +; AVX512DQ-NEXT: vmovdqa64 %ymm5, %ymm19 ; AVX512DQ-NEXT: vpshufb %xmm0, %xmm2, %xmm0 ; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm6 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[0,2,0,3] @@ -5574,14 +5588,14 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2],xmm0[3],xmm5[4,5],xmm0[6,7] ; AVX512DQ-NEXT: vinserti128 $1, 96(%rdi), %ymm4, %ymm12 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0],ymm1[1],ymm12[2,3,4,5],ymm1[6],ymm12[7] -; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm29 +; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm28 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1,2],ymm5[3,4,5,6,7] ; AVX512DQ-NEXT: vmovdqa 352(%rdi), %ymm0 ; AVX512DQ-NEXT: vmovdqa 320(%rdi), %ymm5 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm0[2],ymm5[3,4],ymm0[5],ymm5[6,7] -; AVX512DQ-NEXT: vmovdqa64 %ymm5, %ymm23 -; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm25 +; AVX512DQ-NEXT: vmovdqa64 %ymm5, %ymm21 +; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm23 ; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm5 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm11 = xmm1[2,2,2,2,4,5,6,7] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm5[0,1,2],xmm11[3,4],xmm5[5,6,7] @@ -5591,8 +5605,8 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm7[2,3],mem[2,3] ; AVX512DQ-NEXT: vinserti128 $1, 288(%rdi), %ymm7, %ymm11 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0],ymm0[1],ymm11[2,3,4,5],ymm0[6],ymm11[7] -; AVX512DQ-NEXT: vmovdqa64 %ymm11, %ymm26 -; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm27 +; AVX512DQ-NEXT: vmovdqa64 %ymm11, %ymm24 +; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm26 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm7[0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0,1,2],ymm8[3,4,5,6,7],ymm0[8,9,10],ymm8[11,12,13,14,15] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6] @@ -5603,23 +5617,22 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm10 {%k1} ; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [2,3,14,15,10,11,6,7,2,3,14,15,12,13,14,15] ; AVX512DQ-NEXT: vpshufb %xmm0, %xmm9, %xmm8 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm9 = [2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15] -; AVX512DQ-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX512DQ-NEXT: vpshufb %xmm0, %xmm3, %xmm3 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm8[2],xmm3[3],xmm8[4,5],xmm3[6,7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm8 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] -; AVX512DQ-NEXT: vpshufb %xmm8, %xmm15, %xmm10 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm9 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] +; AVX512DQ-NEXT: vpshufb %xmm9, %xmm15, %xmm8 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,5,5,5] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm14[3],xmm10[4,5],xmm14[6],xmm10[7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQ-NEXT: vinserti32x4 $2, %xmm3, %zmm10, %zmm3 -; AVX512DQ-NEXT: vpshufb %xmm0, %xmm6, %xmm0 -; AVX512DQ-NEXT: vpshufb %xmm9, %xmm2, %xmm2 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4,5],xmm2[6,7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm14[3],xmm8[4,5],xmm14[6],xmm8[7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512DQ-NEXT: vinserti32x4 $2, %xmm3, %zmm8, %zmm3 +; AVX512DQ-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX512DQ-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2],xmm0[3],xmm6[4,5],xmm0[6,7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7] -; AVX512DQ-NEXT: vpshufb %xmm8, %xmm5, %xmm0 +; AVX512DQ-NEXT: vpshufb %xmm9, %xmm5, %xmm0 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] ; AVX512DQ-NEXT: vpternlogq $226, %zmm3, %zmm17, %zmm2 @@ -5630,19 +5643,19 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm2 {%k1} ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %ymm18, %ymm20 -; AVX512DQ-NEXT: vmovdqa64 %ymm18, %ymm0 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2],ymm13[3,4],ymm0[5],ymm13[6,7] -; AVX512DQ-NEXT: vmovdqa64 %ymm13, %ymm30 +; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm0 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm13[2],ymm0[3,4],ymm13[5],ymm0[6,7] +; AVX512DQ-NEXT: vmovdqa64 %ymm13, %ymm29 +; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm30 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[2,1,2,3] ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[0,3,2,1] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm10[0,0,2,3,4,5,6,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,3,2,1] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm8[0,0,2,3,4,5,6,7] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[2,1,2,0,4,5,6,7] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %ymm22, %ymm1 -; AVX512DQ-NEXT: vmovdqa64 %ymm24, %ymm2 +; AVX512DQ-NEXT: vmovdqa64 %ymm20, %ymm1 +; AVX512DQ-NEXT: vmovdqa64 %ymm22, %ymm2 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] ; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[2,1,0,3] @@ -5653,8 +5666,9 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm16 -; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm13 -; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm0 +; AVX512DQ-NEXT: vmovdqa64 %ymm18, %ymm25 +; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm13 +; AVX512DQ-NEXT: vmovdqa64 %ymm18, %ymm0 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2],ymm13[3,4],ymm0[5],ymm13[6,7] ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,1,2,3] @@ -5663,38 +5677,38 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[2,1,2,0,4,5,6,7] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %ymm29, %ymm11 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1],ymm11[2],ymm12[3],ymm11[4],ymm12[5,6],ymm11[7] +; AVX512DQ-NEXT: vmovdqa64 %ymm28, %ymm10 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1],ymm10[2],ymm12[3],ymm10[4],ymm12[5,6],ymm10[7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %ymm23, %ymm0 -; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm1 +; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm0 +; AVX512DQ-NEXT: vmovdqa64 %ymm23, %ymm1 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,1,0,3] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[0,0,0,0,4,5,6,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,4,6,7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,1,2,1] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,6,5,6,4] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0,1,2,3],xmm1[4],xmm8[5,6],xmm1[7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm8 -; AVX512DQ-NEXT: vmovdqa64 %ymm27, %ymm0 -; AVX512DQ-NEXT: vmovdqa64 %ymm26, %ymm1 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0,1,2],ymm8[3,4,5,6,7],ymm0[8,9,10],ymm8[11,12,13,14,15] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm11 = xmm1[0,1,2,3,6,5,6,4] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm11[4],xmm0[5,6],xmm11[7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm11 +; AVX512DQ-NEXT: vmovdqa64 %ymm26, %ymm0 +; AVX512DQ-NEXT: vmovdqa64 %ymm24, %ymm2 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm0[2],ymm2[3],ymm0[4],ymm2[5,6],ymm0[7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm0[0,1,2],ymm11[3,4,5,6,7],ymm0[8,9,10],ymm11[11,12,13,14,15] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm18 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512DQ-NEXT: vpternlogq $226, %zmm16, %zmm0, %zmm9 ; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} zmm17 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] ; AVX512DQ-NEXT: vpternlogq $184, %zmm9, %zmm17, %zmm18 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm15[3,1,2,1,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm10[0,1,3,3,4,5,6,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,7,7,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2],xmm8[3],xmm9[4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm15[3,1,2,1,4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,3,3,4,5,6,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,7,7,7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1,2],xmm9[3],xmm8[4,5,6,7] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,5] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm14[1,1,1,1,4,5,6,7] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,7,7] @@ -5708,28 +5722,28 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm4[5,6,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,5] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,1,1,1,4,5,6,7] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6],xmm2[7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4],xmm3[5,6],xmm1[7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,4,5] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-NEXT: vpternlogq $226, %zmm7, %zmm0, %zmm4 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm16 ; AVX512DQ-NEXT: vpternlogq $184, %zmm4, %zmm17, %zmm16 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] -; AVX512DQ-NEXT: vmovdqa64 %ymm20, %ymm0 +; AVX512DQ-NEXT: vmovdqa64 %ymm29, %ymm0 ; AVX512DQ-NEXT: vmovdqa64 %ymm30, %ymm1 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX512DQ-NEXT: vpshufb %xmm6, %xmm0, %xmm2 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[2,2,2,2,4,5,6,7] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %ymm22, %ymm2 -; AVX512DQ-NEXT: vmovdqa64 %ymm24, %ymm4 +; AVX512DQ-NEXT: vmovdqa64 %ymm20, %ymm2 +; AVX512DQ-NEXT: vmovdqa64 %ymm22, %ymm4 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7] ; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm2 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm4[0,3,2,1] @@ -5740,37 +5754,37 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6,7] ; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm3, %zmm4, %zmm4 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0],ymm12[1],ymm11[2,3,4,5],ymm12[6],ymm11[7] -; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm5 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0],ymm12[1],ymm10[2,3,4,5],ymm12[6],ymm10[7] +; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm5 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm13[2],ymm5[3,4],ymm13[5],ymm5[6,7] ; AVX512DQ-NEXT: vpshufb %xmm6, %xmm5, %xmm6 ; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm13 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm13[2,2,2,2,4,5,6,7] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1],xmm6[2,3],xmm8[4],xmm6[5,6,7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq $236, %ymm10, %ymm8, %ymm6 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm14 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX512DQ-NEXT: vpternlogq $236, %ymm14, %ymm8, %ymm6 ; AVX512DQ-NEXT: movw $31, %ax ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vinserti32x8 $0, %ymm6, %zmm0, %zmm4 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %ymm23, %ymm6 -; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm8 +; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm6 +; AVX512DQ-NEXT: vmovdqa64 %ymm23, %ymm8 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0],ymm8[1],ymm6[2,3],ymm8[4],ymm6[5,6],ymm8[7] ; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm6 -; AVX512DQ-NEXT: vpshufb %xmm7, %xmm6, %xmm14 +; AVX512DQ-NEXT: vpshufb %xmm7, %xmm6, %xmm11 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm8[0,3,2,1] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[0,1,0,2,4,5,6,7] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,6,6,6] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm14[4],xmm8[5],xmm14[6,7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm11[4],xmm8[5],xmm11[6,7] ; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512DQ-NEXT: vmovdqa64 %ymm27, %ymm14 ; AVX512DQ-NEXT: vmovdqa64 %ymm26, %ymm11 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4,5],ymm11[6],ymm14[7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm14 = ymm11[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3,4],ymm8[5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %ymm24, %ymm10 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm10[1],ymm11[2,3,4,5],ymm10[6],ymm11[7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm15 = ymm11[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1,2,3,4],ymm8[5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm8 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm14 = [10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15] -; AVX512DQ-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm15 = [10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15] +; AVX512DQ-NEXT: vpshufb %xmm15, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7] @@ -5781,12 +5795,12 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm9[0,1,2,3],xmm2[4],xmm9[5],xmm2[6,7] ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0 -; AVX512DQ-NEXT: vpshufb %xmm14, %xmm5, %xmm2 +; AVX512DQ-NEXT: vpshufb %xmm15, %xmm5, %xmm2 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm13[1,1,2,3] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3],xmm5[4],xmm2[5,6,7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpternlogq $236, %ymm10, %ymm3, %ymm2 +; AVX512DQ-NEXT: vpternlogq $236, %ymm14, %ymm3, %ymm2 ; AVX512DQ-NEXT: vinserti32x8 $0, %ymm2, %zmm0, %zmm0 {%k1} ; AVX512DQ-NEXT: vpshufb %xmm1, %xmm6, %xmm1 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[0,1,1,3,4,5,6,7] @@ -5813,155 +5827,152 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512DQ-FCP-LABEL: load_i16_stride6_vf32: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] -; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %ymm12 +; AVX512DQ-FCP-NEXT: pushq %rax +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %ymm13 ; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0],ymm12[1],ymm2[2,3],ymm12[4],ymm2[5,6],ymm12[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm25 -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm7, %xmm1 -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm2 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm14 = xmm2[2,1,0,3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm14, %xmm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0],ymm13[1],ymm2[2,3],ymm13[4],ymm2[5,6],ymm13[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm24 +; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm1 +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm2 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[2,1,0,3] +; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm9, %xmm2 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] ; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm4 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm5 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm22 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm8 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm6, %xmm4 -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm15 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5] -; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm15, %xmm5 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3],xmm5[4,5],xmm4[6],xmm5[7] -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm9 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm5 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm10 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm6 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm22 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm25 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5] +; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm14, %xmm4 +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm14, %xmm7 +; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm8 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1,2],xmm4[3],xmm8[4,5],xmm4[6],xmm8[7] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm4, %zmm16 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm9[1],ymm2[2,3],ymm9[4],ymm2[5,6],ymm9[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm9, %ymm18 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm2[0],ymm10[1],ymm2[2,3],ymm10[4],ymm2[5,6],ymm10[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm10, %ymm18 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm19 -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm4[2,1,0,3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm9, %xmm3 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2],xmm0[3],xmm3[4,5],xmm0[6,7] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm5[2,3],mem[2,3] -; AVX512DQ-FCP-NEXT: vinserti128 $1, 96(%rdi), %ymm5, %ymm3 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0],ymm2[1],ymm3[2,3,4,5],ymm2[6],ymm3[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm27 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm28 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm0[0,1,2],ymm3[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm15, %xmm4 +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm15, %xmm1 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,0,3] +; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3],xmm0[4,5],xmm4[6,7] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],mem[2,3] +; AVX512DQ-FCP-NEXT: vinserti128 $1, 96(%rdi), %ymm3, %ymm12 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0],ymm1[1],ymm12[2,3,4,5],ymm1[6],ymm12[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm28 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1,2],ymm4[3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa 352(%rdi), %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa 320(%rdi), %ymm2 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm21 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm23 -; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm4, %xmm0 -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm8 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1,2],xmm0[3],xmm8[4,5],xmm0[6],xmm8[7] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm8 -; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %ymm0 -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm0[2,3],mem[2,3] -; AVX512DQ-FCP-NEXT: vinserti128 $1, 288(%rdi), %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm10[1],ymm0[2,3,4,5],ymm10[6],ymm0[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm24 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm10, %ymm26 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa 320(%rdi), %ymm1 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm20 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm21 +; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm4, %xmm11 +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm1 +; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm6 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm11[3],xmm6[4,5],xmm11[6],xmm6[7] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm8 +; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %ymm6 +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm6[2,3],mem[2,3] +; AVX512DQ-FCP-NEXT: vinserti128 $1, 288(%rdi), %ymm6, %ymm11 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0],ymm0[1],ymm11[2,3,4,5],ymm0[6],ymm11[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm11, %ymm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm26 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0,1,2],ymm8[3,4,5,6,7],ymm0[8,9,10],ymm8[11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm16, %zmm17, %zmm11 +; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm16, %zmm17, %zmm10 ; AVX512DQ-FCP-NEXT: movw $-2048, %ax # imm = 0xF800 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 -; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm11 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm7, %xmm7 -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm8 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] -; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm14, %xmm14 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm14[2],xmm7[3],xmm14[4,5],xmm7[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm15, %xmm15 -; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm15[0,1,2],xmm6[3],xmm15[4,5],xmm6[6],xmm15[7] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm6, %zmm6 -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm9, %xmm1 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm3, %xmm0 +; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm10 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [2,3,14,15,2,3,6,7,10,11,14,15,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm9, %xmm8 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm8[2],xmm5[3],xmm8[4,5],xmm5[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm7, %xmm7 +; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm14[0,1,2,3,5,5,5,5] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3],xmm7[4,5],xmm8[6],xmm7[7] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm5, %zmm7, %zmm5 +; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm15, %xmm7 +; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm0[2],xmm7[3],xmm0[4,5],xmm7[6,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm0 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,5,5,5,5] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm6, %zmm17, %zmm13 +; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm5, %zmm17, %zmm2 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm13 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm0 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm12[2],ymm0[3,4],ymm12[5],ymm0[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm12, %ymm29 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm30 +; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm2 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm0 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm13[2],ymm0[3,4],ymm13[5],ymm0[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm13, %ymm29 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm30 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[2,1,2,3] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[2,1,2,0,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,3,2,1] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm8[u,u,0,1,4,5,u,u,12,13,12,13,12,13,12,13] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[2,1,0,3] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm14, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,1,2,1] -; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,6,5,6,4] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[0,1,2,1] +; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,6,5,6,4] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6],xmm2[7] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm25 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm24 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm13 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm1[2],ymm13[3,4],ymm1[5],ymm13[6,7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,1,2,3] +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[2,1,2,0,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,3,2,1] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3],xmm2[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm10 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1],ymm10[2],ymm12[3],ymm10[4],ymm12[5,6],ymm10[7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm3[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm6[5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,2,3] -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[2,1,2,0,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,1] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0],xmm5[1,2],xmm1[3],xmm5[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm11 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm11[2],ymm12[3],ymm11[4],ymm12[5,6],ymm11[7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm2[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm7[5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm5[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm5 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,0,3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] -; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm10 = xmm7[0,1,2,3,6,5,6,4] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm10[4],xmm0[5,6],xmm10[7] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm10 +; AVX512DQ-FCP-NEXT: vextracti32x4 $1, %ymm1, %xmm17 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[2,1,0,3] +; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm17[0,1,2,1] +; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm11 = xmm2[0,1,2,3,6,5,6,4] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm11[4],xmm0[5,6],xmm11[7] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm11 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm1 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm0[0,1,2],ymm11[3,4,5,6,7],ymm0[8,9,10],ymm11[11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm18 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm16, %zmm0, %zmm9 @@ -5971,112 +5982,110 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1,2],xmm9[3],xmm8[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u] -; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm14, %xmm10 -; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,5] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1,2,3],xmm6[4],xmm10[5,6],xmm6[7] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm6, %zmm6 -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,1,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2],xmm4[3],xmm3[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm2[5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm5, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,7,5,6,5] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm14, %xmm11 +; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,5] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm11[0,1,2,3],xmm7[4],xmm11[5,6],xmm7[7] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm7, %zmm7 +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,1,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1,2],xmm5[3],xmm4[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm6, %xmm4 +; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4],xmm4[5,6],xmm2[7] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7],ymm1[8,9,10],ymm3[11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm6, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm7, %zmm0, %zmm3 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm16 -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm2, %zmm17, %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm3, %zmm17, %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm1 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm1 -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,2,2,2,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm4 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3],ymm4[4],ymm1[5,6],ymm4[7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm8 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm4 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm7 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2,3],xmm4[4],xmm7[5],xmm4[6,7] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm4, %zmm2 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0],ymm12[1],ymm11[2,3,4,5],ymm12[6],ymm11[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm10 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm10[2],ymm7[3,4],ymm10[5],ymm7[6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm6 -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm10 -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm10[2,2,2,2,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm12[1],xmm6[2,3],xmm12[4],xmm6[5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm19 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpternlogq $236, %ymm19, %ymm12, %ymm6 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm0 +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm1 +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[2,2,2,2,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm4 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,3,2,1] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13] +; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm4, %xmm2 +; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm8 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0,1,2,3],xmm2[4],xmm8[5],xmm2[6,7] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm12[1],ymm10[2,3,4,5],ymm12[6],ymm10[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm8 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm13[2],ymm8[3,4],ymm13[5],ymm8[6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm8, %xmm5 +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm9[2,2,2,2,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm11[1],xmm5[2,3],xmm11[4],xmm5[5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpternlogq $236, %ymm13, %ymm11, %ymm5 ; AVX512DQ-FCP-NEXT: movw $31, %ax ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 -; AVX512DQ-FCP-NEXT: vinserti32x8 $0, %ymm6, %zmm0, %zmm2 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm12 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm6[0],ymm12[1],ymm6[2,3],ymm12[4],ymm6[5,6],ymm12[7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm6 -; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm6, %xmm8 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,3,2,1] -; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm12, %xmm9 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2,3],xmm8[4],xmm9[5],xmm8[6,7] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm11 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0],ymm11[1],ymm9[2,3,4,5],ymm11[6],ymm9[7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm9[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1,2,3,4],ymm8[5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm8 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm15 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm10, %xmm10 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm7, %xmm7 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm10[1],xmm7[2,3],xmm10[4],xmm7[5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpternlogq $248, %ymm19, %ymm7, %ymm4 +; AVX512DQ-FCP-NEXT: vinserti32x8 $0, %ymm5, %zmm0, %zmm0 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm11 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0],ymm11[1],ymm5[2,3],ymm11[4],ymm5[5,6],ymm11[7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm11, %xmm5 +; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm5, %xmm14 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,3,2,1] +; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm11, %xmm7 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm14[4],xmm7[5],xmm14[6,7] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm10 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0],ymm10[1],ymm14[2,3,4,5],ymm10[6],ymm14[7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm14[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm15[0,1,2,3,4],ymm7[5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm9, %xmm9 +; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm8, %xmm8 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3],xmm9[4],xmm8[5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpternlogq $248, %ymm13, %ymm8, %ymm2 +; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm1, %xmm1 ; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm0, %xmm0 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7] -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm1 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm5[4],xmm1[5],xmm5[6,7] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 -; AVX512DQ-FCP-NEXT: vinserti32x8 $0, %ymm4, %zmm0, %zmm0 {%k1} -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm1 -; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm12, %xmm3 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4],xmm3[5],xmm1[6,7] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm9[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3],xmm1[4],xmm3[5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15] +; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4],xmm6[5],xmm4[6,7] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm4, %zmm1 +; AVX512DQ-FCP-NEXT: vinserti32x8 $0, %ymm2, %zmm0, %zmm1 {%k1} +; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm2 +; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm11, %xmm3 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5],xmm2[6,7] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm14[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm3, (%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, (%rdx) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm3, (%rdx) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm2, %zmm17, %zmm8 -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm0, %zmm17, %zmm1 +; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm0, %zmm17, %zmm7 +; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm1, %zmm17, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, (%rcx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, (%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, (%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, (%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512DQ-FCP-NEXT: popq %rax ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -9314,213 +9323,209 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpshufb %ymm2, %ymm13, %ymm2 ; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm14, %ymm2, %ymm2 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} xmm2 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] -; AVX2-FP-NEXT: vpshufb %xmm2, %xmm11, %xmm14 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm11 = [10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15] -; AVX2-FP-NEXT: vpshufb %xmm11, %xmm7, %xmm7 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm14 = xmm7[0],xmm14[1],xmm7[2,3],xmm14[4],xmm7[5,6,7] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15] +; AVX2-FP-NEXT: vpshufb %xmm2, %xmm11, %xmm11 +; AVX2-FP-NEXT: vpshufb %xmm2, %xmm7, %xmm7 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm11 = xmm7[0],xmm11[1],xmm7[2,3],xmm11[4],xmm7[5,6,7] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] ; AVX2-FP-NEXT: vpshufb %ymm7, %ymm10, %ymm10 -; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm14, %ymm10, %ymm10 +; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm11, %ymm10, %ymm10 ; AVX2-FP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX2-FP-NEXT: vpshufb %xmm11, %xmm1, %xmm1 +; AVX2-FP-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3],xmm5[4],xmm1[5,6,7] ; AVX2-FP-NEXT: vpshufb %ymm7, %ymm4, %ymm4 ; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm1, %ymm4, %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm9, %xmm1 -; AVX2-FP-NEXT: vpshufb %xmm11, %xmm6, %xmm4 +; AVX2-FP-NEXT: vpshufb %xmm2, %xmm6, %xmm4 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1],xmm4[2,3],xmm1[4],xmm4[5,6,7] ; AVX2-FP-NEXT: vpshufb %ymm7, %ymm8, %ymm4 ; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm1, %ymm4, %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpshufb %ymm7, %ymm13, %ymm1 -; AVX2-FP-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-FP-NEXT: vpshufb %xmm11, %xmm12, %xmm3 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6,7] +; AVX2-FP-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX2-FP-NEXT: vpshufb %xmm2, %xmm12, %xmm2 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] ; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm10 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FP-NEXT: vextracti128 $1, %ymm10, %xmm0 -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,0,3] -; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] -; AVX2-FP-NEXT: vpshufb %xmm2, %xmm10, %xmm0 -; AVX2-FP-NEXT: vmovdqa %xmm2, %xmm6 -; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm12 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] -; AVX2-FP-NEXT: vpshufb %xmm12, %xmm1, %xmm1 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3,4,5],ymm1[6],mem[7] -; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-FP-NEXT: vmovdqa %ymm2, %ymm5 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm7 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FP-NEXT: vextracti128 $1, %ymm7, %xmm0 +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,1,0,3] +; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15] +; AVX2-FP-NEXT: vpshufb %xmm1, %xmm7, %xmm0 +; AVX2-FP-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3,4,5],ymm2[6],mem[7] +; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-FP-NEXT: vmovdqa %ymm3, %ymm5 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FP-NEXT: vmovdqa 544(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 512(%rdi), %ymm2 +; AVX2-FP-NEXT: vmovdqa 512(%rdi), %ymm3 +; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5] +; AVX2-FP-NEXT: vpshufb %xmm0, %xmm11, %xmm3 +; AVX2-FP-NEXT: vextracti128 $1, %ymm11, %xmm4 +; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3],xmm4[4,5],xmm3[6],xmm4[7] +; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] -; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} xmm0 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX2-FP-NEXT: vpshufb %xmm0, %xmm13, %xmm2 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5] -; AVX2-FP-NEXT: vextracti128 $1, %ymm13, %xmm3 +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm6 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7] +; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm2 +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,1,0,3] ; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5],xmm2[6],xmm3[7] -; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm11 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6],ymm1[7] -; AVX2-FP-NEXT: vextracti128 $1, %ymm11, %xmm1 -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,0,3] -; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm11, %xmm1 -; AVX2-FP-NEXT: vpshufb %xmm12, %xmm2, %xmm2 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] -; AVX2-FP-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm2 = ymm15[0],mem[1],ymm15[2,3,4,5],mem[6],ymm15[7] -; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX2-FP-NEXT: vpshufb %xmm1, %xmm6, %xmm2 +; AVX2-FP-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7] +; AVX2-FP-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm3 = ymm15[0],mem[1],ymm15[2,3,4,5],mem[6],ymm15[7] +; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpshufb %ymm5, %ymm3, %ymm3 ; AVX2-FP-NEXT: vmovdqa %ymm5, %ymm15 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FP-NEXT: vmovdqa 352(%rdi), %ymm2 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 352(%rdi), %ymm3 +; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 320(%rdi), %ymm4 +; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] +; AVX2-FP-NEXT: vpshufb %xmm0, %xmm8, %xmm3 +; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm4 +; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3],xmm4[4,5],xmm3[6],xmm4[7] +; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 320(%rdi), %ymm3 +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm5 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7] +; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm2 +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,1,0,3] +; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vpshufb %xmm1, %xmm5, %xmm2 +; AVX2-FP-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4,5],mem[6],ymm3[7] +; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpshufb %ymm15, %ymm3, %ymm3 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 736(%rdi), %ymm3 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] -; AVX2-FP-NEXT: vpshufb %xmm0, %xmm8, %xmm2 -; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm3 +; AVX2-FP-NEXT: vmovdqa 704(%rdi), %ymm4 +; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] +; AVX2-FP-NEXT: vpshufb %xmm0, %xmm4, %xmm9 +; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm3 ; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5],xmm2[6],xmm3[7] -; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm7 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6],ymm1[7] -; AVX2-FP-NEXT: vextracti128 $1, %ymm7, %xmm1 -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,0,3] -; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm7, %xmm1 -; AVX2-FP-NEXT: vpshufb %xmm12, %xmm2, %xmm2 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4,5],mem[6],ymm2[7] +; AVX2-FP-NEXT: vpshufb %xmm0, %xmm3, %xmm10 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3],xmm10[4,5],xmm9[6],xmm10[7] +; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2],ymm9[3,4,5,6,7],ymm2[8,9,10],ymm9[11,12,13,14,15] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpshufb %ymm5, %ymm2, %ymm2 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FP-NEXT: vmovdqa 736(%rdi), %ymm2 +; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm2 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 704(%rdi), %ymm3 +; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm3 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] -; AVX2-FP-NEXT: vpshufb %xmm0, %xmm5, %xmm6 -; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm2 -; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm9 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3],xmm9[4,5],xmm6[6],xmm9[7] -; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm6 = ymm1[0,1,2],ymm6[3,4,5,6,7],ymm1[8,9,10],ymm6[11,12,13,14,15] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm1 -; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX2-FP-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm14 -; AVX2-FP-NEXT: vpshufb %xmm4, %xmm14, %xmm4 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0,1,2],xmm0[3],xmm4[4,5],xmm0[6],xmm4[7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] +; AVX2-FP-NEXT: vpshufb %xmm0, %xmm3, %xmm9 +; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm13 +; AVX2-FP-NEXT: vpshufb %xmm0, %xmm13, %xmm0 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0,1,2],xmm9[3],xmm0[4,5],xmm9[6],xmm0[7] ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm2 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] -; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm9 -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,1,0,3] -; AVX2-FP-NEXT: vpshufb %xmm12, %xmm9, %xmm12 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm12 = xmm4[0,1],xmm12[2],xmm4[3],xmm12[4,5],xmm4[6,7] +; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm10 +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm14 = xmm10[2,1,0,3] +; AVX2-FP-NEXT: vpshufb %xmm1, %xmm2, %xmm12 +; AVX2-FP-NEXT: vpshufb %xmm1, %xmm14, %xmm1 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm1[2],xmm12[3],xmm1[4,5],xmm12[6,7] ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm4 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7] -; AVX2-FP-NEXT: vpshufb %ymm15, %ymm4, %ymm15 +; AVX2-FP-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm10 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7] +; AVX2-FP-NEXT: vpshufb %ymm15, %ymm10, %ymm15 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm15[3,4,5,6,7] -; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm6 = ymm12[0,1,2],ymm6[3,4,5,6,7],ymm12[8,9,10],ymm6[11,12,13,14,15] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm12[0,1,2],ymm9[3,4,5,6,7],ymm12[8,9,10],ymm9[11,12,13,14,15] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm12 = [6,7,2,3,12,13,14,15,6,7,2,3,12,13,14,15] -; AVX2-FP-NEXT: vpshufb %xmm12, %xmm10, %xmm6 -; AVX2-FP-NEXT: vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm10 = mem[1,1,1,1,4,5,6,7] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm15 = xmm6[0,1],xmm10[2],xmm6[3,4],xmm10[5],xmm6[6],xmm10[7] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX2-FP-NEXT: vpshufb %xmm12, %xmm7, %xmm7 +; AVX2-FP-NEXT: vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm9 = mem[1,1,1,1,4,5,6,7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm15 = xmm7[0,1],xmm9[2],xmm7[3,4],xmm9[5],xmm7[6],xmm9[7] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vpshufb %ymm6, %ymm0, %ymm0 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm10 = [2,3,6,7,4,5,0,1,10,11,14,15,12,13,14,15] -; AVX2-FP-NEXT: vpshufb %xmm10, %xmm15, %xmm15 +; AVX2-FP-NEXT: vpshufb %ymm7, %ymm0, %ymm0 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm9 = [2,3,6,7,4,5,0,1,10,11,14,15,12,13,14,15] +; AVX2-FP-NEXT: vpshufb %xmm9, %xmm15, %xmm15 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3,4,5,6,7] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm15 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm15, %xmm1, %xmm1 -; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,5,5,5,5] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm13[3],xmm1[4,5],xmm13[6],xmm1[7] +; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,5,5] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm11[3],xmm1[4,5],xmm11[6],xmm1[7] ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpshufb %xmm12, %xmm11, %xmm0 +; AVX2-FP-NEXT: vpshufb %xmm12, %xmm6, %xmm0 ; AVX2-FP-NEXT: vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm1 = mem[1,1,1,1,4,5,6,7] ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6],xmm1[7] ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vpshufb %ymm6, %ymm1, %ymm1 -; AVX2-FP-NEXT: vpshufb %xmm10, %xmm0, %xmm0 +; AVX2-FP-NEXT: vpshufb %ymm7, %ymm1, %ymm1 +; AVX2-FP-NEXT: vpshufb %xmm9, %xmm0, %xmm0 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm15, %xmm1, %xmm1 -; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm8[3],xmm1[4,5],xmm8[6],xmm1[7] +; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm6 = xmm8[0,1,2,3,5,5,5,5] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm6[3],xmm1[4,5],xmm6[6],xmm1[7] ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpshufb %xmm12, %xmm7, %xmm0 +; AVX2-FP-NEXT: vpshufb %xmm12, %xmm5, %xmm0 ; AVX2-FP-NEXT: vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm1 = mem[1,1,1,1,4,5,6,7] ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6],xmm1[7] ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vpshufb %ymm6, %ymm1, %ymm1 -; AVX2-FP-NEXT: vpshufb %xmm10, %xmm0, %xmm0 +; AVX2-FP-NEXT: vpshufb %ymm7, %ymm1, %ymm1 +; AVX2-FP-NEXT: vpshufb %xmm9, %xmm0, %xmm0 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm15, %xmm1, %xmm1 -; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[3],xmm1[4,5],xmm5[6],xmm1[7] +; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3],xmm1[4,5],xmm4[6],xmm1[7] ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpshufb %xmm15, %xmm14, %xmm0 +; AVX2-FP-NEXT: vpshufb %xmm15, %xmm13, %xmm0 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,5,5,5,5] ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] -; AVX2-FP-NEXT: vpshufb %ymm6, %ymm4, %ymm1 +; AVX2-FP-NEXT: vpshufb %ymm7, %ymm10, %ymm1 ; AVX2-FP-NEXT: vpshufb %xmm12, %xmm2, %xmm2 -; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[1,1,1,1,4,5,6,7] +; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm14[1,1,1,1,4,5,6,7] ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6],xmm3[7] -; AVX2-FP-NEXT: vpshufb %xmm10, %xmm2, %xmm2 +; AVX2-FP-NEXT: vpshufb %xmm9, %xmm2, %xmm2 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] @@ -9679,8 +9684,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7],ymm4[8,9,10],ymm0[11,12,13,14,15] ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,2] ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm4[5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm0, %xmm1 ; AVX2-FP-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload @@ -9697,7 +9701,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7],ymm6[8,9,10],ymm1[11,12,13,14,15] ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,2] ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm6[5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm14, %xmm1 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,7,5,6,5] ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] @@ -9709,16 +9713,15 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm7 -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[0,3,2,1] -; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm2 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX2-FP-NEXT: vpshufb %xmm2, %xmm7, %xmm1 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u] -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm8, %xmm4 +; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[0,3,2,1] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13] +; AVX2-FP-NEXT: vpshufb %xmm7, %xmm2, %xmm1 +; AVX2-FP-NEXT: vpshufb %xmm7, %xmm6, %xmm4 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4],xmm4[5],xmm1[6,7] ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload @@ -9726,10 +9729,10 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm10 +; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm11 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[0,3,2,1] -; AVX2-FP-NEXT: vpshufb %xmm2, %xmm10, %xmm1 -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm12, %xmm5 +; AVX2-FP-NEXT: vpshufb %xmm7, %xmm11, %xmm1 +; AVX2-FP-NEXT: vpshufb %xmm7, %xmm12, %xmm5 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4],xmm5[5],xmm1[6,7] ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload @@ -9739,88 +9742,86 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] ; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm13 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[0,3,2,1] -; AVX2-FP-NEXT: vpshufb %xmm2, %xmm13, %xmm1 -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm14, %xmm15 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0,1,2,3],xmm1[4],xmm15[5],xmm1[6,7] +; AVX2-FP-NEXT: vpshufb %xmm7, %xmm13, %xmm1 +; AVX2-FP-NEXT: vpshufb %xmm7, %xmm14, %xmm10 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0,1,2,3],xmm1[4],xmm10[5],xmm1[6,7] ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm3 = mem[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm15 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FP-NEXT: vextracti128 $1, %ymm15, %xmm0 -; AVX2-FP-NEXT: vpshufb %xmm2, %xmm0, %xmm2 -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,3,2,1] -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm15, %xmm6 +; AVX2-FP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm10 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FP-NEXT: vextracti128 $1, %ymm10, %xmm15 +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[0,3,2,1] +; AVX2-FP-NEXT: vpshufb %xmm7, %xmm15, %xmm10 +; AVX2-FP-NEXT: vpshufb %xmm7, %xmm0, %xmm7 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm10[4],xmm7[5],xmm10[6,7] +; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm10 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm10 = mem[0,1,2,3,4],ymm7[5,6,7] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15] +; AVX2-FP-NEXT: vpshufb %xmm7, %xmm11, %xmm11 +; AVX2-FP-NEXT: vpshufb %xmm7, %xmm12, %xmm12 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3],xmm11[4],xmm12[5],xmm11[6,7] +; AVX2-FP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm11 = mem[0,1,2,3,4],ymm11[5,6,7] +; AVX2-FP-NEXT: vpshufb %xmm7, %xmm2, %xmm2 +; AVX2-FP-NEXT: vpshufb %xmm7, %xmm6, %xmm6 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1,2,3],xmm2[4],xmm6[5],xmm2[6,7] ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm1 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX2-FP-NEXT: vpshufb %xmm1, %xmm10, %xmm6 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u] -; AVX2-FP-NEXT: vpshufb %xmm10, %xmm12, %xmm12 +; AVX2-FP-NEXT: vpshufb %xmm7, %xmm13, %xmm6 +; AVX2-FP-NEXT: vpshufb %xmm7, %xmm14, %xmm12 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm12[0,1,2,3],xmm6[4],xmm12[5],xmm6[6,7] ; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm12 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm12 = mem[0,1,2,3,4],ymm6[5,6,7] -; AVX2-FP-NEXT: vpshufb %xmm1, %xmm7, %xmm6 -; AVX2-FP-NEXT: vpshufb %xmm10, %xmm8, %xmm7 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4],xmm7[5],xmm6[6,7] -; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm6 = mem[0,1,2,3,4],ymm6[5,6,7] -; AVX2-FP-NEXT: vpshufb %xmm1, %xmm13, %xmm7 -; AVX2-FP-NEXT: vpshufb %xmm10, %xmm14, %xmm8 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3],xmm7[4],xmm8[5],xmm7[6,7] -; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm7 = mem[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FP-NEXT: vpshufb %xmm10, %xmm15, %xmm1 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5],xmm0[6,7] +; AVX2-FP-NEXT: vpshufb %xmm7, %xmm15, %xmm12 +; AVX2-FP-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm12[4],xmm0[5],xmm12[6,7] ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 96(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 96(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-FP-NEXT: vmovdqa %ymm11, 96(%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 64(%r8) -; AVX2-FP-NEXT: vmovdqa %ymm9, (%r8) -; AVX2-FP-NEXT: vmovdqa %ymm2, 96(%r9) -; AVX2-FP-NEXT: vmovdqa %ymm3, 32(%r9) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm7, 96(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm7, 32(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm7, 64(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm7, (%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm7, 96(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm7, 32(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm7, 64(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm7, (%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm7, 32(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm7, 96(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm7, 64(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm7, (%rcx) +; AVX2-FP-NEXT: vmovdqa %ymm8, 96(%r8) +; AVX2-FP-NEXT: vmovdqa %ymm9, 32(%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm7, 64(%r8) +; AVX2-FP-NEXT: vmovdqa %ymm3, (%r8) +; AVX2-FP-NEXT: vmovdqa %ymm10, 96(%r9) +; AVX2-FP-NEXT: vmovdqa %ymm1, 32(%r9) ; AVX2-FP-NEXT: vmovdqa %ymm5, (%r9) ; AVX2-FP-NEXT: vmovdqa %ymm4, 64(%r9) ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FP-NEXT: vmovdqa %ymm0, 96(%rax) -; AVX2-FP-NEXT: vmovdqa %ymm7, 32(%rax) -; AVX2-FP-NEXT: vmovdqa %ymm6, 64(%rax) -; AVX2-FP-NEXT: vmovdqa %ymm12, (%rax) +; AVX2-FP-NEXT: vmovdqa %ymm6, 32(%rax) +; AVX2-FP-NEXT: vmovdqa %ymm2, 64(%rax) +; AVX2-FP-NEXT: vmovdqa %ymm11, (%rax) ; AVX2-FP-NEXT: addq $1304, %rsp # imm = 0x518 ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq @@ -9909,213 +9910,209 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm13, %ymm2 ; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm14, %ymm2, %ymm2 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm2 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm11, %xmm14 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15] -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm7, %xmm7 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm7[0],xmm14[1],xmm7[2,3],xmm14[4],xmm7[5,6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15] +; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm11, %xmm11 +; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm7, %xmm7 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm7[0],xmm11[1],xmm7[2,3],xmm11[4],xmm7[5,6,7] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] ; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm10, %ymm10 -; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm14, %ymm10, %ymm10 +; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm11, %ymm10, %ymm10 ; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm1, %xmm1 +; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3],xmm5[4],xmm1[5,6,7] ; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm4 ; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm1, %ymm4, %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm9, %xmm1 -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm6, %xmm4 +; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm6, %xmm4 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1],xmm4[2,3],xmm1[4],xmm4[5,6,7] ; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm8, %ymm4 ; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm1, %ymm4, %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm13, %ymm1 -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm12, %xmm3 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6,7] +; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm12, %xmm2 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] ; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm10 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm10, %xmm0 -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,0,3] -; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm10, %xmm0 -; AVX2-FCP-NEXT: vmovdqa %xmm2, %xmm6 -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm12 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] -; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm1 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3,4,5],ymm1[6],mem[7] -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm5 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm7 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm0 +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,1,0,3] +; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15] +; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm7, %xmm0 +; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3,4,5],ymm2[6],mem[7] +; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vmovdqa %ymm3, %ymm5 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FCP-NEXT: vmovdqa 544(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 512(%rdi), %ymm2 +; AVX2-FCP-NEXT: vmovdqa 512(%rdi), %ymm3 +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5] +; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm11, %xmm3 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm11, %xmm4 +; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3],xmm4[4,5],xmm3[6],xmm4[7] +; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm0 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm13, %xmm2 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm13, %xmm3 +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm6 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm2 +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,1,0,3] ; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5],xmm2[6],xmm3[7] -; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm11 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6],ymm1[7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm11, %xmm1 -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,0,3] -; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm11, %xmm1 -; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm2 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] -; AVX2-FCP-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm2 = ymm15[0],mem[1],ymm15[2,3,4,5],mem[6],ymm15[7] -; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm6, %xmm2 +; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7] +; AVX2-FCP-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm3 = ymm15[0],mem[1],ymm15[2,3,4,5],mem[6],ymm15[7] +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm3 ; AVX2-FCP-NEXT: vmovdqa %ymm5, %ymm15 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %ymm2 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %ymm3 +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm4 +; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] +; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm8, %xmm3 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm8, %xmm4 +; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3],xmm4[4,5],xmm3[6],xmm4[7] +; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm3 +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm5 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm2 +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,1,0,3] +; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm2 +; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4,5],mem[6],ymm3[7] +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm3, %ymm3 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 736(%rdi), %ymm3 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm8, %xmm2 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm8, %xmm3 +; AVX2-FCP-NEXT: vmovdqa 704(%rdi), %ymm4 +; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] +; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm9 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm4, %xmm3 ; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5],xmm2[6],xmm3[7] -; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm7 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6],ymm1[7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm1 -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,0,3] -; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm1 -; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm2 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4,5],mem[6],ymm2[7] +; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm10 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3],xmm10[4,5],xmm9[6],xmm10[7] +; AVX2-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2],ymm9[3,4,5,6,7],ymm2[8,9,10],ymm9[11,12,13,14,15] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm2 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa 736(%rdi), %ymm2 +; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm2 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 704(%rdi), %ymm3 +; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm3 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm6 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm2 -; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm9 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3],xmm9[4,5],xmm6[6],xmm9[7] -; AVX2-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm1[0,1,2],ymm6[3,4,5,6,7],ymm1[8,9,10],ymm6[11,12,13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm1 -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm14 -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm14, %xmm4 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0,1,2],xmm0[3],xmm4[4,5],xmm0[6],xmm4[7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] +; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm9 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm13 +; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm13, %xmm0 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0,1,2],xmm9[3],xmm0[4,5],xmm9[6],xmm0[7] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm2 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm9 -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,1,0,3] -; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm9, %xmm12 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm4[0,1],xmm12[2],xmm4[3],xmm12[4,5],xmm4[6,7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm10 +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm14 = xmm10[2,1,0,3] +; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm12 +; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm14, %xmm1 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm1[2],xmm12[3],xmm1[4,5],xmm12[6,7] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm4 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7] -; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm4, %ymm15 +; AVX2-FCP-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm10 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7] +; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm10, %ymm15 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm15[3,4,5,6,7] -; AVX2-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm12[0,1,2],ymm6[3,4,5,6,7],ymm12[8,9,10],ymm6[11,12,13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm12[0,1,2],ymm9[3,4,5,6,7],ymm12[8,9,10],ymm9[11,12,13,14,15] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm12 = [6,7,2,3,12,13,14,15,6,7,2,3,12,13,14,15] -; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm10, %xmm6 -; AVX2-FCP-NEXT: vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm10 = mem[1,1,1,1,4,5,6,7] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm15 = xmm6[0,1],xmm10[2],xmm6[3,4],xmm10[5],xmm6[6],xmm10[7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm7, %xmm7 +; AVX2-FCP-NEXT: vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm9 = mem[1,1,1,1,4,5,6,7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm15 = xmm7[0,1],xmm9[2],xmm7[3,4],xmm9[5],xmm7[6],xmm9[7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [2,3,6,7,4,5,0,1,10,11,14,15,12,13,14,15] -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm15, %xmm15 +; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [2,3,6,7,4,5,0,1,10,11,14,15,12,13,14,15] +; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm15, %xmm15 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3,4,5,6,7] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm1, %xmm1 -; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,5,5,5,5] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm13[3],xmm1[4,5],xmm13[6],xmm1[7] +; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,5,5] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm11[3],xmm1[4,5],xmm11[6],xmm1[7] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm11, %xmm0 +; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm6, %xmm0 ; AVX2-FCP-NEXT: vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm1 = mem[1,1,1,1,4,5,6,7] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6],xmm1[7] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm0, %xmm0 +; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm0 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm1, %xmm1 -; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm8[3],xmm1[4,5],xmm8[6],xmm1[7] +; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm6 = xmm8[0,1,2,3,5,5,5,5] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm6[3],xmm1[4,5],xmm6[6],xmm1[7] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm7, %xmm0 +; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm5, %xmm0 ; AVX2-FCP-NEXT: vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm1 = mem[1,1,1,1,4,5,6,7] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6],xmm1[7] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm0, %xmm0 +; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm0 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm1, %xmm1 -; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[3],xmm1[4,5],xmm5[6],xmm1[7] +; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3],xmm1[4,5],xmm4[6],xmm1[7] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm14, %xmm0 +; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm13, %xmm0 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,5,5,5,5] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] -; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm1 +; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm10, %ymm1 ; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm2 -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[1,1,1,1,4,5,6,7] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm14[1,1,1,1,4,5,6,7] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6],xmm3[7] -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm2 +; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] @@ -10274,8 +10271,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7],ymm4[8,9,10],ymm0[11,12,13,14,15] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,2] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm4[5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm1 ; AVX2-FCP-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload @@ -10292,7 +10288,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7],ymm6[8,9,10],ymm1[11,12,13,14,15] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,2] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm6[5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm14, %xmm1 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,7,5,6,5] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] @@ -10304,16 +10300,15 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm7 -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[0,3,2,1] -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm2 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm7, %xmm1 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm8, %xmm4 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[0,3,2,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13] +; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm1 +; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm4 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4],xmm4[5],xmm1[6,7] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload @@ -10321,10 +10316,10 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm10 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm11 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[0,3,2,1] -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm10, %xmm1 -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm12, %xmm5 +; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm11, %xmm1 +; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm12, %xmm5 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4],xmm5[5],xmm1[6,7] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload @@ -10334,88 +10329,86 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm13 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[0,3,2,1] -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm13, %xmm1 -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm14, %xmm15 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0,1,2,3],xmm1[4],xmm15[5],xmm1[6,7] +; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm13, %xmm1 +; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm14, %xmm10 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0,1,2,3],xmm1[4],xmm10[5],xmm1[6,7] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm3 = mem[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm15 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm15, %xmm0 -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm2 -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,3,2,1] -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm15, %xmm6 +; AVX2-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm10 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm10, %xmm15 +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[0,3,2,1] +; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm15, %xmm10 +; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm0, %xmm7 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm10[4],xmm7[5],xmm10[6,7] +; AVX2-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm10 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm10 = mem[0,1,2,3,4],ymm7[5,6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15] +; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm11, %xmm11 +; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm12, %xmm12 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3],xmm11[4],xmm12[5],xmm11[6,7] +; AVX2-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm11 = mem[0,1,2,3,4],ymm11[5,6,7] +; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm2 +; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm6 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1,2,3],xmm2[4],xmm6[5],xmm2[6,7] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm1 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm6 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm12, %xmm12 +; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm13, %xmm6 +; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm14, %xmm12 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm12[0,1,2,3],xmm6[4],xmm12[5],xmm6[6,7] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm12 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm12 = mem[0,1,2,3,4],ymm6[5,6,7] -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm7, %xmm6 -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm8, %xmm7 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4],xmm7[5],xmm6[6,7] -; AVX2-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm6 = mem[0,1,2,3,4],ymm6[5,6,7] -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm13, %xmm7 -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm14, %xmm8 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3],xmm7[4],xmm8[5],xmm7[6,7] -; AVX2-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm7 = mem[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm15, %xmm1 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5],xmm0[6,7] +; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm15, %xmm12 +; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm12[4],xmm0[5],xmm12[6,7] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-FCP-NEXT: vmovdqa %ymm11, 96(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%r8) -; AVX2-FCP-NEXT: vmovdqa %ymm9, (%r8) -; AVX2-FCP-NEXT: vmovdqa %ymm2, 96(%r9) -; AVX2-FCP-NEXT: vmovdqa %ymm3, 32(%r9) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm7, 96(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm7, 32(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm7, 64(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm7, (%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm7, 96(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm7, 32(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm7, 64(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm7, (%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm7, 32(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm7, 96(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm7, 64(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm7, (%rcx) +; AVX2-FCP-NEXT: vmovdqa %ymm8, 96(%r8) +; AVX2-FCP-NEXT: vmovdqa %ymm9, 32(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm7, 64(%r8) +; AVX2-FCP-NEXT: vmovdqa %ymm3, (%r8) +; AVX2-FCP-NEXT: vmovdqa %ymm10, 96(%r9) +; AVX2-FCP-NEXT: vmovdqa %ymm1, 32(%r9) ; AVX2-FCP-NEXT: vmovdqa %ymm5, (%r9) ; AVX2-FCP-NEXT: vmovdqa %ymm4, 64(%r9) ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: vmovdqa %ymm0, 96(%rax) -; AVX2-FCP-NEXT: vmovdqa %ymm7, 32(%rax) -; AVX2-FCP-NEXT: vmovdqa %ymm6, 64(%rax) -; AVX2-FCP-NEXT: vmovdqa %ymm12, (%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm6, 32(%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm2, 64(%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm11, (%rax) ; AVX2-FCP-NEXT: addq $1304, %rsp # imm = 0x518 ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq @@ -10445,8 +10438,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm13[2,2,2,2,4,5,6,7] ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7] ; AVX512-NEXT: vmovdqa64 %xmm2, %xmm21 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,4,5,4,5,6,7,0,1,12,13,8,9,4,5] -; AVX512-NEXT: vpshufb %xmm11, %xmm1, %xmm1 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,4,5,4,5,6,7,0,1,12,13,8,9,4,5] +; AVX512-NEXT: vpshufb %xmm10, %xmm1, %xmm1 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -10467,8 +10460,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vinserti128 $1, 480(%rdi), %ymm0, %ymm0 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3,4,5],ymm2[6],ymm0[7] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX512-NEXT: vpshufb %ymm10, %ymm2, %ymm0 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX512-NEXT: vpshufb %ymm5, %ymm2, %ymm0 ; AVX512-NEXT: vmovdqa64 %ymm2, %ymm23 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -10477,12 +10470,12 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vmovdqa 704(%rdi), %ymm2 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm2 -; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[2,2,2,2,4,5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX512-NEXT: vextracti128 $1, %ymm11, %xmm2 +; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm11[2,2,2,2,4,5,6,7] ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7] ; AVX512-NEXT: vmovdqa64 %xmm2, %xmm28 -; AVX512-NEXT: vpshufb %xmm11, %xmm1, %xmm1 +; AVX512-NEXT: vpshufb %xmm10, %xmm1, %xmm1 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],mem[2,3] ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -10512,10 +10505,10 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] ; AVX512-NEXT: vmovdqa64 %ymm1, %ymm30 -; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm7 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[2,2,2,2,4,5,6,7] -; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1,2],xmm1[3,4],xmm6[5,6,7] -; AVX512-NEXT: vpshufb %xmm11, %xmm1, %xmm1 +; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2],xmm1[3,4],xmm7[5,6,7] +; AVX512-NEXT: vpshufb %xmm10, %xmm1, %xmm1 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -10523,97 +10516,96 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX512-NEXT: vpshufb %xmm9, %xmm5, %xmm1 -; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm9 -; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[0,2,0,3] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3],xmm3[4,5],xmm1[6,7] -; AVX512-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],mem[2,3] -; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vinserti128 $1, 96(%rdi), %ymm3, %ymm2 +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX512-NEXT: vpshufb %xmm9, %xmm3, %xmm0 +; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm9 +; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[0,2,0,3] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7] +; AVX512-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],mem[2,3] +; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vinserti128 $1, 96(%rdi), %ymm2, %ymm2 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0],ymm0[1],ymm2[2,3,4,5],ymm0[6],ymm2[7] -; AVX512-NEXT: vpshufb %ymm10, %ymm7, %ymm3 -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm3[3,4,5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0],ymm1[1],ymm2[2,3,4,5],ymm1[6],ymm2[7] +; AVX512-NEXT: vpshufb %ymm5, %ymm8, %ymm2 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa 352(%rdi), %ymm0 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vmovdqa 320(%rdi), %ymm1 ; AVX512-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm3 -; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[2,2,2,2,4,5,6,7] -; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4],xmm3[5,6,7] -; AVX512-NEXT: vpshufb %xmm11, %xmm1, %xmm11 -; AVX512-NEXT: vmovdqa 256(%rdi), %ymm1 -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3],mem[2,3] -; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vinserti128 $1, 288(%rdi), %ymm1, %ymm0 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm2[1],ymm0[2,3,4,5],ymm2[6],ymm0[7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[2,2,2,2,4,5,6,7] +; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3,4],xmm6[5,6,7] +; AVX512-NEXT: vpshufb %xmm10, %xmm2, %xmm10 +; AVX512-NEXT: vmovdqa 256(%rdi), %ymm2 +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],mem[2,3] +; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vinserti128 $1, 288(%rdi), %ymm2, %ymm0 +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] ; AVX512-NEXT: vmovdqa64 %ymm0, %ymm31 ; AVX512-NEXT: vmovdqa64 %ymm17, %ymm0 -; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512-NEXT: vpblendw {{.*#+}} ymm11 = ymm0[0,1,2],ymm11[3,4,5,6,7],ymm0[8,9,10],ymm11[11,12,13,14,15] +; AVX512-NEXT: vpshufb %ymm0, %ymm2, %ymm0 +; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6] -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm11 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0] -; AVX512-NEXT: vpshufb %xmm11, %xmm15, %xmm0 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm15 = [2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15] -; AVX512-NEXT: vpshufb %xmm15, %xmm14, %xmm14 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm10 = [2,3,14,15,10,11,6,7,2,3,14,15,12,13,14,15] +; AVX512-NEXT: vpshufb %xmm10, %xmm15, %xmm0 +; AVX512-NEXT: vpshufb %xmm10, %xmm14, %xmm14 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1],xmm0[2],xmm14[3],xmm0[4,5],xmm14[6,7] ; AVX512-NEXT: vmovdqa {{.*#+}} xmm14 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] -; AVX512-NEXT: vpshufb %xmm14, %xmm6, %xmm6 +; AVX512-NEXT: vpshufb %xmm14, %xmm7, %xmm7 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] -; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3],xmm6[4,5],xmm4[6],xmm6[7] +; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3],xmm7[4,5],xmm4[6],xmm7[7] ; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpshufb %xmm11, %xmm9, %xmm0 -; AVX512-NEXT: vpshufb %xmm15, %xmm5, %xmm2 -; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4,5],xmm2[6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX512-NEXT: vpshufb %ymm2, %ymm7, %ymm4 +; AVX512-NEXT: vpshufb %xmm10, %xmm9, %xmm0 +; AVX512-NEXT: vpshufb %xmm10, %xmm3, %xmm3 +; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3],xmm0[4,5],xmm3[6,7] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX512-NEXT: vpshufb %ymm3, %ymm8, %ymm4 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7] ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpshufb %xmm14, %xmm3, %xmm0 -; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm10[0,1,2,3,5,5,5,5] -; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5],xmm3[6],xmm0[7] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23] -; AVX512-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX512-NEXT: vpshufb %xmm14, %xmm6, %xmm0 +; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,5,5,5,5] +; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23] +; AVX512-NEXT: vpshufb %ymm1, %ymm2, %ymm2 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX512-NEXT: vmovdqa64 %ymm0, %ymm27 ; AVX512-NEXT: vmovdqa64 %xmm20, %xmm0 -; AVX512-NEXT: vpshufb %xmm11, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa64 %ymm16, %ymm1 -; AVX512-NEXT: vpshufb %xmm15, %xmm1, %xmm1 -; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] -; AVX512-NEXT: vmovdqa64 %xmm21, %xmm1 -; AVX512-NEXT: vpshufb %xmm14, %xmm1, %xmm1 +; AVX512-NEXT: vpshufb %xmm10, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa64 %ymm16, %ymm2 +; AVX512-NEXT: vpshufb %xmm10, %xmm2, %xmm2 +; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4,5],xmm2[6,7] +; AVX512-NEXT: vmovdqa64 %xmm21, %xmm2 +; AVX512-NEXT: vpshufb %xmm14, %xmm2, %xmm2 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm13[0,1,2,3,5,5,5,5] -; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3],xmm1[4,5],xmm4[6],xmm1[7] -; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3],xmm2[4,5],xmm4[6],xmm2[7] +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %ymm23, %ymm0 -; AVX512-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX512-NEXT: vmovdqa64 %xmm22, %xmm1 -; AVX512-NEXT: vpshufb %xmm11, %xmm1, %xmm1 -; AVX512-NEXT: vpshufb %xmm15, %xmm12, %xmm2 -; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3],xmm1[4,5],xmm2[6,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX512-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa64 %xmm22, %xmm2 +; AVX512-NEXT: vpshufb %xmm10, %xmm2, %xmm2 +; AVX512-NEXT: vpshufb %xmm10, %xmm12, %xmm3 +; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3],xmm2[4,5],xmm3[6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %ymm29, %ymm0 -; AVX512-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vmovdqa64 %xmm28, %xmm1 ; AVX512-NEXT: vpshufb %xmm14, %xmm1, %xmm1 -; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,5,5,5,5] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,5,5,5,5] ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] @@ -11049,79 +11041,77 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512-FCP-LABEL: load_i16_stride6_vf64: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: subq $1480, %rsp # imm = 0x5C8 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] +; AVX512-FCP-NEXT: subq $1416, %rsp # imm = 0x588 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15] ; AVX512-FCP-NEXT: vmovdqa 608(%rdi), %ymm0 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vmovdqa 576(%rdi), %ymm1 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] -; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm0 -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm12 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] +; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm0 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm1 ; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm20 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,0,3] -; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm1 +; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm1 ; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm21 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] ; AVX512-FCP-NEXT: vmovdqa 544(%rdi), %ymm1 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vmovdqa 512(%rdi), %ymm2 ; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm15, %xmm1 -; AVX512-FCP-NEXT: vextracti128 $1, %ymm15, %xmm4 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5] -; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm3 -; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm22 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5] +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm13, %xmm1 +; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm3 +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm2 +; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm22 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa 416(%rdi), %ymm0 -; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa 384(%rdi), %ymm1 +; AVX512-FCP-NEXT: vmovdqa 448(%rdi), %ymm0 +; AVX512-FCP-NEXT: vmovdqa 416(%rdi), %ymm1 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] -; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm14, %xmm0 -; AVX512-FCP-NEXT: vextracti128 $1, %ymm14, %xmm1 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,1,0,3] -; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm1 +; AVX512-FCP-NEXT: vmovdqa 384(%rdi), %ymm2 +; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] +; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm12, %xmm1 +; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm2 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,1,0,3] +; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm3, %xmm2 ; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm23 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] -; AVX512-FCP-NEXT: vmovdqa 448(%rdi), %ymm1 -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],mem[2,3] -; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vinserti128 $1, 480(%rdi), %ymm1, %ymm1 -; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm3[1],ymm1[2,3,4,5],ymm3[6],ymm1[7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm1 -; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm24 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],mem[2,3] +; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vinserti128 $1, 480(%rdi), %ymm0, %ymm0 +; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3,4,5],ymm2[6],ymm0[7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm24 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa 736(%rdi), %ymm0 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vmovdqa 704(%rdi), %ymm1 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm13, %xmm0 -; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm3 -; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm1 -; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm26 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm9, %xmm0 +; AVX512-FCP-NEXT: vextracti128 $1, %ymm9, %xmm2 +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm1 +; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm26 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] ; AVX512-FCP-NEXT: vmovdqa 640(%rdi), %ymm1 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],mem[2,3] -; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3],mem[2,3] +; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vinserti128 $1, 672(%rdi), %ymm1, %ymm1 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm3[1],ymm1[2,3,4,5],ymm3[6],ymm1[7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u] -; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm1 -; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm16 -; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm29 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3,4,5],ymm2[6],ymm1[7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u] +; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm1 +; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm16 +; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm29 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,6] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -11130,21 +11120,21 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm1 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm11, %xmm0 -; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm11, %xmm1 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[2,1,0,3] -; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm7, %xmm0 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm14, %xmm0 +; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm14, %xmm1 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[2,1,0,3] +; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm15, %xmm0 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] ; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm30 -; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm5, %xmm1 -; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm6, %xmm3 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7] +; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm30 +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm1 +; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm7, %xmm2 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -11152,30 +11142,30 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm0 -; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm3 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm10 = xmm3[2,1,0,3] -; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm10, %xmm3 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2],xmm0[3],xmm3[4,5],xmm0[6,7] -; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],mem[2,3] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm0 +; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm3, %xmm2 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,1,0,3] +; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm0 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4,5],xmm2[6,7] +; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],mem[2,3] ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vinserti128 $1, 96(%rdi), %ymm3, %ymm3 -; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0],ymm1[1],ymm3[2,3,4,5],ymm1[6],ymm3[7] -; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm12, %ymm3 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7] +; AVX512-FCP-NEXT: vinserti128 $1, 96(%rdi), %ymm2, %ymm2 +; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0],ymm1[1],ymm2[2,3,4,5],ymm1[6],ymm2[7] +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm11, %ymm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa 352(%rdi), %ymm0 -; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX512-FCP-NEXT: vmovdqa 320(%rdi), %ymm1 -; AVX512-FCP-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm8, %xmm9 -; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm3 -; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm2[0,1,2],xmm9[3],xmm2[4,5],xmm9[6],xmm2[7] +; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm2 +; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm6, %xmm10 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm2[3],xmm10[4,5],xmm2[6],xmm10[7] ; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %ymm2 ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],mem[2,3] ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -11184,70 +11174,69 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm31 ; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm0 ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm0 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm0[0,1,2],ymm9[3,4,5,6,7],ymm0[8,9,10],ymm9[11,12,13,14,15] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15] ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15] -; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm11, %xmm0 -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm11 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] -; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm7, %xmm7 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0,1],xmm7[2],xmm0[3],xmm7[4,5],xmm0[6,7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [2,3,14,15,2,3,6,7,10,11,14,15,12,13,14,15] +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm14, %xmm0 +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm15, %xmm14 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm14[2],xmm0[3],xmm14[4,5],xmm0[6,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] +; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm7, %xmm7 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3],xmm6[4,5],xmm5[6],xmm6[7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3],xmm7[4,5],xmm5[6],xmm7[7] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm5, %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm4, %xmm4 -; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm10, %xmm5 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3],xmm5[4,5],xmm4[6,7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm12, %ymm6 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3,4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,5,5,5,5] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23] -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm5, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm0 +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm8, %xmm3 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2],xmm0[3],xmm3[4,5],xmm0[6,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm11, %ymm5 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6,7] +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm6, %xmm0 +; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,5,5,5,5] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23] +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm2 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm28 -; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm1 -; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm1 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm28 +; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm0 +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm0, %xmm0 ; AVX512-FCP-NEXT: vmovdqa64 %xmm21, %xmm2 -; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm2 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7] ; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm2 -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm15[0,1,2,3,5,5,5,5] +; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm2, %xmm2 +; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm13[0,1,2,3,5,5,5,5] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3],xmm2[4,5],xmm4[6],xmm2[7] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm1 -; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm1 -; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm14, %xmm2 -; AVX512-FCP-NEXT: vmovdqa64 %xmm23, %xmm4 -; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm4, %xmm4 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3],xmm4[4,5],xmm2[6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %ymm29, %ymm1 -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX512-FCP-NEXT: vmovdqa64 %xmm26, %xmm2 -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm13[0,1,2,3,5,5,5,5] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5],xmm2[6],xmm0[7] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm0 +; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm12, %xmm2 +; AVX512-FCP-NEXT: vmovdqa64 %xmm23, %xmm3 +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm3 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %ymm29, %ymm0 +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 %xmm26, %xmm1 +; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm1, %xmm1 +; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,5,5,5,5] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] +; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm27 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload @@ -11295,8 +11284,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm17 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload -; AVX512-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512-FCP-NEXT: vpblendd $219, (%rsp), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX512-FCP-NEXT: # ymm5 = mem[0,1],ymm5[2],mem[3,4],ymm5[5],mem[6,7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[2,1,0,3] @@ -11311,16 +11300,15 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: # ymm15 = ymm6[0,1],mem[2],ymm6[3],mem[4],ymm6[5,6],mem[7] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] ; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm15, %ymm6 -; AVX512-FCP-NEXT: vmovdqa64 %ymm7, %ymm25 +; AVX512-FCP-NEXT: vmovdqa64 %ymm7, %ymm22 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3,4,5,6,7],ymm6[8,9,10],ymm5[11,12,13,14,15] ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512-FCP-NEXT: vpternlogq $226, %zmm3, %zmm29, %zmm4 -; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} zmm3 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] -; AVX512-FCP-NEXT: vpternlogq $184, %zmm4, %zmm3, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm22 +; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} zmm26 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] +; AVX512-FCP-NEXT: vpternlogq $184, %zmm4, %zmm26, %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX512-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload @@ -11369,20 +11357,19 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512-FCP-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX512-FCP-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm0 ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm0 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0,1,2],ymm13[3,4,5,6,7],ymm0[8,9,10],ymm13[11,12,13,14,15] ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm25 ; AVX512-FCP-NEXT: vpternlogq $226, %zmm11, %zmm29, %zmm1 -; AVX512-FCP-NEXT: vpternlogq $184, %zmm1, %zmm22, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpternlogq $184, %zmm1, %zmm26, %zmm25 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15] ; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm0 ; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm25 +; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm22 ; AVX512-FCP-NEXT: vmovdqa64 %xmm23, %xmm1 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,1,4,5,6,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] @@ -11416,10 +11403,10 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7],ymm15[8,9,10],ymm13[11,12,13,14,15] ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,4,5] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm26 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm23 ; AVX512-FCP-NEXT: vpternlogq $226, %zmm24, %zmm29, %zmm0 -; AVX512-FCP-NEXT: vpternlogq $184, %zmm0, %zmm22, %zmm26 -; AVX512-FCP-NEXT: vmovdqa64 %xmm25, %xmm0 +; AVX512-FCP-NEXT: vpternlogq $184, %zmm0, %zmm26, %zmm23 +; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm0 ; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm10, %xmm0 ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 32-byte Folded Reload ; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -11453,27 +11440,26 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm28 ; AVX512-FCP-NEXT: vpternlogq $226, %zmm0, %zmm29, %zmm5 -; AVX512-FCP-NEXT: vpternlogq $184, %zmm5, %zmm22, %zmm28 +; AVX512-FCP-NEXT: vpternlogq $184, %zmm5, %zmm26, %zmm28 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm12 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX512-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm11 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm12, %xmm0 -; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm2 -; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] +; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm11, %xmm0 +; AVX512-FCP-NEXT: vextracti128 $1, %ymm11, %xmm3 +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,2,2,2,4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm24 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] ; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm2 ; AVX512-FCP-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX512-FCP-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5,6],mem[7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,3,2,1] -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm7 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm2 -; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm24 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm4, %xmm3 -; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm25 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13] +; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm2 +; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm22 +; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm4, %xmm3 +; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm21 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5],xmm2[6,7] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm30 @@ -11481,16 +11467,16 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload ; AVX512-FCP-NEXT: # ymm4 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7] ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm11 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm11, %xmm0 -; AVX512-FCP-NEXT: vextracti128 $1, %ymm11, %xmm9 -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[2,2,2,2,4,5,6,7] +; AVX512-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm10 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm0 +; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm14 +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm14[2,2,2,2,4,5,6,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm29 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm3 -; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm20 +; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm17 ; AVX512-FCP-NEXT: vpternlogq $236, %ymm29, %ymm3, %ymm2 ; AVX512-FCP-NEXT: movw $31, %ax ; AVX512-FCP-NEXT: kmovw %eax, %k1 @@ -11498,174 +11484,164 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 %ymm31, %ymm2 ; AVX512-FCP-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload ; AVX512-FCP-NEXT: # ymm4 = mem[0],ymm2[1],mem[2,3,4,5],ymm2[6],mem[7] -; AVX512-FCP-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload -; AVX512-FCP-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512-FCP-NEXT: vpblendd $146, (%rsp), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX512-FCP-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5,6],mem[7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,3,2,1] -; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm2 -; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm27 -; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm5, %xmm3 -; AVX512-FCP-NEXT: vmovdqa64 %xmm5, %xmm22 +; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm2 +; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm20 +; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm5, %xmm3 +; AVX512-FCP-NEXT: vmovdqa64 %xmm5, %xmm19 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5],xmm2[6,7] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm3 ; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm16 -; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm21 +; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm18 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm23 +; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm27 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512-FCP-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm4 = mem[0],ymm2[1],mem[2,3,4,5],ymm2[6],mem[7] +; AVX512-FCP-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm15 = mem[0],ymm2[1],mem[2,3,4,5],ymm2[6],mem[7] ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm14 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm14, %xmm2 -; AVX512-FCP-NEXT: vextracti128 $1, %ymm14, %xmm5 -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[2,2,2,2,4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %xmm5, %xmm19 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm18 -; AVX512-FCP-NEXT: vpternlogq $236, %ymm29, %ymm0, %ymm2 +; AVX512-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm6 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] +; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm6, %xmm3 +; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm13 +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[2,2,2,2,4,5,6,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7] +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm15, %ymm0 +; AVX512-FCP-NEXT: vpternlogq $236, %ymm29, %ymm0, %ymm3 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm4 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm4, %xmm1 -; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm0 -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[2,2,2,2,4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm17 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6,7] +; AVX512-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm7 +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm7[2,2,2,2,4,5,6,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3],xmm4[4],xmm1[5,6,7] ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512-FCP-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm3 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm13 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[0,3,2,1] -; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm13, %xmm3 -; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm8, %xmm5 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4],xmm5[5],xmm3[6,7] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm3, %zmm31 -; AVX512-FCP-NEXT: vmovdqa32 %zmm2, %zmm31 {%k1} +; AVX512-FCP-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm4 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm9 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm4[0,3,2,1] +; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm9, %xmm4 +; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm8, %xmm5 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4],xmm5[5],xmm4[6,7] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm4, %zmm31 +; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm31 {%k1} ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512-FCP-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm10 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] +; AVX512-FCP-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm5 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX512-FCP-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm6 -; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm7 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,3,2,1] -; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm5, %xmm1 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm7[4],xmm1[5],xmm7[6,7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] +; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm4 +; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm12 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm12[0,1,2,3],xmm4[4],xmm12[5],xmm4[6,7] ; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm10, %ymm7 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4],ymm1[5,6,7] +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm12 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm4[5,6,7] ; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm16 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm9, %xmm9 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15] -; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm11, %xmm11 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0],xmm9[1],xmm11[2,3],xmm9[4],xmm11[5,6,7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm11 -; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm20 -; AVX512-FCP-NEXT: vpternlogq $236, %ymm29, %ymm11, %ymm9 -; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm11 -; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm12, %xmm12 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm12[0],xmm11[1],xmm12[2,3],xmm11[4],xmm12[5,6,7] -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm12 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15] +; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm14, %xmm14 +; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm10, %xmm10 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm10[0],xmm14[1],xmm10[2,3],xmm14[4],xmm10[5,6,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm0 +; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm10 +; AVX512-FCP-NEXT: vmovdqa64 %ymm12, %ymm17 +; AVX512-FCP-NEXT: vpternlogq $236, %ymm29, %ymm10, %ymm14 ; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm0 -; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm0, %xmm0 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u] -; AVX512-FCP-NEXT: vmovdqa64 %xmm25, %xmm2 -; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm15 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1,2,3],xmm0[4],xmm15[5],xmm0[6,7] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqa32 %zmm9, %zmm2 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm0 -; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm0, %xmm1 +; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm10 +; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm11, %xmm11 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15] ; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm0 -; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm0, %xmm9 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0,1,2,3],xmm1[4],xmm9[5],xmm1[6,7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] -; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm15 +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vmovdqa64 %xmm21, %xmm12 +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm12, %xmm12 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0,1,2,3],xmm0[4],xmm12[5],xmm0[6,7] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm11, %zmm0, %zmm0 +; AVX512-FCP-NEXT: vmovdqa32 %zmm14, %zmm0 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm11 +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm11, %xmm11 +; AVX512-FCP-NEXT: vmovdqa64 %xmm19, %xmm12 +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm12, %xmm12 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3],xmm11[4],xmm12[5],xmm11[6,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] +; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm14 +; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm14, %ymm14 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3,4],ymm11[5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm14 +; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm15, %ymm14 +; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm13, %xmm13 +; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm6 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm13[1],xmm6[2,3],xmm13[4],xmm6[5,6,7] +; AVX512-FCP-NEXT: vpternlogq $236, %ymm29, %ymm14, %ymm6 +; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm7, %xmm7 +; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm7[1],xmm2[2,3],xmm7[4],xmm2[5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm27, %zmm0, %zmm4 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm7 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm11 +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm9, %xmm9 +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm8, %xmm8 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm9[4],xmm8[5],xmm9[6,7] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm8, %zmm2 +; AVX512-FCP-NEXT: vmovdqa32 %zmm6, %zmm2 {%k1} +; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm5, %ymm5 +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm3 +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm1 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6,7] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4],ymm1[5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm0 -; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm15 -; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm0, %ymm15 -; AVX512-FCP-NEXT: vmovdqa64 %xmm19, %xmm0 -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm14, %xmm14 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0],xmm0[1],xmm14[2,3],xmm0[4],xmm14[5,6,7] -; AVX512-FCP-NEXT: vpternlogq $236, %ymm29, %ymm15, %ymm0 -; AVX512-FCP-NEXT: vmovdqa64 %xmm17, %xmm14 -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm14, %xmm3 -; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm4, %xmm4 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6,7] -; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm13, %xmm4 -; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm7 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2,3],xmm4[4],xmm7[5],xmm4[6,7] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm23, %zmm0, %zmm7 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm8 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4],ymm1[5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm3, %zmm4, %zmm3 -; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} -; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm10, %ymm0 -; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm6, %xmm4 -; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm5, %xmm5 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4],xmm5[5],xmm4[6,7] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm5 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm5 # 64-byte Folded Reload ; AVX512-FCP-NEXT: movw $-2048, %ax # imm = 0xF800 ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa32 %zmm6, %zmm5 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rsi) ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm5 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm5 # 64-byte Folded Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa32 %zmm6, %zmm5 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 64(%rsi) ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm5 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm5 # 64-byte Folded Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa32 %zmm6, %zmm5 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 64(%rdx) ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm5 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa32 %zmm4, %zmm5 {%k1} +; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm5 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm5 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) -; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} zmm4 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] -; AVX512-FCP-NEXT: vpternlogq $184, %zmm30, %zmm4, %zmm7 -; AVX512-FCP-NEXT: vpternlogq $184, %zmm31, %zmm4, %zmm8 -; AVX512-FCP-NEXT: vpternlogq $184, %zmm2, %zmm4, %zmm1 -; AVX512-FCP-NEXT: vpternlogq $184, %zmm3, %zmm4, %zmm0 -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm2, 64(%rcx) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm2, (%rcx) +; AVX512-FCP-NEXT: vpternlogq $184, %zmm30, %zmm26, %zmm4 +; AVX512-FCP-NEXT: vpternlogq $184, %zmm31, %zmm26, %zmm7 +; AVX512-FCP-NEXT: vpternlogq $184, %zmm0, %zmm26, %zmm11 +; AVX512-FCP-NEXT: vpternlogq $184, %zmm2, %zmm26, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, 64(%rcx) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, (%rcx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm28, 64(%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, (%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 64(%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, (%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, (%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 64(%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%r9) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512-FCP-NEXT: addq $1480, %rsp # imm = 0x5C8 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512-FCP-NEXT: addq $1416, %rsp # imm = 0x588 ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -11694,8 +11670,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm23 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4],xmm3[5,6,7] ; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm22 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,4,5,4,5,6,7,0,1,12,13,8,9,4,5] -; AVX512DQ-NEXT: vpshufb %xmm11, %xmm1, %xmm1 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,4,5,4,5,6,7,0,1,12,13,8,9,4,5] +; AVX512DQ-NEXT: vpshufb %xmm10, %xmm1, %xmm1 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vmovdqa 448(%rdi), %ymm1 @@ -11714,8 +11690,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vinserti128 $1, 480(%rdi), %ymm1, %ymm1 ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm3[1],ymm1[2,3,4,5],ymm3[6],ymm1[7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX512DQ-NEXT: vpshufb %ymm6, %ymm3, %ymm1 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX512DQ-NEXT: vpshufb %ymm5, %ymm3, %ymm1 ; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm20 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2],ymm1[3,4,5,6,7] ; AVX512DQ-NEXT: vmovdqa 640(%rdi), %ymm1 @@ -11728,16 +11704,16 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[2,2,2,2,4,5,6,7] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3,4],xmm4[5,6,7] ; AVX512DQ-NEXT: vmovdqa64 %xmm4, %xmm19 -; AVX512DQ-NEXT: vpshufb %xmm11, %xmm2, %xmm2 +; AVX512DQ-NEXT: vpshufb %xmm10, %xmm2, %xmm2 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm1[2,3],mem[2,3] ; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vinserti128 $1, 672(%rdi), %ymm1, %ymm1 ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0],ymm4[1],ymm1[2,3,4,5],ymm4[6],ymm1[7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u] -; AVX512DQ-NEXT: vpshufb %ymm5, %ymm4, %ymm1 -; AVX512DQ-NEXT: vmovdqa64 %ymm5, %ymm27 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u] +; AVX512DQ-NEXT: vpshufb %ymm6, %ymm4, %ymm1 +; AVX512DQ-NEXT: vmovdqa64 %ymm6, %ymm27 ; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm18 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,6] @@ -11763,87 +11739,86 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm2 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm8 +; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm11 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[2,2,2,2,4,5,6,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3,4],xmm8[5,6,7] -; AVX512DQ-NEXT: vpshufb %xmm11, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm11[0,1,2],xmm1[3,4],xmm11[5,6,7] +; AVX512DQ-NEXT: vpshufb %xmm10, %xmm1, %xmm1 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm17 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX512DQ-NEXT: vpshufb %xmm9, %xmm5, %xmm1 -; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm9 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[0,2,0,3] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3],xmm3[4,5],xmm1[6,7] -; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],mem[2,3] -; AVX512DQ-NEXT: vinserti128 $1, 96(%rdi), %ymm3, %ymm3 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0],ymm0[1],ymm3[2,3,4,5],ymm0[6],ymm3[7] -; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm31 -; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm30 -; AVX512DQ-NEXT: vpshufb %ymm6, %ymm7, %ymm3 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1,2],ymm3[3,4,5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX512DQ-NEXT: vpshufb %xmm9, %xmm3, %xmm0 +; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm9 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[0,2,0,3] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7] +; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],mem[2,3] +; AVX512DQ-NEXT: vinserti128 $1, 96(%rdi), %ymm2, %ymm2 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0],ymm1[1],ymm2[2,3,4,5],ymm1[6],ymm2[7] +; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm31 +; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm30 +; AVX512DQ-NEXT: vpshufb %ymm5, %ymm6, %ymm2 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1,2],ymm2[3,4,5,6,7] ; AVX512DQ-NEXT: vmovdqa 352(%rdi), %ymm0 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vmovdqa 320(%rdi), %ymm1 ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm3 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[2,2,2,2,4,5,6,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4],xmm3[5,6,7] -; AVX512DQ-NEXT: vpshufb %xmm11, %xmm1, %xmm11 -; AVX512DQ-NEXT: vmovdqa 256(%rdi), %ymm1 -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3],mem[2,3] -; AVX512DQ-NEXT: vinserti128 $1, 288(%rdi), %ymm1, %ymm0 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm2[1],ymm0[2,3,4,5],ymm2[6],ymm0[7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm8 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[2,2,2,2,4,5,6,7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0,1,2],xmm2[3,4],xmm8[5,6,7] +; AVX512DQ-NEXT: vpshufb %xmm10, %xmm2, %xmm10 +; AVX512DQ-NEXT: vmovdqa 256(%rdi), %ymm2 +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],mem[2,3] +; AVX512DQ-NEXT: vinserti128 $1, 288(%rdi), %ymm2, %ymm0 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] ; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm26 -; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm29 +; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm29 ; AVX512DQ-NEXT: vmovdqa64 %ymm27, %ymm0 -; AVX512DQ-NEXT: vpshufb %ymm0, %ymm1, %ymm0 -; AVX512DQ-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm0[0,1,2],ymm11[3,4,5,6,7],ymm0[8,9,10],ymm11[11,12,13,14,15] +; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm0 +; AVX512DQ-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-NEXT: vpternlogq $226, %zmm17, %zmm16, %zmm6 -; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm6 {%k1} -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} xmm11 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0] -; AVX512DQ-NEXT: vpshufb %xmm11, %xmm14, %xmm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm14 = [2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15] -; AVX512DQ-NEXT: vpshufb %xmm14, %xmm13, %xmm13 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-NEXT: vpternlogq $226, %zmm17, %zmm16, %zmm7 +; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm7 {%k1} +; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm10 = [2,3,14,15,10,11,6,7,2,3,14,15,12,13,14,15] +; AVX512DQ-NEXT: vpshufb %xmm10, %xmm14, %xmm0 +; AVX512DQ-NEXT: vpshufb %xmm10, %xmm13, %xmm13 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1],xmm0[2],xmm13[3],xmm0[4,5],xmm13[6,7] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm13 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] -; AVX512DQ-NEXT: vpshufb %xmm13, %xmm8, %xmm6 +; AVX512DQ-NEXT: vpshufb %xmm13, %xmm11, %xmm7 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3],xmm6[4,5],xmm4[6],xmm6[7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3],xmm7[4,5],xmm4[6],xmm7[7] ; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm0 -; AVX512DQ-NEXT: vpshufb %xmm11, %xmm9, %xmm4 -; AVX512DQ-NEXT: vpshufb %xmm14, %xmm5, %xmm2 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3],xmm4[4,5],xmm2[6,7] +; AVX512DQ-NEXT: vpshufb %xmm10, %xmm9, %xmm4 +; AVX512DQ-NEXT: vpshufb %xmm10, %xmm3, %xmm3 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3],xmm4[4,5],xmm3[6,7] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX512DQ-NEXT: vpshufb %ymm4, %ymm7, %ymm5 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2],ymm5[3,4,5,6,7] -; AVX512DQ-NEXT: vpshufb %xmm13, %xmm3, %xmm2 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm10[0,1,2,3,5,5,5,5] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5],xmm3[6],xmm2[7] +; AVX512DQ-NEXT: vpshufb %ymm4, %ymm6, %ymm6 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2],ymm6[3,4,5,6,7] +; AVX512DQ-NEXT: vpshufb %xmm13, %xmm8, %xmm3 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,5,5,5,5] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23] -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-NEXT: vpternlogq $226, %zmm0, %zmm16, %zmm5 -; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm5 {%k1} -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-NEXT: vpternlogq $226, %zmm0, %zmm16, %zmm6 +; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm6 {%k1} +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm0 -; AVX512DQ-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshufb %xmm10, %xmm0, %xmm0 ; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm1 -; AVX512DQ-NEXT: vpshufb %xmm14, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpshufb %xmm10, %xmm1, %xmm1 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] ; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm1 ; AVX512DQ-NEXT: vpshufb %xmm13, %xmm1, %xmm1 @@ -11855,8 +11830,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 %ymm20, %ymm1 ; AVX512DQ-NEXT: vpshufb %ymm4, %ymm1, %ymm1 ; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm2 -; AVX512DQ-NEXT: vpshufb %xmm11, %xmm2, %xmm2 -; AVX512DQ-NEXT: vpshufb %xmm14, %xmm15, %xmm4 +; AVX512DQ-NEXT: vpshufb %xmm10, %xmm2, %xmm2 +; AVX512DQ-NEXT: vpshufb %xmm10, %xmm15, %xmm4 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3],xmm2[4,5],xmm4[6,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2],ymm1[3,4,5,6,7] ; AVX512DQ-NEXT: vmovdqa64 %ymm18, %ymm1 @@ -12271,19 +12246,18 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512DQ-FCP-LABEL: load_i16_stride6_vf64: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: subq $904, %rsp # imm = 0x388 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] +; AVX512DQ-FCP-NEXT: subq $872, %rsp # imm = 0x368 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15] ; AVX512DQ-FCP-NEXT: vmovdqa 608(%rdi), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 576(%rdi), %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] ; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm0 -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm12 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm25 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,0,3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm1 +; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm24 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] ; AVX512DQ-FCP-NEXT: vmovdqa 544(%rdi), %ymm1 @@ -12291,121 +12265,120 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa 512(%rdi), %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm10 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5] ; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm1 -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm23 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5] -; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm22 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm22 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa 416(%rdi), %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa 448(%rdi), %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa 416(%rdi), %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 384(%rdi), %ymm3 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm15, %xmm1 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm15, %xmm2 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm15, %xmm3 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,1,0,3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm4, %xmm3 +; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm4, %xmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm21 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3],xmm3[4,5],xmm1[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa 448(%rdi), %ymm3 -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm3[2,3],mem[2,3] -; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vinserti128 $1, 480(%rdi), %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],mem[2,3] ; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0],ymm4[1],ymm3[2,3,4,5],ymm4[6],ymm3[7] +; AVX512DQ-FCP-NEXT: vinserti128 $1, 480(%rdi), %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm3[1],ymm1[2,3,4,5],ymm3[6],ymm1[7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm20 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2],ymm3[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm20 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2],ymm1[3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa 736(%rdi), %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 704(%rdi), %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm14, %xmm1 -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm14, %xmm5 -; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm5, %xmm19 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7] -; AVX512DQ-FCP-NEXT: vmovdqa 640(%rdi), %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa 704(%rdi), %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm12, %xmm1 +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm4 +; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm19 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] +; AVX512DQ-FCP-NEXT: vmovdqa 640(%rdi), %ymm2 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm3[2,3],mem[2,3] -; AVX512DQ-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vinserti128 $1, 672(%rdi), %ymm3, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0],ymm5[1],ymm3[2,3,4,5],ymm5[6],ymm3[7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u] -; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm5, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm7, %ymm26 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm18 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm2[2,3],mem[2,3] +; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vinserti128 $1, 672(%rdi), %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0],ymm4[1],ymm2[2,3,4,5],ymm4[6],ymm2[7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u] +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm26 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm18 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,6] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm0, %zmm16, %zmm4 +; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm0, %zmm16, %zmm3 ; AVX512DQ-FCP-NEXT: movw $-2048, %ax # imm = 0xF800 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 -; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm3 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm11, %xmm0 -; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm11, %xmm1 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[2,1,0,3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm7, %xmm0 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm0 +; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm13, %xmm1 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[2,1,0,3] +; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm14, %xmm0 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] ; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7] +; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] ; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm1 -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm13 -; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm13, %xmm3 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm7, %xmm2 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm17 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm4, %xmm0 -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[2,1,0,3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm9, %xmm3 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2],xmm0[3],xmm3[4,5],xmm0[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],mem[2,3] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm0 +; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm2 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[2,1,0,3] +; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm11, %xmm0 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4,5],xmm2[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],mem[2,3] ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vinserti128 $1, 96(%rdi), %ymm3, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0],ymm1[1],ymm3[2,3,4,5],ymm1[6],ymm3[7] -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm12, %ymm3 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1,2],ymm3[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti128 $1, 96(%rdi), %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm2[0],ymm1[1],ymm2[2,3,4,5],ymm1[6],ymm2[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm31 +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm9, %ymm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1,2],ymm2[3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa 352(%rdi), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 320(%rdi), %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm2 +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm8 ; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm8, %xmm10 -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm2[0,1,2],xmm10[3],xmm2[4,5],xmm10[6],xmm2[7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm2[3],xmm10[4,5],xmm2[6],xmm10[7] ; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %ymm2 ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],mem[2,3] ; AVX512DQ-FCP-NEXT: vinserti128 $1, 288(%rdi), %ymm2, %ymm0 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm30 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm31 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm29 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm30 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm0 ; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm0 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 @@ -12415,25 +12388,24 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm17, %zmm16, %zmm6 ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm6 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm11, %xmm0 -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm11 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] -; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm7, %xmm7 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0,1],xmm7[2],xmm0[3],xmm7[4,5],xmm0[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm13, %xmm6 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [2,3,14,15,2,3,6,7,10,11,14,15,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm13, %xmm0 +; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm14, %xmm13 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm13[2],xmm0[3],xmm13[4,5],xmm0[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm7, %xmm7 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3],xmm6[4,5],xmm5[6],xmm6[7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3],xmm7[4,5],xmm5[6],xmm7[7] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm5, %zmm5 -; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm4 -; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm9, %xmm6 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2],xmm4[3],xmm6[4,5],xmm4[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm12, %ymm7 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm7[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,5,5,5,5] +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm5, %zmm0 +; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm3 +; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm11, %xmm5 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2],xmm3[3],xmm5[4,5],xmm3[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm9, %ymm6 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2],ymm6[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm8, %xmm3 +; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,5,5,5,5] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23] ; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2 @@ -12441,40 +12413,40 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm5, %zmm16, %zmm4 -; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm1 +; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm0, %zmm16, %zmm6 +; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm6 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm0, %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm24, %xmm1 ; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm24, %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm4 -; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3],xmm2[4,5],xmm4[6],xmm2[7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm1 +; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm1, %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm2 +; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm15, %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm21, %xmm4 +; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm4 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3],xmm4[4,5],xmm2[6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2],ymm1[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm2 +; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm2 +; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,5,5,5,5] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5],xmm3[6],xmm2[7] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm15, %xmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm21, %xmm5 -; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm5, %xmm5 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3],xmm5[4,5],xmm4[6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm2[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm14[0,1,2,3,5,5,5,5] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5],xmm3[6],xmm0[7] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm1, %zmm16, %zmm4 -; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm4 {%k1} +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm0, %zmm16, %zmm4 +; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm4 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512DQ-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload @@ -12486,8 +12458,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm15, %xmm1 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm14[2,1,2,0,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vpblendd $36, (%rsp), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: # ymm2 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,1,0,3] @@ -12512,7 +12484,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[2,1,2,0,4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm5, %xmm22 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm1[1,2],xmm4[3],xmm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm31, %ymm1 ; AVX512DQ-FCP-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: # ymm6 = ymm1[0,1],mem[2],ymm1[3],mem[4],ymm1[5,6],mem[7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] @@ -12532,12 +12504,12 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm7, %xmm18 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5,6],xmm6[7] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm31, %ymm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm7 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1],ymm7[2],ymm6[3],ymm7[4],ymm6[5,6],ymm7[7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm7, %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm8, %ymm27 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm8, %ymm26 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm7, %ymm17 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3,4,5,6,7],ymm6[8,9,10],ymm5[11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4] @@ -12545,9 +12517,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm3, %zmm20, %zmm4 -; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} zmm3 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm4, %zmm3, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm26 +; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} zmm28 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] +; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm4, %zmm28, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX512DQ-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload @@ -12597,18 +12568,18 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512DQ-FCP-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: # ymm2 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm0 ; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm0 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm29 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm27 ; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm9, %zmm20, %zmm1 -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm1, %zmm26, %zmm29 +; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm1, %zmm28, %zmm27 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15] ; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm15, %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm28 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm26 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[3,1,2,1,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u] @@ -12642,11 +12613,11 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0,1,2],ymm14[3,4,5,6,7],ymm0[8,9,10],ymm14[11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,4,5] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm27 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm19 ; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm21, %zmm20, %zmm1 -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm1, %zmm26, %zmm27 +; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm1, %zmm28, %zmm19 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm16, %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm28, %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm26, %xmm1 ; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm13[3,1,2,1,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] @@ -12671,197 +12642,192 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm21 ; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm0, %zmm20, %zmm1 -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm1, %zmm26, %zmm21 +; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm1, %zmm28, %zmm21 ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm13 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm13, %xmm1 -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm14[2,2,2,2,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm9 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm9, %xmm0 +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm9, %xmm3 +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,2,2,2,4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, %xmm11 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vpblendd $109, (%rsp), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,3,2,1] -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm26 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm5, %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm5, %xmm28 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,3,2,1] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13] +; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm3, %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm16 +; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm4, %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm17 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5],xmm2[6,7] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm22 -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm5 = mem[0],ymm1[1],mem[2,3,4,5],ymm1[6],mem[7] -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm12 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm1 -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm10 -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[2,2,2,2,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7] +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm31, %ymm0 +; AVX512DQ-FCP-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm14 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7] +; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm10 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm0 +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm15 +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[2,2,2,2,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm5, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm18 -; AVX512DQ-FCP-NEXT: vpternlogq $236, %ymm20, %ymm3, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm14, %ymm3 +; AVX512DQ-FCP-NEXT: vpternlogq $236, %ymm20, %ymm3, %ymm0 ; AVX512DQ-FCP-NEXT: movw $31, %ax ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 -; AVX512DQ-FCP-NEXT: vinserti32x8 $0, %ymm2, %zmm0, %zmm22 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm31, %ymm3 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0],ymm2[1],ymm3[2,3,4,5],ymm2[6],ymm3[7] -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5,6],mem[7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,3,2,1] -; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm19 -; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm6, %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm6, %xmm30 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5],xmm2[6,7] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm25 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm31 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm23 -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm5 = ymm2[0],mem[1],ymm2[2,3,4,5],mem[6],ymm2[7] -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm15 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm15, %xmm2 -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm15, %xmm6 -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[2,2,2,2,4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm6, %xmm17 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm5, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm16 -; AVX512DQ-FCP-NEXT: vpternlogq $236, %ymm20, %ymm1, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm9 -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[2,2,2,2,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6],ymm1[7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm8 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[0,3,2,1] -; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm8, %xmm1 -; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm7, %xmm5 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4],xmm5[5],xmm1[6,7] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm24 +; AVX512DQ-FCP-NEXT: vinserti32x8 $0, %ymm0, %zmm0, %zmm22 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm3 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0],ymm0[1],ymm3[2,3,4,5],ymm0[6],ymm3[7] +; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,3,2,1] +; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm3, %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm18 +; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm5, %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm5, %xmm29 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4],xmm3[5],xmm0[6,7] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm26 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm30 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm6 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] +; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm5 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm3 +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm0 +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[2,2,2,2,4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm25 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm6, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm31 +; AVX512DQ-FCP-NEXT: vpternlogq $236, %ymm20, %ymm2, %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm8 +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm8[2,2,2,2,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3],xmm4[4],xmm1[5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm4 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm7 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,3,2,1] +; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm7, %xmm4 +; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm6, %xmm12 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm12[0,1,2,3],xmm4[4],xmm12[5],xmm4[6,7] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm4, %zmm24 ; AVX512DQ-FCP-NEXT: vinserti32x8 $0, %ymm3, %zmm0, %zmm24 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm6 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7] +; AVX512DQ-FCP-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm4 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7] ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm5 -; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm5, %xmm4 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,3,2,1] -; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm3, %xmm0 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4],xmm0[5],xmm4[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm1 -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm6, %ymm4 +; AVX512DQ-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] +; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm3, %xmm12 +; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm1, %xmm13 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1,2,3],xmm12[4],xmm13[5],xmm12[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm13 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4],ymm12[5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm15, %xmm15 +; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm10, %xmm10 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm15[1],xmm10[2,3],xmm15[4],xmm10[5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm14, %ymm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm26 +; AVX512DQ-FCP-NEXT: vpternlogq $236, %ymm20, %ymm14, %ymm10 +; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm11, %xmm14 +; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm9, %xmm9 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm9[0],xmm14[1],xmm9[2,3],xmm14[4],xmm9[5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm16, %xmm0 +; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm17, %xmm15 +; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm15, %xmm15 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1,2,3],xmm0[4],xmm15[5],xmm0[6,7] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm25 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm1 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm11 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm12, %xmm12 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2,3],xmm11[4],xmm12[5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm18 -; AVX512DQ-FCP-NEXT: vpternlogq $236, %ymm20, %ymm12, %ymm11 -; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm14, %xmm12 -; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm13, %xmm13 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0],xmm12[1],xmm13[2,3],xmm12[4],xmm13[5,6,7] -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm12 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm26, %xmm4 -; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm4, %xmm4 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm28, %xmm14 -; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm14, %xmm14 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm14[0,1,2,3],xmm4[4],xmm14[5],xmm4[6,7] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm26 -; AVX512DQ-FCP-NEXT: vinserti32x8 $0, %ymm11, %zmm0, %zmm26 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm0 -; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm0, %xmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm30, %xmm0 -; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm0, %xmm11 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm11[0,1,2,3],xmm4[4],xmm11[5],xmm4[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm31, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm14 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4],ymm4[5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm14 -; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm0, %ymm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm17, %xmm0 -; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm15, %xmm15 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0],xmm0[1],xmm15[2,3],xmm0[4],xmm15[5,6,7] -; AVX512DQ-FCP-NEXT: vpternlogq $236, %ymm20, %ymm14, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm9, %xmm1 -; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm8, %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm7, %xmm7 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1,2,3],xmm2[4],xmm7[5],xmm2[6,7] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1 -; AVX512DQ-FCP-NEXT: vinserti32x8 $0, %ymm0, %zmm0, %zmm1 {%k1} -; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm6, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm5, %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5],xmm2[6,7] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm2, (%rsi) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 64(%rsi) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 64(%rdx) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm2, (%rdx) -; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} zmm2 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm22, %zmm2, %zmm23 -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm24, %zmm2, %zmm25 -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm26, %zmm2, %zmm4 -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm1, %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, 64(%rcx) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm1, (%rcx) +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm14, %zmm0, %zmm0 +; AVX512DQ-FCP-NEXT: vinserti32x8 $0, %ymm10, %zmm0, %zmm0 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm18, %xmm10 +; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm10, %xmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm29, %xmm14 +; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm14, %xmm14 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm14[0,1,2,3],xmm10[4],xmm14[5],xmm10[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm15 +; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm15, %ymm15 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3,4],ymm10[5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm31, %ymm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm11 +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm15, %ymm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm25, %xmm11 +; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm11, %xmm11 +; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm5, %xmm5 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm11[1],xmm5[2,3],xmm11[4],xmm5[5,6,7] +; AVX512DQ-FCP-NEXT: vpternlogq $236, %ymm20, %ymm15, %ymm5 +; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm8, %xmm8 +; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm2 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3],xmm8[4],xmm2[5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm7, %xmm7 +; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm6, %xmm6 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm7[4],xmm6[5],xmm7[6,7] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm6, %zmm2 +; AVX512DQ-FCP-NEXT: vinserti32x8 $0, %ymm5, %zmm0, %zmm2 {%k1} +; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm4, %ymm4 +; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm1 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6,7] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm3, (%rsi) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm3, 64(%rsi) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm3, 64(%rdx) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm3, (%rdx) +; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm22, %zmm28, %zmm23 +; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm24, %zmm28, %zmm13 +; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm0, %zmm28, %zmm10 +; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm2, %zmm28, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, 64(%rcx) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, (%rcx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, 64(%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, (%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, 64(%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, (%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 64(%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, (%r9) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%rax) -; AVX512DQ-FCP-NEXT: addq $904, %rsp # imm = 0x388 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, (%rax) +; AVX512DQ-FCP-NEXT: addq $872, %rsp # imm = 0x368 ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll index 9134e490535ba..af340d15fe8f6 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll @@ -1320,11 +1320,12 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm0[0],xmm1[1,2,3] ; AVX2-NEXT: vmovdqa 80(%rdi), %xmm2 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,u,u,12,13,10,11,4,5] -; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7] -; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm7[4],xmm6[5],xmm7[6],xmm6[7] -; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,14,15,12,13,10,11,8,9,u,u,u,u,u,u] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,14,15,12,13,10,11,8,9,12,13,10,11,4,5] +; AVX2-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7] +; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm8[4],xmm7[5],xmm8[6],xmm7[7] +; AVX2-NEXT: vpshufb %xmm6, %xmm7, %xmm6 ; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4],xmm5[5,6,7] ; AVX2-NEXT: vmovdqa 96(%rdi), %xmm7 ; AVX2-NEXT: vmovdqa 64(%rdi), %xmm8 @@ -1406,11 +1407,12 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm0[0],xmm1[1,2,3] ; AVX2-FP-NEXT: vmovdqa 80(%rdi), %xmm2 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,u,u,12,13,10,11,4,5] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7] -; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm7[4],xmm6[5],xmm7[6],xmm6[7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,14,15,12,13,10,11,8,9,u,u,u,u,u,u] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,14,15,12,13,10,11,8,9,12,13,10,11,4,5] +; AVX2-FP-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7] +; AVX2-FP-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm8[4],xmm7[5],xmm8[6],xmm7[7] +; AVX2-FP-NEXT: vpshufb %xmm6, %xmm7, %xmm6 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4],xmm5[5,6,7] ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %xmm7 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm9 @@ -1429,11 +1431,12 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,5,2,3,0,1,14,15,12,13,10,11,8,9,6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm11 = xmm1[0,1],xmm0[2],xmm1[3] ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,u,8,9,6,7,0,1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] -; AVX2-FP-NEXT: vextracti128 $1, %ymm12, %xmm13 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2],xmm12[3],xmm13[4,5,6,7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[6,7,4,5,2,3,0,1,14,15,u,u,u,u,u,u] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm12 = [6,7,4,5,2,3,0,1,14,15,8,9,6,7,0,1] +; AVX2-FP-NEXT: vpshufb %xmm12, %xmm10, %xmm10 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] +; AVX2-FP-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2],xmm13[3],xmm14[4,5,6,7] +; AVX2-FP-NEXT: vpshufb %xmm12, %xmm13, %xmm12 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2,3,4],xmm10[5,6,7] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,6,7,0,1,14,15,u,u,10,11] ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5],xmm2[6],xmm11[7] @@ -1483,11 +1486,12 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm0[0],xmm1[1,2,3] ; AVX2-FCP-NEXT: vmovdqa 80(%rdi), %xmm2 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,u,u,12,13,10,11,4,5] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm7[4],xmm6[5],xmm7[6],xmm6[7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,14,15,12,13,10,11,8,9,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,14,15,12,13,10,11,8,9,12,13,10,11,4,5] +; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm8[4],xmm7[5],xmm8[6],xmm7[7] +; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm6 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4],xmm5[5,6,7] ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %xmm7 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm9 @@ -1506,11 +1510,12 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,5,2,3,0,1,14,15,12,13,10,11,8,9,6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm1[0,1],xmm0[2],xmm1[3] ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,u,8,9,6,7,0,1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm12, %xmm13 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2],xmm12[3],xmm13[4,5,6,7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[6,7,4,5,2,3,0,1,14,15,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [6,7,4,5,2,3,0,1,14,15,8,9,6,7,0,1] +; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm10, %xmm10 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2],xmm13[3],xmm14[4,5,6,7] +; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm13, %xmm12 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2,3,4],xmm10[5,6,7] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,6,7,0,1,14,15,u,u,10,11] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5],xmm2[6],xmm11[7] @@ -1558,13 +1563,14 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1,2,3] ; AVX512-NEXT: vmovdqa 80(%rdi), %xmm2 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,u,u,12,13,10,11,4,5] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,14,15,12,13,10,11,8,9,12,13,10,11,4,5] +; AVX512-NEXT: vpshufb %xmm6, %xmm3, %xmm3 ; AVX512-NEXT: vmovdqa (%rdi), %ymm4 ; AVX512-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7] -; AVX512-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm7[4],xmm6[5],xmm7[6],xmm6[7] -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,14,15,12,13,10,11,8,9,u,u,u,u,u,u] +; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7] +; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm8[4],xmm7[5],xmm8[6],xmm7[7] +; AVX512-NEXT: vpshufb %xmm6, %xmm7, %xmm6 ; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3,4],xmm3[5,6,7] ; AVX512-NEXT: vpblendd {{.*#+}} xmm7 = xmm0[0,1],xmm1[2,3] ; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm2[3],xmm7[4,5,6,7] @@ -1642,13 +1648,14 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1,2,3] ; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm2 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,u,u,12,13,10,11,4,5] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,14,15,12,13,10,11,8,9,12,13,10,11,4,5] +; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm4 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm7[4],xmm6[5],xmm7[6],xmm6[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,14,15,12,13,10,11,8,9,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm8[4],xmm7[5],xmm8[6],xmm7[7] +; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm6 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3,4],xmm3[5,6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm0[0,1],xmm1[2,3] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm2[3],xmm7[4,5,6,7] @@ -1665,11 +1672,12 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[4,5,2,3,0,1,14,15,12,13,10,11,8,9,6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm1[0,1],xmm0[2],xmm1[3] ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,8,9,6,7,0,1] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3],xmm11[4,5,6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[6,7,4,5,2,3,0,1,14,15,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [6,7,4,5,2,3,0,1,14,15,8,9,6,7,0,1] +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm8, %xmm8 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2],xmm11[3],xmm12[4,5,6,7] +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm11, %xmm10 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2,3,4],xmm8[5,6,7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5],xmm2[6],xmm9[7] @@ -1717,13 +1725,14 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1,2,3] ; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm2 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,u,u,12,13,10,11,4,5] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,14,15,12,13,10,11,8,9,12,13,10,11,4,5] +; AVX512DQ-NEXT: vpshufb %xmm6, %xmm3, %xmm3 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm4 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7] -; AVX512DQ-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm7[4],xmm6[5],xmm7[6],xmm6[7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,14,15,12,13,10,11,8,9,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7] +; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm8[4],xmm7[5],xmm8[6],xmm7[7] +; AVX512DQ-NEXT: vpshufb %xmm6, %xmm7, %xmm6 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3,4],xmm3[5,6,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm7 = xmm0[0,1],xmm1[2,3] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm2[3],xmm7[4,5,6,7] @@ -1801,13 +1810,14 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm2 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,u,u,12,13,10,11,4,5] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,14,15,12,13,10,11,8,9,12,13,10,11,4,5] +; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm4 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm7[4],xmm6[5],xmm7[6],xmm6[7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,14,15,12,13,10,11,8,9,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm8[4],xmm7[5],xmm8[6],xmm7[7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm6 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3,4],xmm3[5,6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm0[0,1],xmm1[2,3] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm2[3],xmm7[4,5,6,7] @@ -1824,11 +1834,12 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[4,5,2,3,0,1,14,15,12,13,10,11,8,9,6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm1[0,1],xmm0[2],xmm1[3] ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,8,9,6,7,0,1] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3],xmm11[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[6,7,4,5,2,3,0,1,14,15,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [6,7,4,5,2,3,0,1,14,15,8,9,6,7,0,1] +; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm8, %xmm8 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2],xmm11[3],xmm12[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm11, %xmm10 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2,3,4],xmm8[5,6,7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5],xmm2[6],xmm9[7] @@ -3046,142 +3057,145 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm2 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm3 -; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm4 +; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm5 ; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm6 ; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm7 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm7[0,1,0,2] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm6[0,1,2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,20,21,26,27] +; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm8, %ymm9 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7] ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [3,6,2,5,3,6,2,5] ; AVX2-FCP-NEXT: # ymm11 = mem[0,1,0,1] ; AVX2-FCP-NEXT: vpermd %ymm10, %ymm11, %ymm10 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3,4,5,6],ymm5[7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1],ymm1[2],ymm0[3,4,5],ymm1[6],ymm0[7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm11[4],xmm10[5],xmm11[6],xmm10[7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3,4,5],xmm10[6],xmm11[7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [2,5,1,0,4,0,0,0] -; AVX2-FCP-NEXT: vpermd %ymm11, %ymm13, %ymm11 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm11[2,3,2,3,2,3,2,3,8,9,0,1,6,7,8,9,18,19,18,19,18,19,18,19,24,25,16,17,22,23,24,25] -; AVX2-FCP-NEXT: vmovdqa %xmm12, %xmm11 -; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm10, %ymm13, %ymm10 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm6[2],ymm4[3,4,5],ymm6[6],ymm4[7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm12, %xmm13 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3],xmm13[4],xmm12[5],xmm13[6],xmm12[7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u] -; AVX2-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm10, %ymm4 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm9[7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1],ymm1[2],ymm0[3,4,5],ymm1[6],ymm0[7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm10[4],xmm9[5],xmm10[6],xmm9[7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,2,2,3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm9, %ymm10, %ymm9 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4],ymm4[5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3,4,5],xmm9[6],xmm10[7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [2,5,1,0,4,0,0,0] +; AVX2-FCP-NEXT: vpermd %ymm10, %ymm12, %ymm10 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm10[2,3,2,3,2,3,2,3,8,9,0,1,6,7,8,9,18,19,18,19,18,19,18,19,24,25,16,17,22,23,24,25] +; AVX2-FCP-NEXT: vmovdqa %xmm11, %xmm10 +; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm9, %ymm12, %ymm9 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm6[2],ymm5[3,4,5],ymm6[6],ymm5[7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm12[4],xmm11[5],xmm12[6],xmm11[7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u] +; AVX2-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0],ymm8[1,2,3,4,5,6,7],ymm10[8],ymm8[9,10,11,12,13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm10, %xmm12 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm12[1],xmm10[2,3,4,5],xmm12[6],xmm10[7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [2,6,1,0,5,0,0,0] -; AVX2-FCP-NEXT: vpermd %ymm12, %ymm13, %ymm12 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,2,3,8,9,2,3,4,5,10,11,16,17,18,19,20,21,18,19,24,25,18,19,20,21,26,27] -; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm10, %ymm12, %ymm10 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1,2],ymm6[3],ymm4[4,5],ymm6[6],ymm4[7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm12, %xmm13 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2,3,4,5],xmm12[6],xmm13[7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u] -; AVX2-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm13 = [2,5,2,5,2,5,2,5] -; AVX2-FCP-NEXT: vpermd %ymm7, %ymm13, %ymm13 -; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3,4,5,6],ymm8[7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3,4,5,6,7],ymm9[8],ymm8[9,10,11,12,13,14,15] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm9, %xmm11 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm11[1],xmm9[2,3,4,5],xmm11[6],xmm9[7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [2,6,1,0,5,0,0,0] +; AVX2-FCP-NEXT: vpermd %ymm11, %ymm12, %ymm11 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,2,3,8,9,2,3,4,5,10,11,16,17,18,19,20,21,18,19,24,25,18,19,20,21,26,27] +; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm9, %ymm11, %ymm9 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2,3,4,5],xmm11[6],xmm12[7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u] +; AVX2-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm12 = [2,5,2,5,2,5,2,5] +; AVX2-FCP-NEXT: vpermd %ymm7, %ymm12, %ymm12 +; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm12[7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm9[0],ymm11[1,2,3,4,5,6,7],ymm9[8],ymm11[9,10,11,12,13,14,15] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3,4,5],xmm12[6],xmm11[7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u] +; AVX2-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm12 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm7[0,1,1,3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm13[7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2],xmm13[3],xmm14[4,5,6,7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[6,7,4,5,2,3,0,1,14,15,14,15,14,15,14,15] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[1,3,2,3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17] +; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm13, %ymm14, %ymm10 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm10[0],ymm12[1,2,3,4,5,6,7],ymm10[8],ymm12[9,10,11,12,13,14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6,7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm12, %xmm13 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3,4,5],xmm13[6],xmm12[7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u] -; AVX2-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm7[0,1,1,3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm14[7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2],xmm14[3],xmm15[4,5,6,7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[6,7,4,5,2,3,0,1,14,15,14,15,14,15,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[1,3,2,3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17] -; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm14, %ymm15, %ymm11 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0],ymm12[1,2,3,4,5,6,7],ymm11[8],ymm12[9,10,11,12,13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm12 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm14, %xmm15 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[3,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,3,7,2,6,0,0,0] -; AVX2-FCP-NEXT: vpermd %ymm15, %ymm9, %ymm9 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm15, %xmm5 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm15[1],xmm5[2],xmm15[3],xmm5[4,5,6,7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] -; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] -; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm9, %ymm9 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0,1],xmm9[2,3] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm9[0],ymm5[1,2,3,4,5,6,7],ymm9[8],ymm5[9,10,11,12,13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm13 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm12, %xmm14 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm12, %xmm12 +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[3,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,3,7,2,6,0,0,0] +; AVX2-FCP-NEXT: vpermd %ymm12, %ymm15, %ymm15 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm12, %xmm4 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm12[1],xmm4[2],xmm12[3],xmm4[4,5,6,7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] +; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm11[7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] +; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm15, %ymm11 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0,1],xmm11[2,3] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0],ymm4[1,2,3,4,5,6,7],ymm11[8],ymm4[9,10,11,12,13,14,15] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7] ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [2,5,1,4,2,5,1,4] ; AVX2-FCP-NEXT: # ymm14 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpermd %ymm5, %ymm14, %ymm5 +; AVX2-FCP-NEXT: vpermd %ymm4, %ymm14, %ymm4 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [0,0,0,0,0,3,7,0] ; AVX2-FCP-NEXT: vpermd %ymm7, %ymm14, %ymm14 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm14[5,6,7],ymm5[8,9,10,11,12],ymm14[13,14,15] +; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm15 = [30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25] +; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm14, %ymm14 +; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm4, %ymm4 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm14[5,6,7],ymm4[8,9,10,11,12],ymm14[13,14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,4,7,3,6,0,0,0] ; AVX2-FCP-NEXT: vpermd %ymm14, %ymm15, %ymm14 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm15, %xmm8 -; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm8, %xmm8 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm15[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm14[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm12[0],ymm5[1,2,3,4,5,6,7],ymm12[8],ymm5[9,10,11,12,13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm12[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,4,7,0,0,4,7,0] -; AVX2-FCP-NEXT: # ymm8 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpermd %ymm7, %ymm8, %ymm7 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7] +; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm8, %xmm8 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm15[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm14[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm13[0],ymm4[1,2,3,4,5,6,7],ymm13[8],ymm4[9,10,11,12,13,14,15] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm13[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm8[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,4,7,0,0,4,7,0] +; AVX2-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vpermd %ymm7, %ymm4, %ymm4 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,6,1,5,2,6,1,5] ; AVX2-FCP-NEXT: # ymm6 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpermd %ymm4, %ymm6, %ymm4 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5,6,7],ymm4[8,9,10,11,12],ymm6[13,14,15] +; AVX2-FCP-NEXT: vpermd %ymm5, %ymm6, %ymm5 +; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm6 = [28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27] +; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm4 +; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm5 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7],ymm5[8,9,10,11,12],ymm4[13,14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7] ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,4,0,3,7,0,0,0] ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm2 -; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u] @@ -3194,11 +3208,11 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovaps %ymm1, (%rsi) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-FCP-NEXT: vmovdqa %ymm10, (%rcx) -; AVX2-FCP-NEXT: vmovdqa %ymm11, (%r8) -; AVX2-FCP-NEXT: vmovdqa %ymm9, (%r9) +; AVX2-FCP-NEXT: vmovdqa %ymm9, (%rcx) +; AVX2-FCP-NEXT: vmovdqa %ymm10, (%r8) +; AVX2-FCP-NEXT: vmovdqa %ymm11, (%r9) ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: vmovdqa %ymm5, (%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm13, (%rax) ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: vmovdqa %ymm0, (%rax) ; AVX2-FCP-NEXT: vzeroupper @@ -3378,146 +3392,148 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-LABEL: load_i16_stride7_vf16: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [2,5,9,12,2,5,9,12] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [10,3,6,15,12,13,6,15] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [2,6,9,0,13,0,0,0] -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm2, %zmm8 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [2,5,9,0,12,0,0,0] -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm2, %zmm6 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [8,1,12,5,12,5,14,15] -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm2, %zmm3 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,6,10,13,3,6,10,13] -; AVX512-FCP-NEXT: vpermd %zmm1, %zmm2, %zmm4 -; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm2[0,1,0,2] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm13, %ymm5 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [2,6,9,13,2,6,9,13] +; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [2,5,9,12,2,5,9,12] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [10,3,6,15,12,13,6,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,6,9,0,13,0,0,0] +; AVX512-FCP-NEXT: vpermd %zmm0, %zmm3, %zmm9 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,5,9,0,12,0,0,0] +; AVX512-FCP-NEXT: vpermd %zmm0, %zmm3, %zmm7 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [8,1,12,5,12,5,14,15] +; AVX512-FCP-NEXT: vpermd %zmm0, %zmm3, %zmm3 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [3,6,10,13,3,6,10,13] +; AVX512-FCP-NEXT: vpermd %zmm2, %zmm4, %zmm4 +; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm5 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm5[0,1,0,2] +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm14, %ymm6 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3,4,5,6],ymm5[7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1,2,3,4,5,6],ymm6[7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm14 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3],xmm14[4],xmm12[5],xmm14[6],xmm12[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %ymm3, %ymm12, %ymm3 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm11[5,6,7] -; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm11 -; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm12 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm12[0,1],ymm11[2],ymm12[3,4,5],ymm11[6],ymm12[7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3],xmm15[4],xmm14[5],xmm15[6],xmm14[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7] +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0,1],ymm6[2],ymm4[3,4,5],ymm6[6],ymm4[7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm15 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3],xmm15[4],xmm13[5],xmm15[6],xmm13[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %ymm3, %ymm13, %ymm3 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm12[5,6,7] +; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm12 +; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm13 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0,1],ymm12[2],ymm13[3,4,5],ymm12[6],ymm13[7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm15, %xmm1 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0,1,2,3],xmm1[4],xmm15[5],xmm1[6],xmm15[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm14[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm4[0,1,2],ymm6[3],ymm4[4,5],ymm6[6],ymm4[7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3,4,5],xmm14[6],xmm15[7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %ymm6, %ymm14, %ymm6 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm6[0],ymm13[1,2,3,4,5,6,7],ymm6[8],ymm13[9,10,11,12,13,14,15] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm13[4,5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2,3,4,5],xmm13[6],xmm14[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512-FCP-NEXT: vpor %ymm7, %ymm14, %ymm7 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm1[1,2,3,4,5,6,7],ymm7[8],ymm1[9,10,11,12,13,14,15] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm14 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0],xmm1[1],xmm14[2,3,4,5],xmm1[6],xmm14[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm14 = [2,5,2,5,2,5,2,5] -; AVX512-FCP-NEXT: vpermd %ymm2, %ymm14, %ymm14 +; AVX512-FCP-NEXT: vpermd %ymm5, %ymm14, %ymm14 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm14[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6,7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1],xmm14[2,3,4,5],xmm15[6],xmm14[7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %ymm8, %ymm14, %ymm8 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm8[0],ymm13[1,2,3,4,5,6,7],ymm8[8],ymm13[9,10,11,12,13,14,15] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm13[4,5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6,7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2,3,4,5],xmm14[6],xmm13[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm2[0,1,1,3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0,1,2,3,4,5,6],ymm15[7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm7 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm13[1],xmm7[2],xmm13[3],xmm7[4,5,6,7] -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm9, %zmm9 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %ymm7, %ymm9, %ymm7 -; AVX512-FCP-NEXT: vpermd %zmm1, %zmm16, %zmm13 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,3,7,10,14,0,0,0] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm7[0],ymm15[1,2,3,4,5,6,7],ymm7[8],ymm15[9,10,11,12,13,14,15] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm15[4,5,6,7] -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm9, %zmm9 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3,4],ymm12[5],ymm11[6,7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2],xmm11[3],xmm12[4,5,6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm14, %ymm10 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm10[7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] +; AVX512-FCP-NEXT: vpor %ymm9, %ymm14, %ymm9 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0],ymm1[1,2,3,4,5,6,7],ymm9[8],ymm1[9,10,11,12,13,14,15] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5,6,7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm14 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm14[1],xmm1[2,3,4,5],xmm14[6],xmm1[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm5[0,1,1,3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm14[7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm14, %xmm8 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm14[1],xmm8[2],xmm14[3],xmm8[4,5,6,7] +; AVX512-FCP-NEXT: vpermd %zmm0, %zmm10, %zmm10 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %ymm10, %ymm8, %ymm8 +; AVX512-FCP-NEXT: vpermd %zmm2, %zmm17, %zmm14 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,3,7,10,14,0,0,0] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1,2,3,4,5,6,7],ymm8[8],ymm1[9,10,11,12,13,14,15] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vpermd %zmm0, %zmm10, %zmm1 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0],ymm13[1],ymm12[2,3,4],ymm13[5],ymm12[6,7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm12 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0],xmm10[1],xmm12[2],xmm10[3],xmm12[4,5,6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm15, %ymm11 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm10, %xmm14 -; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm10 -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,3,4,5,6,7] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm9 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1],ymm9[2,3,4,5,6,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm9[0],ymm11[1,2,3,4,5,6,7],ymm9[8],ymm11[9,10,11,12,13,14,15] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,3,3,0,0,3,7,0] -; AVX512-FCP-NEXT: vpermd %ymm2, %ymm11, %ymm11 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,0,1,6,7,8,9,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,0,1,6,7,8,9,14,15,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0,1,2,3,4],ymm11[5,6,7],ymm13[8,9,10,11,12],ymm11[13,14,15] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] +; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm11, %xmm13 +; AVX512-FCP-NEXT: vextracti128 $1, %ymm11, %xmm11 +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm1, %ymm1 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm1[2,3,4,5,6,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm10[1,2,3,4,5,6,7],ymm1[8],ymm10[9,10,11,12,13,14,15] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm10[4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,3,3,0,0,3,7,0] +; AVX512-FCP-NEXT: vpermd %ymm5, %ymm1, %ymm1 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,0,1,6,7,8,9,14,15,0,1,6,7,8,9,16,17,16,17,22,23,24,25,30,31,16,17,22,23,24,25] +; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm1, %ymm1 +; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm14, %ymm13 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0,1,2,3,4],ymm1[5,6,7],ymm13[8,9,10,11,12],ymm1[13,14,15] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm14 ; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm14, %xmm12 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,4,8,11,15,0,0,0] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [2,6,9,13,2,6,9,13] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [0,4,7,11,14,0,0,0] +; AVX512-FCP-NEXT: vpermd %zmm2, %zmm16, %zmm2 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [0,4,7,11,14,0,0,0] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm16, %zmm13 +; AVX512-FCP-NEXT: vpermd %zmm0, %zmm14, %zmm13 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3,4,5,6,7] -; AVX512-FCP-NEXT: vpermd %zmm1, %zmm15, %zmm1 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1,2,3,4,5,6,7],ymm12[8],ymm11[9,10,11,12,13,14,15] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0],ymm1[1,2,3,4,5,6,7],ymm12[8],ymm1[9,10,11,12,13,14,15] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [0,4,7,0,0,4,7,0] ; AVX512-FCP-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpermd %ymm2, %ymm12, %ymm2 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,2,3,4,5,10,11,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,2,3,4,5,10,11,12,13,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15] -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm14, %zmm0 -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7] -; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vpermd %ymm5, %ymm12, %ymm5 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [4,5,2,3,4,5,10,11,12,13,2,3,4,5,10,11,20,21,18,19,20,21,26,27,28,29,18,19,20,21,26,27] +; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm5, %ymm5 +; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm2 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [1,4,8,11,15,0,0,0] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7],ymm2[8,9,10,11,12],ymm5[13,14,15] +; AVX512-FCP-NEXT: vpermd %zmm0, %zmm12, %zmm0 +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7] +; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,4,6,7] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa %ymm3, (%rsi) -; AVX512-FCP-NEXT: vmovdqa %ymm6, (%rdx) -; AVX512-FCP-NEXT: vmovdqa %ymm8, (%rcx) -; AVX512-FCP-NEXT: vmovdqa %ymm7, (%r8) -; AVX512-FCP-NEXT: vmovdqa %ymm9, (%r9) +; AVX512-FCP-NEXT: vmovdqa %ymm7, (%rdx) +; AVX512-FCP-NEXT: vmovdqa %ymm9, (%rcx) +; AVX512-FCP-NEXT: vmovdqa %ymm8, (%r8) +; AVX512-FCP-NEXT: vmovdqa %ymm10, (%r9) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa %ymm11, (%rax) +; AVX512-FCP-NEXT: vmovdqa %ymm1, (%rax) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rax) ; AVX512-FCP-NEXT: vzeroupper @@ -3697,146 +3713,148 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-LABEL: load_i16_stride7_vf16: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [2,5,9,12,2,5,9,12] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [10,3,6,15,12,13,6,15] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [2,6,9,0,13,0,0,0] -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm2, %zmm8 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [2,5,9,0,12,0,0,0] -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm2, %zmm6 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [8,1,12,5,12,5,14,15] -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm2, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,6,10,13,3,6,10,13] -; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm2, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm2[0,1,0,2] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm13, %ymm5 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [2,6,9,13,2,6,9,13] +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [2,5,9,12,2,5,9,12] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [10,3,6,15,12,13,6,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,6,9,0,13,0,0,0] +; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm3, %zmm9 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,5,9,0,12,0,0,0] +; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm3, %zmm7 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [8,1,12,5,12,5,14,15] +; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm3, %zmm3 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [3,6,10,13,3,6,10,13] +; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm4, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm5 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm5[0,1,0,2] +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm14, %ymm6 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3,4,5,6],ymm5[7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1,2,3,4,5,6],ymm6[7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm14 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3],xmm14[4],xmm12[5],xmm14[6],xmm12[7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %ymm3, %ymm12, %ymm3 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm11[5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm11 -; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm12 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm12[0,1],ymm11[2],ymm12[3,4,5],ymm11[6],ymm12[7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3],xmm15[4],xmm14[5],xmm15[6],xmm14[7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7] +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0,1],ymm6[2],ymm4[3,4,5],ymm6[6],ymm4[7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm15 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3],xmm15[4],xmm13[5],xmm15[6],xmm13[7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %ymm3, %ymm13, %ymm3 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm12[5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm12 +; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm13 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0,1],ymm12[2],ymm13[3,4,5],ymm12[6],ymm13[7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm15, %xmm1 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0,1,2,3],xmm1[4],xmm15[5],xmm1[6],xmm15[7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm14[7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm4[0,1,2],ymm6[3],ymm4[4,5],ymm6[6],ymm4[7] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3,4,5],xmm14[6],xmm15[7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %ymm6, %ymm14, %ymm6 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm6[0],ymm13[1,2,3,4,5,6,7],ymm6[8],ymm13[9,10,11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2,3,4,5],xmm13[6],xmm14[7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512DQ-FCP-NEXT: vpor %ymm7, %ymm14, %ymm7 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm1[1,2,3,4,5,6,7],ymm7[8],ymm1[9,10,11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm14 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0],xmm1[1],xmm14[2,3,4,5],xmm1[6],xmm14[7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm14 = [2,5,2,5,2,5,2,5] -; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm14, %ymm14 +; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm14, %ymm14 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm14[7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6,7] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1],xmm14[2,3,4,5],xmm15[6],xmm14[7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %ymm8, %ymm14, %ymm8 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm8[0],ymm13[1,2,3,4,5,6,7],ymm8[8],ymm13[9,10,11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6,7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2,3,4,5],xmm14[6],xmm13[7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm2[0,1,1,3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0,1,2,3,4,5,6],ymm15[7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm7 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm13[1],xmm7[2],xmm13[3],xmm7[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm9, %zmm9 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %ymm7, %ymm9, %ymm7 -; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm16, %zmm13 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,3,7,10,14,0,0,0] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm7[0],ymm15[1,2,3,4,5,6,7],ymm7[8],ymm15[9,10,11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm9, %zmm9 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3,4],ymm12[5],ymm11[6,7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2],xmm11[3],xmm12[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm14, %ymm10 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm10[7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] +; AVX512DQ-FCP-NEXT: vpor %ymm9, %ymm14, %ymm9 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0],ymm1[1,2,3,4,5,6,7],ymm9[8],ymm1[9,10,11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5,6,7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm14 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm14[1],xmm1[2,3,4,5],xmm14[6],xmm1[7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm5[0,1,1,3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm14[7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm14, %xmm8 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm14[1],xmm8[2],xmm14[3],xmm8[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm10, %zmm10 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %ymm10, %ymm8, %ymm8 +; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm17, %zmm14 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,3,7,10,14,0,0,0] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1,2,3,4,5,6,7],ymm8[8],ymm1[9,10,11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm10, %zmm1 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0],ymm13[1],ymm12[2,3,4],ymm13[5],ymm12[6,7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm12 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0],xmm10[1],xmm12[2],xmm10[3],xmm12[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm15, %ymm11 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm10, %xmm14 -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm10 -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] -; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm9 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1],ymm9[2,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm9[0],ymm11[1,2,3,4,5,6,7],ymm9[8],ymm11[9,10,11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,3,3,0,0,3,7,0] -; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm11, %ymm11 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,0,1,6,7,8,9,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,0,1,6,7,8,9,14,15,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0,1,2,3,4],ymm11[5,6,7],ymm13[8,9,10,11,12],ymm11[13,14,15] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm11, %xmm13 +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm11, %xmm11 +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm1[2,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm10[1,2,3,4,5,6,7],ymm1[8],ymm10[9,10,11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,3,3,0,0,3,7,0] +; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,0,1,6,7,8,9,14,15,0,1,6,7,8,9,16,17,16,17,22,23,24,25,30,31,16,17,22,23,24,25] +; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm14, %ymm13 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0,1,2,3,4],ymm1[5,6,7],ymm13[8,9,10,11,12],ymm1[13,14,15] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm14 ; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm14, %xmm12 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,4,8,11,15,0,0,0] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [2,6,9,13,2,6,9,13] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [0,4,7,11,14,0,0,0] +; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm16, %zmm2 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [0,4,7,11,14,0,0,0] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm16, %zmm13 +; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm14, %zmm13 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm15, %zmm1 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1,2,3,4,5,6,7],ymm12[8],ymm11[9,10,11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0],ymm1[1,2,3,4,5,6,7],ymm12[8],ymm1[9,10,11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [0,4,7,0,0,4,7,0] ; AVX512DQ-FCP-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm12, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,2,3,4,5,10,11,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,2,3,4,5,10,11,12,13,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15] -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm14, %zmm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm12, %ymm5 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [4,5,2,3,4,5,10,11,12,13,2,3,4,5,10,11,20,21,18,19,20,21,26,27,28,29,18,19,20,21,26,27] +; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm5, %ymm5 +; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [1,4,8,11,15,0,0,0] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7],ymm2[8,9,10,11,12],ymm5[13,14,15] +; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm12, %zmm0 +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,4,6,7] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, (%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, (%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, (%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, (%r8) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, (%r9) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, (%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, (%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, (%r8) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, (%r9) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa %ymm11, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, (%rax) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper @@ -7141,296 +7159,299 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512-FCP-LABEL: load_i16_stride7_vf32: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm30 +; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm29 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm31 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,6,9,13,2,6,9,13] -; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm25 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [2,6,9,13,2,6,9,13] +; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm22 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [2,5,9,12,2,5,9,12] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [10,3,6,15,12,13,6,15] ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [3,6,10,13,3,6,10,13] -; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm14 +; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm9 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [2,6,9,0,13,0,0,0] -; AVX512-FCP-NEXT: vpermd %zmm31, %zmm0, %zmm10 -; AVX512-FCP-NEXT: vpermd %zmm14, %zmm1, %zmm12 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,0,0,0,4,8,11,15] -; AVX512-FCP-NEXT: vpermd %zmm30, %zmm0, %zmm15 +; AVX512-FCP-NEXT: vpermd %zmm31, %zmm0, %zmm13 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [1,0,0,0,4,8,11,15] ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [2,5,9,0,12,0,0,0] -; AVX512-FCP-NEXT: vpermd %zmm31, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vpermd %zmm14, %zmm16, %zmm7 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,0,0,0,4,7,11,14] -; AVX512-FCP-NEXT: vpermd %zmm30, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [8,1,12,5,12,5,14,15] -; AVX512-FCP-NEXT: vpermd %zmm31, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vpermd %zmm25, %zmm19, %zmm4 +; AVX512-FCP-NEXT: vpermd %zmm31, %zmm0, %zmm4 +; AVX512-FCP-NEXT: vpermd %zmm9, %zmm16, %zmm0 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,4,7,11,14] +; AVX512-FCP-NEXT: vpermd %zmm29, %zmm2, %zmm2 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [8,1,12,5,12,5,14,15] +; AVX512-FCP-NEXT: vpermd %zmm31, %zmm5, %zmm5 +; AVX512-FCP-NEXT: vpermd %zmm22, %zmm19, %zmm6 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm28 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm28[0,1,0,2] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm28[0,1,0,2] ; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm11, %ymm5 -; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm23 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1,2,3,4,5,6],ymm5[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[6,7,12,13,2,3,16,17,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm9[4],xmm8[5],xmm9[6],xmm8[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vporq %ymm0, %ymm8, %ymm22 -; AVX512-FCP-NEXT: vpbroadcastw 252(%rdi), %xmm0 -; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %xmm13 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm13[u,u,u,u,u,u,u,u,0,1,14,15,12,13,14,15] -; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm8[2],xmm0[2],xmm8[3],xmm0[3] +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm15, %ymm7 +; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm25 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3,4,5,6],ymm7[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[6,7,12,13,2,3,16,17,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm5 +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm6[2],ymm5[3,4,5],ymm6[6],ymm5[7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm12[4],xmm11[5],xmm12[6],xmm11[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vporq %ymm8, %ymm11, %ymm23 +; AVX512-FCP-NEXT: vpbroadcastw 252(%rdi), %xmm8 +; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %xmm12 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u,u,u,u,0,1,14,15,12,13,14,15] +; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm11[2],xmm8[2],xmm11[3],xmm8[3] ; AVX512-FCP-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512-FCP-NEXT: kmovw %eax, %k1 -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm6, %zmm22 {%k1} -; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %ymm6 +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm7, %zmm23 {%k1} +; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %ymm7 ; AVX512-FCP-NEXT: vmovdqa 288(%rdi), %ymm8 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm6[2,3],ymm8[4,5],ymm6[6,7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm9 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3],xmm9[4],xmm0[5],xmm9[6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,10,11,8,9,6,7,4,5,u,u] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm11, %xmm14 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm14[0,1,2],xmm11[3],xmm14[4],xmm11[5],xmm14[6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,10,11,8,9,6,7,4,5,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4,5,6],xmm2[7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] -; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm7, %ymm7 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm24 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm7 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0],xmm0[1],xmm7[2,3,4,5],xmm0[6],xmm7[7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm2[0,1,2],xmm11[3,4,5,6],xmm2[7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm2[4,5,6,7] +; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm1 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] ; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vporq %ymm3, %ymm0, %ymm20 -; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm3 -; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm9 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm3[2],ymm9[3,4,5],ymm3[6],ymm9[7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm7 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm7[4],xmm0[5],xmm7[6],xmm0[7] -; AVX512-FCP-NEXT: vmovdqa 240(%rdi), %xmm7 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm11[7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm7[0],xmm13[1],xmm7[2,3,4,5,6,7] -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm11, %xmm1 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm27 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm6[3],ymm8[4,5],ymm6[6],ymm8[7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm15[2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4,5,6],xmm1[7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] ; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm26 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm10 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm10[1],xmm1[2,3,4,5],xmm10[6],xmm1[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vporq %ymm0, %ymm1, %ymm21 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm3[3],ymm9[4,5],ymm3[6],ymm9[7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm10 = [2,5,2,5,2,5,2,5] -; AVX512-FCP-NEXT: vpermd %ymm28, %ymm10, %ymm10 -; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm10[7] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm15 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm8[2],ymm6[3,4,5],ymm8[6],ymm6[7] +; AVX512-FCP-NEXT: vmovdqa 240(%rdi), %xmm14 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm11 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm11[4],xmm0[5],xmm11[6],xmm0[7] -; AVX512-FCP-NEXT: vpermd %zmm31, %zmm18, %zmm11 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [1,0,0,0,5,8,12,15] -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpermd %zmm30, %zmm12, %zmm1 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[2,3,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpermd %zmm14, %zmm19, %zmm1 -; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm17 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vporq %ymm0, %ymm1, %ymm18 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm9[1],ymm3[2,3],ymm9[4],ymm3[5,6,7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3,4,5],xmm0[6],xmm11[7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vporq %ymm4, %ymm0, %ymm20 +; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm4 +; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm11 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm4[2],ymm11[3,4,5],ymm4[6],ymm11[7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5],xmm2[6],xmm0[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm28[0,1,1,3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm10[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm19 -; AVX512-FCP-NEXT: vmovdqa 416(%rdi), %ymm14 -; AVX512-FCP-NEXT: vmovdqa 384(%rdi), %ymm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm14[2],ymm2[3,4,5],ymm14[6],ymm2[7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2],ymm8[3],ymm6[4,5],ymm8[6],ymm6[7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm10 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0],xmm1[1],xmm10[2,3,4,5],xmm1[6],xmm10[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0],xmm12[1],xmm14[2,3,4,5,6,7] +; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm24 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vporq %ymm0, %ymm2, %ymm21 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm4[3],ymm11[4,5],ymm4[6],ymm11[7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,5,2,5,2,5,2,5] +; AVX512-FCP-NEXT: vpermd %ymm28, %ymm2, %ymm2 +; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5,6],ymm2[7] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm15 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5],xmm2[6],xmm3[7] +; AVX512-FCP-NEXT: vpermd %zmm29, %zmm18, %zmm3 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5,6],xmm3[7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512-FCP-NEXT: vpermd %zmm9, %zmm10, %zmm3 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm27 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm8[2],ymm7[3,4,5],ymm8[6],ymm7[7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [10,3,6,15,12,13,6,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [1,0,0,0,5,8,12,15] +; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm2, %ymm2 +; AVX512-FCP-NEXT: vpermd %zmm29, %zmm18, %zmm13 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[2,3,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %ymm2, %ymm13, %ymm2 +; AVX512-FCP-NEXT: vpermd %zmm9, %zmm19, %zmm9 +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm9, %ymm9 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm9[6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm9 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm9[0],xmm2[1],xmm9[2],xmm2[3],xmm9[4,5,6,7] +; AVX512-FCP-NEXT: vpermd %zmm31, %zmm3, %zmm3 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vporq %ymm3, %ymm2, %ymm18 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm11[1],ymm4[2,3],ymm11[4],ymm4[5,6,7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm3 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm28[0,1,1,3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm9[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm19 +; AVX512-FCP-NEXT: vmovdqa 416(%rdi), %ymm13 +; AVX512-FCP-NEXT: vmovdqa 384(%rdi), %ymm3 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm13[2],ymm3[3,4,5],ymm13[6],ymm3[7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm9 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm9[4],xmm0[5],xmm9[6],xmm0[7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3,4,5],xmm9[6],xmm10[7] ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [2,11,2,11,12,5,8,9] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,0,1,14,15,12,13,10,11,8,9] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpermd %zmm30, %zmm10, %zmm10 +; AVX512-FCP-NEXT: vpermd %zmm29, %zmm10, %zmm10 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[0,1,22,23,28,29,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7],ymm10[8,9,10],ymm0[11,12,13,14,15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero -; AVX512-FCP-NEXT: vpor %ymm1, %ymm10, %ymm1 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,3,7,10,14,0,0,0] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm9[1],ymm3[2,3,4],ymm9[5],ymm3[6,7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2],xmm1[3],xmm3[4,5,6,7] -; AVX512-FCP-NEXT: vpermd %zmm31, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm3 -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm11, %ymm3 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] -; AVX512-FCP-NEXT: vpbroadcastw 232(%rdi), %xmm3 -; AVX512-FCP-NEXT: vpsrlq $48, %xmm7, %xmm9 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm3, %zmm1, %zmm23 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm9, %xmm1 -; AVX512-FCP-NEXT: vextracti128 $1, %ymm9, %xmm3 -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] -; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX512-FCP-NEXT: vpermd %zmm25, %zmm16, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq $184, %zmm0, %zmm16, %zmm23 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero +; AVX512-FCP-NEXT: vpor %ymm10, %ymm9, %ymm9 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm17 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm11[1],ymm4[2,3,4],ymm11[5],ymm4[6,7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2],xmm0[3],xmm4[4,5,6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm4 +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] +; AVX512-FCP-NEXT: vpbroadcastw 232(%rdi), %xmm2 +; AVX512-FCP-NEXT: vpsrlq $48, %xmm14, %xmm4 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX512-FCP-NEXT: vpermd %zmm22, %zmm16, %zmm4 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,3,7,10,14,0,0,0] +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm16 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,3,3,0,0,3,7,0] ; AVX512-FCP-NEXT: vpermd %ymm28, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,0,1,6,7,8,9,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,0,1,6,7,8,9,14,15,u,u,u,u,u,u,16,17,16,17,22,23,24,25,30,31,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] -; AVX512-FCP-NEXT: vpsrld $16, %xmm13, %xmm1 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm29 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm14[3],ymm2[4,5],ymm14[6],ymm2[7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm6[1],ymm8[2,3],ymm6[4],ymm8[5,6,7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3,4,5],xmm3[6],xmm1[7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,6,7,8,9,14,15,0,1,6,7,8,9,16,17,16,17,22,23,24,25,30,31,16,17,22,23,24,25] +; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7],ymm2[8,9,10,11,12],ymm0[13,14,15] +; AVX512-FCP-NEXT: vpsrld $16, %xmm12, %xmm2 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm25 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm4, %xmm0 +; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm2 +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX512-FCP-NEXT: vpermd %zmm31, %zmm10, %zmm4 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] +; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm4 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3,4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm30 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vpternlogq $184, %zmm0, %zmm30, %zmm16 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm13[3],ymm3[4,5],ymm13[6],ymm3[7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2,3,4,5],xmm0[6],xmm4[7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6,7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm10 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm10[1],xmm4[2,3,4,5],xmm10[6],xmm4[7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,2,3,0,1,14,15,12,13,10,11] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,0,0,0,6,9,13,0] -; AVX512-FCP-NEXT: vpermd %zmm30, %zmm3, %zmm3 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[2,3,16,17,22,23,24,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero -; AVX512-FCP-NEXT: vpor %ymm3, %ymm1, %ymm1 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm1 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [2,0,0,0,6,9,13,0] +; AVX512-FCP-NEXT: vpermd %zmm29, %zmm10, %zmm10 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,3,16,17,22,23,24,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7],ymm10[8,9,10],ymm0[11,12,13,14,15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero +; AVX512-FCP-NEXT: vpor %ymm4, %ymm10, %ymm4 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm10 +; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm10, %xmm10 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,4,7,11,14,0,0,0] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm26, %zmm0, %zmm26 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,6,9,13,2,6,9,13] -; AVX512-FCP-NEXT: vpermd %zmm25, %zmm3, %zmm3 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,4,7,11,14,0,0,0] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm17, %zmm0, %zmm17 -; AVX512-FCP-NEXT: vpermd %zmm31, %zmm12, %zmm12 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0],ymm2[1],ymm14[2,3],ymm2[4],ymm14[5,6,7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,4,7,0,0,4,7,0] -; AVX512-FCP-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpermd %ymm28, %ymm1, %ymm1 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,2,3,4,5,10,11,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[4,5,2,3,4,5,10,11,12,13,u,u,u,u,u,u,20,21,18,19,20,21,26,27,28,29,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6,7],ymm3[8,9,10,11,12],ymm1[13,14,15] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm13[4],xmm7[4],xmm13[5],xmm7[5],xmm13[6],xmm7[6],xmm13[7],xmm7[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm7 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm6[1],ymm8[2,3,4],ymm6[5],ymm8[6,7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2],xmm1[3],xmm3[4,5,6,7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [3,0,0,0,6,10,13,0] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm10 -; AVX512-FCP-NEXT: vpermd %zmm30, %zmm3, %zmm3 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,4,5,2,3,0,1,14,15,12,13] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[0,1,18,19,20,21,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15] +; AVX512-FCP-NEXT: vpermd %zmm31, %zmm10, %zmm10 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1],ymm10[2,3,4,5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0],ymm3[1],ymm13[2,3],ymm3[4],ymm13[5,6,7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm10 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm10[1],xmm4[2,3,4,5],xmm10[6],xmm4[7] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm27, %zmm0, %zmm27 +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [0,4,7,0,0,4,7,0] +; AVX512-FCP-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpermd %ymm28, %ymm10, %ymm10 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm28 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,2,3,4,5,10,11,12,13,2,3,4,5,10,11,20,21,18,19,20,21,26,27,28,29,18,19,20,21,26,27] +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm10, %ymm10 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [2,6,9,13,2,6,9,13] +; AVX512-FCP-NEXT: vpermd %zmm22, %zmm9, %zmm9 +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm9, %ymm1 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm10[5,6,7],ymm1[8,9,10,11,12],ymm10[13,14,15] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm10 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm9 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0],xmm1[1],xmm9[2],xmm1[3],xmm9[4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [3,0,0,0,6,10,13,0] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm17, %zmm0, %zmm12 +; AVX512-FCP-NEXT: vpermd %zmm29, %zmm9, %zmm9 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,4,5,2,3,0,1,14,15,12,13] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[0,1,18,19,20,21,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm9[0,1,2],ymm4[3,4,5,6,7],ymm9[8,9,10],ymm4[11,12,13,14,15] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero -; AVX512-FCP-NEXT: vpor %ymm3, %ymm1, %ymm1 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,4,8,11,15,0,0,0] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vpermd %zmm31, %zmm1, %zmm1 -; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm1 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7] -; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,4,6,7] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3,4,5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0],ymm2[1],ymm14[2,3,4],ymm2[5],ymm14[6,7] +; AVX512-FCP-NEXT: vpor %ymm1, %ymm9, %ymm1 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,4,8,11,15,0,0,0] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm9 +; AVX512-FCP-NEXT: vpermd %zmm31, %zmm4, %zmm0 +; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2],xmm2[3],xmm4[4,5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1],ymm6[2],ymm8[3,4],ymm6[5],ymm8[6,7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,3,1,3,4,5,6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[8,9,8,9,4,5,6,7,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,1,10,3,14,7,10,3] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vpermd %zmm30, %zmm5, %zmm5 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[4,5,10,11,u,u,u,u,u,u,u,u,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1,2],ymm5[3,4,5,6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7] +; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0],ymm3[1],ymm13[2,3,4],ymm3[5],ymm13[6,7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,1,3,4,5,6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,8,9,4,5,6,7,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,1,10,3,14,7,10,3] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpermd %zmm29, %zmm4, %zmm4 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[4,5,10,11,u,u,u,u,u,u,u,u,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3,4,5,6,7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,6,7,4,5,2,3,0,1,14,15] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7],ymm4[8,9,10],ymm2[11,12,13,14,15] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512-FCP-NEXT: vpternlogq $184, %zmm22, %zmm4, %zmm24 -; AVX512-FCP-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm20, %zmm27 -; AVX512-FCP-NEXT: vpternlogq $184, %zmm27, %zmm4, %zmm26 -; AVX512-FCP-NEXT: vpternlogq $184, %zmm21, %zmm16, %zmm15 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512-FCP-NEXT: vpternlogq $184, %zmm23, %zmm3, %zmm26 +; AVX512-FCP-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm20, %zmm24 +; AVX512-FCP-NEXT: vpternlogq $184, %zmm24, %zmm3, %zmm27 +; AVX512-FCP-NEXT: vpternlogq $184, %zmm21, %zmm30, %zmm15 ; AVX512-FCP-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512-FCP-NEXT: kmovw %eax, %k1 -; AVX512-FCP-NEXT: vmovdqa32 %zmm17, %zmm15 {%k1} -; AVX512-FCP-NEXT: vpternlogq $184, %zmm18, %zmm16, %zmm19 -; AVX512-FCP-NEXT: vmovdqa32 %zmm10, %zmm19 {%k1} -; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm23 {%k1} -; AVX512-FCP-NEXT: vpternlogq $226, %zmm29, %zmm16, %zmm12 -; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm12 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, (%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, (%rdx) +; AVX512-FCP-NEXT: vmovdqa32 %zmm28, %zmm15 {%k1} +; AVX512-FCP-NEXT: vpternlogq $184, %zmm18, %zmm30, %zmm19 +; AVX512-FCP-NEXT: vmovdqa32 %zmm12, %zmm19 {%k1} +; AVX512-FCP-NEXT: vmovdqa32 %zmm9, %zmm16 {%k1} +; AVX512-FCP-NEXT: vpternlogq $226, %zmm25, %zmm30, %zmm11 +; AVX512-FCP-NEXT: vmovdqa32 %zmm1, %zmm11 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm26, (%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, (%rdx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, (%rcx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm19, (%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, (%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, (%r9) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, (%rax) -; AVX512-FCP-NEXT: vpternlogq $226, %zmm7, %zmm16, %zmm1 -; AVX512-FCP-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512-FCP-NEXT: vpternlogq $226, %zmm10, %zmm30, %zmm0 +; AVX512-FCP-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -7800,192 +7821,192 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-LABEL: load_i16_stride7_vf32: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm26 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm30 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [2,6,9,13,2,6,9,13] -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm22 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [2,5,9,12,2,5,9,12] ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [3,6,10,13,3,6,10,13] -; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm20 +; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm3 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [2,6,9,0,13,0,0,0] -; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm0, %zmm7 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm27 = [1,0,0,0,4,8,11,15] +; AVX512DQ-FCP-NEXT: vpermd %zmm30, %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [1,0,0,0,4,8,11,15] ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [2,5,9,0,12,0,0,0] -; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm0, %zmm10 -; AVX512DQ-FCP-NEXT: vpermd %zmm20, %zmm18, %zmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,0,0,0,4,7,11,14] -; AVX512DQ-FCP-NEXT: vpermd %zmm26, %zmm3, %zmm9 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [8,1,12,5,12,5,14,15] -; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm3, %zmm3 -; AVX512DQ-FCP-NEXT: vpermd %zmm23, %zmm19, %zmm4 +; AVX512DQ-FCP-NEXT: vpermd %zmm30, %zmm0, %zmm10 +; AVX512DQ-FCP-NEXT: vpermd %zmm3, %zmm18, %zmm0 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,4,7,11,14] +; AVX512DQ-FCP-NEXT: vpermd %zmm26, %zmm2, %zmm2 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [8,1,12,5,12,5,14,15] +; AVX512DQ-FCP-NEXT: vpermd %zmm30, %zmm5, %zmm5 +; AVX512DQ-FCP-NEXT: vpermd %zmm22, %zmm19, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm25 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm25[0,1,0,2] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm8, %ymm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm30 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1,2,3,4,5,6],ymm5[7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,7,12,13,2,3,16,17,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm12[4],xmm11[5],xmm12[6],xmm11[7] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm25[0,1,0,2] +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] +; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm12, %ymm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm8, %ymm29 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3,4,5,6],ymm7[7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[6,7,12,13,2,3,16,17,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm5 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm6[2],ymm5[3,4,5],ymm6[6],ymm5[7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm11, %xmm13 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm13[4],xmm11[5],xmm13[6],xmm11[7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vporq %ymm6, %ymm11, %ymm22 -; AVX512DQ-FCP-NEXT: vpbroadcastw 252(%rdi), %xmm6 +; AVX512DQ-FCP-NEXT: vporq %ymm8, %ymm11, %ymm23 +; AVX512DQ-FCP-NEXT: vpbroadcastw 252(%rdi), %xmm8 ; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %xmm13 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,u,u,u,u,0,1,14,15,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] +; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm11[2],xmm8[2],xmm11[3],xmm8[3] ; AVX512DQ-FCP-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm22 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %ymm5 -; AVX512DQ-FCP-NEXT: vmovdqa 288(%rdi), %ymm6 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3],xmm12[4],xmm11[5],xmm12[6,7] +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm7, %zmm23 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %ymm7 +; AVX512DQ-FCP-NEXT: vmovdqa 288(%rdi), %ymm8 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm11, %xmm14 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm14[0,1,2],xmm11[3],xmm14[4],xmm11[5],xmm14[6,7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,10,11,8,9,6,7,4,5,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm9[0,1,2],xmm11[3,4,5,6],xmm9[7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm9 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] -; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm2[0,1,2],xmm11[3,4,5,6],xmm2[7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] +; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm28 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm27 ; AVX512DQ-FCP-NEXT: vmovdqa 240(%rdi), %xmm15 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm11 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3,4,5],xmm0[6],xmm11[7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-FCP-NEXT: vporq %ymm10, %ymm0, %ymm21 -; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm12 +; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm11 ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm10 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm12[2],ymm10[3,4,5],ymm12[6],ymm10[7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm14 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm14[4],xmm0[5],xmm14[6],xmm0[7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm11[2],ymm10[3,4,5],ymm11[6],ymm10[7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm9 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm9[4],xmm0[5],xmm9[6],xmm0[7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm8[7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm15[0],xmm13[1],xmm15[2,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm8 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm24 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1],xmm7[2,3,4,5],xmm8[6],xmm7[7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm7, %ymm8 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm12[3],ymm10[4,5],ymm12[6],ymm10[7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm7 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm0[1],xmm7[2,3,4,5],xmm0[6],xmm7[7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm9[7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm15[0],xmm13[1],xmm15[2,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm9, %xmm9 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm24 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm9 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm9[1],xmm4[2,3,4,5],xmm9[6],xmm4[7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm4, %ymm9 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm0[1],xmm4[2,3,4,5],xmm0[6],xmm4[7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm7, %xmm7 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm11 = [2,5,2,5,2,5,2,5] -; AVX512DQ-FCP-NEXT: vpermd %ymm25, %ymm11, %ymm11 -; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0,1,2,3,4,5,6],ymm11[7] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm7, %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm14, %xmm31 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm11, %zmm16 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm11 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm11[0,1,2,3],xmm2[4],xmm11[5],xmm2[6],xmm11[7] -; AVX512DQ-FCP-NEXT: vpermd %zmm26, %zmm27, %zmm11 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm11[0,1,2],xmm2[3,4,5,6],xmm11[7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermd %zmm20, %zmm17, %zmm11 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm11[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm29 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm6[2],ymm5[3,4,5],ymm6[6],ymm5[7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm14 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm14[4],xmm2[5],xmm14[6],xmm2[7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm12 = [2,5,2,5,2,5,2,5] +; AVX512DQ-FCP-NEXT: vpermd %ymm25, %ymm12, %ymm12 +; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1,2,3,4,5,6],ymm12[7] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm4, %xmm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm31 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm12, %zmm16 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm8[0,1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm14 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0,1,2,3],xmm12[4],xmm14[5],xmm12[6],xmm14[7] +; AVX512DQ-FCP-NEXT: vpermd %zmm26, %zmm20, %zmm14 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3,4,5,6],xmm14[7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermd %zmm3, %zmm17, %zmm14 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm14[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm12, %ymm28 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm7[0,1],ymm8[2],ymm7[3,4,5],ymm8[6],ymm7[7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm14, %xmm12 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0,1,2,3],xmm12[4],xmm14[5],xmm12[6],xmm14[7] ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [10,3,6,15,12,13,6,15] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm27 = [1,0,0,0,5,8,12,15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm0 -; AVX512DQ-FCP-NEXT: vpermd %zmm26, %zmm27, %zmm2 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,3,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpermd %zmm20, %zmm19, %zmm2 -; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm14, %zmm9 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [1,0,0,0,5,8,12,15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm12, %ymm0 +; AVX512DQ-FCP-NEXT: vpermd %zmm26, %zmm20, %zmm12 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[2,3,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm12, %ymm0 +; AVX512DQ-FCP-NEXT: vpermd %zmm3, %zmm19, %zmm3 +; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512DQ-FCP-NEXT: vpermd %zmm30, %zmm14, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm8, %zmm19, %zmm16 +; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm9, %zmm19, %zmm16 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX512DQ-FCP-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm16 {%k1} -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm8 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0],xmm2[1],xmm8[2],xmm2[3],xmm8[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm2, %ymm8 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm10[1],ymm12[2,3],ymm10[4],ymm12[5,6,7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7] +; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5,6,7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3,4,5],xmm3[6],xmm0[7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm25[0,1,1,3] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm9[7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa 416(%rdi), %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa 384(%rdi), %ymm9 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1],ymm2[2],ymm9[3,4,5],ymm2[6],ymm9[7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm14 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm14[4],xmm7[5],xmm14[6],xmm7[7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm14, %xmm11 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm14[1],xmm11[2,3,4,5],xmm14[6],xmm11[7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm9[7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm20 +; AVX512DQ-FCP-NEXT: vmovdqa 416(%rdi), %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa 384(%rdi), %ymm4 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1],ymm3[2],ymm4[3,4,5],ymm3[6],ymm4[7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm9, %xmm12 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm12[4],xmm9[5],xmm12[6],xmm9[7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm14 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0],xmm12[1],xmm14[2,3,4,5],xmm12[6],xmm14[7] ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [2,11,2,11,12,5,8,9] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,0,1,14,15,12,13,10,11,8,9] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,0,1,14,15,12,13,10,11,8,9] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX512DQ-FCP-NEXT: vpermd %zmm26, %zmm14, %zmm14 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[0,1,22,23,28,29,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm14[0,1,2],ymm7[3,4,5,6,7],ymm14[8,9,10],ymm7[11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero -; AVX512DQ-FCP-NEXT: vpor %ymm14, %ymm11, %ymm11 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm8, %zmm19, %zmm20 -; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm7, %zmm0, %zmm20 {%k1} -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm8, %xmm11 -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,3,7,10,14,0,0,0] -; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm8, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] -; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm14, %ymm14 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm14[2,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0],ymm10[1],ymm12[2,3,4],ymm10[5],ymm12[6,7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm12 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0],xmm10[1],xmm12[2],xmm10[3],xmm12[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm12 -; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm14[0,1,2],ymm9[3,4,5,6,7],ymm14[8,9,10],ymm9[11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero +; AVX512DQ-FCP-NEXT: vpor %ymm14, %ymm12, %ymm12 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm2, %zmm19, %zmm20 +; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm9, %zmm0, %zmm20 {%k1} +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm9, %xmm12 +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[3,1,2,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,3,7,10,14,0,0,0] +; AVX512DQ-FCP-NEXT: vpermd %zmm30, %zmm9, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] +; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm14, %ymm14 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm14[2,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2,3,4],ymm10[5],ymm11[6,7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3],xmm11[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm9 +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5,6],ymm0[7] ; AVX512DQ-FCP-NEXT: vpbroadcastw 232(%rdi), %xmm10 -; AVX512DQ-FCP-NEXT: vpsrlq $48, %xmm15, %xmm12 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3] +; AVX512DQ-FCP-NEXT: vpsrlq $48, %xmm15, %xmm11 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm0, %zmm10 -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm11, %zmm19, %zmm10 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm2[3],ymm9[4,5],ymm2[6],ymm9[7] +; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm12, %zmm19, %zmm10 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm11 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3,4,5],xmm0[6],xmm11[7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6,7] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3,4,5],xmm12[6],xmm11[7] ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [2,0,0,0,6,9,13,0] @@ -7996,94 +8017,96 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3,4,5,6,7],ymm12[8,9,10],ymm0[11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero ; AVX512DQ-FCP-NEXT: vpor %ymm12, %ymm11, %ymm11 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermd %zmm23, %zmm17, %zmm0 -; AVX512DQ-FCP-NEXT: vpermd %zmm23, %zmm18, %zmm12 -; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm11, %zmm0, %zmm10 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,3,3,0,0,3,7,0] -; AVX512DQ-FCP-NEXT: vpermd %ymm25, %ymm11, %ymm11 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,0,1,6,7,8,9,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,0,1,6,7,8,9,14,15,u,u,u,u,u,u,16,17,16,17,22,23,24,25,30,31,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5,6,7],ymm12[8,9,10,11,12],ymm11[13,14,15] -; AVX512DQ-FCP-NEXT: vpsrld $16, %xmm13, %xmm12 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7] -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm12, %zmm11, %zmm11 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm14 -; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm14, %xmm7 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [0,4,7,11,14,0,0,0] -; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm14, %zmm14 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm14[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm7[0,1],ymm12[2,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0],ymm9[1],ymm2[2,3],ymm9[4],ymm2[5,6,7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm14 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm14[1],xmm7[2,3,4,5],xmm14[6],xmm7[7] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [0,4,7,0,0,4,7,0] -; AVX512DQ-FCP-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpermd %ymm25, %ymm14, %ymm14 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,2,3,4,5,10,11,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,2,3,4,5,10,11,12,13,u,u,u,u,u,u,20,21,18,19,20,21,26,27,28,29,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7],ymm0[8,9,10,11,12],ymm14[13,14,15] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm14 -; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm13, %xmm13 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm13 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm14 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0],xmm0[1],xmm14[2],xmm0[3],xmm14[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [3,0,0,0,6,10,13,0] -; AVX512DQ-FCP-NEXT: vpermd %zmm26, %zmm14, %zmm14 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,4,5,2,3,0,1,14,15,12,13] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[0,1,18,19,20,21,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm14[0,1,2],ymm7[3,4,5,6,7],ymm14[8,9,10],ymm7[11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero -; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm14, %ymm0 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm11, %zmm19, %zmm12 -; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm12 {%k1} +; AVX512DQ-FCP-NEXT: vpermd %zmm22, %zmm18, %zmm12 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm10 {%k1} +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,3,3,0,0,3,7,0] +; AVX512DQ-FCP-NEXT: vpermd %ymm25, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,0,1,6,7,8,9,14,15,0,1,6,7,8,9,16,17,16,17,22,23,24,25,30,31,16,17,22,23,24,25] +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm12, %ymm11 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm0[5,6,7],ymm11[8,9,10,11,12],ymm0[13,14,15] +; AVX512DQ-FCP-NEXT: vpsrld $16, %xmm13, %xmm11 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm11[4],xmm15[4],xmm11[5],xmm15[5],xmm11[6],xmm15[6],xmm11[7],xmm15[7] +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm11, %zmm0, %zmm12 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm11 +; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm11, %xmm2 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,4,7,11,14,0,0,0] +; AVX512DQ-FCP-NEXT: vpermd %zmm30, %zmm11, %zmm11 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm11[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm0[0,1],ymm2[2,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6,7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7] +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,4,7,0,0,4,7,0] +; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpermd %ymm25, %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [4,5,2,3,4,5,10,11,12,13,2,3,4,5,10,11,20,21,18,19,20,21,26,27,28,29,18,19,20,21,26,27] +; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vpermd %zmm22, %zmm17, %zmm9 +; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm9, %ymm9 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0,1,2,3,4],ymm2[5,6,7],ymm9[8,9,10,11,12],ymm2[13,14,15] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm13 +; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm9, %xmm9 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm2, %zmm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm9, %xmm13 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm13[0],xmm9[1],xmm13[2],xmm9[3],xmm13[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [3,0,0,0,6,10,13,0] +; AVX512DQ-FCP-NEXT: vpermd %zmm26, %zmm13, %zmm13 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,4,5,2,3,0,1,14,15,12,13] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[0,1,18,19,20,21,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0,1,2],ymm0[3,4,5,6,7],ymm13[8,9,10],ymm0[11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero +; AVX512DQ-FCP-NEXT: vpor %ymm13, %ymm9, %ymm9 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm12, %zmm19, %zmm11 +; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm11 {%k1} ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,4,8,11,15,0,0,0] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm28, %zmm0, %zmm7 -; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm27, %zmm0, %zmm9 +; AVX512DQ-FCP-NEXT: vpermd %zmm30, %zmm0, %zmm0 +; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm9[1],ymm2[2,3,4],ymm9[5],ymm2[6,7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,1,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,8,9,4,5,6,7,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,1,10,3,14,7,10,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm29, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vpermd %zmm26, %zmm3, %zmm3 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[4,5,10,11,u,u,u,u,u,u,u,u,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2],xmm1[3],xmm3[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,1,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,8,9,4,5,6,7,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,1,10,3,14,7,10,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm28, %zmm0, %zmm5 +; AVX512DQ-FCP-NEXT: vpermd %zmm26, %zmm4, %zmm4 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[4,5,10,11,u,u,u,u,u,u,u,u,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,6,7,4,5,2,3,0,1,14,15] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm13, %zmm19, %zmm0 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm2, %zmm19, %zmm0 ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm22, %zmm1, %zmm7 +; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm23, %zmm1, %zmm9 ; AVX512DQ-FCP-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm21, %zmm24 -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm24, %zmm1, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, (%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%rdx) +; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm24, %zmm1, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, (%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, (%rcx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, (%r8) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, (%r9) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, (%rax) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll index 0db78440d3aa7..9f69a3cf44189 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll @@ -645,12 +645,14 @@ define void @load_i8_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] -; AVX2-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14] +; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm3 +; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] +; AVX2-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vmovdqa %ymm2, (%rsi) @@ -662,12 +664,14 @@ define void @load_i8_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14] +; AVX2-FP-NEXT: vpshufb %ymm2, %ymm1, %ymm3 +; AVX2-FP-NEXT: vpshufb %ymm2, %ymm0, %ymm2 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] +; AVX2-FP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX2-FP-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-FP-NEXT: vmovdqa %ymm2, (%rsi) @@ -679,12 +683,14 @@ define void @load_i8_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14] +; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm3 +; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm2 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] +; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-FCP-NEXT: vmovdqa %ymm2, (%rsi) @@ -694,18 +700,20 @@ define void @load_i8_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; ; AVX512-LABEL: load_i8_stride2_vf32: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] -; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] -; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] -; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31] -; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14] +; AVX512-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX512-NEXT: vpshufb %ymm0, %ymm2, %ymm3 +; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5],ymm3[6,7] ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX512-NEXT: vmovdqa %ymm2, (%rsi) -; AVX512-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] +; AVX512-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX512-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] +; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX512-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -727,18 +735,20 @@ define void @load_i8_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; ; AVX512DQ-LABEL: load_i8_stride2_vf32: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14] +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm3 +; AVX512DQ-NEXT: vpshufb %ymm0, %ymm1, %ymm0 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5],ymm3[6,7] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX512DQ-NEXT: vmovdqa %ymm2, (%rsi) -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] +; AVX512DQ-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll index e05b5ab9ebe02..43a45b9fd59a7 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll @@ -1419,31 +1419,37 @@ define void @load_i8_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpshufb %xmm7, %xmm6, %xmm6 ; AVX-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[2,7,12] ; AVX-NEXT: vpor %xmm6, %xmm8, %xmm6 -; AVX-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm2[1,6,11,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm3[u,u,u,u,u,u],zero,zero,zero,zero,xmm3[4,9,14,u,u,u] -; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3,4,5,6,7] -; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[2,7,12],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[u,u,u,u,u,u,0,5,10,15],zero,zero,zero,xmm1[u,u,u] -; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm10[3,4,5,6,7] +; AVX-NEXT: vmovdqa {{.*#+}} xmm8 = [128,128,128,1,6,11,128,128,128,128,4,9,14,u,u,u] +; AVX-NEXT: vpshufb %xmm8, %xmm2, %xmm9 +; AVX-NEXT: vpshufb %xmm8, %xmm3, %xmm8 +; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3,4,5,6,7] +; AVX-NEXT: vmovdqa {{.*#+}} xmm9 = [2,7,12,128,128,128,0,5,10,15,128,128,128,u,u,u] +; AVX-NEXT: vpshufb %xmm9, %xmm0, %xmm10 +; AVX-NEXT: vpshufb %xmm9, %xmm1, %xmm9 +; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3,4,5,6,7] ; AVX-NEXT: vpor %xmm8, %xmm9, %xmm8 ; AVX-NEXT: vpshufb %xmm7, %xmm8, %xmm8 ; AVX-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[3,8,13] ; AVX-NEXT: vpor %xmm9, %xmm8, %xmm8 -; AVX-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm2[2,7,12,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[u,u,u,u,u,u,1,6,11],zero,zero,zero,zero,xmm1[u,u,u] -; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm10[3,4,5,6,7] -; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[3,8,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[u,u,u,u,u,u],zero,zero,zero,xmm3[0,5,10,15,u,u,u] -; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm11[3,4,5,6,7] +; AVX-NEXT: vmovdqa {{.*#+}} xmm9 = [128,128,128,2,7,12,1,6,11,128,128,128,128,u,u,u] +; AVX-NEXT: vpshufb %xmm9, %xmm2, %xmm10 +; AVX-NEXT: vpshufb %xmm9, %xmm1, %xmm9 +; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3,4,5,6,7] +; AVX-NEXT: vmovdqa {{.*#+}} xmm10 = [3,8,13,128,128,128,128,128,128,0,5,10,15,u,u,u] +; AVX-NEXT: vpshufb %xmm10, %xmm0, %xmm11 +; AVX-NEXT: vpshufb %xmm10, %xmm3, %xmm10 +; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3,4,5,6,7] ; AVX-NEXT: vpor %xmm9, %xmm10, %xmm9 ; AVX-NEXT: vpshufb %xmm7, %xmm9, %xmm7 ; AVX-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[4,9,14] ; AVX-NEXT: vpor %xmm7, %xmm9, %xmm7 -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u],zero,zero,zero,xmm3[1,6,11,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[3,8,13,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovdqa {{.*#+}} xmm9 = [128,128,128,3,8,13,128,128,128,1,6,11,u,u,u,u] +; AVX-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX-NEXT: vpshufb %xmm9, %xmm2, %xmm2 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4,5],xmm2[6,7] -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,2,7,12],zero,zero,zero,xmm1[u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [4,9,14,128,128,128,2,7,12,128,128,128,u,u,u,u] +; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5],xmm0[6,7] ; AVX-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15] @@ -2735,78 +2741,84 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpblendvb %xmm13, %xmm14, %xmm6, %xmm6 ; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[u,u,u],zero,zero,zero,zero,xmm2[4,9,14,u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,0,5,10,15],zero,zero,zero,xmm3[u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm13, %xmm14, %xmm13 -; AVX-NEXT: vmovdqa {{.*#+}} xmm14 = [128,128,128,3,4,5,6,7,8,9,u,u,u,u,u,u] -; AVX-NEXT: vpshufb %xmm14, %xmm13, %xmm13 +; AVX-NEXT: vpor %xmm13, %xmm14, %xmm14 +; AVX-NEXT: vmovdqa {{.*#+}} xmm13 = [128,128,128,3,4,5,6,7,8,9,u,u,u,u,u,u] +; AVX-NEXT: vpshufb %xmm13, %xmm14, %xmm14 ; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm4[1,6,11],zero,zero,zero,zero,zero,zero,zero,xmm4[u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm15, %xmm13, %xmm13 -; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0,1,2,3,4],xmm11[5,6,7] +; AVX-NEXT: vpor %xmm15, %xmm14, %xmm14 +; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm14[0,1,2,3,4],xmm11[5,6,7] ; AVX-NEXT: vandps %ymm6, %ymm12, %ymm6 -; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12] -; AVX-NEXT: vandnps %ymm13, %ymm12, %ymm13 -; AVX-NEXT: vorps %ymm6, %ymm13, %ymm6 +; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12] +; AVX-NEXT: vandnps %ymm14, %ymm12, %ymm14 +; AVX-NEXT: vorps %ymm6, %ymm14, %ymm6 ; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm6, %ymm6 ; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[3,8,13] -; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm1[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero -; AVX-NEXT: vpor %xmm6, %xmm13, %xmm6 -; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm3[u,u,u,1,6,11],zero,zero,zero,zero,xmm3[u,u,u,u,u,u] +; AVX-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,128,1,6,11,128,128,128,128,4,9,14,u,u,u] +; AVX-NEXT: vpshufb %xmm6, %xmm9, %xmm14 +; AVX-NEXT: vpshufb %xmm6, %xmm10, %xmm6 +; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm14[0,1,2],xmm6[3,4,5,6,7] +; AVX-NEXT: vmovdqa {{.*#+}} xmm14 = [2,7,12,128,128,128,0,5,10,15,128,128,128,u,u,u] +; AVX-NEXT: vpshufb %xmm14, %xmm7, %xmm15 +; AVX-NEXT: vpshufb %xmm14, %xmm8, %xmm14 +; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3,4,5,6,7] +; AVX-NEXT: vpor %xmm6, %xmm14, %xmm6 +; AVX-NEXT: vandps %ymm6, %ymm12, %ymm6 +; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13] +; AVX-NEXT: vandnps %ymm14, %ymm12, %ymm12 +; AVX-NEXT: vorps %ymm6, %ymm12, %ymm6 +; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[3,8,13] +; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm1[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero +; AVX-NEXT: vpor %xmm12, %xmm14, %xmm12 +; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,1,6,11],zero,zero,zero,zero,xmm3[u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm2[u,u,u],zero,zero,zero,xmm2[0,5,10,15,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm13, %xmm15, %xmm13 -; AVX-NEXT: vpshufb %xmm14, %xmm13, %xmm13 +; AVX-NEXT: vpor %xmm14, %xmm15, %xmm14 +; AVX-NEXT: vpshufb %xmm13, %xmm14, %xmm13 ; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[2,7,12],zero,zero,zero,zero,zero,zero,zero,xmm4[u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm14, %xmm13, %xmm13 -; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm13[0,1,2,3,4],xmm6[5,6,7] -; AVX-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm9[1,6,11,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[u,u,u,u,u,u],zero,zero,zero,zero,xmm10[4,9,14,u,u,u] -; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3,4,5,6,7] -; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm7[2,7,12],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,0,5,10,15],zero,zero,zero,xmm8[u,u,u] -; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3,4,5,6,7] -; AVX-NEXT: vpor %xmm13, %xmm14, %xmm13 -; AVX-NEXT: vandps %ymm12, %ymm13, %ymm13 -; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13] -; AVX-NEXT: vandnps %ymm14, %ymm12, %ymm12 -; AVX-NEXT: vorps %ymm12, %ymm13, %ymm12 -; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm12, %ymm12 -; AVX-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm9[2,7,12,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm8[u,u,u,u,u,u,1,6,11],zero,zero,zero,zero,xmm8[u,u,u] -; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm13[3,4,5,6,7] -; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm7[3,8,13],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[u,u,u,u,u,u],zero,zero,zero,xmm10[0,5,10,15,u,u,u] -; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3,4,5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1,2,3,4],xmm12[5,6,7] +; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm6, %ymm12 +; AVX-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,128,2,7,12,1,6,11,128,128,128,128,u,u,u] +; AVX-NEXT: vpshufb %xmm6, %xmm9, %xmm13 +; AVX-NEXT: vpshufb %xmm6, %xmm8, %xmm6 +; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm13[0,1,2],xmm6[3,4,5,6,7] +; AVX-NEXT: vmovdqa {{.*#+}} xmm13 = [3,8,13,128,128,128,128,128,128,0,5,10,15,u,u,u] +; AVX-NEXT: vpshufb %xmm13, %xmm7, %xmm14 +; AVX-NEXT: vpshufb %xmm13, %xmm10, %xmm13 +; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0,1,2],xmm13[3,4,5,6,7] ; AVX-NEXT: vpor %xmm6, %xmm13, %xmm6 ; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[u,u,u],zero,zero,zero,xmm2[1,6,11,u,u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,2,7,12],zero,zero,zero,xmm3[u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm13, %xmm14, %xmm13 -; AVX-NEXT: vmovdqa {{.*#+}} xmm14 = [128,128,128,3,4,5,6,7,8,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb %xmm14, %xmm13, %xmm13 +; AVX-NEXT: vpor %xmm13, %xmm14, %xmm14 +; AVX-NEXT: vmovdqa {{.*#+}} xmm13 = [128,128,128,3,4,5,6,7,8,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb %xmm13, %xmm14, %xmm14 ; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm4[3,8,13],zero,zero,zero,zero,zero,zero,xmm4[u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm15, %xmm13, %xmm13 +; AVX-NEXT: vpor %xmm15, %xmm14, %xmm14 ; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14] -; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm15, %ymm13 +; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm15, %ymm14 ; AVX-NEXT: vmovaps {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255] ; AVX-NEXT: vandps %ymm6, %ymm15, %ymm6 -; AVX-NEXT: vandnps %ymm13, %ymm15, %ymm13 -; AVX-NEXT: vorps %ymm6, %ymm13, %ymm6 -; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm0[4,9,14] +; AVX-NEXT: vandnps %ymm14, %ymm15, %ymm14 +; AVX-NEXT: vorps %ymm6, %ymm14, %ymm6 +; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm0[4,9,14] ; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm1[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero -; AVX-NEXT: vpor %xmm13, %xmm15, %xmm13 +; AVX-NEXT: vpor %xmm14, %xmm15, %xmm14 ; AVX-NEXT: vextractf128 $1, %ymm6, %xmm15 ; AVX-NEXT: vpmovsxwq {{.*#+}} xmm11 = [18446744073709551615,255] -; AVX-NEXT: vpblendvb %xmm11, %xmm15, %xmm13, %xmm13 -; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm6, %ymm6 -; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u],zero,zero,zero,xmm10[1,6,11,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[3,8,13,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpblendvb %xmm11, %xmm15, %xmm14, %xmm14 +; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm6, %ymm6 +; AVX-NEXT: vmovdqa {{.*#+}} xmm14 = [128,128,128,3,8,13,128,128,128,1,6,11,u,u,u,u] +; AVX-NEXT: vpshufb %xmm14, %xmm10, %xmm10 +; AVX-NEXT: vpshufb %xmm14, %xmm9, %xmm9 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm10[3,4,5],xmm9[6,7] -; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,2,7,12],zero,zero,zero,xmm8[u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[4,9,14],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovdqa {{.*#+}} xmm10 = [4,9,14,128,128,128,2,7,12,128,128,128,u,u,u,u] +; AVX-NEXT: vpshufb %xmm10, %xmm8, %xmm8 +; AVX-NEXT: vpshufb %xmm10, %xmm7, %xmm7 ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3,4,5],xmm7[6,7] ; AVX-NEXT: vpor %xmm7, %xmm9, %xmm7 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,zero,xmm2[2,7,12,u,u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,3,8,13],zero,zero,zero,xmm3[u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpshufb %xmm14, %xmm2, %xmm2 +; AVX-NEXT: vpshufb %xmm13, %xmm2, %xmm2 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[4,9,14],zero,zero,zero,zero,zero,zero,xmm4[u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,1,6,11],zero,zero,zero,zero diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll index c77b232fde969..e4dc257543d20 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll @@ -2015,20 +2015,24 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,9,15],zero,zero,xmm7[1,7,13],zero,zero,zero,xmm7[u,u,u,u,u] ; AVX512-NEXT: vpor %xmm7, %xmm10, %xmm7 ; AVX512-NEXT: vpternlogq $226, %xmm5, %xmm9, %xmm7 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,0,6,12,128,128,128,4,10,128,128,128,2,8,14] +; AVX512-NEXT: vpshufb %xmm5, %xmm2, %xmm9 ; AVX512-NEXT: vpternlogq $202, %ymm3, %ymm4, %ymm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[4,10,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[2,8,14] -; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7] -; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14],zero,zero,xmm0[u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[u,u,u,u,u,u,u,u,u,u,0,6,12],zero,zero,zero +; AVX512-NEXT: vpshufb %xmm5, %xmm3, %xmm4 +; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm9[5,6,7] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [4,10,128,128,128,2,8,14,128,128,0,6,12,128,128,128] +; AVX512-NEXT: vpshufb %xmm5, %xmm1, %xmm9 +; AVX512-NEXT: vpshufb %xmm5, %xmm0, %xmm5 ; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm9[5,6,7] ; AVX512-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,7,13],zero,zero,zero,xmm3[5,11,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[3,9,15] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,1,7,13,128,128,128,5,11,128,128,128,3,9,15] +; AVX512-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX512-NEXT: vpshufb %xmm5, %xmm3, %xmm3 ; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5,6,7] -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15],zero,zero,xmm0[u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,u,1,7,13],zero,zero,zero +; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [5,11,128,128,128,3,9,15,128,128,1,7,13,128,128,128] +; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] ; AVX512-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa %xmm8, (%rsi) @@ -2083,20 +2087,24 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,9,15],zero,zero,xmm7[1,7,13],zero,zero,zero,xmm7[u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm7, %xmm10, %xmm7 ; AVX512-FCP-NEXT: vpternlogq $226, %xmm5, %xmm9, %xmm7 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,0,6,12,128,128,128,4,10,128,128,128,2,8,14] +; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm9 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm3, %ymm4, %ymm0 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[4,10,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[2,8,14] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14],zero,zero,xmm0[u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[u,u,u,u,u,u,u,u,u,u,0,6,12],zero,zero,zero +; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm4 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm9[5,6,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [4,10,128,128,128,2,8,14,128,128,0,6,12,128,128,128] +; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm9 +; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm5 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm9[5,6,7] ; AVX512-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,7,13],zero,zero,zero,xmm3[5,11,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[3,9,15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,1,7,13,128,128,128,5,11,128,128,128,3,9,15] +; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm3 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5,6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15],zero,zero,xmm0[u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,u,1,7,13],zero,zero,zero +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [5,11,128,128,128,3,9,15,128,128,1,7,13,128,128,128] +; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] ; AVX512-FCP-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX512-FCP-NEXT: vmovdqa %xmm8, (%rsi) @@ -2151,20 +2159,24 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,9,15],zero,zero,xmm7[1,7,13],zero,zero,zero,xmm7[u,u,u,u,u] ; AVX512DQ-NEXT: vpor %xmm7, %xmm10, %xmm7 ; AVX512DQ-NEXT: vpternlogq $226, %xmm5, %xmm9, %xmm7 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,0,6,12,128,128,128,4,10,128,128,128,2,8,14] +; AVX512DQ-NEXT: vpshufb %xmm5, %xmm2, %xmm9 ; AVX512DQ-NEXT: vpternlogq $202, %ymm3, %ymm4, %ymm0 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[4,10,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[2,8,14] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14],zero,zero,xmm0[u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[u,u,u,u,u,u,u,u,u,u,0,6,12],zero,zero,zero +; AVX512DQ-NEXT: vpshufb %xmm5, %xmm3, %xmm4 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm9[5,6,7] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm5 = [4,10,128,128,128,2,8,14,128,128,0,6,12,128,128,128] +; AVX512DQ-NEXT: vpshufb %xmm5, %xmm1, %xmm9 +; AVX512DQ-NEXT: vpshufb %xmm5, %xmm0, %xmm5 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm9[5,6,7] ; AVX512DQ-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,7,13],zero,zero,zero,xmm3[5,11,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[3,9,15] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,1,7,13,128,128,128,5,11,128,128,128,3,9,15] +; AVX512DQ-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX512DQ-NEXT: vpshufb %xmm5, %xmm3, %xmm3 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5,6,7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15],zero,zero,xmm0[u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,u,1,7,13],zero,zero,zero +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [5,11,128,128,128,3,9,15,128,128,1,7,13,128,128,128] +; AVX512DQ-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] ; AVX512DQ-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX512DQ-NEXT: vmovdqa %xmm8, (%rsi) @@ -2219,20 +2231,24 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,9,15],zero,zero,xmm7[1,7,13],zero,zero,zero,xmm7[u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm10, %xmm7 ; AVX512DQ-FCP-NEXT: vpternlogq $226, %xmm5, %xmm9, %xmm7 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,0,6,12,128,128,128,4,10,128,128,128,2,8,14] +; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm9 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm3, %ymm4, %ymm0 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[4,10,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[2,8,14] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14],zero,zero,xmm0[u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[u,u,u,u,u,u,u,u,u,u,0,6,12],zero,zero,zero +; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm4 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm9[5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [4,10,128,128,128,2,8,14,128,128,0,6,12,128,128,128] +; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm9 +; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm5 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm9[5,6,7] ; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,7,13],zero,zero,zero,xmm3[5,11,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[3,9,15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,1,7,13,128,128,128,5,11,128,128,128,3,9,15] +; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm3 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15],zero,zero,xmm0[u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,u,1,7,13],zero,zero,zero +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [5,11,128,128,128,3,9,15,128,128,1,7,13,128,128,128] +; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] ; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm8, (%rsi) @@ -2247,11 +2263,11 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-LABEL: load_i8_stride6_vf16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm0 ; AVX512BW-NEXT: movw $18724, %r10w # imm = 0x4924 ; AVX512BW-NEXT: kmovd %r10d, %k1 -; AVX512BW-NEXT: vpblendmw %ymm1, %ymm0, %ymm2 {%k1} +; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm2 {%k1} ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10],zero,zero,zero,xmm2[u,u,u,u,u] ; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm4 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm4[2,8,14],zero,zero,xmm4[0,6,12,u,u,u,u,u] @@ -2273,7 +2289,7 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovdqu8 %xmm4, %xmm2 {%k2} ; AVX512BW-NEXT: movw $9362, %di # imm = 0x2492 ; AVX512BW-NEXT: kmovd %edi, %k3 -; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm4 {%k3} +; AVX512BW-NEXT: vpblendmw %ymm1, %ymm0, %ymm4 {%k3} ; AVX512BW-NEXT: vextracti128 $1, %ymm4, %xmm6 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm6[4,10],zero,zero,zero,xmm6[2,8,14,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[2,8,14],zero,zero,xmm4[0,6,12],zero,zero,zero,xmm4[u,u,u,u,u] @@ -2289,22 +2305,26 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[1,7,13] ; AVX512BW-NEXT: vpor %xmm6, %xmm9, %xmm6 ; AVX512BW-NEXT: vmovdqu8 %xmm6, %xmm4 {%k2} -; AVX512BW-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1} -; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm0[0,6,12],zero,zero,zero,xmm0[4,10,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm5[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[2,8,14] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,0,6,12,128,128,128,4,10,128,128,128,2,8,14] +; AVX512BW-NEXT: vpshufb %xmm6, %xmm5, %xmm9 +; AVX512BW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vpshufb %xmm6, %xmm1, %xmm6 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm9[5,6,7] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[4,10],zero,zero,zero,xmm1[2,8,14],zero,zero,xmm1[u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm7[u,u,u,u,u,u,u,u,u,u,0,6,12],zero,zero,zero +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm9 = [4,10,128,128,128,2,8,14,128,128,0,6,12,128,128,128] +; AVX512BW-NEXT: vpshufb %xmm9, %xmm7, %xmm10 +; AVX512BW-NEXT: vpshufb %xmm9, %xmm0, %xmm9 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm10[5,6,7] ; AVX512BW-NEXT: vpor %xmm6, %xmm9, %xmm6 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[1,7,13],zero,zero,zero,xmm0[5,11,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[3,9,15] -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm5[5,6,7] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,11],zero,zero,zero,xmm1[3,9,15],zero,zero,xmm1[u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm7[u,u,u,u,u,u,u,u,u,u,1,7,13],zero,zero,zero +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm9 = [128,128,1,7,13,128,128,128,5,11,128,128,128,3,9,15] +; AVX512BW-NEXT: vpshufb %xmm9, %xmm5, %xmm5 +; AVX512BW-NEXT: vpshufb %xmm9, %xmm1, %xmm1 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7] -; AVX512BW-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = [5,11,128,128,128,3,9,15,128,128,1,7,13,128,128,128] +; AVX512BW-NEXT: vpshufb %xmm5, %xmm7, %xmm7 +; AVX512BW-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm7[5,6,7] +; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovdqa %xmm3, (%rsi) ; AVX512BW-NEXT: vmovdqa %xmm2, (%rdx) ; AVX512BW-NEXT: vmovdqa %xmm8, (%rcx) @@ -2317,11 +2337,11 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-LABEL: load_i8_stride6_vf16: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm0 ; AVX512BW-FCP-NEXT: movw $18724, %r10w # imm = 0x4924 ; AVX512BW-FCP-NEXT: kmovd %r10d, %k1 -; AVX512BW-FCP-NEXT: vpblendmw %ymm1, %ymm0, %ymm2 {%k1} +; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm2 {%k1} ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10],zero,zero,zero,xmm2[u,u,u,u,u] ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm4[2,8,14],zero,zero,xmm4[0,6,12,u,u,u,u,u] @@ -2343,7 +2363,7 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovdqu8 %xmm4, %xmm2 {%k2} ; AVX512BW-FCP-NEXT: movw $9362, %di # imm = 0x2492 ; AVX512BW-FCP-NEXT: kmovd %edi, %k3 -; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm4 {%k3} +; AVX512BW-FCP-NEXT: vpblendmw %ymm1, %ymm0, %ymm4 {%k3} ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm6 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm6[4,10],zero,zero,zero,xmm6[2,8,14,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[2,8,14],zero,zero,xmm4[0,6,12],zero,zero,zero,xmm4[u,u,u,u,u] @@ -2359,22 +2379,26 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[1,7,13] ; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm9, %xmm6 ; AVX512BW-FCP-NEXT: vmovdqu8 %xmm6, %xmm4 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1} -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm0[0,6,12],zero,zero,zero,xmm0[4,10,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm5[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[2,8,14] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,0,6,12,128,128,128,4,10,128,128,128,2,8,14] +; AVX512BW-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm9 +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm6 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm9[5,6,7] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[4,10],zero,zero,zero,xmm1[2,8,14],zero,zero,xmm1[u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm7[u,u,u,u,u,u,u,u,u,u,0,6,12],zero,zero,zero +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,10,128,128,128,2,8,14,128,128,0,6,12,128,128,128] +; AVX512BW-FCP-NEXT: vpshufb %xmm9, %xmm7, %xmm10 +; AVX512BW-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm9 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm10[5,6,7] ; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm9, %xmm6 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[1,7,13],zero,zero,zero,xmm0[5,11,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[3,9,15] -; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm5[5,6,7] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,11],zero,zero,zero,xmm1[3,9,15],zero,zero,xmm1[u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm7[u,u,u,u,u,u,u,u,u,u,1,7,13],zero,zero,zero +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [128,128,1,7,13,128,128,128,5,11,128,128,128,3,9,15] +; AVX512BW-FCP-NEXT: vpshufb %xmm9, %xmm5, %xmm5 +; AVX512BW-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm1 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7] -; AVX512BW-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [5,11,128,128,128,3,9,15,128,128,1,7,13,128,128,128] +; AVX512BW-FCP-NEXT: vpshufb %xmm5, %xmm7, %xmm7 +; AVX512BW-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm7[5,6,7] +; AVX512BW-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512BW-FCP-NEXT: vmovdqa %xmm3, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa %xmm2, (%rdx) ; AVX512BW-FCP-NEXT: vmovdqa %xmm8, (%rcx) @@ -2387,11 +2411,11 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-LABEL: load_i8_stride6_vf16: ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm0 ; AVX512DQ-BW-NEXT: movw $18724, %r10w # imm = 0x4924 ; AVX512DQ-BW-NEXT: kmovd %r10d, %k1 -; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm0, %ymm2 {%k1} +; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm2 {%k1} ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10],zero,zero,zero,xmm2[u,u,u,u,u] ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm4 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm4[2,8,14],zero,zero,xmm4[0,6,12,u,u,u,u,u] @@ -2413,7 +2437,7 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vmovdqu8 %xmm4, %xmm2 {%k2} ; AVX512DQ-BW-NEXT: movw $9362, %di # imm = 0x2492 ; AVX512DQ-BW-NEXT: kmovd %edi, %k3 -; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm4 {%k3} +; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm0, %ymm4 {%k3} ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm4, %xmm6 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm6[4,10],zero,zero,zero,xmm6[2,8,14,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[2,8,14],zero,zero,xmm4[0,6,12],zero,zero,zero,xmm4[u,u,u,u,u] @@ -2429,22 +2453,26 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[1,7,13] ; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm9, %xmm6 ; AVX512DQ-BW-NEXT: vmovdqu8 %xmm6, %xmm4 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1} -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm0[0,6,12],zero,zero,zero,xmm0[4,10,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm5[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[2,8,14] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,0,6,12,128,128,128,4,10,128,128,128,2,8,14] +; AVX512DQ-BW-NEXT: vpshufb %xmm6, %xmm5, %xmm9 +; AVX512DQ-BW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-BW-NEXT: vpshufb %xmm6, %xmm1, %xmm6 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm9[5,6,7] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[4,10],zero,zero,zero,xmm1[2,8,14],zero,zero,xmm1[u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm7[u,u,u,u,u,u,u,u,u,u,0,6,12],zero,zero,zero +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm9 = [4,10,128,128,128,2,8,14,128,128,0,6,12,128,128,128] +; AVX512DQ-BW-NEXT: vpshufb %xmm9, %xmm7, %xmm10 +; AVX512DQ-BW-NEXT: vpshufb %xmm9, %xmm0, %xmm9 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm10[5,6,7] ; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm9, %xmm6 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[1,7,13],zero,zero,zero,xmm0[5,11,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[3,9,15] -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm5[5,6,7] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,11],zero,zero,zero,xmm1[3,9,15],zero,zero,xmm1[u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm7[u,u,u,u,u,u,u,u,u,u,1,7,13],zero,zero,zero +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm9 = [128,128,1,7,13,128,128,128,5,11,128,128,128,3,9,15] +; AVX512DQ-BW-NEXT: vpshufb %xmm9, %xmm5, %xmm5 +; AVX512DQ-BW-NEXT: vpshufb %xmm9, %xmm1, %xmm1 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7] -; AVX512DQ-BW-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm5 = [5,11,128,128,128,3,9,15,128,128,1,7,13,128,128,128] +; AVX512DQ-BW-NEXT: vpshufb %xmm5, %xmm7, %xmm7 +; AVX512DQ-BW-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm7[5,6,7] +; AVX512DQ-BW-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512DQ-BW-NEXT: vmovdqa %xmm3, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa %xmm2, (%rdx) ; AVX512DQ-BW-NEXT: vmovdqa %xmm8, (%rcx) @@ -2457,11 +2485,11 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-LABEL: load_i8_stride6_vf16: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm0 ; AVX512DQ-BW-FCP-NEXT: movw $18724, %r10w # imm = 0x4924 ; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k1 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm1, %ymm0, %ymm2 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm2 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10],zero,zero,zero,xmm2[u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm4[2,8,14],zero,zero,xmm4[0,6,12,u,u,u,u,u] @@ -2483,7 +2511,7 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm4, %xmm2 {%k2} ; AVX512DQ-BW-FCP-NEXT: movw $9362, %di # imm = 0x2492 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k3 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm4 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm1, %ymm0, %ymm4 {%k3} ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm6 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm6[4,10],zero,zero,zero,xmm6[2,8,14,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[2,8,14],zero,zero,xmm4[0,6,12],zero,zero,zero,xmm4[u,u,u,u,u] @@ -2499,22 +2527,26 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[1,7,13] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm9, %xmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm6, %xmm4 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm0[0,6,12],zero,zero,zero,xmm0[4,10,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm5[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[2,8,14] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,0,6,12,128,128,128,4,10,128,128,128,2,8,14] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm6 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm9[5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[4,10],zero,zero,zero,xmm1[2,8,14],zero,zero,xmm1[u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm7[u,u,u,u,u,u,u,u,u,u,0,6,12],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,10,128,128,128,2,8,14,128,128,0,6,12,128,128,128] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm9, %xmm7, %xmm10 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm9 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm10[5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm9, %xmm6 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[1,7,13],zero,zero,zero,xmm0[5,11,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[3,9,15] -; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm5[5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,11],zero,zero,zero,xmm1[3,9,15],zero,zero,xmm1[u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm7[u,u,u,u,u,u,u,u,u,u,1,7,13],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [128,128,1,7,13,128,128,128,5,11,128,128,128,3,9,15] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm9, %xmm5, %xmm5 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm1 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [5,11,128,128,128,3,9,15,128,128,1,7,13,128,128,128] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm5, %xmm7, %xmm7 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm7[5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm3, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, (%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm8, (%rcx) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll index 0ee10a33c1d0c..130ae31b37bfe 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll @@ -2809,20 +2809,22 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512: # %bb.0: ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,128,5,12,128,128,1,8,15,128,128,4,11,128,128] ; AVX512-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero +; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm4 ; AVX512-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] ; AVX512-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm5 -; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm4 -; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15,u,u,u,u,u,u] -; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0,1,2,3,4],xmm3[5,6,7] +; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512-NEXT: vpshufb %xmm3, %xmm6, %xmm3 +; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1,2,3,4],xmm4[5,6,7] ; AVX512-NEXT: vmovdqa 96(%rdi), %xmm3 ; AVX512-NEXT: vmovdqa 64(%rdi), %xmm4 ; AVX512-NEXT: vpblendd {{.*#+}} xmm7 = xmm3[0],xmm4[1],xmm3[2],xmm4[3] -; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,6,13],zero,zero,xmm7[2,9] -; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,7,14],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[u,u,u,u,u,u] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm8 = [0,7,14,128,128,3,10,128,128,128,6,13,128,128,2,9] +; AVX512-NEXT: vpshufb %xmm8, %xmm7, %xmm7 +; AVX512-NEXT: vpshufb %xmm8, %xmm5, %xmm5 ; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm7[5,6,7] ; AVX512-NEXT: vpor %xmm6, %xmm5, %xmm5 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] @@ -2905,20 +2907,22 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,128,5,12,128,128,1,8,15,128,128,4,11,128,128] ; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero +; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm4 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] ; AVX512-FCP-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm5 -; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm4 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0,1,2,3,4],xmm3[5,6,7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm3 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1,2,3,4],xmm4[5,6,7] ; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm3 ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm4 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm3[0],xmm4[1],xmm3[2],xmm4[3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,6,13],zero,zero,xmm7[2,9] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,7,14],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [0,7,14,128,128,3,10,128,128,128,6,13,128,128,2,9] +; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm7, %xmm7 +; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm5 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm7[5,6,7] ; AVX512-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] @@ -3001,20 +3005,22 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,128,5,12,128,128,1,8,15,128,128,4,11,128,128] ; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero +; AVX512DQ-NEXT: vpshufb %xmm3, %xmm0, %xmm4 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] ; AVX512DQ-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm5 -; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm4 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0,1,2,3,4],xmm3[5,6,7] +; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512DQ-NEXT: vpshufb %xmm3, %xmm6, %xmm3 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1,2,3,4],xmm4[5,6,7] ; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm3 ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm4 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm7 = xmm3[0],xmm4[1],xmm3[2],xmm4[3] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,6,13],zero,zero,xmm7[2,9] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,7,14],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[u,u,u,u,u,u] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm8 = [0,7,14,128,128,3,10,128,128,128,6,13,128,128,2,9] +; AVX512DQ-NEXT: vpshufb %xmm8, %xmm7, %xmm7 +; AVX512DQ-NEXT: vpshufb %xmm8, %xmm5, %xmm5 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm7[5,6,7] ; AVX512DQ-NEXT: vpor %xmm6, %xmm5, %xmm5 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] @@ -3097,20 +3103,22 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,128,5,12,128,128,1,8,15,128,128,4,11,128,128] ; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero +; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm4 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm5 -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm4 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0,1,2,3,4],xmm3[5,6,7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm3 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1,2,3,4],xmm4[5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm3 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm4 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm3[0],xmm4[1],xmm3[2],xmm4[3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,6,13],zero,zero,xmm7[2,9] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,7,14],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [0,7,14,128,128,3,10,128,128,128,6,13,128,128,2,9] +; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm7, %xmm7 +; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm5 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm7[5,6,7] ; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] @@ -3193,97 +3201,99 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-NEXT: vmovdqa 96(%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u,u,u,u,u,u,6,13],zero,zero,xmm2[2,9] -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,128,5,12,128,128,1,8,15,128,128,4,11,128,128] +; AVX512BW-NEXT: vmovdqa 80(%rdi), %xmm0 +; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm4 +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512BW-NEXT: movw $-28382, %r11w # imm = 0x9122 ; AVX512BW-NEXT: kmovd %r11d, %k1 -; AVX512BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm5 {%k1} -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[0,7,14],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[u,u,u,u,u,u] -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3,4],xmm4[5,6,7] -; AVX512BW-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15,u,u,u,u,u,u] -; AVX512BW-NEXT: vmovdqa 80(%rdi), %xmm5 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[4,11],zero,zero -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5,6,7] -; AVX512BW-NEXT: vpor %xmm6, %xmm4, %xmm4 +; AVX512BW-NEXT: vpblendmw %ymm1, %ymm2, %ymm5 {%k1} +; AVX512BW-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512BW-NEXT: vpshufb %xmm3, %xmm6, %xmm3 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1,2,3,4],xmm4[5,6,7] +; AVX512BW-NEXT: vmovdqa 96(%rdi), %xmm3 +; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm7 = xmm3[0],xmm4[1],xmm3[2],xmm4[3] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm8 = [0,7,14,128,128,3,10,128,128,128,6,13,128,128,2,9] +; AVX512BW-NEXT: vpshufb %xmm8, %xmm7, %xmm7 +; AVX512BW-NEXT: vpshufb %xmm8, %xmm5, %xmm5 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm7[5,6,7] +; AVX512BW-NEXT: vpor %xmm6, %xmm5, %xmm5 ; AVX512BW-NEXT: movw $4644, %di # imm = 0x1224 ; AVX512BW-NEXT: kmovd %edi, %k2 -; AVX512BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm6 {%k2} +; AVX512BW-NEXT: vpblendmw %ymm1, %ymm2, %ymm6 {%k2} ; AVX512BW-NEXT: vextracti128 $1, %ymm6, %xmm7 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[1,8,15],zero,zero,xmm6[4,11],zero,zero,xmm6[u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm7 = xmm1[0],xmm0[1],xmm1[2,3,4],xmm0[5],xmm1[6,7] +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0],xmm3[1],xmm4[2,3,4],xmm3[5],xmm4[6,7] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm7[3,10] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[5,12],zero,zero +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero ; AVX512BW-NEXT: vpor %xmm7, %xmm8, %xmm7 ; AVX512BW-NEXT: movw $-512, %di # imm = 0xFE00 ; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vmovdqu8 %xmm7, %xmm6 {%k1} ; AVX512BW-NEXT: movw $8772, %di # imm = 0x2244 ; AVX512BW-NEXT: kmovd %edi, %k3 -; AVX512BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm7 {%k3} +; AVX512BW-NEXT: vpblendmw %ymm1, %ymm2, %ymm7 {%k3} ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[2,9],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[u,u,u,u,u,u,u] ; AVX512BW-NEXT: vextracti128 $1, %ymm7, %xmm7 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[0,7,14],zero,zero,xmm7[3,10,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm8 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm8 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm8[4,11] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm5[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[6,13],zero,zero +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[6,13],zero,zero ; AVX512BW-NEXT: vpor %xmm9, %xmm8, %xmm8 ; AVX512BW-NEXT: vmovdqu8 %xmm8, %xmm7 {%k1} ; AVX512BW-NEXT: movw $9288, %di # imm = 0x2448 ; AVX512BW-NEXT: kmovd %edi, %k4 -; AVX512BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm8 {%k4} +; AVX512BW-NEXT: vpblendmw %ymm1, %ymm2, %ymm8 {%k4} ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u,u,u] ; AVX512BW-NEXT: vextracti128 $1, %ymm8, %xmm8 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpor %xmm9, %xmm8, %xmm8 -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm9 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm9 = xmm4[0],xmm3[1],xmm4[2],xmm3[3] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm9[5,12] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm5[u,u,u,u,u,u,u,u,u],zero,zero,xmm5[0,7,14],zero,zero +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[0,7,14],zero,zero ; AVX512BW-NEXT: vpor %xmm11, %xmm10, %xmm10 ; AVX512BW-NEXT: vmovdqu8 %xmm10, %xmm8 {%k1} -; AVX512BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm10 {%k2} +; AVX512BW-NEXT: vpblendmw %ymm2, %ymm1, %ymm10 {%k2} ; AVX512BW-NEXT: vextracti128 $1, %ymm10, %xmm11 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[4,11],zero,zero,xmm10[0,7,14],zero,zero,xmm10[u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpor %xmm11, %xmm10, %xmm10 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm9[6,13] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm5[u,u,u,u,u,u,u,u,u],zero,zero,xmm5[1,8,15],zero,zero +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero ; AVX512BW-NEXT: vpor %xmm11, %xmm9, %xmm9 ; AVX512BW-NEXT: vmovdqu8 %xmm9, %xmm10 {%k1} -; AVX512BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm9 {%k3} +; AVX512BW-NEXT: vpblendmw %ymm2, %ymm1, %ymm9 {%k3} ; AVX512BW-NEXT: vextracti128 $1, %ymm9, %xmm11 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[3,10],zero,zero,zero,xmm11[6,13,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[5,12],zero,zero,xmm9[1,8,15],zero,zero,xmm9[u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpor %xmm11, %xmm9, %xmm9 -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5,6],xmm0[7] +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm11 = xmm3[0],xmm4[1,2],xmm3[3],xmm4[4,5,6],xmm3[7] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm11[0,7,14] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm5[u,u,u,u,u,u,u,u,u],zero,zero,xmm5[2,9],zero,zero,zero +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[2,9],zero,zero,zero ; AVX512BW-NEXT: vpor %xmm12, %xmm11, %xmm11 ; AVX512BW-NEXT: vmovdqu8 %xmm11, %xmm9 {%k1} -; AVX512BW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k4} -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6],xmm0[7] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,u,u,u],zero,zero,xmm5[3,10],zero,zero,zero -; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovdqu8 %xmm0, %xmm2 {%k1} -; AVX512BW-NEXT: vmovdqa %xmm4, (%rsi) +; AVX512BW-NEXT: vmovdqu16 %ymm2, %ymm1 {%k4} +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u] +; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6],xmm3[7] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[3,10],zero,zero,zero +; AVX512BW-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX512BW-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1} +; AVX512BW-NEXT: vmovdqa %xmm5, (%rsi) ; AVX512BW-NEXT: vmovdqa %xmm6, (%rdx) ; AVX512BW-NEXT: vmovdqa %xmm7, (%rcx) ; AVX512BW-NEXT: vmovdqa %xmm8, (%r8) ; AVX512BW-NEXT: vmovdqa %xmm10, (%r9) ; AVX512BW-NEXT: vmovdqa %xmm9, (%r10) -; AVX512BW-NEXT: vmovdqa %xmm2, (%rax) +; AVX512BW-NEXT: vmovdqa %xmm1, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -3291,97 +3301,99 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-FCP-NEXT: vmovdqa 96(%rdi), %xmm0 -; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u,u,u,u,u,u,6,13],zero,zero,xmm2[2,9] -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,128,5,12,128,128,1,8,15,128,128,4,11,128,128] +; AVX512BW-FCP-NEXT: vmovdqa 80(%rdi), %xmm0 +; AVX512BW-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm4 +; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512BW-FCP-NEXT: movw $-28382, %r11w # imm = 0x9122 ; AVX512BW-FCP-NEXT: kmovd %r11d, %k1 -; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm5 {%k1} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[0,7,14],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3,4],xmm4[5,6,7] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vmovdqa 80(%rdi), %xmm5 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[4,11],zero,zero -; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5,6,7] -; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm4, %xmm4 +; AVX512BW-FCP-NEXT: vpblendmw %ymm1, %ymm2, %ymm5 {%k1} +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512BW-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm3 +; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1,2,3,4],xmm4[5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa 96(%rdi), %xmm3 +; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm3[0],xmm4[1],xmm3[2],xmm4[3] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [0,7,14,128,128,3,10,128,128,128,6,13,128,128,2,9] +; AVX512BW-FCP-NEXT: vpshufb %xmm8, %xmm7, %xmm7 +; AVX512BW-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm5 +; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm7[5,6,7] +; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 ; AVX512BW-FCP-NEXT: movw $4644, %di # imm = 0x1224 ; AVX512BW-FCP-NEXT: kmovd %edi, %k2 -; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm6 {%k2} +; AVX512BW-FCP-NEXT: vpblendmw %ymm1, %ymm2, %ymm6 {%k2} ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[1,8,15],zero,zero,xmm6[4,11],zero,zero,xmm6[u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm1[0],xmm0[1],xmm1[2,3,4],xmm0[5],xmm1[6,7] +; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0],xmm3[1],xmm4[2,3,4],xmm3[5],xmm4[6,7] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm7[3,10] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[5,12],zero,zero +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero ; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 ; AVX512BW-FCP-NEXT: movw $-512, %di # imm = 0xFE00 ; AVX512BW-FCP-NEXT: kmovd %edi, %k1 ; AVX512BW-FCP-NEXT: vmovdqu8 %xmm7, %xmm6 {%k1} ; AVX512BW-FCP-NEXT: movw $8772, %di # imm = 0x2244 ; AVX512BW-FCP-NEXT: kmovd %edi, %k3 -; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm7 {%k3} +; AVX512BW-FCP-NEXT: vpblendmw %ymm1, %ymm2, %ymm7 {%k3} ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[2,9],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[0,7,14],zero,zero,xmm7[3,10,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] +; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm8[4,11] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm5[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[6,13],zero,zero +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[6,13],zero,zero ; AVX512BW-FCP-NEXT: vpor %xmm9, %xmm8, %xmm8 ; AVX512BW-FCP-NEXT: vmovdqu8 %xmm8, %xmm7 {%k1} ; AVX512BW-FCP-NEXT: movw $9288, %di # imm = 0x2448 ; AVX512BW-FCP-NEXT: kmovd %edi, %k4 -; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm8 {%k4} +; AVX512BW-FCP-NEXT: vpblendmw %ymm1, %ymm2, %ymm8 {%k4} ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpor %xmm9, %xmm8, %xmm8 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm4[0],xmm3[1],xmm4[2],xmm3[3] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm9[5,12] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm5[u,u,u,u,u,u,u,u,u],zero,zero,xmm5[0,7,14],zero,zero +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[0,7,14],zero,zero ; AVX512BW-FCP-NEXT: vpor %xmm11, %xmm10, %xmm10 ; AVX512BW-FCP-NEXT: vmovdqu8 %xmm10, %xmm8 {%k1} -; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm10 {%k2} +; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm1, %ymm10 {%k2} ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm10, %xmm11 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[4,11],zero,zero,xmm10[0,7,14],zero,zero,xmm10[u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpor %xmm11, %xmm10, %xmm10 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm9[6,13] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm5[u,u,u,u,u,u,u,u,u],zero,zero,xmm5[1,8,15],zero,zero +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero ; AVX512BW-FCP-NEXT: vpor %xmm11, %xmm9, %xmm9 ; AVX512BW-FCP-NEXT: vmovdqu8 %xmm9, %xmm10 {%k1} -; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm9 {%k3} +; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm1, %ymm9 {%k3} ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm9, %xmm11 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[3,10],zero,zero,zero,xmm11[6,13,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[5,12],zero,zero,xmm9[1,8,15],zero,zero,xmm9[u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpor %xmm11, %xmm9, %xmm9 -; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5,6],xmm0[7] +; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm3[0],xmm4[1,2],xmm3[3],xmm4[4,5,6],xmm3[7] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm11[0,7,14] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm5[u,u,u,u,u,u,u,u,u],zero,zero,xmm5[2,9],zero,zero,zero +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[2,9],zero,zero,zero ; AVX512BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11 ; AVX512BW-FCP-NEXT: vmovdqu8 %xmm11, %xmm9 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu16 %ymm3, %ymm2 {%k4} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6],xmm0[7] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,u,u,u],zero,zero,xmm5[3,10],zero,zero,zero -; AVX512BW-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512BW-FCP-NEXT: vmovdqu8 %xmm0, %xmm2 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa %xmm4, (%rsi) +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm2, %ymm1 {%k4} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6],xmm3[7] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[3,10],zero,zero,zero +; AVX512BW-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX512BW-FCP-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa %xmm5, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa %xmm6, (%rdx) ; AVX512BW-FCP-NEXT: vmovdqa %xmm7, (%rcx) ; AVX512BW-FCP-NEXT: vmovdqa %xmm8, (%r8) ; AVX512BW-FCP-NEXT: vmovdqa %xmm10, (%r9) ; AVX512BW-FCP-NEXT: vmovdqa %xmm9, (%r10) -; AVX512BW-FCP-NEXT: vmovdqa %xmm2, (%rax) +; AVX512BW-FCP-NEXT: vmovdqa %xmm1, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -3389,97 +3401,99 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-BW-NEXT: vmovdqa 96(%rdi), %xmm0 -; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u,u,u,u,u,u,6,13],zero,zero,xmm2[2,9] -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,128,5,12,128,128,1,8,15,128,128,4,11,128,128] +; AVX512DQ-BW-NEXT: vmovdqa 80(%rdi), %xmm0 +; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm0, %xmm4 +; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512DQ-BW-NEXT: movw $-28382, %r11w # imm = 0x9122 ; AVX512DQ-BW-NEXT: kmovd %r11d, %k1 -; AVX512DQ-BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm5 {%k1} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[0,7,14],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3,4],xmm4[5,6,7] -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vmovdqa 80(%rdi), %xmm5 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[4,11],zero,zero -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5,6,7] -; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm4, %xmm4 +; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm2, %ymm5 {%k1} +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm6, %xmm3 +; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1,2,3,4],xmm4[5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa 96(%rdi), %xmm3 +; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm7 = xmm3[0],xmm4[1],xmm3[2],xmm4[3] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm8 = [0,7,14,128,128,3,10,128,128,128,6,13,128,128,2,9] +; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm7, %xmm7 +; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm5, %xmm5 +; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm7[5,6,7] +; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm5, %xmm5 ; AVX512DQ-BW-NEXT: movw $4644, %di # imm = 0x1224 ; AVX512DQ-BW-NEXT: kmovd %edi, %k2 -; AVX512DQ-BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm6 {%k2} +; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm2, %ymm6 {%k2} ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm6, %xmm7 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[1,8,15],zero,zero,xmm6[4,11],zero,zero,xmm6[u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm7 = xmm1[0],xmm0[1],xmm1[2,3,4],xmm0[5],xmm1[6,7] +; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0],xmm3[1],xmm4[2,3,4],xmm3[5],xmm4[6,7] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm7[3,10] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[5,12],zero,zero +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero ; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm8, %xmm7 ; AVX512DQ-BW-NEXT: movw $-512, %di # imm = 0xFE00 ; AVX512DQ-BW-NEXT: kmovd %edi, %k1 ; AVX512DQ-BW-NEXT: vmovdqu8 %xmm7, %xmm6 {%k1} ; AVX512DQ-BW-NEXT: movw $8772, %di # imm = 0x2244 ; AVX512DQ-BW-NEXT: kmovd %edi, %k3 -; AVX512DQ-BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm7 {%k3} +; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm2, %ymm7 {%k3} ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[2,9],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm7, %xmm7 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[0,7,14],zero,zero,xmm7[3,10,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm8 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] +; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm8 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm8[4,11] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm5[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[6,13],zero,zero +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[6,13],zero,zero ; AVX512DQ-BW-NEXT: vpor %xmm9, %xmm8, %xmm8 ; AVX512DQ-BW-NEXT: vmovdqu8 %xmm8, %xmm7 {%k1} ; AVX512DQ-BW-NEXT: movw $9288, %di # imm = 0x2448 ; AVX512DQ-BW-NEXT: kmovd %edi, %k4 -; AVX512DQ-BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm8 {%k4} +; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm2, %ymm8 {%k4} ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm8, %xmm8 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpor %xmm9, %xmm8, %xmm8 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm9 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm9 = xmm4[0],xmm3[1],xmm4[2],xmm3[3] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm9[5,12] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm5[u,u,u,u,u,u,u,u,u],zero,zero,xmm5[0,7,14],zero,zero +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[0,7,14],zero,zero ; AVX512DQ-BW-NEXT: vpor %xmm11, %xmm10, %xmm10 ; AVX512DQ-BW-NEXT: vmovdqu8 %xmm10, %xmm8 {%k1} -; AVX512DQ-BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm10 {%k2} +; AVX512DQ-BW-NEXT: vpblendmw %ymm2, %ymm1, %ymm10 {%k2} ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm10, %xmm11 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[4,11],zero,zero,xmm10[0,7,14],zero,zero,xmm10[u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpor %xmm11, %xmm10, %xmm10 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm9[6,13] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm5[u,u,u,u,u,u,u,u,u],zero,zero,xmm5[1,8,15],zero,zero +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero ; AVX512DQ-BW-NEXT: vpor %xmm11, %xmm9, %xmm9 ; AVX512DQ-BW-NEXT: vmovdqu8 %xmm9, %xmm10 {%k1} -; AVX512DQ-BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm9 {%k3} +; AVX512DQ-BW-NEXT: vpblendmw %ymm2, %ymm1, %ymm9 {%k3} ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm9, %xmm11 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[3,10],zero,zero,zero,xmm11[6,13,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[5,12],zero,zero,xmm9[1,8,15],zero,zero,xmm9[u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpor %xmm11, %xmm9, %xmm9 -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5,6],xmm0[7] +; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm11 = xmm3[0],xmm4[1,2],xmm3[3],xmm4[4,5,6],xmm3[7] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm11[0,7,14] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm5[u,u,u,u,u,u,u,u,u],zero,zero,xmm5[2,9],zero,zero,zero +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[2,9],zero,zero,zero ; AVX512DQ-BW-NEXT: vpor %xmm12, %xmm11, %xmm11 ; AVX512DQ-BW-NEXT: vmovdqu8 %xmm11, %xmm9 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k4} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6],xmm0[7] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,u,u,u],zero,zero,xmm5[3,10],zero,zero,zero -; AVX512DQ-BW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQ-BW-NEXT: vmovdqu8 %xmm0, %xmm2 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa %xmm4, (%rsi) +; AVX512DQ-BW-NEXT: vmovdqu16 %ymm2, %ymm1 {%k4} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6],xmm3[7] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[3,10],zero,zero,zero +; AVX512DQ-BW-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX512DQ-BW-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa %xmm5, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa %xmm6, (%rdx) ; AVX512DQ-BW-NEXT: vmovdqa %xmm7, (%rcx) ; AVX512DQ-BW-NEXT: vmovdqa %xmm8, (%r8) ; AVX512DQ-BW-NEXT: vmovdqa %xmm10, (%r9) ; AVX512DQ-BW-NEXT: vmovdqa %xmm9, (%r10) -; AVX512DQ-BW-NEXT: vmovdqa %xmm2, (%rax) +; AVX512DQ-BW-NEXT: vmovdqa %xmm1, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -3487,97 +3501,99 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 96(%rdi), %xmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u,u,u,u,u,u,6,13],zero,zero,xmm2[2,9] -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,128,5,12,128,128,1,8,15,128,128,4,11,128,128] +; AVX512DQ-BW-FCP-NEXT: vmovdqa 80(%rdi), %xmm0 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512DQ-BW-FCP-NEXT: movw $-28382, %r11w # imm = 0x9122 ; AVX512DQ-BW-FCP-NEXT: kmovd %r11d, %k1 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm5 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[0,7,14],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3,4],xmm4[5,6,7] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 80(%rdi), %xmm5 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[4,11],zero,zero -; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm4, %xmm4 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm1, %ymm2, %ymm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1,2,3,4],xmm4[5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa 96(%rdi), %xmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm3[0],xmm4[1],xmm3[2],xmm4[3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [0,7,14,128,128,3,10,128,128,128,6,13,128,128,2,9] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm8, %xmm7, %xmm7 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm5 +; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm7[5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 ; AVX512DQ-BW-FCP-NEXT: movw $4644, %di # imm = 0x1224 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k2 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm6 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm1, %ymm2, %ymm6 {%k2} ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[1,8,15],zero,zero,xmm6[4,11],zero,zero,xmm6[u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm1[0],xmm0[1],xmm1[2,3,4],xmm0[5],xmm1[6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0],xmm3[1],xmm4[2,3,4],xmm3[5],xmm4[6,7] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm7[3,10] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[5,12],zero,zero +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero ; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 ; AVX512DQ-BW-FCP-NEXT: movw $-512, %di # imm = 0xFE00 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm7, %xmm6 {%k1} ; AVX512DQ-BW-FCP-NEXT: movw $8772, %di # imm = 0x2244 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k3 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm7 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm1, %ymm2, %ymm7 {%k3} ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[2,9],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[0,7,14],zero,zero,xmm7[3,10,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm8[4,11] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm5[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[6,13],zero,zero +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[6,13],zero,zero ; AVX512DQ-BW-FCP-NEXT: vpor %xmm9, %xmm8, %xmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm8, %xmm7 {%k1} ; AVX512DQ-BW-FCP-NEXT: movw $9288, %di # imm = 0x2448 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k4 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm8 {%k4} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm1, %ymm2, %ymm8 {%k4} ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm9, %xmm8, %xmm8 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm4[0],xmm3[1],xmm4[2],xmm3[3] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm9[5,12] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm5[u,u,u,u,u,u,u,u,u],zero,zero,xmm5[0,7,14],zero,zero +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[0,7,14],zero,zero ; AVX512DQ-BW-FCP-NEXT: vpor %xmm11, %xmm10, %xmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm10, %xmm8 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm10 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm1, %ymm10 {%k2} ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm10, %xmm11 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[4,11],zero,zero,xmm10[0,7,14],zero,zero,xmm10[u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm11, %xmm10, %xmm10 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm9[6,13] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm5[u,u,u,u,u,u,u,u,u],zero,zero,xmm5[1,8,15],zero,zero +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero ; AVX512DQ-BW-FCP-NEXT: vpor %xmm11, %xmm9, %xmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm9, %xmm10 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm9 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm1, %ymm9 {%k3} ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm9, %xmm11 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[3,10],zero,zero,zero,xmm11[6,13,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[5,12],zero,zero,xmm9[1,8,15],zero,zero,xmm9[u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm11, %xmm9, %xmm9 -; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5,6],xmm0[7] +; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm3[0],xmm4[1,2],xmm3[3],xmm4[4,5,6],xmm3[7] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm11[0,7,14] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm5[u,u,u,u,u,u,u,u,u],zero,zero,xmm5[2,9],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[2,9],zero,zero,zero ; AVX512DQ-BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm11, %xmm9 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm3, %ymm2 {%k4} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6],xmm0[7] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,u,u,u],zero,zero,xmm5[3,10],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm0, %xmm2 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm4, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm2, %ymm1 {%k4} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6],xmm3[7] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[3,10],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm5, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm6, (%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm7, (%rcx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm8, (%r8) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm10, (%r9) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm9, (%r10) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm1, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <112 x i8>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll index 23ddcd7cd0262..9d1939f66219f 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll @@ -465,10 +465,11 @@ define void @store_i16_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vmovdqa (%rsi), %xmm1 ; AVX2-NEXT: vmovdqa (%rdx), %xmm2 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 -; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[0,1,u,u,6,7,2,3,u,u,8,9,4,5,u,u,16,17,u,u,22,23,18,19,u,u,24,25,20,21,u,u] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,0,1,6,7,2,3,2,3,8,9,4,5,4,5,16,17,6,7,22,23,18,19,8,9,24,25,20,21,10,11] +; AVX2-NEXT: vpshufb %ymm4, %ymm3, %ymm5 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,0,1,u,u,u,u,2,3,u,u,u,u,4,5,u,u,22,23,u,u,u,u,24,25,u,u,u,u,26,27] -; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14],ymm3[15] +; AVX2-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6],ymm3[7],ymm5[8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13,14],ymm3[15] ; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,1,1,0,2] ; AVX2-NEXT: vpermd %ymm2, %ymm4, %ymm4 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] @@ -488,10 +489,11 @@ define void @store_i16_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm1 ; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm2 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[0,1,u,u,6,7,2,3,u,u,8,9,4,5,u,u,16,17,u,u,22,23,18,19,u,u,24,25,20,21,u,u] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,0,1,6,7,2,3,2,3,8,9,4,5,4,5,16,17,6,7,22,23,18,19,8,9,24,25,20,21,10,11] +; AVX2-FP-NEXT: vpshufb %ymm4, %ymm3, %ymm5 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,0,1,u,u,u,u,2,3,u,u,u,u,4,5,u,u,22,23,u,u,u,u,24,25,u,u,u,u,26,27] -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14],ymm3[15] +; AVX2-FP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6],ymm3[7],ymm5[8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13,14],ymm3[15] ; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,1,1,0,2] ; AVX2-FP-NEXT: vpermd %ymm2, %ymm4, %ymm4 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] @@ -533,10 +535,11 @@ define void @store_i16_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vmovdqa (%rsi), %xmm1 ; AVX512-NEXT: vmovdqa (%rdx), %xmm2 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 -; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[0,1,u,u,u,u,2,3,u,u,u,u,4,5,u,u,u,u,u,u,22,23,u,u,u,u,24,25,u,u,u,u] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,0,1,6,7,2,3,2,3,8,9,4,5,4,5,16,17,6,7,22,23,18,19,8,9,24,25,20,21,10,11] +; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm5 ; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] -; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,0,1,u,u,u,u,2,3,u,u,u,u,4,5,u,u,22,23,u,u,u,u,24,25,u,u,u,u,26,27] -; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14],ymm3[15] +; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6],ymm3[7],ymm5[8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13,14],ymm3[15] ; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,1,1,0,2] ; AVX512-NEXT: vpermd %ymm2, %ymm4, %ymm4 ; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm4 @@ -579,10 +582,11 @@ define void @store_i16_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm1 ; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm2 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[0,1,u,u,u,u,2,3,u,u,u,u,4,5,u,u,u,u,u,u,22,23,u,u,u,u,24,25,u,u,u,u] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,0,1,6,7,2,3,2,3,8,9,4,5,4,5,16,17,6,7,22,23,18,19,8,9,24,25,20,21,10,11] +; AVX512DQ-NEXT: vpshufb %ymm4, %ymm3, %ymm5 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,0,1,u,u,u,u,2,3,u,u,u,u,4,5,u,u,22,23,u,u,u,u,24,25,u,u,u,u,26,27] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14],ymm3[15] +; AVX512DQ-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6],ymm3[7],ymm5[8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13,14],ymm3[15] ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,1,1,0,2] ; AVX512DQ-NEXT: vpermd %ymm2, %ymm4, %ymm4 ; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm4 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll index f054c7edfff16..704c92924abfb 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll @@ -218,10 +218,11 @@ define void @store_i16_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u,22,23,30,31] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,8,9,0,1,8,9,2,3,10,11,2,3,10,11,20,21,28,29,20,21,28,29,22,23,30,31,22,23,30,31] +; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,0,1,8,9,u,u,u,u,2,3,10,11,20,21,28,29,u,u,u,u,22,23,30,31,u,u,u,u] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6],ymm1[7] +; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6],ymm2[7] ; AVX2-NEXT: vmovdqa %ymm0, (%r8) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -235,10 +236,11 @@ define void @store_i16_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u,22,23,30,31] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,8,9,0,1,8,9,2,3,10,11,2,3,10,11,20,21,28,29,20,21,28,29,22,23,30,31,22,23,30,31] +; AVX2-FP-NEXT: vpshufb %ymm1, %ymm0, %ymm2 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,0,1,8,9,u,u,u,u,2,3,10,11,20,21,28,29,u,u,u,u,22,23,30,31,u,u,u,u] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6],ymm1[7] +; AVX2-FP-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6],ymm2[7] ; AVX2-FP-NEXT: vmovdqa %ymm0, (%r8) ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq @@ -268,10 +270,11 @@ define void @store_i16_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u,22,23,30,31] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,8,9,0,1,8,9,2,3,10,11,2,3,10,11,20,21,28,29,20,21,28,29,22,23,30,31,22,23,30,31] +; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm2 ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,0,1,8,9,u,u,u,u,2,3,10,11,20,21,28,29,u,u,u,u,22,23,30,31,u,u,u,u] -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6],ymm1[7] +; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6],ymm2[7] ; AVX512-NEXT: vmovdqa %ymm0, (%r8) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -301,10 +304,11 @@ define void @store_i16_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u,22,23,30,31] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,8,9,0,1,8,9,2,3,10,11,2,3,10,11,20,21,28,29,20,21,28,29,22,23,30,31,22,23,30,31] +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm0, %ymm2 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,0,1,8,9,u,u,u,u,2,3,10,11,20,21,28,29,u,u,u,u,22,23,30,31,u,u,u,u] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6],ymm1[7] +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6],ymm2[7] ; AVX512DQ-NEXT: vmovdqa %ymm0, (%r8) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll index c1e7f1e8c6c72..7d2f52d3c5830 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll @@ -336,8 +336,9 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm2[1],xmm4[2,3,4,5],xmm2[6],xmm4[7] -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,0,1,8,9,u,u,u,u,u,u,2,3] -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,u,u,u,u,u,u,2,3,10,11,u,u] +; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,8,9,0,1,8,9,u,u,2,3,10,11,2,3] +; AVX-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm0 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6],xmm1[7] ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5,6,7] ; AVX-NEXT: vmovdqa %xmm0, (%r9) @@ -356,10 +357,11 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX2-NEXT: vpbroadcastq %xmm3, %ymm3 -; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,1,8,9,u,u,u,u,u,u,2,3,10,11,u,u,26,27,u,u,u,u,u,u,20,21,28,29,u,u,u,u] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,8,9,0,1,8,9,u,u,2,3,10,11,2,3,10,11,u,u,4,5,12,13,4,5,12,13,u,u,6,7] +; AVX2-NEXT: vpshufb %ymm4, %ymm2, %ymm5 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,0,1,8,9,u,u,u,u,u,u,2,3,u,u,u,u,20,21,28,29,u,u,u,u,u,u,22,23] -; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6],ymm2[7],ymm4[8,9],ymm2[10,11],ymm4[12,13,14],ymm2[15] +; AVX2-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5,6],ymm2[7],ymm5[8,9],ymm2[10,11],ymm5[12,13,14],ymm2[15] ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] ; AVX2-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -383,10 +385,11 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 ; AVX2-FP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX2-FP-NEXT: vpbroadcastq %xmm3, %ymm3 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,1,8,9,u,u,u,u,u,u,2,3,10,11,u,u,26,27,u,u,u,u,u,u,20,21,28,29,u,u,u,u] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,8,9,0,1,8,9,u,u,2,3,10,11,2,3,10,11,u,u,4,5,12,13,4,5,12,13,u,u,6,7] +; AVX2-FP-NEXT: vpshufb %ymm4, %ymm2, %ymm5 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,0,1,8,9,u,u,u,u,u,u,2,3,u,u,u,u,20,21,28,29,u,u,u,u,u,u,22,23] -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6],ymm2[7],ymm4[8,9],ymm2[10,11],ymm4[12,13,14],ymm2[15] +; AVX2-FP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5,6],ymm2[7],ymm5[8,9],ymm2[10,11],ymm5[12,13,14],ymm2[15] ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 ; AVX2-FP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -409,10 +412,11 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 ; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX2-FCP-NEXT: vpbroadcastq %xmm3, %ymm3 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,1,8,9,u,u,u,u,u,u,2,3,10,11,u,u,26,27,u,u,u,u,u,u,20,21,28,29,u,u,u,u] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,8,9,0,1,8,9,u,u,2,3,10,11,2,3,10,11,u,u,4,5,12,13,4,5,12,13,u,u,6,7] +; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm5 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,0,1,8,9,u,u,u,u,u,u,2,3,u,u,u,u,20,21,28,29,u,u,u,u,u,u,22,23] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6],ymm2[7],ymm4[8,9],ymm2[10,11],ymm4[12,13,14],ymm2[15] +; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5,6],ymm2[7],ymm5[8,9],ymm2[10,11],ymm5[12,13,14],ymm2[15] ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 ; AVX2-FCP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -435,10 +439,11 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 ; AVX512-NEXT: vpbroadcastq %rax, %ymm3 -; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,1,8,9,u,u,u,u,u,u,2,3,10,11,u,u,26,27,u,u,u,u,u,u,20,21,28,29,u,u,u,u] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,8,9,0,1,8,9,u,u,2,3,10,11,2,3,10,11,u,u,4,5,12,13,4,5,12,13,u,u,6,7] +; AVX512-NEXT: vpshufb %ymm4, %ymm2, %ymm5 ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,0,1,8,9,u,u,u,u,u,u,2,3,u,u,u,u,20,21,28,29,u,u,u,u,u,u,22,23] -; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6],ymm2[7],ymm4[8,9],ymm2[10,11],ymm4[12,13,14],ymm2[15] +; AVX512-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5,6],ymm2[7],ymm5[8,9],ymm2[10,11],ymm5[12,13,14],ymm2[15] ; AVX512-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm2 ; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] @@ -462,10 +467,11 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 ; AVX512-FCP-NEXT: vpbroadcastq %rax, %ymm3 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,1,8,9,u,u,u,u,u,u,2,3,10,11,u,u,26,27,u,u,u,u,u,u,20,21,28,29,u,u,u,u] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,8,9,0,1,8,9,u,u,2,3,10,11,2,3,10,11,u,u,4,5,12,13,4,5,12,13,u,u,6,7] +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm5 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,0,1,8,9,u,u,u,u,u,u,2,3,u,u,u,u,20,21,28,29,u,u,u,u,u,u,22,23] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6],ymm2[7],ymm4[8,9],ymm2[10,11],ymm4[12,13,14],ymm2[15] +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5,6],ymm2[7],ymm5[8,9],ymm2[10,11],ymm5[12,13,14],ymm2[15] ; AVX512-FCP-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm2 ; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,6,7,14,15,u,u,8,9,10,11,12,13,14,15] @@ -488,10 +494,11 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 ; AVX512DQ-NEXT: vpbroadcastq %rax, %ymm3 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,1,8,9,u,u,u,u,u,u,2,3,10,11,u,u,26,27,u,u,u,u,u,u,20,21,28,29,u,u,u,u] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,8,9,0,1,8,9,u,u,2,3,10,11,2,3,10,11,u,u,4,5,12,13,4,5,12,13,u,u,6,7] +; AVX512DQ-NEXT: vpshufb %ymm4, %ymm2, %ymm5 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,0,1,8,9,u,u,u,u,u,u,2,3,u,u,u,u,20,21,28,29,u,u,u,u,u,u,22,23] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6],ymm2[7],ymm4[8,9],ymm2[10,11],ymm4[12,13,14],ymm2[15] +; AVX512DQ-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5,6],ymm2[7],ymm5[8,9],ymm2[10,11],ymm5[12,13,14],ymm2[15] ; AVX512DQ-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm2 ; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] @@ -515,10 +522,11 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 ; AVX512DQ-FCP-NEXT: vpbroadcastq %rax, %ymm3 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,1,8,9,u,u,u,u,u,u,2,3,10,11,u,u,26,27,u,u,u,u,u,u,20,21,28,29,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,8,9,0,1,8,9,u,u,2,3,10,11,2,3,10,11,u,u,4,5,12,13,4,5,12,13,u,u,6,7] +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm5 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,0,1,8,9,u,u,u,u,u,u,2,3,u,u,u,u,20,21,28,29,u,u,u,u,u,u,22,23] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6],ymm2[7],ymm4[8,9],ymm2[10,11],ymm4[12,13,14],ymm2[15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5,6],ymm2[7],ymm5[8,9],ymm2[10,11],ymm5[12,13,14],ymm2[15] ; AVX512DQ-FCP-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm2 ; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,6,7,14,15,u,u,8,9,10,11,12,13,14,15] @@ -814,20 +822,22 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-LABEL: store_i16_stride5_vf8: ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm2 +; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm2 +; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX2-FP-NEXT: vmovdqa (%rcx), %xmm3 ; AVX2-FP-NEXT: vmovdqa (%r8), %xmm4 -; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5 -; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm6 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm7 = ymm5[u,u,u,u,10,11,u,u,8,9,u,u,u,u,12,13,u,u,u,u,26,27,u,u,24,25,u,u,u,u,28,29] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm5[2,3,0,1] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[6,7,10,11,u,u,6,7,u,u,8,9,12,13,u,u,22,23,26,27,u,u,22,23,u,u,24,25,28,29,u,u] -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1],ymm7[2],ymm8[3],ymm7[4],ymm8[5,6],ymm7[7],ymm8[8,9],ymm7[10],ymm8[11],ymm7[12],ymm8[13,14],ymm7[15] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[6,7,6,7,u,u,u,u,10,11,10,11,8,9,u,u,22,23,22,23,u,u,u,u,26,27,26,27,24,25,u,u] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm6[2,3,0,1] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,6,7,10,11,u,u,u,u,u,u,8,9,u,u,u,u,22,23,26,27,u,u,u,u,u,u,24,25] -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6],ymm9[7],ymm8[8,9],ymm9[10,11],ymm8[12,13,14],ymm9[15] +; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm5 +; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm6 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [6,7,10,11,10,11,6,7,8,9,8,9,12,13,12,13,22,23,26,27,26,27,22,23,24,25,24,25,28,29,28,29] +; AVX2-FP-NEXT: vpshufb %ymm7, %ymm5, %ymm8 +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm5[2,3,0,1] +; AVX2-FP-NEXT: vpshufb %ymm7, %ymm9, %ymm7 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm8[2],ymm7[3],ymm8[4],ymm7[5,6],ymm8[7],ymm7[8,9],ymm8[10],ymm7[11],ymm8[12],ymm7[13,14],ymm8[15] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [6,7,6,7,6,7,10,11,10,11,10,11,8,9,8,9,22,23,22,23,22,23,26,27,26,27,26,27,24,25,24,25] +; AVX2-FP-NEXT: vpshufb %ymm8, %ymm6, %ymm9 +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm6[2,3,0,1] +; AVX2-FP-NEXT: vpshufb %ymm8, %ymm10, %ymm8 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5,6],ymm8[7],ymm9[8,9],ymm8[10,11],ymm9[12,13,14],ymm8[15] ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535] ; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] @@ -841,9 +851,9 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpbroadcastq (%r8), %ymm6 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 -; AVX2-FP-NEXT: vpsrlq $48, %xmm1, %xmm1 -; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX2-FP-NEXT: vpsrlq $48, %xmm2, %xmm2 +; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,10,11,u,u,u,u,u,u,12,13,14,15,u,u] ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] ; AVX2-FP-NEXT: vpbroadcastd 12(%r8), %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll index 824bd6e023c79..b33cc83ac3f79 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll @@ -100,10 +100,11 @@ define void @store_i16_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,2,3,6,7,u,u,u,u,18,19,22,23,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,4,5,8,9,12,13,8,9,12,13,2,3,6,7,18,19,22,23,2,3,6,7,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb %ymm1, %ymm0, %ymm2 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,8,9,12,13,u,u,u,u,18,19,22,23,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7] +; AVX2-FP-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3],ymm0[4],ymm2[5,6,7] ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FP-NEXT: vmovq %xmm1, 16(%rax) ; AVX2-FP-NEXT: vmovdqa %xmm0, (%rax) @@ -121,10 +122,11 @@ define void @store_i16_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,2,3,6,7,u,u,u,u,18,19,22,23,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,4,5,8,9,12,13,8,9,12,13,2,3,6,7,18,19,22,23,2,3,6,7,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm2 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,8,9,12,13,u,u,u,u,18,19,22,23,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7] +; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3],ymm0[4],ymm2[5,6,7] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FCP-NEXT: vmovq %xmm1, 16(%rax) ; AVX2-FCP-NEXT: vmovdqa %xmm0, (%rax) @@ -164,10 +166,11 @@ define void @store_i16_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,2,3,6,7,u,u,u,u,18,19,22,23,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,4,5,8,9,12,13,8,9,12,13,2,3,6,7,18,19,22,23,2,3,6,7,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm2 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,8,9,12,13,u,u,u,u,18,19,22,23,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7] +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3],ymm0[4],ymm2[5,6,7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-FCP-NEXT: vmovq %xmm1, 16(%rax) ; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rax) @@ -207,10 +210,11 @@ define void @store_i16_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,2,3,6,7,u,u,u,u,18,19,22,23,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,4,5,8,9,12,13,8,9,12,13,2,3,6,7,18,19,22,23,2,3,6,7,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm2 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,8,9,12,13,u,u,u,u,18,19,22,23,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3],ymm0[4],ymm2[5,6,7] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-FCP-NEXT: vmovq %xmm1, 16(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rax) @@ -396,10 +400,11 @@ define void @store_i16_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX2-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm4[0],xmm3[0] -; AVX2-NEXT: vpshufb {{.*#+}} ymm6 = ymm2[0,1,8,9,u,u,u,u,u,u,u,u,2,3,10,11,18,19,26,27,u,u,u,u,u,u,u,u,20,21,28,29] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,8,9,0,1,8,9,u,u,u,u,2,3,10,11,2,3,10,11,u,u,u,u,20,21,28,29,4,5,12,13] +; AVX2-NEXT: vpshufb %ymm6, %ymm2, %ymm7 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u] -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0],ymm2[1],ymm6[2,3,4,5],ymm2[6],ymm6[7] +; AVX2-NEXT: vpshufb %ymm6, %ymm2, %ymm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm2[1],ymm7[2,3,4,5],ymm2[6],ymm7[7] ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-NEXT: vpbroadcastq %xmm3, %ymm3 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] @@ -428,10 +433,11 @@ define void @store_i16_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX2-FP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero ; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm4[0],xmm3[0] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm6 = ymm2[0,1,8,9,u,u,u,u,u,u,u,u,2,3,10,11,18,19,26,27,u,u,u,u,u,u,u,u,20,21,28,29] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,8,9,0,1,8,9,u,u,u,u,2,3,10,11,2,3,10,11,u,u,u,u,20,21,28,29,4,5,12,13] +; AVX2-FP-NEXT: vpshufb %ymm6, %ymm2, %ymm7 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0],ymm2[1],ymm6[2,3,4,5],ymm2[6],ymm6[7] +; AVX2-FP-NEXT: vpshufb %ymm6, %ymm2, %ymm2 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm2[1],ymm7[2,3,4,5],ymm2[6],ymm7[7] ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FP-NEXT: vpbroadcastq %xmm3, %ymm3 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] @@ -458,12 +464,13 @@ define void @store_i16_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero ; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm4[0],xmm3[0] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm2[0,1,8,9,u,u,u,u,u,u,u,u,2,3,10,11,18,19,26,27,u,u,u,u,u,u,u,u,20,21,28,29] -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [4,6,1,3,4,6,1,3] -; AVX2-FCP-NEXT: # ymm7 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpermd %ymm2, %ymm7, %ymm2 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,0,1,4,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,28,29,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0],ymm2[1],ymm6[2,3,4,5],ymm2[6],ymm6[7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,8,9,0,1,4,5,u,u,u,u,2,3,10,11,2,3,10,11,u,u,u,u,24,25,28,29,4,5,12,13] +; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm7 +; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [4,6,1,3,4,6,1,3] +; AVX2-FCP-NEXT: # ymm8 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vpermd %ymm2, %ymm8, %ymm2 +; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm2[1],ymm7[2,3,4,5],ymm2[6],ymm7[7] ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FCP-NEXT: vpbroadcastq %xmm3, %ymm3 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] @@ -490,10 +497,11 @@ define void @store_i16_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 -; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[0,1,8,9,u,u,u,u,u,u,u,u,2,3,10,11,18,19,26,27,u,u,u,u,u,u,u,u,20,21,28,29] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,8,9,0,1,8,9,u,u,u,u,2,3,10,11,2,3,10,11,u,u,u,u,20,21,28,29,4,5,12,13] +; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm5 ; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] -; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u] -; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4,5],ymm3[6],ymm4[7] +; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3,4,5],ymm3[6],ymm5[7] ; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,2,2,3] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,1,3,4,5,6,7] ; AVX512-NEXT: vpbroadcastq %xmm4, %ymm4 @@ -524,12 +532,13 @@ define void @store_i16_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm3[0],xmm2[0] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm5[0,1,8,9,u,u,u,u,u,u,u,u,2,3,10,11,18,19,26,27,u,u,u,u,u,u,u,u,20,21,28,29] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [4,6,1,3,4,6,1,3] -; AVX512-FCP-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpermd %ymm5, %ymm7, %ymm5 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,0,1,4,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,28,29,u,u,u,u] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4,5],ymm5[6],ymm6[7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,8,9,0,1,4,5,u,u,u,u,2,3,10,11,2,3,10,11,u,u,u,u,24,25,28,29,4,5,12,13] +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm7 +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [4,6,1,3,4,6,1,3] +; AVX512-FCP-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpermd %ymm5, %ymm8, %ymm5 +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm5 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3,4,5],ymm5[6],ymm7[7] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,1,8,3,4,9,6,7] ; AVX512-FCP-NEXT: vpermi2d %ymm2, %ymm5, %ymm3 @@ -557,10 +566,11 @@ define void @store_i16_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[0,1,8,9,u,u,u,u,u,u,u,u,2,3,10,11,18,19,26,27,u,u,u,u,u,u,u,u,20,21,28,29] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,8,9,0,1,8,9,u,u,u,u,2,3,10,11,2,3,10,11,u,u,u,u,20,21,28,29,4,5,12,13] +; AVX512DQ-NEXT: vpshufb %ymm4, %ymm3, %ymm5 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4,5],ymm3[6],ymm4[7] +; AVX512DQ-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3,4,5],ymm3[6],ymm5[7] ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,2,2,3] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,1,3,4,5,6,7] ; AVX512DQ-NEXT: vpbroadcastq %xmm4, %ymm4 @@ -591,12 +601,13 @@ define void @store_i16_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm3[0],xmm2[0] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm5[0,1,8,9,u,u,u,u,u,u,u,u,2,3,10,11,18,19,26,27,u,u,u,u,u,u,u,u,20,21,28,29] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [4,6,1,3,4,6,1,3] -; AVX512DQ-FCP-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm7, %ymm5 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,0,1,4,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,28,29,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4,5],ymm5[6],ymm6[7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,8,9,0,1,4,5,u,u,u,u,2,3,10,11,2,3,10,11,u,u,u,u,24,25,28,29,4,5,12,13] +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm7 +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [4,6,1,3,4,6,1,3] +; AVX512DQ-FCP-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm8, %ymm5 +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm5 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3,4,5],ymm5[6],ymm7[7] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,1,8,3,4,9,6,7] ; AVX512DQ-FCP-NEXT: vpermi2d %ymm2, %ymm5, %ymm3 @@ -854,25 +865,28 @@ define void @store_i16_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX2-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,0,2] -; AVX2-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,18,19,26,27,u,u,u,u,u,u,u,u,20,21,28,29] -; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,0,2] -; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,1,8,9,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,8,9,0,1,8,9,4,5,12,13,2,3,10,11,18,19,26,27,24,25,30,31,20,21,28,29,20,21,28,29] +; AVX2-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm0[0,2,0,2] +; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,2,0,2] ; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,18,19,26,27,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,2,1,3] -; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm0[0,2,1,3] -; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u,u,u,u,u,18,19,26,27] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,8,9,6,7,14,15,6,7,14,15,2,3,10,11,16,17,24,25,16,17,24,25,24,25,26,27,18,19,26,27] +; AVX2-NEXT: vpshufb %ymm5, %ymm4, %ymm4 +; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm0[0,2,1,3] +; AVX2-NEXT: vpshufb %ymm5, %ymm6, %ymm5 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm2[0,2,1,3] ; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u] ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,22,23,30,31,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [2,3,10,11,8,9,10,11,4,5,12,13,4,5,12,13,18,19,26,27,22,23,30,31,22,23,30,31,20,21,28,29] +; AVX2-NEXT: vpshufb %ymm5, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3] -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[2,3,10,11,u,u,u,u,u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,22,23,30,31,u,u,u,u] +; AVX2-NEXT: vpshufb %ymm5, %ymm1, %ymm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm2[1,3,1,3] ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u,u,u,u,u,22,23,30,31] @@ -893,25 +907,28 @@ define void @store_i16_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX2-FP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,0,2] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,18,19,26,27,u,u,u,u,u,u,u,u,20,21,28,29] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,0,2] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,1,8,9,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,8,9,0,1,8,9,4,5,12,13,2,3,10,11,18,19,26,27,24,25,30,31,20,21,28,29,20,21,28,29] +; AVX2-FP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm0[0,2,0,2] +; AVX2-FP-NEXT: vpshufb %ymm4, %ymm5, %ymm4 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,2,0,2] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,18,19,26,27,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,2,1,3] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm0[0,2,1,3] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u,u,u,u,u,18,19,26,27] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,8,9,6,7,14,15,6,7,14,15,2,3,10,11,16,17,24,25,16,17,24,25,24,25,26,27,18,19,26,27] +; AVX2-FP-NEXT: vpshufb %ymm5, %ymm4, %ymm4 +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm0[0,2,1,3] +; AVX2-FP-NEXT: vpshufb %ymm5, %ymm6, %ymm5 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm2[0,2,1,3] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,22,23,30,31,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [2,3,10,11,8,9,10,11,4,5,12,13,4,5,12,13,18,19,26,27,22,23,30,31,22,23,30,31,20,21,28,29] +; AVX2-FP-NEXT: vpshufb %ymm5, %ymm0, %ymm0 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[2,3,10,11,u,u,u,u,u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,22,23,30,31,u,u,u,u] +; AVX2-FP-NEXT: vpshufb %ymm5, %ymm1, %ymm1 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm2[1,3,1,3] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u,u,u,u,u,22,23,30,31] @@ -934,9 +951,10 @@ define void @store_i16_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,4,1,5,0,4,1,5] ; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm3 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,0,1,4,5,u,u,u,u,u,u,u,u,18,19,22,23,u,u,u,u,u,u,u,u,24,25,28,29] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,0,2] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,1,8,9,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,8,9,0,1,4,5,4,5,12,13,2,3,10,11,18,19,22,23,24,25,30,31,20,21,28,29,24,25,28,29] +; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm0[0,2,0,2] +; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm5, %ymm4 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,4,4,0,0,4,4,0] ; AVX2-FCP-NEXT: # ymm4 = mem[0,1,0,1] @@ -946,9 +964,10 @@ define void @store_i16_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [2,6,1,5,2,6,1,5] ; AVX2-FCP-NEXT: # ymm4 = mem[0,1,0,1] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm4, %ymm4 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,10,11,14,15,u,u,u,u,u,u,u,u,16,17,20,21,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm0[0,2,1,3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u,u,u,u,u,18,19,26,27] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,8,9,6,7,14,15,10,11,14,15,2,3,10,11,16,17,24,25,16,17,20,21,24,25,26,27,18,19,26,27] +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm4 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm0[0,2,1,3] +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm6, %ymm5 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm2[0,2,1,3] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u] @@ -956,9 +975,10 @@ define void @store_i16_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [7,3,3,7,7,3,3,7] ; AVX2-FCP-NEXT: # ymm5 = mem[0,1,0,1] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm5, %ymm0 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,8,9,12,13,u,u,u,u,u,u,u,u,22,23,18,19,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [2,3,10,11,8,9,10,11,8,9,12,13,4,5,12,13,18,19,26,27,22,23,18,19,22,23,30,31,20,21,28,29] +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[2,3,10,11,u,u,u,u,u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,22,23,30,31,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm2[1,3,1,3] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u,u,u,u,u,22,23,30,31] @@ -979,26 +999,29 @@ define void @store_i16_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 ; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,1,3] -; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,1,3] -; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u,u,u,u,u,18,19,26,27] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,8,9,6,7,14,15,6,7,14,15,2,3,10,11,16,17,24,25,16,17,24,25,24,25,26,27,18,19,26,27] +; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm0[0,2,1,3] +; AVX512-NEXT: vpshufb %ymm4, %ymm5, %ymm4 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] ; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,2,1,3] ; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u] ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7] ; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,2,0,2] -; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,18,19,26,27,u,u,u,u,u,u,u,u,20,21,28,29] -; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm0[0,2,0,2] -; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,8,9,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,8,9,0,1,8,9,4,5,12,13,2,3,10,11,18,19,26,27,24,25,30,31,20,21,28,29,20,21,28,29] +; AVX512-NEXT: vpshufb %ymm5, %ymm4, %ymm4 +; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm0[0,2,0,2] +; AVX512-NEXT: vpshufb %ymm5, %ymm6, %ymm5 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7] ; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm2[0,2,0,2] ; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,18,19,26,27,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3] -; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,22,23,30,31,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,10,11,8,9,10,11,4,5,12,13,4,5,12,13,18,19,26,27,22,23,30,31,22,23,30,31,20,21,28,29] +; AVX512-NEXT: vpshufb %ymm4, %ymm0, %ymm0 ; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3] -; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[2,3,10,11,u,u,u,u,u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,22,23,30,31,u,u,u,u] +; AVX512-NEXT: vpshufb %ymm4, %ymm1, %ymm1 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm2[1,3,1,3] ; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u,u,u,u,u,22,23,30,31] @@ -1020,9 +1043,10 @@ define void @store_i16_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,6,1,5,2,6,1,5] ; AVX512-FCP-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm3 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,10,11,14,15,u,u,u,u,u,u,u,u,16,17,20,21,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,1,3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u,u,u,u,u,18,19,26,27] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,8,9,6,7,14,15,10,11,14,15,2,3,10,11,16,17,24,25,16,17,20,21,24,25,26,27,18,19,26,27] +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm0[0,2,1,3] +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm5, %ymm4 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,2,1,3] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u] @@ -1030,9 +1054,10 @@ define void @store_i16_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,4,1,5,0,4,1,5] ; AVX512-FCP-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512-FCP-NEXT: vpermd %ymm1, %ymm4, %ymm4 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,0,1,4,5,u,u,u,u,u,u,u,u,18,19,22,23,u,u,u,u,u,u,u,u,24,25,28,29] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm0[0,2,0,2] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,8,9,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,8,9,0,1,4,5,4,5,12,13,2,3,10,11,18,19,22,23,24,25,30,31,20,21,28,29,24,25,28,29] +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm4 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm0[0,2,0,2] +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm6, %ymm5 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7] ; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,4,4,0,0,4,4,0] ; AVX512-FCP-NEXT: # ymm5 = mem[0,1,0,1] @@ -1043,9 +1068,10 @@ define void @store_i16_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [7,3,3,7,7,3,3,7] ; AVX512-FCP-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512-FCP-NEXT: vpermd %ymm0, %ymm4, %ymm0 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,8,9,12,13,u,u,u,u,u,u,u,u,22,23,18,19,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,10,11,8,9,10,11,8,9,12,13,4,5,12,13,18,19,26,27,22,23,18,19,22,23,30,31,20,21,28,29] +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[2,3,10,11,u,u,u,u,u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,22,23,30,31,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm2[1,3,1,3] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u,u,u,u,u,22,23,30,31] @@ -1065,26 +1091,29 @@ define void @store_i16_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,1,3] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,1,3] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u,u,u,u,u,18,19,26,27] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,8,9,6,7,14,15,6,7,14,15,2,3,10,11,16,17,24,25,16,17,24,25,24,25,26,27,18,19,26,27] +; AVX512DQ-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm0[0,2,1,3] +; AVX512DQ-NEXT: vpshufb %ymm4, %ymm5, %ymm4 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,2,1,3] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,2,0,2] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,18,19,26,27,u,u,u,u,u,u,u,u,20,21,28,29] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm0[0,2,0,2] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,8,9,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,8,9,0,1,8,9,4,5,12,13,2,3,10,11,18,19,26,27,24,25,30,31,20,21,28,29,20,21,28,29] +; AVX512DQ-NEXT: vpshufb %ymm5, %ymm4, %ymm4 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm0[0,2,0,2] +; AVX512DQ-NEXT: vpshufb %ymm5, %ymm6, %ymm5 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm2[0,2,0,2] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,18,19,26,27,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,22,23,30,31,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,10,11,8,9,10,11,4,5,12,13,4,5,12,13,18,19,26,27,22,23,30,31,22,23,30,31,20,21,28,29] +; AVX512DQ-NEXT: vpshufb %ymm4, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[2,3,10,11,u,u,u,u,u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,22,23,30,31,u,u,u,u] +; AVX512DQ-NEXT: vpshufb %ymm4, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm2[1,3,1,3] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u,u,u,u,u,22,23,30,31] @@ -1106,9 +1135,10 @@ define void @store_i16_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,6,1,5,2,6,1,5] ; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm3 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,10,11,14,15,u,u,u,u,u,u,u,u,16,17,20,21,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,1,3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u,u,u,u,u,18,19,26,27] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,8,9,6,7,14,15,10,11,14,15,2,3,10,11,16,17,24,25,16,17,20,21,24,25,26,27,18,19,26,27] +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm0[0,2,1,3] +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm5, %ymm4 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,2,1,3] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u] @@ -1116,9 +1146,10 @@ define void @store_i16_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,4,1,5,0,4,1,5] ; AVX512DQ-FCP-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm4, %ymm4 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,0,1,4,5,u,u,u,u,u,u,u,u,18,19,22,23,u,u,u,u,u,u,u,u,24,25,28,29] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm0[0,2,0,2] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,8,9,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,8,9,0,1,4,5,4,5,12,13,2,3,10,11,18,19,22,23,24,25,30,31,20,21,28,29,24,25,28,29] +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm4 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm0[0,2,0,2] +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm6, %ymm5 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7] ; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,4,4,0,0,4,4,0] ; AVX512DQ-FCP-NEXT: # ymm5 = mem[0,1,0,1] @@ -1129,9 +1160,10 @@ define void @store_i16_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [7,3,3,7,7,3,3,7] ; AVX512DQ-FCP-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm4, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,8,9,12,13,u,u,u,u,u,u,u,u,22,23,18,19,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,10,11,8,9,10,11,8,9,12,13,4,5,12,13,18,19,26,27,22,23,18,19,22,23,30,31,20,21,28,29] +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[2,3,10,11,u,u,u,u,u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,22,23,30,31,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm2[1,3,1,3] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u,u,u,u,u,22,23,30,31] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll index e2a33019fffee..208ee607909ed 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll @@ -66,11 +66,13 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,2,3] -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,u,u,u,0,1,4,5,8,9,u,u] -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6],xmm2[7] -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,10,11,14,15,u,u,u,u,u,u,12,13,14,15] -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,2,3,6,7,10,11,u,u,u,u] +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,0,1,4,5,8,9,2,3] +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm3 +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm2 +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6],xmm3[7] +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,10,11,14,15,2,3,6,7,10,11,12,13,14,15] +; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5],xmm0[6,7] ; AVX-NEXT: vpextrd $2, %xmm1, 24(%rax) ; AVX-NEXT: vmovq %xmm0, 16(%rax) @@ -1222,10 +1224,11 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm7 ; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm8 ; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm9 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm6 = ymm8[4,5,u,u,u,u,u,u,u,u,u,u,u,u,6,7,22,23,u,u,u,u,u,u,u,u,u,u,u,u,24,25] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm8[2,3,0,1] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,4,5,u,u,u,u,u,u,u,u,8,9,u,u,u,u,20,21,u,u,u,u,u,u,u,u,24,25,u,u] -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm10[1],ymm6[2,3,4,5],ymm10[6],ymm6[7,8],ymm10[9],ymm6[10,11,12,13],ymm10[14],ymm6[15] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [4,5,4,5,u,u,u,u,u,u,u,u,8,9,6,7,6,7,20,21,u,u,u,u,u,u,u,u,24,25,8,9] +; AVX2-FP-NEXT: vpshufb %ymm6, %ymm8, %ymm10 +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm8[2,3,0,1] +; AVX2-FP-NEXT: vpshufb %ymm6, %ymm11, %ymm6 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm6 = ymm10[0],ymm6[1],ymm10[2,3,4,5],ymm6[6],ymm10[7,8],ymm6[9],ymm10[10,11,12,13],ymm6[14],ymm10[15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm7[0,2,1,3] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u] ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll index 2b268af107f6b..13c3c6a9939c1 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll @@ -506,20 +506,24 @@ define void @store_i16_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = ymm1[u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,26,27] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,18,19,26,27,u,u,u,u] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5],ymm4[6],ymm2[7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = ymm0[0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,26,27,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm0[2,3,0,1] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm6 = ymm5[u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,18,19,26,27,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3],ymm6[4],ymm4[5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,30,31] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,22,23,30,31,u,u,u,u] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,8,9,0,1,8,9,16,17,18,19,20,21,22,23,18,19,26,27,18,19,26,27] +; AVX2-FP-NEXT: vpshufb %ymm2, %ymm1, %ymm3 +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1] +; AVX2-FP-NEXT: vpshufb %ymm2, %ymm4, %ymm2 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,8,9,0,1,8,9,8,9,10,11,12,13,14,15,18,19,26,27,18,19,26,27,24,25,26,27,28,29,30,31] +; AVX2-FP-NEXT: vpshufb %ymm3, %ymm0, %ymm5 +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm0[2,3,0,1] +; AVX2-FP-NEXT: vpshufb %ymm3, %ymm6, %ymm3 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,4,5,6,7,4,5,12,13,4,5,12,13,16,17,18,19,20,21,22,23,22,23,30,31,22,23,30,31] +; AVX2-FP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX2-FP-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,30,31,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm3 = ymm5[u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,22,23,30,31,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [4,5,12,13,4,5,12,13,8,9,10,11,12,13,14,15,22,23,30,31,22,23,30,31,24,25,26,27,28,29,30,31] +; AVX2-FP-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-FP-NEXT: vpshufb %ymm3, %ymm6, %ymm3 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovdqa %ymm0, 32(%rax) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll index de34e48c01d7d..e43aa56c96c28 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll @@ -704,10 +704,11 @@ define void @store_i8_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX2-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX2-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,u,u,u,u,19,27,u,u,u,u,20,28,u,u,u,u] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,8,0,8,u,u,1,9,1,9,u,u,2,10,2,10,u,u,3,11,3,11,u,u,4,12,4,12,u,u,5,13] +; AVX2-NEXT: vpshufb %ymm4, %ymm2, %ymm5 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,19,27,u,u,u,u,20,28,u,u,u,u,21,29] -; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5,6],ymm2[7],ymm4[8],ymm2[9],ymm4[10,11],ymm2[12],ymm4[13,14],ymm2[15] +; AVX2-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2,3],ymm2[4],ymm5[5,6],ymm2[7],ymm5[8],ymm2[9],ymm5[10,11],ymm2[12],ymm5[13,14],ymm2[15] ; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[2,10,1,9,0,8,3,11,u,u,u,u,4,12,u,u] ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] @@ -735,10 +736,11 @@ define void @store_i8_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX2-FP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX2-FP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero ; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,u,u,u,u,19,27,u,u,u,u,20,28,u,u,u,u] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,8,0,8,u,u,1,9,1,9,u,u,2,10,2,10,u,u,3,11,3,11,u,u,4,12,4,12,u,u,5,13] +; AVX2-FP-NEXT: vpshufb %ymm4, %ymm2, %ymm5 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,19,27,u,u,u,u,20,28,u,u,u,u,21,29] -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5,6],ymm2[7],ymm4[8],ymm2[9],ymm4[10,11],ymm2[12],ymm4[13,14],ymm2[15] +; AVX2-FP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2,3],ymm2[4],ymm5[5,6],ymm2[7],ymm5[8],ymm2[9],ymm5[10,11],ymm2[12],ymm5[13,14],ymm2[15] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[2,10,1,9,0,8,3,11,u,u,u,u,4,12,u,u] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] @@ -766,10 +768,11 @@ define void @store_i8_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero ; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,u,u,u,u,19,27,u,u,u,u,20,28,u,u,u,u] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,8,0,8,u,u,1,9,1,9,u,u,2,10,2,10,u,u,3,11,3,11,u,u,4,12,4,12,u,u,5,13] +; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm5 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,19,27,u,u,u,u,20,28,u,u,u,u,21,29] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5,6],ymm2[7],ymm4[8],ymm2[9],ymm4[10,11],ymm2[12],ymm4[13,14],ymm2[15] +; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2,3],ymm2[4],ymm5[5,6],ymm2[7],ymm5[8],ymm2[9],ymm5[10,11],ymm2[12],ymm5[13,14],ymm2[15] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[2,10,1,9,0,8,3,11,u,u,u,u,4,12,u,u] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] @@ -797,10 +800,11 @@ define void @store_i8_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 -; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,u,u,u,u,19,27,u,u,u,u,20,28,u,u,u,u] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [0,8,0,8,u,u,1,9,1,9,u,u,2,10,2,10,u,u,3,11,3,11,u,u,4,12,4,12,u,u,5,13] +; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm5 ; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] -; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,19,27,u,u,u,u,20,28,u,u,u,u,21,29] -; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14],ymm3[15] +; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6],ymm3[7],ymm5[8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13,14],ymm3[15] ; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[2,10,1,9,0,8,3,11,u,u,u,u,4,12,u,u] ; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] ; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm4 @@ -828,10 +832,11 @@ define void @store_i8_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,u,u,u,u,19,27,u,u,u,u,20,28,u,u,u,u] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,8,0,8,u,u,1,9,1,9,u,u,2,10,2,10,u,u,3,11,3,11,u,u,4,12,4,12,u,u,5,13] +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm5 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,19,27,u,u,u,u,20,28,u,u,u,u,21,29] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14],ymm3[15] +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6],ymm3[7],ymm5[8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13,14],ymm3[15] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[2,10,1,9,0,8,3,11,u,u,u,u,4,12,u,u] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] ; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm4 @@ -859,10 +864,11 @@ define void @store_i8_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,u,u,u,u,19,27,u,u,u,u,20,28,u,u,u,u] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [0,8,0,8,u,u,1,9,1,9,u,u,2,10,2,10,u,u,3,11,3,11,u,u,4,12,4,12,u,u,5,13] +; AVX512DQ-NEXT: vpshufb %ymm4, %ymm3, %ymm5 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,19,27,u,u,u,u,20,28,u,u,u,u,21,29] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14],ymm3[15] +; AVX512DQ-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6],ymm3[7],ymm5[8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13,14],ymm3[15] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[2,10,1,9,0,8,3,11,u,u,u,u,4,12,u,u] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] ; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm4 @@ -890,10 +896,11 @@ define void @store_i8_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,u,u,u,u,19,27,u,u,u,u,20,28,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,8,0,8,u,u,1,9,1,9,u,u,2,10,2,10,u,u,3,11,3,11,u,u,4,12,4,12,u,u,5,13] +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm5 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,19,27,u,u,u,u,20,28,u,u,u,u,21,29] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14],ymm3[15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6],ymm3[7],ymm5[8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13,14],ymm3[15] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[2,10,1,9,0,8,3,11,u,u,u,u,4,12,u,u] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] ; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm4 @@ -921,10 +928,11 @@ define void @store_i8_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,u,u,u,u,19,27,u,u,u,u,20,28,u,u,u,u] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [0,8,0,8,u,u,1,9,1,9,u,u,2,10,2,10,u,u,3,11,3,11,u,u,4,12,4,12,u,u,5,13] +; AVX512BW-NEXT: vpshufb %ymm4, %ymm3, %ymm5 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,19,27,u,u,u,u,20,28,u,u,u,u,21,29] -; AVX512BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14],ymm3[15] +; AVX512BW-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6],ymm3[7],ymm5[8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13,14],ymm3[15] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[2,10,1,9,0,8,3,11,u,u,u,u,4,12,u,u] ; AVX512BW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] ; AVX512BW-NEXT: movw $18724, %cx # imm = 0x4924 @@ -954,10 +962,11 @@ define void @store_i8_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,u,u,u,u,19,27,u,u,u,u,20,28,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,8,0,8,u,u,1,9,1,9,u,u,2,10,2,10,u,u,3,11,3,11,u,u,4,12,4,12,u,u,5,13] +; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm5 ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,19,27,u,u,u,u,20,28,u,u,u,u,21,29] -; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14],ymm3[15] +; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6],ymm3[7],ymm5[8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13,14],ymm3[15] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[2,10,1,9,0,8,3,11,u,u,u,u,4,12,u,u] ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] ; AVX512BW-FCP-NEXT: movw $18724, %cx # imm = 0x4924 @@ -987,10 +996,11 @@ define void @store_i8_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,u,u,u,u,19,27,u,u,u,u,20,28,u,u,u,u] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm4 = [0,8,0,8,u,u,1,9,1,9,u,u,2,10,2,10,u,u,3,11,3,11,u,u,4,12,4,12,u,u,5,13] +; AVX512DQ-BW-NEXT: vpshufb %ymm4, %ymm3, %ymm5 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,19,27,u,u,u,u,20,28,u,u,u,u,21,29] -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14],ymm3[15] +; AVX512DQ-BW-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6],ymm3[7],ymm5[8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13,14],ymm3[15] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[2,10,1,9,0,8,3,11,u,u,u,u,4,12,u,u] ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] ; AVX512DQ-BW-NEXT: movw $18724, %cx # imm = 0x4924 @@ -1020,10 +1030,11 @@ define void @store_i8_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,u,u,u,u,19,27,u,u,u,u,20,28,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,8,0,8,u,u,1,9,1,9,u,u,2,10,2,10,u,u,3,11,3,11,u,u,4,12,4,12,u,u,5,13] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm5 ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,19,27,u,u,u,u,20,28,u,u,u,u,21,29] -; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14],ymm3[15] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6],ymm3[7],ymm5[8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13,14],ymm3[15] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[2,10,1,9,0,8,3,11,u,u,u,u,4,12,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] ; AVX512DQ-BW-FCP-NEXT: movw $18724, %cx # imm = 0x4924 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll index 2b4d0b1409a79..1771b53a0f835 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll @@ -991,20 +991,24 @@ define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1 -; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm1[u,u,u,u,4,12],zero,zero,ymm1[u,u,u,u,5,13],zero,zero,ymm1[u,u,u,u],zero,zero,ymm1[22,30,u,u,u,u],zero,zero,ymm1[23,31] -; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[4,12],zero,zero,ymm0[u,u,u,u,5,13],zero,zero,ymm0[u,u,u,u],zero,zero,ymm0[22,30,u,u,u,u],zero,zero,ymm0[23,31,u,u,u,u] -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7] -; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,0,8],zero,zero,ymm1[u,u,u,u,1,9],zero,zero,ymm1[u,u,u,u],zero,zero,ymm1[18,26,u,u,u,u],zero,zero,ymm1[19,27] -; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm0[0,8],zero,zero,ymm0[u,u,u,u,1,9],zero,zero,ymm0[u,u,u,u],zero,zero,ymm0[18,26,u,u,u,u],zero,zero,ymm0[19,27,u,u,u,u] -; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [4,12,128,128,4,12,128,128,5,13,128,128,5,13,128,128,128,128,22,30,128,128,22,30,128,128,23,31,128,128,23,31] +; AVX512-NEXT: vpshufb %ymm2, %ymm1, %ymm3 +; AVX512-NEXT: vpshufb %ymm2, %ymm0, %ymm2 +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,8,128,128,0,8,128,128,1,9,128,128,1,9,128,128,128,128,18,26,128,128,18,26,128,128,19,27,128,128,19,27] +; AVX512-NEXT: vpshufb %ymm3, %ymm1, %ymm4 +; AVX512-NEXT: vpshufb %ymm3, %ymm0, %ymm3 +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7] ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] -; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u],zero,zero,ymm1[4,12,u,u,u,u],zero,zero,ymm1[5,13,u,u,u,u,22,30],zero,zero,ymm1[u,u,u,u,23,31],zero,zero +; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,4,12,128,128,4,12,128,128,5,13,128,128,5,13,22,30,128,128,22,30,128,128,23,31,128,128,23,31,128,128] +; AVX512-NEXT: vpshufb %ymm3, %ymm1, %ymm4 ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm0[4,12,u,u,u,u],zero,zero,ymm0[5,13,u,u,u,u,22,30],zero,zero,ymm0[u,u,u,u,23,31],zero,zero,ymm0[u,u,u,u] -; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] -; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u],zero,zero,ymm1[0,8,u,u,u,u],zero,zero,ymm1[1,9,u,u,u,u,18,26],zero,zero,ymm1[u,u,u,u,19,27],zero,zero -; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0,8,u,u,u,u],zero,zero,ymm0[1,9,u,u,u,u,18,26],zero,zero,ymm0[u,u,u,u,19,27],zero,zero,ymm0[u,u,u,u] +; AVX512-NEXT: vpshufb %ymm3, %ymm0, %ymm3 +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,0,8,128,128,0,8,128,128,1,9,128,128,1,9,18,26,128,128,18,26,128,128,19,27,128,128,19,27,128,128] +; AVX512-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX512-NEXT: vpshufb %ymm4, %ymm0, %ymm0 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 ; AVX512-NEXT: vpord %zmm0, %zmm2, %zmm0 @@ -1071,20 +1075,24 @@ define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm1[u,u,u,u,4,12],zero,zero,ymm1[u,u,u,u,5,13],zero,zero,ymm1[u,u,u,u],zero,zero,ymm1[22,30,u,u,u,u],zero,zero,ymm1[23,31] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[4,12],zero,zero,ymm0[u,u,u,u,5,13],zero,zero,ymm0[u,u,u,u],zero,zero,ymm0[22,30,u,u,u,u],zero,zero,ymm0[23,31,u,u,u,u] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,0,8],zero,zero,ymm1[u,u,u,u,1,9],zero,zero,ymm1[u,u,u,u],zero,zero,ymm1[18,26,u,u,u,u],zero,zero,ymm1[19,27] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm0[0,8],zero,zero,ymm0[u,u,u,u,1,9],zero,zero,ymm0[u,u,u,u],zero,zero,ymm0[18,26,u,u,u,u],zero,zero,ymm0[19,27,u,u,u,u] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [4,12,128,128,4,12,128,128,5,13,128,128,5,13,128,128,128,128,22,30,128,128,22,30,128,128,23,31,128,128,23,31] +; AVX512DQ-NEXT: vpshufb %ymm2, %ymm1, %ymm3 +; AVX512DQ-NEXT: vpshufb %ymm2, %ymm0, %ymm2 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,8,128,128,0,8,128,128,1,9,128,128,1,9,128,128,128,128,18,26,128,128,18,26,128,128,19,27,128,128,19,27] +; AVX512DQ-NEXT: vpshufb %ymm3, %ymm1, %ymm4 +; AVX512DQ-NEXT: vpshufb %ymm3, %ymm0, %ymm3 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u],zero,zero,ymm1[4,12,u,u,u,u],zero,zero,ymm1[5,13,u,u,u,u,22,30],zero,zero,ymm1[u,u,u,u,23,31],zero,zero +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,4,12,128,128,4,12,128,128,5,13,128,128,5,13,22,30,128,128,22,30,128,128,23,31,128,128,23,31,128,128] +; AVX512DQ-NEXT: vpshufb %ymm3, %ymm1, %ymm4 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm0[4,12,u,u,u,u],zero,zero,ymm0[5,13,u,u,u,u,22,30],zero,zero,ymm0[u,u,u,u,23,31],zero,zero,ymm0[u,u,u,u] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u],zero,zero,ymm1[0,8,u,u,u,u],zero,zero,ymm1[1,9,u,u,u,u,18,26],zero,zero,ymm1[u,u,u,u,19,27],zero,zero -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0,8,u,u,u,u],zero,zero,ymm0[1,9,u,u,u,u,18,26],zero,zero,ymm0[u,u,u,u,19,27],zero,zero,ymm0[u,u,u,u] +; AVX512DQ-NEXT: vpshufb %ymm3, %ymm0, %ymm3 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,0,8,128,128,0,8,128,128,1,9,128,128,1,9,18,26,128,128,18,26,128,128,19,27,128,128,19,27,128,128] +; AVX512DQ-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpshufb %ymm4, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpord %zmm0, %zmm2, %zmm0 @@ -2076,42 +2084,40 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm3 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 ; AVX512BW-NEXT: vpermq {{.*#+}} zmm4 = zmm2[0,2,0,2,4,6,4,6] -; AVX512BW-NEXT: vpmovsxwd {{.*#+}} zmm5 = [0,2048,0,2305,0,2562,0,2819,0,3076,0,3333,0,3590,0,3847] -; AVX512BW-NEXT: vpshufb %zmm5, %zmm4, %zmm4 +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm4 = zmm4[u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u] ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[4,5,6,7,4,5,6,7] -; AVX512BW-NEXT: vpermq {{.*#+}} zmm6 = zmm3[0,2,0,2,4,6,4,6] -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15] -; AVX512BW-NEXT: vpshufb %zmm7, %zmm6, %zmm6 +; AVX512BW-NEXT: vpermq {{.*#+}} zmm5 = zmm3[0,2,0,2,4,6,4,6] +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm5 = zmm5[u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63] ; AVX512BW-NEXT: movl $-2004318072, %ecx # imm = 0x88888888 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm6, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu16 %zmm5, %zmm4 {%k1} ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512BW-NEXT: vpermq {{.*#+}} zmm6 = zmm0[0,2,0,2,4,6,4,6] -; AVX512BW-NEXT: vpmovsxwq {{.*#+}} zmm8 = [2048,2305,2562,2819,3076,3333,3590,3847] -; AVX512BW-NEXT: vpshufb %zmm8, %zmm6, %zmm6 +; AVX512BW-NEXT: vpermq {{.*#+}} zmm5 = zmm0[0,2,0,2,4,6,4,6] +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm5 = zmm5[0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u] ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[4,5,6,7,4,5,6,7] -; AVX512BW-NEXT: vpermq {{.*#+}} zmm9 = zmm1[0,2,0,2,4,6,4,6] -; AVX512BW-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10 -; AVX512BW-NEXT: vpshufb %zmm10, %zmm9, %zmm9 +; AVX512BW-NEXT: vpermq {{.*#+}} zmm6 = zmm1[0,2,0,2,4,6,4,6] +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm6 = zmm6[u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u] ; AVX512BW-NEXT: movl $572662306, %ecx # imm = 0x22222222 ; AVX512BW-NEXT: kmovd %ecx, %k2 -; AVX512BW-NEXT: vmovdqu16 %zmm9, %zmm6 {%k2} +; AVX512BW-NEXT: vmovdqu16 %zmm6, %zmm5 {%k2} ; AVX512BW-NEXT: movw $-21846, %cx # imm = 0xAAAA ; AVX512BW-NEXT: kmovd %ecx, %k3 -; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm6 {%k3} +; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm5 {%k3} ; AVX512BW-NEXT: vpermq {{.*#+}} zmm2 = zmm2[1,3,1,3,5,7,5,7] -; AVX512BW-NEXT: vpshufb %zmm5, %zmm2, %zmm2 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,2,10,2,10,u,u,u,u,3,11,3,11,u,u,u,u,4,12,4,12,u,u,u,u,5,13,5,13,u,u,u,u,6,14,6,14,u,u,u,u,7,15,7,15] +; AVX512BW-NEXT: vpshufb %zmm4, %zmm2, %zmm2 ; AVX512BW-NEXT: vpermq {{.*#+}} zmm3 = zmm3[1,3,1,3,5,7,5,7] -; AVX512BW-NEXT: vpshufb %zmm7, %zmm3, %zmm3 +; AVX512BW-NEXT: vpshufb %zmm4, %zmm3, %zmm3 ; AVX512BW-NEXT: vmovdqu16 %zmm3, %zmm2 {%k1} ; AVX512BW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,3,1,3,5,7,5,7] -; AVX512BW-NEXT: vpshufb %zmm8, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovsxdq {{.*#+}} zmm3 = [134219776,151062785,167905794,184748803,201591812,218434821,235277830,252120839] +; AVX512BW-NEXT: vpshufb %zmm3, %zmm0, %zmm0 ; AVX512BW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[1,3,1,3,5,7,5,7] -; AVX512BW-NEXT: vpshufb %zmm10, %zmm1, %zmm1 +; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k2} ; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm0 {%k3} ; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -2132,43 +2138,41 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm5 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,2,0,2,12,14,12,14] ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15] -; AVX512BW-FCP-NEXT: vpshufb %zmm7, %zmm3, %zmm3 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63] ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm8 = zmm2[0,2,0,2,4,6,4,6] -; AVX512BW-FCP-NEXT: vpmovsxwd {{.*#+}} zmm9 = [0,2048,0,2305,0,2562,0,2819,0,3076,0,3333,0,3590,0,3847] -; AVX512BW-FCP-NEXT: vpshufb %zmm9, %zmm8, %zmm8 +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm7 = zmm2[0,2,0,2,4,6,4,6] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm7 = zmm7[u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u] ; AVX512BW-FCP-NEXT: movl $-2004318072, %ecx # imm = 0x88888888 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm3, %zmm8 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm3, %zmm7 {%k1} ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm1 -; AVX512BW-FCP-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3 -; AVX512BW-FCP-NEXT: vpshufb %zmm3, %zmm1, %zmm1 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u] ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm6 = zmm0[0,2,0,2,4,6,4,6] -; AVX512BW-FCP-NEXT: vpmovsxwq {{.*#+}} zmm10 = [2048,2305,2562,2819,3076,3333,3590,3847] -; AVX512BW-FCP-NEXT: vpshufb %zmm10, %zmm6, %zmm6 +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm3 = zmm0[0,2,0,2,4,6,4,6] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: movl $572662306, %ecx # imm = 0x22222222 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k2 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm6 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm3 {%k2} ; AVX512BW-FCP-NEXT: movw $-21846, %cx # imm = 0xAAAA ; AVX512BW-FCP-NEXT: kmovd %ecx, %k3 -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm6 {%k3} +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm3 {%k3} ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [5,7,5,7,5,7,5,7] ; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermq %zmm5, %zmm1, %zmm5 -; AVX512BW-FCP-NEXT: vpshufb %zmm7, %zmm5, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,2,10,2,10,u,u,u,u,3,11,3,11,u,u,u,u,4,12,4,12,u,u,u,u,5,13,5,13,u,u,u,u,6,14,6,14,u,u,u,u,7,15,7,15] +; AVX512BW-FCP-NEXT: vpshufb %zmm6, %zmm5, %zmm5 ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm2 = zmm2[1,3,1,3,5,7,5,7] -; AVX512BW-FCP-NEXT: vpshufb %zmm9, %zmm2, %zmm2 +; AVX512BW-FCP-NEXT: vpshufb %zmm6, %zmm2, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm5, %zmm2 {%k1} ; AVX512BW-FCP-NEXT: vpermq %zmm4, %zmm1, %zmm1 -; AVX512BW-FCP-NEXT: vpshufb %zmm3, %zmm1, %zmm1 +; AVX512BW-FCP-NEXT: vpmovsxdq {{.*#+}} zmm4 = [134219776,151062785,167905794,184748803,201591812,218434821,235277830,252120839] +; AVX512BW-FCP-NEXT: vpshufb %zmm4, %zmm1, %zmm1 ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,3,1,3,5,7,5,7] -; AVX512BW-FCP-NEXT: vpshufb %zmm10, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vpshufb %zmm4, %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm0 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm0 {%k3} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -2189,42 +2193,40 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm3 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm4 = zmm2[0,2,0,2,4,6,4,6] -; AVX512DQ-BW-NEXT: vpmovsxwd {{.*#+}} zmm5 = [0,2048,0,2305,0,2562,0,2819,0,3076,0,3333,0,3590,0,3847] -; AVX512DQ-BW-NEXT: vpshufb %zmm5, %zmm4, %zmm4 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm4 = zmm4[u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u] ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[4,5,6,7,4,5,6,7] -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm6 = zmm3[0,2,0,2,4,6,4,6] -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15] -; AVX512DQ-BW-NEXT: vpshufb %zmm7, %zmm6, %zmm6 +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm5 = zmm3[0,2,0,2,4,6,4,6] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm5 = zmm5[u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63] ; AVX512DQ-BW-NEXT: movl $-2004318072, %ecx # imm = 0x88888888 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm6, %zmm4 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm5, %zmm4 {%k1} ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm6 = zmm0[0,2,0,2,4,6,4,6] -; AVX512DQ-BW-NEXT: vpmovsxwq {{.*#+}} zmm8 = [2048,2305,2562,2819,3076,3333,3590,3847] -; AVX512DQ-BW-NEXT: vpshufb %zmm8, %zmm6, %zmm6 +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm5 = zmm0[0,2,0,2,4,6,4,6] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm5 = zmm5[0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[4,5,6,7,4,5,6,7] -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm9 = zmm1[0,2,0,2,4,6,4,6] -; AVX512DQ-BW-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10 -; AVX512DQ-BW-NEXT: vpshufb %zmm10, %zmm9, %zmm9 +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm6 = zmm1[0,2,0,2,4,6,4,6] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm6 = zmm6[u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u] ; AVX512DQ-BW-NEXT: movl $572662306, %ecx # imm = 0x22222222 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k2 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm9, %zmm6 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm6, %zmm5 {%k2} ; AVX512DQ-BW-NEXT: movw $-21846, %cx # imm = 0xAAAA ; AVX512DQ-BW-NEXT: kmovd %ecx, %k3 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm4, %zmm6 {%k3} +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm4, %zmm5 {%k3} ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm2 = zmm2[1,3,1,3,5,7,5,7] -; AVX512DQ-BW-NEXT: vpshufb %zmm5, %zmm2, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,2,10,2,10,u,u,u,u,3,11,3,11,u,u,u,u,4,12,4,12,u,u,u,u,5,13,5,13,u,u,u,u,6,14,6,14,u,u,u,u,7,15,7,15] +; AVX512DQ-BW-NEXT: vpshufb %zmm4, %zmm2, %zmm2 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm3 = zmm3[1,3,1,3,5,7,5,7] -; AVX512DQ-BW-NEXT: vpshufb %zmm7, %zmm3, %zmm3 +; AVX512DQ-BW-NEXT: vpshufb %zmm4, %zmm3, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm3, %zmm2 {%k1} ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,3,1,3,5,7,5,7] -; AVX512DQ-BW-NEXT: vpshufb %zmm8, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vpmovsxdq {{.*#+}} zmm3 = [134219776,151062785,167905794,184748803,201591812,218434821,235277830,252120839] +; AVX512DQ-BW-NEXT: vpshufb %zmm3, %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[1,3,1,3,5,7,5,7] -; AVX512DQ-BW-NEXT: vpshufb %zmm10, %zmm1, %zmm1 +; AVX512DQ-BW-NEXT: vpshufb %zmm3, %zmm1, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm2, %zmm0 {%k3} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -2245,43 +2247,41 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,2,0,2,12,14,12,14] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15] -; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm7, %zmm3, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm8 = zmm2[0,2,0,2,4,6,4,6] -; AVX512DQ-BW-FCP-NEXT: vpmovsxwd {{.*#+}} zmm9 = [0,2048,0,2305,0,2562,0,2819,0,3076,0,3333,0,3590,0,3847] -; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm9, %zmm8, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm7 = zmm2[0,2,0,2,4,6,4,6] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm7 = zmm7[u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u] ; AVX512DQ-BW-FCP-NEXT: movl $-2004318072, %ecx # imm = 0x88888888 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm3, %zmm8 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm3, %zmm7 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm3, %zmm1, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm6 = zmm0[0,2,0,2,4,6,4,6] -; AVX512DQ-BW-FCP-NEXT: vpmovsxwq {{.*#+}} zmm10 = [2048,2305,2562,2819,3076,3333,3590,3847] -; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm10, %zmm6, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm3 = zmm0[0,2,0,2,4,6,4,6] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: movl $572662306, %ecx # imm = 0x22222222 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm6 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm3 {%k2} ; AVX512DQ-BW-FCP-NEXT: movw $-21846, %cx # imm = 0xAAAA ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm6 {%k3} +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm3 {%k3} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [5,7,5,7,5,7,5,7] ; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermq %zmm5, %zmm1, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm7, %zmm5, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,2,10,2,10,u,u,u,u,3,11,3,11,u,u,u,u,4,12,4,12,u,u,u,u,5,13,5,13,u,u,u,u,6,14,6,14,u,u,u,u,7,15,7,15] +; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm6, %zmm5, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm2 = zmm2[1,3,1,3,5,7,5,7] -; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm9, %zmm2, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm6, %zmm2, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm5, %zmm2 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpermq %zmm4, %zmm1, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm3, %zmm1, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpmovsxdq {{.*#+}} zmm4 = [134219776,151062785,167905794,184748803,201591812,218434821,235277830,252120839] +; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm4, %zmm1, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,3,1,3,5,7,5,7] -; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm10, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm4, %zmm0, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm0 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm0 {%k3} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %in.vec0 = load <16 x i8>, ptr %in.vecptr0, align 64 @@ -3806,13 +3806,15 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vmovdqa 16(%r8), %xmm6 ; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm0[u,u,u,u,u,u,8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[0,1,2,3,8,9,u,u,8,9,10,11,10,11,u,u,16,17,18,19,28,29,u,u,28,29,26,27,30,31,u,u] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,0,1,u,u,u,u,u,u,2,3,u,u,u,u,u,u,20,21,u,u,u,u,u,u,22,23] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,2,3,0,1,u,u,8,9,10,11,2,3,u,u,20,21,18,19,20,21,u,u,24,25,26,27,22,23,u,u] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,8,9,8,9,8,9,10,11,10,11,10,11,0,1,2,3,12,13,12,13,12,13,10,11,14,15,14,15] +; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm3 +; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm2 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6],ymm3[7],ymm2[8,9,10],ymm3[11],ymm2[12,13,14],ymm3[15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,0,1,0,1,8,9,10,11,2,3,2,3,4,5,2,3,4,5,4,5,8,9,10,11,6,7,6,7] +; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm17 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm17 ; AVX512-FCP-NEXT: vmovdqa 16(%rsi), %xmm5 ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm4 ; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] @@ -4142,21 +4144,21 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm3 ; AVX512DQ-FCP-NEXT: vpmovsxwq {{.*#+}} ymm4 = [2312,2826,3340,3854] ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm5 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm11 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3,4],ymm1[5],ymm3[6,7,8],ymm1[9],ymm3[10,11,12],ymm1[13],ymm3[14,15] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm23 ; AVX512DQ-FCP-NEXT: vpmovsxwq {{.*#+}} xmm4 = [1284,1798] ; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, %xmm10 +; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, %xmm8 ; AVX512DQ-FCP-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7,8],ymm0[9],ymm2[10,11,12],ymm0[13],ymm2[14,15] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm17 ; AVX512DQ-FCP-NEXT: vmovdqa (%r10), %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa (%rax), %xmm11 -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm1[8],xmm11[9],xmm1[9],xmm11[10],xmm1[10],xmm11[11],xmm1[11],xmm11[12],xmm1[12],xmm11[13],xmm1[13],xmm11[14],xmm1[14],xmm11[15],xmm1[15] +; AVX512DQ-FCP-NEXT: vmovdqa (%rax), %xmm14 +; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm14[8],xmm1[8],xmm14[9],xmm1[9],xmm14[10],xmm1[10],xmm14[11],xmm1[11],xmm14[12],xmm1[12],xmm14[13],xmm1[13],xmm14[14],xmm1[14],xmm14[15],xmm1[15] ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm28 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] @@ -4180,12 +4182,11 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: movw $-21846, %r11w # imm = 0xAAAA ; AVX512DQ-FCP-NEXT: kmovw %r11d, %k1 ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm17 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 16(%rsi), %xmm9 -; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm8 -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] +; AVX512DQ-FCP-NEXT: vmovdqa 16(%rsi), %xmm10 +; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm9 +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm30 +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm3 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rcx), %xmm7 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdx), %xmm6 ; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] @@ -4193,46 +4194,48 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm4[u,u,8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3,4],ymm5[5],ymm3[6,7,8],ymm5[9],ymm3[10,11,12],ymm5[13],ymm3[14,15] ; AVX512DQ-FCP-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm10, %xmm29 +; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm2, %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm8, %xmm29 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,0,1,u,u,u,u,u,u,2,3,u,u,u,u,u,u,20,21,u,u,u,u,u,u,22,23,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4],ymm4[5],ymm2[6,7,8],ymm4[9],ymm2[10,11,12],ymm4[13],ymm2[14,15] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa 16(%r10), %xmm5 +; AVX512DQ-FCP-NEXT: vmovdqa 16(%r10), %xmm8 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rax), %xmm4 -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa 16(%r9), %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa 16(%r8), %xmm2 -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX512DQ-FCP-NEXT: vmovdqa 16(%r9), %xmm5 +; AVX512DQ-FCP-NEXT: vmovdqa 16(%r8), %xmm3 +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm15, %ymm15, %ymm15 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[u,u,u,u,u,u,8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm15[0,1,2,3,8,9,u,u,8,9,10,11,10,11,u,u,16,17,18,19,28,29,u,u,28,29,26,27,30,31,u,u] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm14[0,1,2],ymm1[3],ymm14[4,5,6],ymm1[7],ymm14[8,9,10],ymm1[11],ymm14[12,13,14],ymm1[15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,0,1,u,u,u,u,u,u,2,3,u,u,u,u,u,u,20,21,u,u,u,u,u,u,22,23] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm15[0,1,2,3,0,1,u,u,8,9,10,11,2,3,u,u,20,21,18,19,20,21,u,u,24,25,26,27,22,23,u,u] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0,1,2],ymm0[3],ymm14[4,5,6],ymm0[7],ymm14[8,9,10],ymm0[11],ymm14[12,13,14],ymm0[15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,8,9,8,9,8,9,10,11,10,11,10,11,0,1,2,3,12,13,12,13,12,13,10,11,14,15,14,15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm15, %ymm2 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,0,1,0,1,8,9,10,11,2,3,2,3,4,5,2,3,4,5,4,5,8,9,10,11,6,7,6,7] +; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm15, %ymm2 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15] ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm16 {%k1} ; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15] +; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm14 -; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm6, %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm10 -; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm7 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4],ymm7[5],ymm6[6,7,8],ymm7[9],ymm6[10,11,12],ymm7[13],ymm6[14,15] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm11, %ymm10 +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm11 +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm6 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2,3,4],ymm6[5],ymm2[6,7,8],ymm6[9],ymm2[10,11,12],ymm6[13],ymm2[14,15] ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm9 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512DQ-FCP-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm29, %xmm15 ; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm1, %xmm1 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm7, %ymm1 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm6, %ymm1 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7,8],ymm0[9],ymm1[10,11,12],ymm0[13],ymm1[14,15] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] +; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm5 @@ -4240,10 +4243,10 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm6 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm4 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm7 -; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm8 -; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm8 +; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm7 +; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm2 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15] ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm3, %zmm1, %zmm0 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm1 @@ -4253,9 +4256,9 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm3 ; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm3 -; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm3, %ymm3 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 -; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm4 +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm1, %ymm4 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15] ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm2, %xmm4 @@ -4264,15 +4267,15 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm28, %xmm2 -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3],xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7] +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm14[0],xmm2[0],xmm14[1],xmm2[1],xmm14[2],xmm2[2],xmm14[3],xmm2[3],xmm14[4],xmm2[4],xmm14[5],xmm2[5],xmm14[6],xmm2[6],xmm14[7],xmm2[7] ; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm4 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm5 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm3, %ymm3 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15] ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm4, %zmm2, %zmm1 {%k1} ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll index 9a6d8c3366d98..5dd16c7b25790 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -5502,9 +5502,10 @@ define <16 x i16> @shuffle_v16i16_01_00_17_16_03_02_19_26_09_08_25_24_11_10_27_2 ; ; AVX2-FAST-LABEL: shuffle_v16i16_01_00_17_16_03_02_19_26_09_08_25_24_11_10_27_26: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,u,u,u,u,6,7,4,5,u,u,u,u,18,19,16,17,u,u,u,u,22,23,20,21,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,0,1,2,3,0,1,6,7,4,5,6,7,12,13,18,19,16,17,2,3,0,1,22,23,20,21,6,7,4,5] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,2,3,0,1,u,u,u,u,6,7,12,13,u,u,u,u,18,19,16,17,u,u,u,u,22,23,20,21] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] ; AVX2-FAST-NEXT: retq ; @@ -7469,14 +7470,15 @@ define <16 x i16> @PR24935(<16 x i16> %a, <16 x i16> %b) { ; ; AVX2-FAST-PERLANE-LABEL: PR24935: ; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,2,3,2,3,u,u,10,11,u,u,6,7,u,u,2,3,18,19,18,19,u,u,26,27,8,9,0,1,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1,2],ymm0[3],ymm3[4],ymm0[5,6,7,8],ymm3[9,10],ymm0[11],ymm3[12],ymm0[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm1[8,9],zero,zero,zero,zero,ymm1[14,15,12,13,0,1,24,25,24,25],zero,zero,ymm1[24,25,16,17,30,31,28,29,16,17] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,4,5],zero,zero,ymm1[10,11,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpor %ymm2, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[u,u,2,3,2,3,u,u,10,11,u,u,u,u,u,u,u,u,18,19,18,19,u,u,26,27,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,6,7,u,u,18,19,u,u,u,u,u,u,u,u,24,25,16,17,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2],ymm0[3],ymm2[4],ymm0[5,6,7,8],ymm2[9,10],ymm0[11],ymm2[12],ymm0[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,0,65535,65535,65535,0,65535,0,0,65535,65535,0,0,0,65535] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: retq @@ -7524,9 +7526,10 @@ define <16 x i16> @PR34369(<16 x i16> %vec, <16 x i16> %mask) { ; AVX1-LABEL: PR34369: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,u,10,11,u,u,u,u,u,u,4,5] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,0,1,0,1,u,u,10,11,4,5,4,5,u,u] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5,6],xmm3[7] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,0,1,0,1,10,11,10,11,4,5,4,5,4,5] +; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm4 +; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3],xmm0[4,5,6],xmm4[7] ; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[14,15,0,1,12,13,0,1,2,3,4,5,8,9,8,9] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll index dbcb49507ea19..56170c5c7e699 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -2630,15 +2630,17 @@ define <32 x i8> @shuffle_v32i8_39_38_37_36_35_34_33_32_07_06_05_04_03_02_01_00_ ; ; AVX2-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_07_06_05_04_03_02_01_00_55_54_53_52_51_50_49_48_23_22_21_20_19_18_17_16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,7,6,5,4,3,2,1,0,u,u,u,u,u,u,u,u,23,22,21,20,19,18,17,16] -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[7,6,5,4,3,2,1,0,u,u,u,u,u,u,u,u,23,22,21,20,19,18,17,16,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,7,6,5,4,3,2,1,0,7,6,5,4,3,2,1,0,7,6,5,4,3,2,1,0] +; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-NEXT: retq ; ; AVX512VLBW-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_07_06_05_04_03_02_01_00_55_54_53_52_51_50_49_48_23_22_21_20_19_18_17_16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,7,6,5,4,3,2,1,0,u,u,u,u,u,u,u,u,23,22,21,20,19,18,17,16] -; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[7,6,5,4,3,2,1,0,u,u,u,u,u,u,u,u,23,22,21,20,19,18,17,16,u,u,u,u,u,u,u,u] +; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,7,6,5,4,3,2,1,0,7,6,5,4,3,2,1,0,7,6,5,4,3,2,1,0] +; AVX512VLBW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512VLBW-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX512VLBW-NEXT: retq ; @@ -2661,8 +2663,9 @@ define <32 x i8> @shuffle_v32i8_39_38_37_36_35_34_33_32_07_06_05_04_03_02_01_00_ ; ; XOPAVX2-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_07_06_05_04_03_02_01_00_55_54_53_52_51_50_49_48_23_22_21_20_19_18_17_16: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,7,6,5,4,3,2,1,0,u,u,u,u,u,u,u,u,23,22,21,20,19,18,17,16] -; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[7,6,5,4,3,2,1,0,u,u,u,u,u,u,u,u,23,22,21,20,19,18,17,16,u,u,u,u,u,u,u,u] +; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,7,6,5,4,3,2,1,0,7,6,5,4,3,2,1,0,7,6,5,4,3,2,1,0] +; XOPAVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; XOPAVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; XOPAVX2-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> @@ -4837,15 +4840,17 @@ define <32 x i8> @shuffle_v32i8_00_02_04_06_08_10_12_14_32_34_36_38_40_42_44_46_ ; ; AVX2-LABEL: shuffle_v32i8_00_02_04_06_08_10_12_14_32_34_36_38_40_42_44_46_16_18_20_22_24_26_28_30_48_50_52_54_56_58_60_62: ; AVX2: # %bb.0: -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14] +; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX2-NEXT: retq ; ; AVX512VLBW-LABEL: shuffle_v32i8_00_02_04_06_08_10_12_14_32_34_36_38_40_42_44_46_16_18_20_22_24_26_28_30_48_50_52_54_56_58_60_62: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] -; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u] +; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14] +; AVX512VLBW-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512VLBW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX512VLBW-NEXT: retq ; @@ -4867,8 +4872,9 @@ define <32 x i8> @shuffle_v32i8_00_02_04_06_08_10_12_14_32_34_36_38_40_42_44_46_ ; ; XOPAVX2-LABEL: shuffle_v32i8_00_02_04_06_08_10_12_14_32_34_36_38_40_42_44_46_16_18_20_22_24_26_28_30_48_50_52_54_56_58_60_62: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] -; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u] +; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14] +; XOPAVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; XOPAVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; XOPAVX2-NEXT: retq %1 = shufflevector <32 x i8> %a0, <32 x i8> %a1, <32 x i32> @@ -4892,8 +4898,9 @@ define <32 x i8> @shuffle_v32i8_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_ ; ; AVX2-LABEL: shuffle_v32i8_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62: ; AVX2: # %bb.0: -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14] +; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: retq @@ -4917,8 +4924,9 @@ define <32 x i8> @shuffle_v32i8_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_ ; ; XOPAVX2-LABEL: shuffle_v32i8_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] -; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u] +; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14] +; XOPAVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; XOPAVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; XOPAVX2-NEXT: retq From f56db7860b38ac73310039b44770e844e4cb613d Mon Sep 17 00:00:00 2001 From: Benji Smith <6193112+Benjins@users.noreply.github.com> Date: Wed, 17 Jul 2024 05:35:25 -0400 Subject: [PATCH 246/777] [C API] Support new ptrauth constant type (#93909) This is a new constant type that was added to the C++ API in 0edc97f119f3ac3ff96b11183fe5c001a48a9a8d. This adds the ability to create instances of this constant and get its values to the C API. --- llvm/docs/ReleaseNotes.rst | 8 ++++++ llvm/include/llvm-c/Core.h | 45 +++++++++++++++++++++++++++++++ llvm/include/llvm/IR/Value.def | 2 +- llvm/lib/IR/Core.cpp | 23 ++++++++++++++++ llvm/test/Bindings/llvm-c/echo.ll | 5 ++++ llvm/tools/llvm-c-test/echo.cpp | 10 +++++++ 6 files changed, 92 insertions(+), 1 deletion(-) diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst index 311ae0ea255ef..fa60049f67828 100644 --- a/llvm/docs/ReleaseNotes.rst +++ b/llvm/docs/ReleaseNotes.rst @@ -320,6 +320,14 @@ They are described in detail in the `debug info migration guide (VectorTy)->getElementCount().getKnownMinValue(); } +LLVMValueRef LLVMGetConstantPtrAuthPointer(LLVMValueRef PtrAuth) { + return wrap(unwrap(PtrAuth)->getPointer()); +} + +LLVMValueRef LLVMGetConstantPtrAuthKey(LLVMValueRef PtrAuth) { + return wrap(unwrap(PtrAuth)->getKey()); +} + +LLVMValueRef LLVMGetConstantPtrAuthDiscriminator(LLVMValueRef PtrAuth) { + return wrap(unwrap(PtrAuth)->getDiscriminator()); +} + +LLVMValueRef LLVMGetConstantPtrAuthAddrDiscriminator(LLVMValueRef PtrAuth) { + return wrap(unwrap(PtrAuth)->getAddrDiscriminator()); +} + /*--.. Operations on other types ...........................................--*/ LLVMTypeRef LLVMPointerTypeInContext(LLVMContextRef C, unsigned AddressSpace) { @@ -1663,6 +1679,13 @@ LLVMValueRef LLVMConstVector(LLVMValueRef *ScalarConstantVals, unsigned Size) { ArrayRef(unwrap(ScalarConstantVals, Size), Size))); } +LLVMValueRef LLVMConstantPtrAuth(LLVMValueRef Ptr, LLVMValueRef Key, + LLVMValueRef Disc, LLVMValueRef AddrDisc) { + return wrap(ConstantPtrAuth::get( + unwrap(Ptr), unwrap(Key), + unwrap(Disc), unwrap(AddrDisc))); +} + /*-- Opcode mapping */ static LLVMOpcode map_to_llvmopcode(int opcode) diff --git a/llvm/test/Bindings/llvm-c/echo.ll b/llvm/test/Bindings/llvm-c/echo.ll index 06e1c44e0c490..ab9acbc0a66a5 100644 --- a/llvm/test/Bindings/llvm-c/echo.ll +++ b/llvm/test/Bindings/llvm-c/echo.ll @@ -37,6 +37,11 @@ module asm "classical GAS" @ifunc = ifunc i32 (i32), ptr @ifunc_resolver +@ptrauth_addr_disc = global i32 0 +@ptrauth_data = global i32 0 +@ptrauth_ptr_01 = global ptr ptrauth (ptr @ptrauth_data, i32 77, i64 1001, ptr @ptrauth_addr_disc) +@ptrauth_ptr_02 = global ptr ptrauth (ptr @ptrauth_data, i32 11, i64 99, ptr null) + define ptr @ifunc_resolver() { entry: ret ptr null diff --git a/llvm/tools/llvm-c-test/echo.cpp b/llvm/tools/llvm-c-test/echo.cpp index c9bf03229683a..c5ae051c0a301 100644 --- a/llvm/tools/llvm-c-test/echo.cpp +++ b/llvm/tools/llvm-c-test/echo.cpp @@ -391,6 +391,16 @@ static LLVMValueRef clone_constant_impl(LLVMValueRef Cst, LLVMModuleRef M) { return LLVMConstVector(Elts.data(), EltCount); } + if (LLVMIsAConstantPtrAuth(Cst)) { + LLVMValueRef Ptr = clone_constant(LLVMGetConstantPtrAuthPointer(Cst), M); + LLVMValueRef Key = clone_constant(LLVMGetConstantPtrAuthKey(Cst), M); + LLVMValueRef Disc = + clone_constant(LLVMGetConstantPtrAuthDiscriminator(Cst), M); + LLVMValueRef AddrDisc = + clone_constant(LLVMGetConstantPtrAuthAddrDiscriminator(Cst), M); + return LLVMConstantPtrAuth(Ptr, Key, Disc, AddrDisc); + } + // At this point, if it's not a constant expression, it's a kind of constant // which is not supported if (!LLVMIsAConstantExpr(Cst)) From e9b2a25e90fb7fe47936f65b434e4ebe24773349 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Wed, 17 Jul 2024 11:36:07 +0200 Subject: [PATCH 247/777] [nsan] Swap alignas and visibility order (NFC) (#98933) Use `alignas(16) SANITIZER_INTERFACE_ATTRIBUTE` instead of `SANITIZER_INTERFACE_ATTRIBUTE alignas(16)`, as the former is not supported prior to clang 16. See https://clang.godbolt.org/z/Wj1193xWK. This was broken by https://github.com/llvm/llvm-project/pull/96142 as part of other style changes. --- compiler-rt/lib/nsan/nsan.cpp | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/compiler-rt/lib/nsan/nsan.cpp b/compiler-rt/lib/nsan/nsan.cpp index 194093c9679d0..718242c2ecdf8 100644 --- a/compiler-rt/lib/nsan/nsan.cpp +++ b/compiler-rt/lib/nsan/nsan.cpp @@ -390,24 +390,23 @@ __nsan_dump_shadow_mem(const u8 *addr, size_t size_bytes, size_t bytes_per_line, } } -SANITIZER_INTERFACE_ATTRIBUTE -alignas(16) thread_local uptr __nsan_shadow_ret_tag = 0; +alignas(16) SANITIZER_INTERFACE_ATTRIBUTE + thread_local uptr __nsan_shadow_ret_tag = 0; -SANITIZER_INTERFACE_ATTRIBUTE -alignas(16) thread_local char __nsan_shadow_ret_ptr[kMaxVectorWidth * - sizeof(__float128)]; +alignas(16) SANITIZER_INTERFACE_ATTRIBUTE + thread_local char __nsan_shadow_ret_ptr[kMaxVectorWidth * + sizeof(__float128)]; -SANITIZER_INTERFACE_ATTRIBUTE -alignas(16) thread_local uptr __nsan_shadow_args_tag = 0; +alignas(16) SANITIZER_INTERFACE_ATTRIBUTE + thread_local uptr __nsan_shadow_args_tag = 0; // Maximum number of args. This should be enough for anyone (tm). An alternate // scheme is to have the generated code create an alloca and make // __nsan_shadow_args_ptr point ot the alloca. constexpr const int kMaxNumArgs = 128; -SANITIZER_INTERFACE_ATTRIBUTE -alignas( - 16) thread_local char __nsan_shadow_args_ptr[kMaxVectorWidth * kMaxNumArgs * - sizeof(__float128)]; +alignas(16) SANITIZER_INTERFACE_ATTRIBUTE + thread_local char __nsan_shadow_args_ptr[kMaxVectorWidth * kMaxNumArgs * + sizeof(__float128)]; enum ContinuationType { // Keep in sync with instrumentation pass. kContinueWithShadow = 0, From e94e72a0c2293a02ea6c5335ac5fbc2d34de13f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Mon, 15 Jul 2024 13:19:33 +0200 Subject: [PATCH 248/777] Reapply "[clang][Interp] Implement dynamic memory allocation handling (#70306)" This reverts commit 48d703e7f56282ce5d690e45a129a4a7fd040ee6. --- clang/lib/AST/CMakeLists.txt | 1 + clang/lib/AST/Interp/Compiler.cpp | 111 +++++ clang/lib/AST/Interp/Compiler.h | 2 + clang/lib/AST/Interp/DynamicAllocator.cpp | 118 +++++ clang/lib/AST/Interp/DynamicAllocator.h | 102 +++++ clang/lib/AST/Interp/EvalEmitter.cpp | 23 + clang/lib/AST/Interp/EvaluationResult.cpp | 72 +++ clang/lib/AST/Interp/EvaluationResult.h | 6 + clang/lib/AST/Interp/Interp.cpp | 52 +++ clang/lib/AST/Interp/Interp.h | 152 +++++++ clang/lib/AST/Interp/InterpBlock.h | 11 +- clang/lib/AST/Interp/InterpState.cpp | 17 + clang/lib/AST/Interp/InterpState.h | 11 + clang/lib/AST/Interp/Opcodes.td | 24 +- clang/lib/AST/Interp/Pointer.h | 1 + clang/test/AST/Interp/new-delete.cpp | 490 +++++++++++++++++++++ clang/test/Rewriter/rewrite-modern-catch.m | 2 +- clang/test/SemaCXX/delete.cpp | 2 +- clang/test/SemaCXX/new-delete.cpp | 24 +- 19 files changed, 1213 insertions(+), 8 deletions(-) create mode 100644 clang/lib/AST/Interp/DynamicAllocator.cpp create mode 100644 clang/lib/AST/Interp/DynamicAllocator.h create mode 100644 clang/test/AST/Interp/new-delete.cpp diff --git a/clang/lib/AST/CMakeLists.txt b/clang/lib/AST/CMakeLists.txt index ceaad8d3c5a86..70aecb781c2ff 100644 --- a/clang/lib/AST/CMakeLists.txt +++ b/clang/lib/AST/CMakeLists.txt @@ -75,6 +75,7 @@ add_clang_library(clangAST Interp/InterpBuiltin.cpp Interp/Floating.cpp Interp/EvaluationResult.cpp + Interp/DynamicAllocator.cpp Interp/Interp.cpp Interp/InterpBlock.cpp Interp/InterpFrame.cpp diff --git a/clang/lib/AST/Interp/Compiler.cpp b/clang/lib/AST/Interp/Compiler.cpp index 30dc7f5e4840b..28c4ffd071862 100644 --- a/clang/lib/AST/Interp/Compiler.cpp +++ b/clang/lib/AST/Interp/Compiler.cpp @@ -2771,6 +2771,117 @@ bool Compiler::VisitCXXInheritedCtorInitExpr( return this->emitCall(F, 0, E); } +template +bool Compiler::VisitCXXNewExpr(const CXXNewExpr *E) { + assert(classifyPrim(E->getType()) == PT_Ptr); + const Expr *Init = E->getInitializer(); + QualType ElementType = E->getAllocatedType(); + std::optional ElemT = classify(ElementType); + unsigned PlacementArgs = E->getNumPlacementArgs(); + bool IsNoThrow = false; + + // FIXME: Better diagnostic. diag::note_constexpr_new_placement + if (PlacementArgs != 0) { + // The only new-placement list we support is of the form (std::nothrow). + // + // FIXME: There is no restriction on this, but it's not clear that any + // other form makes any sense. We get here for cases such as: + // + // new (std::align_val_t{N}) X(int) + // + // (which should presumably be valid only if N is a multiple of + // alignof(int), and in any case can't be deallocated unless N is + // alignof(X) and X has new-extended alignment). + if (PlacementArgs != 1 || !E->getPlacementArg(0)->getType()->isNothrowT()) + return this->emitInvalid(E); + + if (!this->discard(E->getPlacementArg(0))) + return false; + IsNoThrow = true; + } + + const Descriptor *Desc; + if (ElemT) { + if (E->isArray()) + Desc = nullptr; // We're not going to use it in this case. + else + Desc = P.createDescriptor(E, *ElemT, Descriptor::InlineDescMD, + /*IsConst=*/false, /*IsTemporary=*/false, + /*IsMutable=*/false); + } else { + Desc = P.createDescriptor( + E, ElementType.getTypePtr(), + E->isArray() ? std::nullopt : Descriptor::InlineDescMD, + /*IsConst=*/false, /*IsTemporary=*/false, /*IsMutable=*/false, Init); + } + + if (E->isArray()) { + std::optional ArraySizeExpr = E->getArraySize(); + if (!ArraySizeExpr) + return false; + + const Expr *Stripped = *ArraySizeExpr; + for (; auto *ICE = dyn_cast(Stripped); + Stripped = ICE->getSubExpr()) + if (ICE->getCastKind() != CK_NoOp && + ICE->getCastKind() != CK_IntegralCast) + break; + + PrimType SizeT = classifyPrim(Stripped->getType()); + + if (!this->visit(Stripped)) + return false; + + if (ElemT) { + // N primitive elements. + if (!this->emitAllocN(SizeT, *ElemT, E, IsNoThrow, E)) + return false; + } else { + // N Composite elements. + if (!this->emitAllocCN(SizeT, Desc, IsNoThrow, E)) + return false; + } + + if (Init && !this->visitInitializer(Init)) + return false; + + } else { + // Allocate just one element. + if (!this->emitAlloc(Desc, E)) + return false; + + if (Init) { + if (ElemT) { + if (!this->visit(Init)) + return false; + + if (!this->emitInit(*ElemT, E)) + return false; + } else { + // Composite. + if (!this->visitInitializer(Init)) + return false; + } + } + } + + if (DiscardResult) + return this->emitPopPtr(E); + + return true; +} + +template +bool Compiler::VisitCXXDeleteExpr(const CXXDeleteExpr *E) { + const Expr *Arg = E->getArgument(); + + // Arg must be an lvalue. + if (!this->visit(Arg)) + return false; + + return this->emitFree(E->isArrayForm(), E); +} + template bool Compiler::VisitExpressionTraitExpr(const ExpressionTraitExpr *E) { assert(Ctx.getLangOpts().CPlusPlus); diff --git a/clang/lib/AST/Interp/Compiler.h b/clang/lib/AST/Interp/Compiler.h index 23e7afd767e88..6df723df2b444 100644 --- a/clang/lib/AST/Interp/Compiler.h +++ b/clang/lib/AST/Interp/Compiler.h @@ -190,6 +190,8 @@ class Compiler : public ConstStmtVisitor, bool>, bool VisitObjCBoxedExpr(const ObjCBoxedExpr *E); bool VisitCXXStdInitializerListExpr(const CXXStdInitializerListExpr *E); bool VisitStmtExpr(const StmtExpr *E); + bool VisitCXXNewExpr(const CXXNewExpr *E); + bool VisitCXXDeleteExpr(const CXXDeleteExpr *E); // Statements. bool visitCompoundStmt(const CompoundStmt *S); diff --git a/clang/lib/AST/Interp/DynamicAllocator.cpp b/clang/lib/AST/Interp/DynamicAllocator.cpp new file mode 100644 index 0000000000000..a515997740780 --- /dev/null +++ b/clang/lib/AST/Interp/DynamicAllocator.cpp @@ -0,0 +1,118 @@ +//==-------- DynamicAllocator.cpp - Dynamic allocations ----------*- C++ -*-==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "DynamicAllocator.h" +#include "InterpBlock.h" +#include "InterpState.h" + +using namespace clang; +using namespace clang::interp; + +DynamicAllocator::~DynamicAllocator() { cleanup(); } + +void DynamicAllocator::cleanup() { + // Invoke destructors of all the blocks and as a last restort, + // reset all the pointers pointing to them to null pointees. + // This should never show up in diagnostics, but it's necessary + // for us to not cause use-after-free problems. + for (auto &Iter : AllocationSites) { + auto &AllocSite = Iter.second; + for (auto &Alloc : AllocSite.Allocations) { + Block *B = reinterpret_cast(Alloc.Memory.get()); + B->invokeDtor(); + if (B->hasPointers()) { + while (B->Pointers) { + Pointer *Next = B->Pointers->Next; + B->Pointers->PointeeStorage.BS.Pointee = nullptr; + B->Pointers = Next; + } + B->Pointers = nullptr; + } + } + } + + AllocationSites.clear(); +} + +Block *DynamicAllocator::allocate(const Expr *Source, PrimType T, + size_t NumElements, unsigned EvalID) { + // Create a new descriptor for an array of the specified size and + // element type. + const Descriptor *D = allocateDescriptor( + Source, T, Descriptor::InlineDescMD, NumElements, /*IsConst=*/false, + /*IsTemporary=*/false, /*IsMutable=*/false); + + return allocate(D, EvalID); +} + +Block *DynamicAllocator::allocate(const Descriptor *ElementDesc, + size_t NumElements, unsigned EvalID) { + // Create a new descriptor for an array of the specified size and + // element type. + const Descriptor *D = allocateDescriptor( + ElementDesc->asExpr(), ElementDesc, Descriptor::InlineDescMD, NumElements, + /*IsConst=*/false, /*IsTemporary=*/false, /*IsMutable=*/false); + return allocate(D, EvalID); +} + +Block *DynamicAllocator::allocate(const Descriptor *D, unsigned EvalID) { + assert(D); + assert(D->asExpr()); + + auto Memory = + std::make_unique(sizeof(Block) + D->getAllocSize()); + auto *B = new (Memory.get()) Block(EvalID, D, /*isStatic=*/false); + B->invokeCtor(); + + InlineDescriptor *ID = reinterpret_cast(B->rawData()); + ID->Desc = D; + ID->IsActive = true; + ID->Offset = sizeof(InlineDescriptor); + ID->IsBase = false; + ID->IsFieldMutable = false; + ID->IsConst = false; + ID->IsInitialized = false; + + B->IsDynamic = true; + + if (auto It = AllocationSites.find(D->asExpr()); It != AllocationSites.end()) + It->second.Allocations.emplace_back(std::move(Memory)); + else + AllocationSites.insert( + {D->asExpr(), AllocationSite(std::move(Memory), D->isArray())}); + return B; +} + +bool DynamicAllocator::deallocate(const Expr *Source, + const Block *BlockToDelete, InterpState &S) { + auto It = AllocationSites.find(Source); + if (It == AllocationSites.end()) + return false; + + auto &Site = It->second; + assert(Site.size() > 0); + + // Find the Block to delete. + auto AllocIt = llvm::find_if(Site.Allocations, [&](const Allocation &A) { + const Block *B = reinterpret_cast(A.Memory.get()); + return BlockToDelete == B; + }); + + assert(AllocIt != Site.Allocations.end()); + + Block *B = reinterpret_cast(AllocIt->Memory.get()); + B->invokeDtor(); + + S.deallocate(B); + Site.Allocations.erase(AllocIt); + + if (Site.size() == 0) + AllocationSites.erase(It); + + return true; +} diff --git a/clang/lib/AST/Interp/DynamicAllocator.h b/clang/lib/AST/Interp/DynamicAllocator.h new file mode 100644 index 0000000000000..a84600aa54cc5 --- /dev/null +++ b/clang/lib/AST/Interp/DynamicAllocator.h @@ -0,0 +1,102 @@ +//==--------- DynamicAllocator.h - Dynamic allocations ------------*- C++ -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_AST_INTERP_DYNAMIC_ALLOCATOR_H +#define LLVM_CLANG_AST_INTERP_DYNAMIC_ALLOCATOR_H + +#include "Descriptor.h" +#include "InterpBlock.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/iterator_range.h" +#include "llvm/Support/Allocator.h" + +namespace clang { +class Expr; +namespace interp { +class Block; +class InterpState; + +/// Manages dynamic memory allocations done during bytecode interpretation. +/// +/// We manage allocations as a map from their new-expression to a list +/// of allocations. This is called an AllocationSite. For each site, we +/// record whether it was allocated using new or new[], the +/// IsArrayAllocation flag. +/// +/// For all array allocations, we need to allocate new Descriptor instances, +/// so the DynamicAllocator has a llvm::BumpPtrAllocator similar to Program. +class DynamicAllocator final { + struct Allocation { + std::unique_ptr Memory; + Allocation(std::unique_ptr Memory) + : Memory(std::move(Memory)) {} + }; + + struct AllocationSite { + llvm::SmallVector Allocations; + bool IsArrayAllocation = false; + + AllocationSite(std::unique_ptr Memory, bool Array) + : IsArrayAllocation(Array) { + Allocations.push_back({std::move(Memory)}); + } + + size_t size() const { return Allocations.size(); } + }; + +public: + DynamicAllocator() = default; + ~DynamicAllocator(); + + void cleanup(); + + unsigned getNumAllocations() const { return AllocationSites.size(); } + + /// Allocate ONE element of the given descriptor. + Block *allocate(const Descriptor *D, unsigned EvalID); + /// Allocate \p NumElements primitive elements of the given type. + Block *allocate(const Expr *Source, PrimType T, size_t NumElements, + unsigned EvalID); + /// Allocate \p NumElements elements of the given descriptor. + Block *allocate(const Descriptor *D, size_t NumElements, unsigned EvalID); + + /// Deallocate the given source+block combination. + /// Returns \c true if anything has been deallocatd, \c false otherwise. + bool deallocate(const Expr *Source, const Block *BlockToDelete, + InterpState &S); + + /// Checks whether the allocation done at the given source is an array + /// allocation. + bool isArrayAllocation(const Expr *Source) const { + if (auto It = AllocationSites.find(Source); It != AllocationSites.end()) + return It->second.IsArrayAllocation; + return false; + } + + /// Allocation site iterator. + using const_virtual_iter = + llvm::DenseMap::const_iterator; + llvm::iterator_range allocation_sites() const { + return llvm::make_range(AllocationSites.begin(), AllocationSites.end()); + } + +private: + llvm::DenseMap AllocationSites; + + using PoolAllocTy = llvm::BumpPtrAllocatorImpl; + PoolAllocTy DescAllocator; + + /// Allocates a new descriptor. + template Descriptor *allocateDescriptor(Ts &&...Args) { + return new (DescAllocator) Descriptor(std::forward(Args)...); + } +}; + +} // namespace interp +} // namespace clang +#endif diff --git a/clang/lib/AST/Interp/EvalEmitter.cpp b/clang/lib/AST/Interp/EvalEmitter.cpp index 74413baf6fc0c..59e78686b78ad 100644 --- a/clang/lib/AST/Interp/EvalEmitter.cpp +++ b/clang/lib/AST/Interp/EvalEmitter.cpp @@ -133,9 +133,17 @@ bool EvalEmitter::fallthrough(const LabelTy &Label) { return true; } +static bool checkReturnState(InterpState &S) { + return S.maybeDiagnoseDanglingAllocations(); +} + template bool EvalEmitter::emitRet(const SourceInfo &Info) { if (!isActive()) return true; + + if (!checkReturnState(S)) + return false; + using T = typename PrimConv::T; EvalResult.setValue(S.Stk.pop().toAPValue()); return true; @@ -147,9 +155,14 @@ template <> bool EvalEmitter::emitRet(const SourceInfo &Info) { const Pointer &Ptr = S.Stk.pop(); + if (!EvalResult.checkReturnValue(S, Ctx, Ptr, Info)) + return false; if (CheckFullyInitialized && !EvalResult.checkFullyInitialized(S, Ptr)) return false; + if (!checkReturnState(S)) + return false; + // Implicitly convert lvalue to rvalue, if requested. if (ConvertResultToRValue) { if (!Ptr.isZero() && !Ptr.isDereferencable()) @@ -174,12 +187,17 @@ template <> bool EvalEmitter::emitRet(const SourceInfo &Info) { template <> bool EvalEmitter::emitRet(const SourceInfo &Info) { if (!isActive()) return true; + + if (!checkReturnState(S)) + return false; // Function pointers cannot be converted to rvalues. EvalResult.setFunctionPointer(S.Stk.pop()); return true; } bool EvalEmitter::emitRetVoid(const SourceInfo &Info) { + if (!checkReturnState(S)) + return false; EvalResult.setValid(); return true; } @@ -187,9 +205,14 @@ bool EvalEmitter::emitRetVoid(const SourceInfo &Info) { bool EvalEmitter::emitRetValue(const SourceInfo &Info) { const auto &Ptr = S.Stk.pop(); + if (!EvalResult.checkReturnValue(S, Ctx, Ptr, Info)) + return false; if (CheckFullyInitialized && !EvalResult.checkFullyInitialized(S, Ptr)) return false; + if (!checkReturnState(S)) + return false; + if (std::optional APV = Ptr.toRValue(S.getCtx(), EvalResult.getSourceType())) { EvalResult.setValue(*APV); diff --git a/clang/lib/AST/Interp/EvaluationResult.cpp b/clang/lib/AST/Interp/EvaluationResult.cpp index d0d68f75dd803..0bebfd4ad984e 100644 --- a/clang/lib/AST/Interp/EvaluationResult.cpp +++ b/clang/lib/AST/Interp/EvaluationResult.cpp @@ -10,6 +10,7 @@ #include "InterpState.h" #include "Record.h" #include "clang/AST/ExprCXX.h" +#include "llvm/ADT/SetVector.h" namespace clang { namespace interp { @@ -152,6 +153,11 @@ bool EvaluationResult::checkFullyInitialized(InterpState &S, if (Ptr.isZero()) return true; + // We can't inspect dead pointers at all. Return true here so we can + // diagnose them later. + if (!Ptr.isLive()) + return true; + SourceLocation InitLoc; if (const auto *D = Source.dyn_cast()) InitLoc = cast(D)->getAnyInitializer()->getExprLoc(); @@ -168,5 +174,71 @@ bool EvaluationResult::checkFullyInitialized(InterpState &S, return true; } +static void collectBlocks(const Pointer &Ptr, + llvm::SetVector &Blocks) { + auto isUsefulPtr = [](const Pointer &P) -> bool { + return P.isLive() && !P.isZero() && !P.isDummy() && + !P.isUnknownSizeArray() && !P.isOnePastEnd() && P.isBlockPointer(); + }; + + if (!isUsefulPtr(Ptr)) + return; + + Blocks.insert(Ptr.block()); + + const Descriptor *Desc = Ptr.getFieldDesc(); + if (!Desc) + return; + + if (const Record *R = Desc->ElemRecord) { + for (const Record::Field &F : R->fields()) { + const Pointer &FieldPtr = Ptr.atField(F.Offset); + assert(FieldPtr.block() == Ptr.block()); + collectBlocks(FieldPtr, Blocks); + } + } else if (Desc->isPrimitive() && Desc->getPrimType() == PT_Ptr) { + const Pointer &Pointee = Ptr.deref(); + if (isUsefulPtr(Pointee) && !Blocks.contains(Pointee.block())) + collectBlocks(Pointee, Blocks); + + } else if (Desc->isPrimitiveArray() && Desc->getPrimType() == PT_Ptr) { + for (unsigned I = 0; I != Desc->getNumElems(); ++I) { + const Pointer &ElemPointee = Ptr.atIndex(I).deref(); + if (isUsefulPtr(ElemPointee) && !Blocks.contains(ElemPointee.block())) + collectBlocks(ElemPointee, Blocks); + } + } else if (Desc->isCompositeArray()) { + for (unsigned I = 0; I != Desc->getNumElems(); ++I) { + const Pointer &ElemPtr = Ptr.atIndex(I).narrow(); + collectBlocks(ElemPtr, Blocks); + } + } +} + +bool EvaluationResult::checkReturnValue(InterpState &S, const Context &Ctx, + const Pointer &Ptr, + const SourceInfo &Info) { + // Collect all blocks that this pointer (transitively) points to and + // return false if any of them is a dynamic block. + llvm::SetVector Blocks; + + collectBlocks(Ptr, Blocks); + + for (const Block *B : Blocks) { + if (B->isDynamic()) { + assert(B->getDescriptor()); + assert(B->getDescriptor()->asExpr()); + + S.FFDiag(Info, diag::note_constexpr_dynamic_alloc) + << Ptr.getType()->isReferenceType() << !Ptr.isRoot(); + S.Note(B->getDescriptor()->asExpr()->getExprLoc(), + diag::note_constexpr_dynamic_alloc_here); + return false; + } + } + + return true; +} + } // namespace interp } // namespace clang diff --git a/clang/lib/AST/Interp/EvaluationResult.h b/clang/lib/AST/Interp/EvaluationResult.h index 378f1ccdb0af4..ef662e3779bc3 100644 --- a/clang/lib/AST/Interp/EvaluationResult.h +++ b/clang/lib/AST/Interp/EvaluationResult.h @@ -98,7 +98,12 @@ class EvaluationResult final { /// LValue and we can't read from it. std::optional toRValue() const; + /// Check that all subobjects of the given pointer have been initialized. bool checkFullyInitialized(InterpState &S, const Pointer &Ptr) const; + /// Check that none of the blocks the given pointer (transitively) points + /// to are dynamically allocated. + bool checkReturnValue(InterpState &S, const Context &Ctx, const Pointer &Ptr, + const SourceInfo &Info); QualType getSourceType() const { if (const auto *D = @@ -113,6 +118,7 @@ class EvaluationResult final { void dump() const; friend class EvalEmitter; + friend class InterpState; }; } // namespace interp diff --git a/clang/lib/AST/Interp/Interp.cpp b/clang/lib/AST/Interp/Interp.cpp index b673cc27aee21..fb63228f8aea8 100644 --- a/clang/lib/AST/Interp/Interp.cpp +++ b/clang/lib/AST/Interp/Interp.cpp @@ -717,6 +717,58 @@ bool CheckFloatResult(InterpState &S, CodePtr OpPC, const Floating &Result, return true; } +bool CheckDynamicMemoryAllocation(InterpState &S, CodePtr OpPC) { + if (S.getLangOpts().CPlusPlus20) + return true; + + const SourceInfo &E = S.Current->getSource(OpPC); + S.FFDiag(E, diag::note_constexpr_new); + return false; +} + +bool CheckNewDeleteForms(InterpState &S, CodePtr OpPC, bool NewWasArray, + bool DeleteIsArray, const Descriptor *D, + const Expr *NewExpr) { + if (NewWasArray == DeleteIsArray) + return true; + + QualType TypeToDiagnose; + // We need to shuffle things around a bit here to get a better diagnostic, + // because the expression we allocated the block for was of type int*, + // but we want to get the array size right. + if (D->isArray()) { + QualType ElemQT = D->getType()->getPointeeType(); + TypeToDiagnose = S.getCtx().getConstantArrayType( + ElemQT, APInt(64, static_cast(D->getNumElems()), false), + nullptr, ArraySizeModifier::Normal, 0); + } else + TypeToDiagnose = D->getType()->getPointeeType(); + + const SourceInfo &E = S.Current->getSource(OpPC); + S.FFDiag(E, diag::note_constexpr_new_delete_mismatch) + << DeleteIsArray << 0 << TypeToDiagnose; + S.Note(NewExpr->getExprLoc(), diag::note_constexpr_dynamic_alloc_here) + << NewExpr->getSourceRange(); + return false; +} + +bool CheckDeleteSource(InterpState &S, CodePtr OpPC, const Expr *Source, + const Pointer &Ptr) { + if (Source && isa(Source)) + return true; + + // Whatever this is, we didn't heap allocate it. + const SourceInfo &Loc = S.Current->getSource(OpPC); + S.FFDiag(Loc, diag::note_constexpr_delete_not_heap_alloc) + << Ptr.toDiagnosticString(S.getCtx()); + + if (Ptr.isTemporary()) + S.Note(Ptr.getDeclLoc(), diag::note_constexpr_temporary_here); + else + S.Note(Ptr.getDeclLoc(), diag::note_declared_at); + return false; +} + /// We aleady know the given DeclRefExpr is invalid for some reason, /// now figure out why and print appropriate diagnostics. bool CheckDeclRef(InterpState &S, CodePtr OpPC, const DeclRefExpr *DR) { diff --git a/clang/lib/AST/Interp/Interp.h b/clang/lib/AST/Interp/Interp.h index 16093393b5da2..b4f8c03280c85 100644 --- a/clang/lib/AST/Interp/Interp.h +++ b/clang/lib/AST/Interp/Interp.h @@ -15,6 +15,7 @@ #include "../ExprConstShared.h" #include "Boolean.h" +#include "DynamicAllocator.h" #include "Floating.h" #include "Function.h" #include "FunctionPointer.h" @@ -122,6 +123,20 @@ bool CheckPure(InterpState &S, CodePtr OpPC, const CXXMethodDecl *MD); bool CheckNonNullArgs(InterpState &S, CodePtr OpPC, const Function *F, const CallExpr *CE, unsigned ArgSize); +/// Checks if dynamic memory allocation is available in the current +/// language mode. +bool CheckDynamicMemoryAllocation(InterpState &S, CodePtr OpPC); + +/// Diagnose mismatched new[]/delete or new/delete[] pairs. +bool CheckNewDeleteForms(InterpState &S, CodePtr OpPC, bool NewWasArray, + bool DeleteIsArray, const Descriptor *D, + const Expr *NewExpr); + +/// Check the source of the pointer passed to delete/delete[] has actually +/// been heap allocated by us. +bool CheckDeleteSource(InterpState &S, CodePtr OpPC, const Expr *Source, + const Pointer &Ptr); + /// Sets the given integral value to the pointer, which is of /// a std::{weak,partial,strong}_ordering type. bool SetThreeWayComparisonField(InterpState &S, CodePtr OpPC, @@ -189,6 +204,30 @@ bool CheckDivRem(InterpState &S, CodePtr OpPC, const T &LHS, const T &RHS) { return true; } +template +bool CheckArraySize(InterpState &S, CodePtr OpPC, SizeT *NumElements, + unsigned ElemSize, bool IsNoThrow) { + // FIXME: Both the SizeT::from() as well as the + // NumElements.toAPSInt() in this function are rather expensive. + + // FIXME: GH63562 + // APValue stores array extents as unsigned, + // so anything that is greater that unsigned would overflow when + // constructing the array, we catch this here. + SizeT MaxElements = SizeT::from(Descriptor::MaxArrayElemBytes / ElemSize); + if (NumElements->toAPSInt().getActiveBits() > + ConstantArrayType::getMaxSizeBits(S.getCtx()) || + *NumElements > MaxElements) { + if (!IsNoThrow) { + const SourceInfo &Loc = S.Current->getSource(OpPC); + S.FFDiag(Loc, diag::note_constexpr_new_too_large) + << NumElements->toDiagnosticString(S.getCtx()); + } + return false; + } + return true; +} + /// Checks if the result of a floating-point operation is valid /// in the current context. bool CheckFloatResult(InterpState &S, CodePtr OpPC, const Floating &Result, @@ -2766,6 +2805,119 @@ inline bool CheckDecl(InterpState &S, CodePtr OpPC, const VarDecl *VD) { return true; } +inline bool Alloc(InterpState &S, CodePtr OpPC, const Descriptor *Desc) { + assert(Desc); + + if (!CheckDynamicMemoryAllocation(S, OpPC)) + return false; + + DynamicAllocator &Allocator = S.getAllocator(); + Block *B = Allocator.allocate(Desc, S.Ctx.getEvalID()); + assert(B); + + S.Stk.push(B, sizeof(InlineDescriptor)); + + return true; +} + +template ::T> +inline bool AllocN(InterpState &S, CodePtr OpPC, PrimType T, const Expr *Source, + bool IsNoThrow) { + if (!CheckDynamicMemoryAllocation(S, OpPC)) + return false; + + SizeT NumElements = S.Stk.pop(); + if (!CheckArraySize(S, OpPC, &NumElements, primSize(T), IsNoThrow)) { + if (!IsNoThrow) + return false; + + // If this failed and is nothrow, just return a null ptr. + S.Stk.push(0, nullptr); + return true; + } + + DynamicAllocator &Allocator = S.getAllocator(); + Block *B = Allocator.allocate(Source, T, static_cast(NumElements), + S.Ctx.getEvalID()); + assert(B); + S.Stk.push(B, sizeof(InlineDescriptor)); + + return true; +} + +template ::T> +inline bool AllocCN(InterpState &S, CodePtr OpPC, const Descriptor *ElementDesc, + bool IsNoThrow) { + if (!CheckDynamicMemoryAllocation(S, OpPC)) + return false; + + SizeT NumElements = S.Stk.pop(); + if (!CheckArraySize(S, OpPC, &NumElements, ElementDesc->getSize(), + IsNoThrow)) { + if (!IsNoThrow) + return false; + + // If this failed and is nothrow, just return a null ptr. + S.Stk.push(0, ElementDesc); + return true; + } + + DynamicAllocator &Allocator = S.getAllocator(); + Block *B = Allocator.allocate(ElementDesc, static_cast(NumElements), + S.Ctx.getEvalID()); + assert(B); + + S.Stk.push(B, sizeof(InlineDescriptor)); + + return true; +} + +static inline bool Free(InterpState &S, CodePtr OpPC, bool DeleteIsArrayForm) { + + if (!CheckDynamicMemoryAllocation(S, OpPC)) + return false; + + const Expr *Source = nullptr; + const Block *BlockToDelete = nullptr; + { + // Extra scope for this so the block doesn't have this pointer + // pointing to it when we destroy it. + const Pointer &Ptr = S.Stk.pop(); + + // Deleteing nullptr is always fine. + if (Ptr.isZero()) + return true; + + if (!Ptr.isRoot() || Ptr.isOnePastEnd() || Ptr.isArrayElement()) { + const SourceInfo &Loc = S.Current->getSource(OpPC); + S.FFDiag(Loc, diag::note_constexpr_delete_subobject) + << Ptr.toDiagnosticString(S.getCtx()) << Ptr.isOnePastEnd(); + return false; + } + + Source = Ptr.getDeclDesc()->asExpr(); + BlockToDelete = Ptr.block(); + + if (!CheckDeleteSource(S, OpPC, Source, Ptr)) + return false; + } + assert(Source); + assert(BlockToDelete); + + DynamicAllocator &Allocator = S.getAllocator(); + bool WasArrayAlloc = Allocator.isArrayAllocation(Source); + const Descriptor *BlockDesc = BlockToDelete->getDescriptor(); + + if (!Allocator.deallocate(Source, BlockToDelete, S)) { + // Nothing has been deallocated, this must be a double-delete. + const SourceInfo &Loc = S.Current->getSource(OpPC); + S.FFDiag(Loc, diag::note_constexpr_double_delete); + return false; + } + return CheckNewDeleteForms(S, OpPC, WasArrayAlloc, DeleteIsArrayForm, + BlockDesc, Source); +} + //===----------------------------------------------------------------------===// // Read opcode arguments //===----------------------------------------------------------------------===// diff --git a/clang/lib/AST/Interp/InterpBlock.h b/clang/lib/AST/Interp/InterpBlock.h index ee33e5a4b2df0..3760ded7b13fe 100644 --- a/clang/lib/AST/Interp/InterpBlock.h +++ b/clang/lib/AST/Interp/InterpBlock.h @@ -52,14 +52,14 @@ class Block final { Block(unsigned EvalID, const std::optional &DeclID, const Descriptor *Desc, bool IsStatic = false, bool IsExtern = false) : EvalID(EvalID), DeclID(DeclID), IsStatic(IsStatic), IsExtern(IsExtern), - Desc(Desc) { + IsDynamic(false), Desc(Desc) { assert(Desc); } Block(unsigned EvalID, const Descriptor *Desc, bool IsStatic = false, bool IsExtern = false) : EvalID(EvalID), DeclID((unsigned)-1), IsStatic(IsStatic), - IsExtern(IsExtern), Desc(Desc) { + IsExtern(IsExtern), IsDynamic(false), Desc(Desc) { assert(Desc); } @@ -73,6 +73,7 @@ class Block final { bool isStatic() const { return IsStatic; } /// Checks if the block is temporary. bool isTemporary() const { return Desc->IsTemporary; } + bool isDynamic() const { return IsDynamic; } /// Returns the size of the block. unsigned getSize() const { return Desc->getAllocSize(); } /// Returns the declaration ID. @@ -130,11 +131,12 @@ class Block final { friend class Pointer; friend class DeadBlock; friend class InterpState; + friend class DynamicAllocator; Block(unsigned EvalID, const Descriptor *Desc, bool IsExtern, bool IsStatic, bool IsDead) : EvalID(EvalID), IsStatic(IsStatic), IsExtern(IsExtern), IsDead(true), - Desc(Desc) { + IsDynamic(false), Desc(Desc) { assert(Desc); } @@ -164,6 +166,9 @@ class Block final { /// Flag indicating if the block contents have been initialized /// via invokeCtor. bool IsInitialized = false; + /// Flag indicating if this block has been allocated via dynamic + /// memory allocation (e.g. malloc). + bool IsDynamic = false; /// Pointer to the stack slot descriptor. const Descriptor *Desc; }; diff --git a/clang/lib/AST/Interp/InterpState.cpp b/clang/lib/AST/Interp/InterpState.cpp index 40eb28bfb4875..332f551838b72 100644 --- a/clang/lib/AST/Interp/InterpState.cpp +++ b/clang/lib/AST/Interp/InterpState.cpp @@ -41,6 +41,8 @@ void InterpState::cleanup() { P->PointeeStorage.BS.Pointee = nullptr; } } + + Alloc.cleanup(); } Frame *InterpState::getCurrentFrame() { @@ -81,3 +83,18 @@ void InterpState::deallocate(Block *B) { B->invokeDtor(); } } + +bool InterpState::maybeDiagnoseDanglingAllocations() { + bool NoAllocationsLeft = (Alloc.getNumAllocations() == 0); + + if (!checkingPotentialConstantExpression()) { + for (const auto &It : Alloc.allocation_sites()) { + assert(It.second.size() > 0); + + const Expr *Source = It.first; + CCEDiag(Source->getExprLoc(), diag::note_constexpr_memory_leak) + << (It.second.size() - 1) << Source->getSourceRange(); + } + } + return NoAllocationsLeft; +} diff --git a/clang/lib/AST/Interp/InterpState.h b/clang/lib/AST/Interp/InterpState.h index 138e1d7ac95d5..61ee54331c65d 100644 --- a/clang/lib/AST/Interp/InterpState.h +++ b/clang/lib/AST/Interp/InterpState.h @@ -14,6 +14,7 @@ #define LLVM_CLANG_AST_INTERP_INTERPSTATE_H #include "Context.h" +#include "DynamicAllocator.h" #include "Function.h" #include "InterpFrame.h" #include "InterpStack.h" @@ -102,13 +103,23 @@ class InterpState final : public State, public SourceMapper { void setEvalLocation(SourceLocation SL) { this->EvalLocation = SL; } + DynamicAllocator &getAllocator() { return Alloc; } + + /// Diagnose any dynamic allocations that haven't been freed yet. + /// Will return \c false if there were any allocations to diagnose, + /// \c true otherwise. + bool maybeDiagnoseDanglingAllocations(); + private: + friend class EvaluationResult; /// AST Walker state. State &Parent; /// Dead block chain. DeadBlock *DeadBlocks = nullptr; /// Reference to the offset-source mapping. SourceMapper *M; + /// Allocator used for dynamic allocations performed via the program. + DynamicAllocator Alloc; public: /// Reference to the module containing all bytecode. diff --git a/clang/lib/AST/Interp/Opcodes.td b/clang/lib/AST/Interp/Opcodes.td index 8d01fe1ac2bd1..3e69098570bd7 100644 --- a/clang/lib/AST/Interp/Opcodes.td +++ b/clang/lib/AST/Interp/Opcodes.td @@ -58,12 +58,14 @@ def ArgRoundingMode : ArgType { let Name = "llvm::RoundingMode"; } def ArgLETD: ArgType { let Name = "const LifetimeExtendedTemporaryDecl *"; } def ArgCastKind : ArgType { let Name = "CastKind"; } def ArgCallExpr : ArgType { let Name = "const CallExpr *"; } +def ArgExpr : ArgType { let Name = "const Expr *"; } def ArgOffsetOfExpr : ArgType { let Name = "const OffsetOfExpr *"; } def ArgDeclRef : ArgType { let Name = "const DeclRefExpr *"; } -def ArgDesc : ArgType { let Name = "const Descriptor *"; } def ArgCCI : ArgType { let Name = "const ComparisonCategoryInfo *"; } def ArgDecl : ArgType { let Name = "const Decl*"; } def ArgVarDecl : ArgType { let Name = "const VarDecl*"; } +def ArgDesc : ArgType { let Name = "const Descriptor *"; } +def ArgPrimType : ArgType { let Name = "PrimType"; } //===----------------------------------------------------------------------===// // Classes of types instructions operate on. @@ -747,3 +749,23 @@ def GetMemberPtrDecl : Opcode; // Debugging. //===----------------------------------------------------------------------===// def Dump : Opcode; + +def Alloc : Opcode { + let Args = [ArgDesc]; +} + +def AllocN : Opcode { + let Types = [IntegerTypeClass]; + let Args = [ArgPrimType, ArgExpr, ArgBool]; + let HasGroup = 1; +} + +def AllocCN : Opcode { + let Types = [IntegerTypeClass]; + let Args = [ArgDesc, ArgBool]; + let HasGroup = 1; +} + +def Free : Opcode { + let Args = [ArgBool]; +} diff --git a/clang/lib/AST/Interp/Pointer.h b/clang/lib/AST/Interp/Pointer.h index 28bc42985adb2..972f55a553f6e 100644 --- a/clang/lib/AST/Interp/Pointer.h +++ b/clang/lib/AST/Interp/Pointer.h @@ -649,6 +649,7 @@ class Pointer { friend class MemberPointer; friend class InterpState; friend struct InitMap; + friend class DynamicAllocator; Pointer(Block *Pointee, unsigned Base, uint64_t Offset); diff --git a/clang/test/AST/Interp/new-delete.cpp b/clang/test/AST/Interp/new-delete.cpp new file mode 100644 index 0000000000000..04ce3ae5f6637 --- /dev/null +++ b/clang/test/AST/Interp/new-delete.cpp @@ -0,0 +1,490 @@ +// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -verify=expected,both %s +// RUN: %clang_cc1 -std=c++20 -fexperimental-new-constant-interpreter -verify=expected,both %s +// RUN: %clang_cc1 -triple=i686-linux-gnu -std=c++20 -fexperimental-new-constant-interpreter -verify=expected,both %s +// RUN: %clang_cc1 -verify=ref,both %s +// RUN: %clang_cc1 -std=c++20 -verify=ref,both %s +// RUN: %clang_cc1 -triple=i686-linux-gnu -std=c++20 -verify=ref,both %s + +#if __cplusplus >= 202002L + +constexpr int *Global = new int(12); // both-error {{must be initialized by a constant expression}} \ + // both-note {{pointer to heap-allocated object}} \ + // both-note {{heap allocation performed here}} + +static_assert(*(new int(12)) == 12); // both-error {{not an integral constant expression}} \ + // both-note {{allocation performed here was not deallocated}} + + +constexpr int a() { + new int(12); // both-note {{allocation performed here was not deallocated}} + return 1; +} +static_assert(a() == 1, ""); // both-error {{not an integral constant expression}} + +constexpr int b() { + int *i = new int(12); + int m = *i; + delete(i); + return m; +} +static_assert(b() == 12, ""); + + +struct S { + int a; + int b; + + static constexpr S *create(int a, int b) { + return new S(a, b); + } +}; + +constexpr int c() { + S *s = new S(12, 13); + + int i = s->a; + delete s; + + return i; +} +static_assert(c() == 12, ""); + +/// Dynamic allocation in function ::create(), freed in function d(). +constexpr int d() { + S* s = S::create(12, 14); + + int sum = s->a + s->b; + delete s; + return sum; +} +static_assert(d() == 26); + + +/// Test we emit the right diagnostic for several allocations done on +/// the same site. +constexpr int loop() { + for (int i = 0; i < 10; ++i) { + int *a = new int[10]; // both-note {{not deallocated (along with 9 other memory leaks)}} + } + + return 1; +} +static_assert(loop() == 1, ""); // both-error {{not an integral constant expression}} + +/// No initializer. +constexpr int noInit() { + int *i = new int; + delete i; + return 0; +} +static_assert(noInit() == 0, ""); + +/// Try to delete a pointer that hasn't been heap allocated. +constexpr int notHeapAllocated() { // both-error {{never produces a constant expression}} + int A = 0; // both-note 2{{declared here}} + delete &A; // ref-note 2{{delete of pointer '&A' that does not point to a heap-allocated object}} \ + // expected-note 2{{delete of pointer '&A' that does not point to a heap-allocated object}} + + return 1; +} +static_assert(notHeapAllocated() == 1, ""); // both-error {{not an integral constant expression}} \ + // both-note {{in call to 'notHeapAllocated()'}} + +consteval int deleteNull() { + int *A = nullptr; + delete A; + return 1; +} +static_assert(deleteNull() == 1, ""); + +consteval int doubleDelete() { // both-error {{never produces a constant expression}} + int *A = new int; + delete A; + delete A; // both-note 2{{delete of pointer that has already been deleted}} + return 1; +} +static_assert(doubleDelete() == 1); // both-error {{not an integral constant expression}} \ + // both-note {{in call to 'doubleDelete()'}} + +constexpr int AutoArray() { + auto array = new int[]{0, 1, 2, 3}; + int ret = array[3]; + delete [] array; + return ret; +} + +static_assert(AutoArray() == 3); + +#if 0 +consteval int largeArray1(bool b) { + if (b) { + int *a = new int[1ull<<32]; // both-note {{cannot allocate array; evaluated array bound 4294967296 is too large}} + delete[] a; + } + return 1; +} +static_assert(largeArray1(false) == 1, ""); +static_assert(largeArray1(true) == 1, ""); // both-error {{not an integral constant expression}} \ + // both-note {{in call to 'largeArray1(true)'}} + +consteval int largeArray2(bool b) { + if (b) { + S *a = new S[1ull<<32]; // both-note {{cannot allocate array; evaluated array bound 4294967296 is too large}} + delete[] a; + } + return 1; +} +static_assert(largeArray2(false) == 1, ""); +static_assert(largeArray2(true) == 1, ""); // both-error {{not an integral constant expression}} \ + // both-note {{in call to 'largeArray2(true)'}} +#endif +namespace Arrays { + constexpr int d() { + int *Arr = new int[12]; + + Arr[0] = 1; + Arr[1] = 5; + + int sum = Arr[0] + Arr[1]; + delete[] Arr; + return sum; + } + static_assert(d() == 6); + + + constexpr int mismatch1() { // both-error {{never produces a constant expression}} + int *i = new int(12); // both-note {{allocated with 'new' here}} \ + // both-note 2{{heap allocation performed here}} + delete[] i; // both-warning {{'delete[]' applied to a pointer that was allocated with 'new'}} \ + // both-note 2{{array delete used to delete pointer to non-array object of type 'int'}} + return 6; + } + static_assert(mismatch1() == 6); // both-error {{not an integral constant expression}} \ + // both-note {{in call to 'mismatch1()'}} + + constexpr int mismatch2() { // both-error {{never produces a constant expression}} + int *i = new int[12]; // both-note {{allocated with 'new[]' here}} \ + // both-note 2{{heap allocation performed here}} + delete i; // both-warning {{'delete' applied to a pointer that was allocated with 'new[]'}} \ + // both-note 2{{non-array delete used to delete pointer to array object of type 'int[12]'}} + return 6; + } + static_assert(mismatch2() == 6); // both-error {{not an integral constant expression}} \ + // both-note {{in call to 'mismatch2()'}} + /// Array of composite elements. + constexpr int foo() { + S *ss = new S[12]; + + ss[0].a = 12; + + int m = ss[0].a; + + delete[] ss; + return m; + } + static_assert(foo() == 12); + + + + constexpr int ArrayInit() { + auto array = new int[4]{0, 1, 2, 3}; + int ret = array[0]; + delete [] array; + return ret; + } + static_assert(ArrayInit() == 0, ""); + + struct S { + float F; + }; + constexpr float ArrayInit2() { + auto array = new S[4]{}; + float ret = array[0].F; + delete [] array; + return ret; + } + static_assert(ArrayInit2() == 0.0f, ""); +} + +namespace std { + struct type_info; + struct destroying_delete_t { + explicit destroying_delete_t() = default; + } inline constexpr destroying_delete{}; + struct nothrow_t { + explicit nothrow_t() = default; + } inline constexpr nothrow{}; + using size_t = decltype(sizeof(0)); + enum class align_val_t : size_t {}; +}; + +[[nodiscard]] void *operator new(std::size_t, const std::nothrow_t&) noexcept; +[[nodiscard]] void *operator new(std::size_t, std::align_val_t, const std::nothrow_t&) noexcept; +[[nodiscard]] void *operator new[](std::size_t, const std::nothrow_t&) noexcept; +[[nodiscard]] void *operator new[](std::size_t, std::align_val_t, const std::nothrow_t&) noexcept; +[[nodiscard]] void *operator new[](std::size_t, std::align_val_t); +void operator delete(void*, const std::nothrow_t&) noexcept; +void operator delete(void*, std::align_val_t, const std::nothrow_t&) noexcept; +void operator delete[](void*, const std::nothrow_t&) noexcept; +void operator delete[](void*, std::align_val_t, const std::nothrow_t&) noexcept; + +struct placement_new_arg {}; +void *operator new(std::size_t, placement_new_arg); +void operator delete(void*, placement_new_arg); + + +constexpr void *operator new(std::size_t, void *p) { return p; } +namespace std { + template constexpr T *construct(T *p) { return new (p) T; } + template constexpr void destroy(T *p) { p->~T(); } +} + + + +/// FIXME: The new interpreter produces the wrong diagnostic. +namespace PlacementNew { + constexpr int foo() { // both-error {{never produces a constant expression}} + char c[sizeof(int)]; + new (c) int{12}; // ref-note {{call to placement 'operator new'}} \ + // expected-note {{subexpression not valid in a constant expression}} + return 0; + } +} + +namespace NowThrowNew { + constexpr bool erroneous_array_bound_nothrow(long long n) { + int *p = new (std::nothrow) int[n]; + bool result = p != nullptr; + delete[] p; + return result; + } + static_assert(erroneous_array_bound_nothrow(3)); + static_assert(erroneous_array_bound_nothrow(0)); + static_assert(erroneous_array_bound_nothrow(-1) == 0); + static_assert(!erroneous_array_bound_nothrow(1LL << 62)); + + struct S { int a; }; + constexpr bool erroneous_array_bound_nothrow2(long long n) { + S *p = new (std::nothrow) S[n]; + bool result = p != nullptr; + delete[] p; + return result; + } + /// This needs support for CXXConstrucExprs with non-constant array sizes. + static_assert(erroneous_array_bound_nothrow2(3)); // expected-error {{not an integral constant expression}} + static_assert(erroneous_array_bound_nothrow2(0));// expected-error {{not an integral constant expression}} + static_assert(erroneous_array_bound_nothrow2(-1) == 0);// expected-error {{not an integral constant expression}} + static_assert(!erroneous_array_bound_nothrow2(1LL << 62));// expected-error {{not an integral constant expression}} + + constexpr bool evaluate_nothrow_arg() { + bool ok = false; + delete new ((ok = true, std::nothrow)) int; + return ok; + } + static_assert(evaluate_nothrow_arg()); +} + +namespace placement_new_delete { + struct ClassSpecificNew { + void *operator new(std::size_t); + }; + struct ClassSpecificDelete { + void operator delete(void*); + }; + struct DestroyingDelete { + void operator delete(DestroyingDelete*, std::destroying_delete_t); + }; + struct alignas(64) Overaligned {}; + + constexpr bool ok() { + delete new Overaligned; + delete ::new ClassSpecificNew; + ::delete new ClassSpecificDelete; + ::delete new DestroyingDelete; + return true; + } + static_assert(ok()); + + /// FIXME: Diagnosting placement new. + constexpr bool bad(int which) { + switch (which) { + case 0: + delete new (placement_new_arg{}) int; // ref-note {{call to placement 'operator new'}} \ + // expected-note {{subexpression not valid in a constant expression}} + break; + + case 1: + delete new ClassSpecificNew; // ref-note {{call to class-specific 'operator new'}} + break; + + case 2: + delete new ClassSpecificDelete; // ref-note {{call to class-specific 'operator delete'}} + break; + + case 3: + delete new DestroyingDelete; // ref-note {{call to class-specific 'operator delete'}} + break; + + case 4: + // FIXME: This technically follows the standard's rules, but it seems + // unreasonable to expect implementations to support this. + delete new (std::align_val_t{64}) Overaligned; // ref-note {{placement new expression is not yet supported}} \ + // expected-note {{subexpression not valid in a constant expression}} + break; + } + + return true; + } + static_assert(bad(0)); // both-error {{constant expression}} \ + // both-note {{in call}} + static_assert(bad(1)); // ref-error {{constant expression}} ref-note {{in call}} + static_assert(bad(2)); // ref-error {{constant expression}} ref-note {{in call}} + static_assert(bad(3)); // ref-error {{constant expression}} ref-note {{in call}} + static_assert(bad(4)); // both-error {{constant expression}} \ + // both-note {{in call}} +} + + + + +namespace delete_random_things { + static_assert((delete new int, true)); + static_assert((delete (int*)0, true)); + int n; // both-note {{declared here}} + static_assert((delete &n, true)); // both-error {{}} \ + // both-note {{delete of pointer '&n' that does not point to a heap-allocated object}} + struct A { int n; }; + static_assert((delete &(new A)->n, true)); // both-error {{}} \ + // both-note {{delete of pointer to subobject }} + static_assert((delete (new int + 1), true)); // both-error {{}} \ + // ref-note {{delete of pointer '&{*new int#0} + 1' that does not point to complete object}} \ + // expected-note {{delete of pointer '&new int + 1' that does not point to complete object}} + static_assert((delete[] (new int[3] + 1), true)); // both-error {{}} \ + // both-note {{delete of pointer to subobject}} + static_assert((delete &(int&)(int&&)0, true)); // both-error {{}} \ + // both-note {{delete of pointer '&0' that does not point to a heap-allocated object}} \ + // both-note {{temporary created here}} +} + +namespace value_dependent_delete { + template void f(T *p) { + int arr[(delete p, 0)]; + } +} + +namespace memory_leaks { + static_assert(*new bool(true)); // both-error {{}} both-note {{allocation performed here was not deallocated}} + + constexpr bool *f() { return new bool(true); } // both-note {{allocation performed here was not deallocated}} + static_assert(*f()); // both-error {{}} + + struct UP { + bool *p; + constexpr ~UP() { delete p; } + constexpr bool &operator*() { return *p; } + }; + constexpr UP g() { return {new bool(true)}; } + static_assert(*g()); // ok + + constexpr bool h(UP p) { return *p; } + static_assert(h({new bool(true)})); // ok +} + +/// From test/SemaCXX/cxx2a-consteval.cpp + +namespace std { +template struct remove_reference { using type = T; }; +template struct remove_reference { using type = T; }; +template struct remove_reference { using type = T; }; +template +constexpr typename std::remove_reference::type&& move(T &&t) noexcept { + return static_cast::type &&>(t); +} +} + +namespace cxx2a { +struct A { + int* p = new int(42); // both-note 7{{heap allocation performed here}} + consteval int ret_i() const { return p ? *p : 0; } + consteval A ret_a() const { return A{}; } + constexpr ~A() { delete p; } +}; + +consteval int by_value_a(A a) { return a.ret_i(); } + +consteval int const_a_ref(const A &a) { + return a.ret_i(); +} + +consteval int rvalue_ref(const A &&a) { + return a.ret_i(); +} + +consteval const A &to_lvalue_ref(const A &&a) { + return a; +} + +void test() { + constexpr A a{ nullptr }; + { int k = A().ret_i(); } + + { A k = A().ret_a(); } // both-error {{'cxx2a::A::ret_a' is not a constant expression}} \ + // both-note {{heap-allocated object is not a constant expression}} + { A k = to_lvalue_ref(A()); } // both-error {{'cxx2a::to_lvalue_ref' is not a constant expression}} \ + // both-note {{reference to temporary is not a constant expression}} \ + // both-note {{temporary created here}} + { A k = to_lvalue_ref(A().ret_a()); } // both-error {{'cxx2a::A::ret_a' is not a constant expression}} \ + // both-note {{heap-allocated object is not a constant expression}} \ + // both-error {{'cxx2a::to_lvalue_ref' is not a constant expression}} \ + // both-note {{reference to temporary is not a constant expression}} \ + // both-note {{temporary created here}} + { int k = A().ret_a().ret_i(); } // both-error {{'cxx2a::A::ret_a' is not a constant expression}} \ + // both-note {{heap-allocated object is not a constant expression}} + { int k = by_value_a(A()); } + { int k = const_a_ref(A()); } + { int k = const_a_ref(a); } + { int k = rvalue_ref(A()); } + { int k = rvalue_ref(std::move(a)); } + { int k = const_a_ref(A().ret_a()); } // both-error {{'cxx2a::A::ret_a' is not a constant expression}} \ + // both-note {{is not a constant expression}} + { int k = const_a_ref(to_lvalue_ref(A().ret_a())); } // both-error {{'cxx2a::A::ret_a' is not a constant expression}} \ + // both-note {{is not a constant expression}} + { int k = const_a_ref(to_lvalue_ref(std::move(a))); } + { int k = by_value_a(A().ret_a()); } + { int k = by_value_a(to_lvalue_ref(static_cast(a))); } + { int k = (A().ret_a(), A().ret_i()); } // both-error {{'cxx2a::A::ret_a' is not a constant expression}} \ + // both-note {{is not a constant expression}} \ + // both-warning {{left operand of comma operator has no effect}} + { int k = (const_a_ref(A().ret_a()), A().ret_i()); } // both-error {{'cxx2a::A::ret_a' is not a constant expression}} \ + // both-note {{is not a constant expression}} \ + // both-warning {{left operand of comma operator has no effect}} +} +} + +constexpr int *const &p = new int; // both-error {{must be initialized by a constant expression}} \ + // both-note {{pointer to heap-allocated object}} \ + // both-note {{allocation performed here}} + +constexpr const int *A[] = {nullptr, nullptr, new int{12}}; // both-error {{must be initialized by a constant expression}} \ + // both-note {{pointer to heap-allocated object}} \ + // both-note {{allocation performed here}} + +struct Sp { + const int *p; +}; +constexpr Sp ss[] = {Sp{new int{154}}}; // both-error {{must be initialized by a constant expression}} \ + // both-note {{pointer to heap-allocated object}} \ + // both-note {{allocation performed here}} + + + + +#else +/// Make sure we reject this prior to C++20 +constexpr int a() { // both-error {{never produces a constant expression}} + delete new int(12); // both-note 2{{dynamic memory allocation is not permitted in constant expressions until C++20}} + return 1; +} +static_assert(a() == 1, ""); // both-error {{not an integral constant expression}} \ + // both-note {{in call to 'a()'}} +#endif diff --git a/clang/test/Rewriter/rewrite-modern-catch.m b/clang/test/Rewriter/rewrite-modern-catch.m index 1900301e91129..621c7ec45bae8 100644 --- a/clang/test/Rewriter/rewrite-modern-catch.m +++ b/clang/test/Rewriter/rewrite-modern-catch.m @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -x objective-c -Wno-return-type -fblocks -fms-extensions -rewrite-objc %s -o %t-rw.cpp +// RUN: %clang_cc1 -x objective-c -Wno-return-type -fblocks -fms-extensions -rewrite-objc %s -o %t-rw.cpp -fexperimental-new-constant-interpreter // RUN: %clang_cc1 -fsyntax-only -fcxx-exceptions -fexceptions -Wno-address-of-temporary -D"id=void*" -D"SEL=void*" -D"__declspec(X)=" %t-rw.cpp void foo(id arg); diff --git a/clang/test/SemaCXX/delete.cpp b/clang/test/SemaCXX/delete.cpp index 08cc1766e9f7e..7d1f51cb218ce 100644 --- a/clang/test/SemaCXX/delete.cpp +++ b/clang/test/SemaCXX/delete.cpp @@ -1,5 +1,5 @@ // Test without PCH -// RUN: %clang_cc1 -fsyntax-only -include %S/delete-mismatch.h -fdiagnostics-parseable-fixits -std=c++11 %s 2>&1 | FileCheck %s +// RUN: %clang_cc1 -fsyntax-only -include %S/delete-mismatch.h -fdiagnostics-parseable-fixits -std=c++11 %s 2>&1 -fexperimental-new-constant-interpreter | FileCheck %s // Test with PCH // RUN: %clang_cc1 -x c++-header -std=c++11 -emit-pch -o %t %S/delete-mismatch.h diff --git a/clang/test/SemaCXX/new-delete.cpp b/clang/test/SemaCXX/new-delete.cpp index ec6ad43476f94..595bdc689d694 100644 --- a/clang/test/SemaCXX/new-delete.cpp +++ b/clang/test/SemaCXX/new-delete.cpp @@ -6,6 +6,14 @@ // RUN: %clang_cc1 -fsyntax-only -verify=expected,cxx98-23,cxx17,cxx20 %s -triple=i686-pc-linux-gnu -Wno-new-returns-null -std=c++23 // RUN: %clang_cc1 -fsyntax-only -verify=expected,since-cxx26,cxx17,cxx20 %s -triple=i686-pc-linux-gnu -Wno-new-returns-null -std=c++2c +// RUN: %clang_cc1 -fsyntax-only -verify=expected,cxx98-23,precxx20 %s -triple=i686-pc-linux-gnu -Wno-new-returns-null -std=c++98 -fexperimental-new-constant-interpreter -DNEW_INTERP +// RUN: %clang_cc1 -fsyntax-only -verify=expected,cxx98-23,precxx20 %s -triple=i686-pc-linux-gnu -Wno-new-returns-null -std=c++11 -fexperimental-new-constant-interpreter -DNEW_INTERP +// RUN: %clang_cc1 -fsyntax-only -verify=expected,cxx98-23,precxx20 %s -triple=i686-pc-linux-gnu -Wno-new-returns-null -std=c++14 -fexperimental-new-constant-interpreter -DNEW_INTERP +// RUN: %clang_cc1 -fsyntax-only -verify=expected,cxx98-23,cxx17,precxx20 %s -triple=i686-pc-linux-gnu -Wno-new-returns-null -std=c++17 -fexperimental-new-constant-interpreter -DNEW_INTERP +// RUN: %clang_cc1 -fsyntax-only -verify=expected,cxx98-23,cxx17,cxx20 %s -triple=i686-pc-linux-gnu -Wno-new-returns-null -std=c++20 -fexperimental-new-constant-interpreter -DNEW_INTERP +// RUN: %clang_cc1 -fsyntax-only -verify=expected,cxx98-23,cxx17,cxx20 %s -triple=i686-pc-linux-gnu -Wno-new-returns-null -std=c++23 -fexperimental-new-constant-interpreter -DNEW_INTERP +// RUN: %clang_cc1 -fsyntax-only -verify=expected,since-cxx26,cxx17,cxx20 %s -triple=i686-pc-linux-gnu -Wno-new-returns-null -std=c++2c -fexperimental-new-constant-interpreter -DNEW_INTERP + // FIXME Location is (frontend) // cxx17-note@*:* {{candidate function not viable: requires 2 arguments, but 3 were provided}} @@ -653,10 +661,22 @@ int *fail = dependent_array_size("hello"); // expected-note {{instantiation of}} // FIXME: Our behavior here is incredibly inconsistent. GCC allows // constant-folding in array bounds in new-expressions. int (*const_fold)[12] = new int[3][&const_fold + 12 - &const_fold]; -#if __cplusplus >= 201402L +#if __cplusplus >= 201402L && !defined(NEW_INTERP) // expected-error@-2 {{array size is not a constant expression}} // expected-note@-3 {{cannot refer to element 12 of non-array}} -#elif __cplusplus < 201103L +#elif __cplusplus < 201103L && !defined(NEW_INTERP) // expected-error@-5 {{cannot allocate object of variably modified type}} // expected-warning@-6 {{variable length arrays in C++ are a Clang extension}} #endif +#ifdef NEW_INTERP +#if __cplusplus >= 201402L +// expected-error@-10 {{array size is not a constant expression}} +// expected-note@-11 {{cannot refer to element 12 of non-array}} +#elif __cplusplus >= 201103L +// expected-error@-13 {{only the first dimension of an allocated array may have dynamic size}} +// expected-note@-14 {{cannot refer to element 12 of non-array}} +#else +// expected-error@-16 {{only the first dimension of an allocated array may have dynamic size}} +// expected-note@-17 {{cannot refer to element 12 of non-array}} +#endif +#endif From 20c6b9fbba583d172e931dd24417784186136531 Mon Sep 17 00:00:00 2001 From: Abid Qadeer Date: Wed, 17 Jul 2024 10:46:12 +0100 Subject: [PATCH 249/777] [flang][debug] Fix issues with local variables. (#98661) This PR fixes 2 similar issues. 1. As reported in #97476, flang generated executable has inconsistent behavior regarding values of the local array variables. 2. Variable with save attribute would not show up in debugger. The reason behind is same for both cases. If a local variable has storage which extends beyond function lifetime, the way to represent it in the debug info is through a global variable whose scope is limited to the function. This is what is used for static local variable in C. Previously local array worked in cases they were on stack. But will not show up if they had a global storage. To fix this, if we can get a corresponding `GlobalOp` for a variable while processing `DeclareOp`, we treat it the variable as global with scope set appropriately. A new FIR test is added. A previous Integration test has been adjusted as to not expect local variables for local arrays. With this fix in place, all the issues described in #97476 go away. It also fixes a lot of fails in GDB's fortran testsuite. Fixes #97476. --- .../lib/Optimizer/Transforms/AddDebugInfo.cpp | 58 ++++++++++++++----- .../Integration/debug-fixed-array-type-2.f90 | 3 - .../debug-local-global-storage-1.fir | 52 +++++++++++++++++ 3 files changed, 94 insertions(+), 19 deletions(-) create mode 100644 flang/test/Transforms/debug-local-global-storage-1.fir diff --git a/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp b/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp index 10c71d3fc9551..8bb24fb6c8078 100644 --- a/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp +++ b/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp @@ -49,7 +49,8 @@ class AddDebugInfoPass : public fir::impl::AddDebugInfoBase { void handleDeclareOp(fir::cg::XDeclareOp declOp, mlir::LLVM::DIFileAttr fileAttr, mlir::LLVM::DIScopeAttr scopeAttr, - fir::DebugTypeGenerator &typeGen); + fir::DebugTypeGenerator &typeGen, + mlir::SymbolTable *symbolTable); public: AddDebugInfoPass(fir::AddDebugInfoOptions options) : Base(options) {} @@ -63,7 +64,8 @@ class AddDebugInfoPass : public fir::impl::AddDebugInfoBase { mlir::LLVM::DIScopeAttr scope, unsigned line, bool decl); void handleGlobalOp(fir::GlobalOp glocalOp, mlir::LLVM::DIFileAttr fileAttr, - mlir::LLVM::DIScopeAttr scope); + mlir::LLVM::DIScopeAttr scope, + mlir::SymbolTable *symbolTable); }; static uint32_t getLineFromLoc(mlir::Location loc) { @@ -73,12 +75,19 @@ static uint32_t getLineFromLoc(mlir::Location loc) { return line; } +bool debugInfoIsAlreadySet(mlir::Location loc) { + if (mlir::isa(loc)) + return true; + return false; +} + } // namespace void AddDebugInfoPass::handleDeclareOp(fir::cg::XDeclareOp declOp, mlir::LLVM::DIFileAttr fileAttr, mlir::LLVM::DIScopeAttr scopeAttr, - fir::DebugTypeGenerator &typeGen) { + fir::DebugTypeGenerator &typeGen, + mlir::SymbolTable *symbolTable) { mlir::MLIRContext *context = &getContext(); mlir::OpBuilder builder(context); auto result = fir::NameUniquer::deconstruct(declOp.getUniqName()); @@ -86,6 +95,12 @@ void AddDebugInfoPass::handleDeclareOp(fir::cg::XDeclareOp declOp, if (result.first != fir::NameUniquer::NameKind::VARIABLE) return; + // If this DeclareOp actually represents a global then treat it as such. + if (auto global = symbolTable->lookup(declOp.getUniqName())) { + handleGlobalOp(global, fileAttr, scopeAttr, symbolTable); + return; + } + // Only accept local variables. if (result.second.procs.empty()) return; @@ -138,7 +153,10 @@ mlir::LLVM::DIModuleAttr AddDebugInfoPass::getOrCreateModuleAttr( void AddDebugInfoPass::handleGlobalOp(fir::GlobalOp globalOp, mlir::LLVM::DIFileAttr fileAttr, - mlir::LLVM::DIScopeAttr scope) { + mlir::LLVM::DIScopeAttr scope, + mlir::SymbolTable *symbolTable) { + if (debugInfoIsAlreadySet(globalOp.getLoc())) + return; mlir::ModuleOp module = getOperation(); mlir::MLIRContext *context = &getContext(); fir::DebugTypeGenerator typeGen(module); @@ -163,12 +181,19 @@ void AddDebugInfoPass::handleGlobalOp(fir::GlobalOp globalOp, // declared. We are using a best guess of line - 1 where line is the source // line of the first member of the module that we encounter. - if (result.second.modules.empty()) - return; + if (result.second.procs.empty()) { + // Only look for module if this variable is not part of a function. + if (result.second.modules.empty()) + return; - scope = getOrCreateModuleAttr(result.second.modules[0], fileAttr, scope, - line - 1, !globalOp.isInitialized()); + // Modules are generated at compile unit scope + if (mlir::LLVM::DISubprogramAttr sp = + mlir::dyn_cast_if_present(scope)) + scope = sp.getCompileUnit(); + scope = getOrCreateModuleAttr(result.second.modules[0], fileAttr, scope, + line - 1, !globalOp.isInitialized()); + } mlir::LLVM::DITypeAttr diType = typeGen.convertType( globalOp.getType(), fileAttr, scope, globalOp.getLoc()); auto gvAttr = mlir::LLVM::DIGlobalVariableAttr::get( @@ -182,6 +207,7 @@ void AddDebugInfoPass::handleGlobalOp(fir::GlobalOp globalOp, void AddDebugInfoPass::runOnOperation() { mlir::ModuleOp module = getOperation(); mlir::MLIRContext *context = &getContext(); + mlir::SymbolTable symbolTable(module); mlir::OpBuilder builder(context); llvm::StringRef fileName; std::string filePath; @@ -218,17 +244,11 @@ void AddDebugInfoPass::runOnOperation() { llvm::dwarf::getLanguage("DW_LANG_Fortran95"), fileAttr, producer, isOptimized, debugLevel); - if (debugLevel == mlir::LLVM::DIEmissionKind::Full) { - // Process 'GlobalOp' only if full debug info is requested. - for (auto globalOp : module.getOps()) - handleGlobalOp(globalOp, fileAttr, cuAttr); - } - module.walk([&](mlir::func::FuncOp funcOp) { mlir::Location l = funcOp->getLoc(); // If fused location has already been created then nothing to do // Otherwise, create a fused location. - if (mlir::dyn_cast(l)) + if (debugInfoIsAlreadySet(l)) return; unsigned int CC = (funcOp.getName() == fir::NameUniquer::doProgramEntry()) @@ -293,9 +313,15 @@ void AddDebugInfoPass::runOnOperation() { return; funcOp.walk([&](fir::cg::XDeclareOp declOp) { - handleDeclareOp(declOp, fileAttr, spAttr, typeGen); + handleDeclareOp(declOp, fileAttr, spAttr, typeGen, &symbolTable); }); }); + // Process any global which was not processed through DeclareOp. + if (debugLevel == mlir::LLVM::DIEmissionKind::Full) { + // Process 'GlobalOp' only if full debug info is requested. + for (auto globalOp : module.getOps()) + handleGlobalOp(globalOp, fileAttr, cuAttr, &symbolTable); + } } std::unique_ptr diff --git a/flang/test/Integration/debug-fixed-array-type-2.f90 b/flang/test/Integration/debug-fixed-array-type-2.f90 index 315525442a5bc..b34413458ad8d 100644 --- a/flang/test/Integration/debug-fixed-array-type-2.f90 +++ b/flang/test/Integration/debug-fixed-array-type-2.f90 @@ -23,20 +23,17 @@ function fn1(a1, b1, c1) result (res) ! CHECK-DAG: ![[R1:.*]] = !DISubrange(count: 3, lowerBound: 1) ! CHECK-DAG: ![[SUB1:.*]] = !{![[R1]]} ! CHECK-DAG: ![[D1TY:.*]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[INT]], elements: ![[SUB1]]) -! CHECK-DAG: !DILocalVariable(name: "d1"{{.*}}type: ![[D1TY]]) ! CHECK-DAG: ![[R21:.*]] = !DISubrange(count: 2, lowerBound: 1) ! CHECK-DAG: ![[R22:.*]] = !DISubrange(count: 5, lowerBound: 1) ! CHECK-DAG: ![[SUB2:.*]] = !{![[R21]], ![[R22]]} ! CHECK-DAG: ![[D2TY:.*]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[INT]], elements: ![[SUB2]]) -! CHECK-DAG: !DILocalVariable(name: "d2"{{.*}}type: ![[D2TY]]) ! CHECK-DAG: ![[R31:.*]] = !DISubrange(count: 6, lowerBound: 1) ! CHECK-DAG: ![[R32:.*]] = !DISubrange(count: 8, lowerBound: 1) ! CHECK-DAG: ![[R33:.*]] = !DISubrange(count: 7, lowerBound: 1) ! CHECK-DAG: ![[SUB3:.*]] = !{![[R31]], ![[R32]], ![[R33]]} ! CHECK-DAG: ![[D3TY:.*]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[REAL]], elements: ![[SUB3]]) -! CHECK-DAG: !DILocalVariable(name: "d3"{{.*}}type: ![[D3TY]]) ! CHECK-DAG: !DILocalVariable(name: "a1", arg: 1{{.*}}type: ![[D1TY]]) ! CHECK-DAG: !DILocalVariable(name: "b1", arg: 2{{.*}}type: ![[D2TY]]) diff --git a/flang/test/Transforms/debug-local-global-storage-1.fir b/flang/test/Transforms/debug-local-global-storage-1.fir new file mode 100644 index 0000000000000..d9d8083a14709 --- /dev/null +++ b/flang/test/Transforms/debug-local-global-storage-1.fir @@ -0,0 +1,52 @@ +// RUN: fir-opt --add-debug-info --mlir-print-debuginfo %s | FileCheck %s + +module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry, dense<64> : vector<4xi64>>, #dlti.dl_entry, dense<32> : vector<4xi64>>, #dlti.dl_entry, dense<32> : vector<4xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<4xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>, #dlti.dl_entry<"dlti.endianness", "little">>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"} { + func.func @_QMexamplePmod_sub() { + %c2 = arith.constant 2 : index + %1 = fir.address_of(@_QMexampleEmod_arr) : !fir.ref> + %2 = fircg.ext_declare %1(%c2, %c2) {uniq_name = "_QMexampleEmod_arr"} : (!fir.ref>, index, index) -> !fir.ref> loc(#loc4) + %3 = fir.address_of(@_QMexampleFmod_subEss) : !fir.ref + %4 = fircg.ext_declare %3 {uniq_name = "_QMexampleFmod_subEss"} : (!fir.ref) -> !fir.ref loc(#loc5) + return + } loc(#loc6) + func.func @_QQmain() attributes {fir.bindc_name = "test"} { + %c3 = arith.constant 3 : index + %c4 = arith.constant 4 : index + %1 = fir.address_of(@_QFEarr) : !fir.ref> + %2 = fircg.ext_declare %1(%c3, %c4) {uniq_name = "_QFEarr"} : (!fir.ref>, index, index) -> !fir.ref> loc(#loc2) + %3 = fir.address_of(@_QFEs) : !fir.ref + %4 = fircg.ext_declare %3 {uniq_name = "_QFEs"} : (!fir.ref) -> !fir.ref loc(#loc3) + return + } loc(#loc1) + fir.global @_QMexampleEmod_arr : !fir.array<2x2xi32> { + %0 = fir.zero_bits !fir.array<2x2xi32> + fir.has_value %0 : !fir.array<2x2xi32> + } loc(#loc4) + fir.global internal @_QMexampleFmod_subEss : i32 { + %c2_i32 = arith.constant 2 : i32 + fir.has_value %c2_i32 : i32 + } loc(#loc5) + fir.global internal @_QFEarr : !fir.array<3x4xi32> { + %0 = fir.zero_bits !fir.array<3x4xi32> + fir.has_value %0 : !fir.array<3x4xi32> + } loc(#loc2) + fir.global internal @_QFEs : i32 { + %c2_i32 = arith.constant 2 : i32 + fir.has_value %c2_i32 : i32 + } loc(#loc3) +} +#loc1 = loc("test.f90":21:1) +#loc2 = loc("test.f90":22:1) +#loc3 = loc("test.f90":23:1) +#loc4 = loc("test.f90":5:1) +#loc5 = loc("test.f90":12:1) +#loc6 = loc("test.f90":10:1) + +// CHECK-DAG: #[[CU:.*]] = #llvm.di_compile_unit<{{.*}}> +// CHECK-DAG: #[[MOD:.*]] = #llvm.di_module<{{.*}}scope = #[[CU]]{{.*}}name = "example"{{.*}}> +// CHECK-DAG: #[[SP:.*]] = #llvm.di_subprogram<{{.*}}name = "_QQmain"{{.*}}> +// CHECK-DAG: #[[MOD_SP:.*]] = #llvm.di_subprogram<{{.*}}name = "mod_sub"{{.*}}> +// CHECK-DAG: #llvm.di_global_variable +// CHECK-DAG: #llvm.di_global_variable +// CHECK-DAG: #llvm.di_global_variable +// CHECK-DAG: #llvm.di_global_variable From c5329c827ab345c4390f9a176573816e7f5c19e3 Mon Sep 17 00:00:00 2001 From: Sjoerd Meijer Date: Wed, 17 Jul 2024 10:46:28 +0100 Subject: [PATCH 250/777] [LV][AArch64] Prefer Fixed over Scalable if cost-model is equal (Neoverse V2) (#95819) For the Neoverse V2 we would like to prefer fixed width over scalable vectorisation if the cost-model assigns an equal cost to both for certain loops. This improves 7 kernels from TSVC-2 and several production kernels by about 2x, and does not affect SPEC21017 INT and FP. This also adds a new TTI hook that can steer the loop vectorizater to preferring fixed width vectorization, which can be set per CPU. For now, this is only enabled for the Neoverse V2. There are 3 reasons why preferring NEON might be better in the case the cost-model is a tie and the SVE vector size is the same as NEON (128-bit): architectural reasons, micro-architecture reasons, and SVE codegen reasons. The latter will be improved over time, so the more important reasons are the former two. I.e., (micro) architecture reason is the use of LPD/STP instructions which are not available in SVE2 and it avoids predication. For what it is worth: this codegen strategy to generate more NEON is inline with GCC's codegen strategy, which is actually even more aggressive in generating NEON when no predication is required. We could be smarter about the decision making, but this seems to be a first good step in the right direction, and we can always revise this later (for example make the target hook more general). --- .../llvm/Analysis/TargetTransformInfo.h | 9 +++ .../llvm/Analysis/TargetTransformInfoImpl.h | 2 + llvm/lib/Analysis/TargetTransformInfo.cpp | 4 ++ llvm/lib/Target/AArch64/AArch64Features.td | 4 ++ llvm/lib/Target/AArch64/AArch64Processors.td | 1 + .../AArch64/AArch64TargetTransformInfo.h | 4 ++ .../Transforms/Vectorize/LoopVectorize.cpp | 4 +- .../prefer-fixed-if-equal-to-scalable.ll | 60 +++++++++++++++++++ 8 files changed, 87 insertions(+), 1 deletion(-) create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/prefer-fixed-if-equal-to-scalable.ll diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index bda9d4e624505..cf378008e4c7c 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1679,6 +1679,11 @@ class TargetTransformInfo { false; ///< If op is an fp min/max, whether NaNs may be present. }; + /// \returns True if the targets prefers fixed width vectorization if the + /// loop vectorizer's cost-model assigns an equal cost to the fixed and + /// scalable version of the vectorized loop. + bool preferFixedOverScalableIfEqualCost() const; + /// \returns True if the target prefers reductions in loop. bool preferInLoopReduction(unsigned Opcode, Type *Ty, ReductionFlags Flags) const; @@ -2156,6 +2161,7 @@ class TargetTransformInfo::Concept { virtual unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, unsigned ChainSizeInBytes, VectorType *VecTy) const = 0; + virtual bool preferFixedOverScalableIfEqualCost() const = 0; virtual bool preferInLoopReduction(unsigned Opcode, Type *Ty, ReductionFlags) const = 0; virtual bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty, @@ -2891,6 +2897,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept { VectorType *VecTy) const override { return Impl.getStoreVectorFactor(VF, StoreSize, ChainSizeInBytes, VecTy); } + bool preferFixedOverScalableIfEqualCost() const override { + return Impl.preferFixedOverScalableIfEqualCost(); + } bool preferInLoopReduction(unsigned Opcode, Type *Ty, ReductionFlags Flags) const override { return Impl.preferInLoopReduction(Opcode, Ty, Flags); diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index c1eb6151440be..47fde08735c0c 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -920,6 +920,8 @@ class TargetTransformInfoImplBase { return VF; } + bool preferFixedOverScalableIfEqualCost() const { return false; } + bool preferInLoopReduction(unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const { return false; diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index be4069bb3eabf..693f7a5bb7af5 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1286,6 +1286,10 @@ unsigned TargetTransformInfo::getStoreVectorFactor(unsigned VF, return TTIImpl->getStoreVectorFactor(VF, StoreSize, ChainSizeInBytes, VecTy); } +bool TargetTransformInfo::preferFixedOverScalableIfEqualCost() const { + return TTIImpl->preferFixedOverScalableIfEqualCost(); +} + bool TargetTransformInfo::preferInLoopReduction(unsigned Opcode, Type *Ty, ReductionFlags Flags) const { return TTIImpl->preferInLoopReduction(Opcode, Ty, Flags); diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td index e523957afc25a..832e44fe117e2 100644 --- a/llvm/lib/Target/AArch64/AArch64Features.td +++ b/llvm/lib/Target/AArch64/AArch64Features.td @@ -355,6 +355,10 @@ def FeatureTHE : ExtensionWithMArch<"the", "THE", "FEAT_THE", // Armv9.0 Architecture Extensions //===----------------------------------------------------------------------===// +def FeatureUseFixedOverScalableIfEqualCost: SubtargetFeature<"use-fixed-over-scalable-if-equal-cost", + "UseFixedOverScalableIfEqualCost", "true", + "Prefer fixed width loop vectorization over scalable if the cost-model assigns equal costs">; + def FeatureUseScalarIncVL : SubtargetFeature<"use-scalar-inc-vl", "UseScalarIncVL", "true", "Prefer inc/dec over add+cnt">; diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td index 87927093a2c4c..71384a23c49af 100644 --- a/llvm/lib/Target/AArch64/AArch64Processors.td +++ b/llvm/lib/Target/AArch64/AArch64Processors.td @@ -525,6 +525,7 @@ def TuneNeoverseV2 : SubtargetFeature<"neoversev2", "ARMProcFamily", "NeoverseV2 FeatureALULSLFast, FeaturePostRAScheduler, FeatureEnableSelectOptimize, + FeatureUseFixedOverScalableIfEqualCost, FeaturePredictableSelectIsExpensive]>; def TuneNeoverseV3 : SubtargetFeature<"neoversev3", "ARMProcFamily", "NeoverseV3", diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index 3eb9aa963c018..a9189fd53f40b 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -371,6 +371,10 @@ class AArch64TTIImpl : public BasicTTIImplBase { return TailFoldingStyle::DataWithoutLaneMask; } + bool preferFixedOverScalableIfEqualCost() const { + return ST->useFixedOverScalableIfEqualCost(); + } + bool preferPredicateOverEpilogue(TailFoldingInfo *TFI); bool supportsScalableVectors() const { diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 3bdb545946e2b..71ec7252bd950 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4630,7 +4630,9 @@ bool LoopVectorizationPlanner::isMoreProfitable( // Assume vscale may be larger than 1 (or the value being tuned for), // so that scalable vectorization is slightly favorable over fixed-width // vectorization. - bool PreferScalable = A.Width.isScalable() && !B.Width.isScalable(); + bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost() && + A.Width.isScalable() && !B.Width.isScalable(); + auto CmpFn = [PreferScalable](const InstructionCost &LHS, const InstructionCost &RHS) { return PreferScalable ? LHS <= RHS : LHS < RHS; diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/prefer-fixed-if-equal-to-scalable.ll b/llvm/test/Transforms/LoopVectorize/AArch64/prefer-fixed-if-equal-to-scalable.ll new file mode 100644 index 0000000000000..41595cc7d8996 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/prefer-fixed-if-equal-to-scalable.ll @@ -0,0 +1,60 @@ +; RUN: opt -S < %s -passes=loop-vectorize -force-target-instruction-cost=1 | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32" +target triple = "aarch64-unknown-linux-gnu" + +@a = dso_local local_unnamed_addr global [32000 x float] zeroinitializer, align 64 +@b = dso_local local_unnamed_addr global [32000 x float] zeroinitializer, align 64 + +define void @NeoverseV2() #0 { +; CHECK-LABEL: define void @NeoverseV2( +; CHECK: store <4 x float> +; +entry: + br label %for.body + +for.cond.cleanup: + ret void + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %indvars.iv + %0 = load float, ptr %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds [32000 x float], ptr @b, i64 0, i64 %indvars.iv + %1 = load float, ptr %arrayidx2, align 4 + %add = fadd fast float %1, %0 + %2 = add nuw nsw i64 %indvars.iv, 16000 + %arrayidx5 = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %2 + store float %add, ptr %arrayidx5, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, 16000 + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +define void @GenericCPU() #1 { +; CHECK-LABEL: define void @GenericCPU( +; CHECK: store +; +entry: + br label %for.body + +for.cond.cleanup: + ret void + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %indvars.iv + %0 = load float, ptr %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds [32000 x float], ptr @b, i64 0, i64 %indvars.iv + %1 = load float, ptr %arrayidx2, align 4 + %add = fadd fast float %1, %0 + %2 = add nuw nsw i64 %indvars.iv, 16000 + %arrayidx5 = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %2 + store float %add, ptr %arrayidx5, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, 16000 + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +attributes #0 = { vscale_range(1,16) "target-cpu"="neoverse-v2" "target-features"="+sve,+sve2,+v9a" } +attributes #1 = { vscale_range(1,16) "target-cpu"="generic" "target-features"="+sve,+v9a" } From 9b9194af408003e7d484d621fb3ee61389bdd20e Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Wed, 17 Jul 2024 09:48:13 +0000 Subject: [PATCH 251/777] [gn build] Port e94e72a0c229 --- llvm/utils/gn/secondary/clang/lib/AST/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/clang/lib/AST/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/AST/BUILD.gn index 1eed616330eb1..1708af8612bc2 100644 --- a/llvm/utils/gn/secondary/clang/lib/AST/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/lib/AST/BUILD.gn @@ -97,6 +97,7 @@ static_library("AST") { "Interp/Context.cpp", "Interp/Descriptor.cpp", "Interp/Disasm.cpp", + "Interp/DynamicAllocator.cpp", "Interp/EvalEmitter.cpp", "Interp/EvaluationResult.cpp", "Interp/Floating.cpp", From 0b71d8020f1181c75c305d34943ed42bb1948177 Mon Sep 17 00:00:00 2001 From: Jeremy Morse Date: Wed, 17 Jul 2024 11:25:08 +0100 Subject: [PATCH 252/777] [InstrRef][NFC] Avoid un-necessary DenseMap queries (#99048) This patch adjusts how some data is stored to avoid a number of un-necessary DenseMap queries. There's no change to the compiler behaviour, and it's measurably faster on the compile time tracker. The BlockOrders vector in buildVLocValueMap collects the blocks over which a variables value have to be determined: however the Cmp ordering function makes two DenseMap queries to determine the RPO-order of blocks being compared. And given that sorting is O(N log(N)) comparisons this isn't fast. So instead, fetch the RPO-numbers of the block collection, order those, and then map back to the blocks themselves. The OrderToBB collection mapped an RPO-number to an MBB: it's completely un-necessary to have DenseMap here, we can just use the RPO number as an array index. Switch it to a SmallVector and deal with a few consequences when iterating. (And for good measure I've jammed in a couple of reserve calls). --- .../LiveDebugValues/InstrRefBasedImpl.cpp | 44 +++++++++++-------- .../LiveDebugValues/InstrRefBasedImpl.h | 2 +- 2 files changed, 27 insertions(+), 19 deletions(-) diff --git a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp index 555cbb7a507f4..bde8cc4a89715 100644 --- a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp +++ b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp @@ -3109,12 +3109,8 @@ void InstrRefBasedLDV::buildVLocValueMap( SmallPtrSet BlocksToExplore; // The order in which to examine them (RPO). - SmallVector BlockOrders; - - // RPO ordering function. - auto Cmp = [&](MachineBasicBlock *A, MachineBasicBlock *B) { - return BBToOrder[A] < BBToOrder[B]; - }; + SmallVector BlockOrders; + SmallVector BlockOrderNums; getBlocksForScope(DILoc, BlocksToExplore, AssignBlocks); @@ -3132,11 +3128,16 @@ void InstrRefBasedLDV::buildVLocValueMap( for (const auto *MBB : BlocksToExplore) MutBlocksToExplore.insert(const_cast(MBB)); - // Picks out relevants blocks RPO order and sort them. + // Picks out relevants blocks RPO order and sort them. Sort their + // order-numbers and map back to MBB pointers later, to avoid repeated + // DenseMap queries during comparisons. for (const auto *MBB : BlocksToExplore) - BlockOrders.push_back(const_cast(MBB)); + BlockOrderNums.push_back(BBToOrder[MBB]); - llvm::sort(BlockOrders, Cmp); + llvm::sort(BlockOrderNums); + for (unsigned int I : BlockOrderNums) + BlockOrders.push_back(OrderToBB[I]); + BlockOrderNums.clear(); unsigned NumBlocks = BlockOrders.size(); // Allocate some vectors for storing the live ins and live outs. Large. @@ -3396,16 +3397,24 @@ void InstrRefBasedLDV::initialSetup(MachineFunction &MF) { return DL.getLine() != 0; return false; }; - // Collect a set of all the artificial blocks. - for (auto &MBB : MF) + + // Collect a set of all the artificial blocks. Collect the size too, ilist + // size calls are O(n). + unsigned int Size = 0; + for (auto &MBB : MF) { + ++Size; if (none_of(MBB.instrs(), hasNonArtificialLocation)) ArtificialBlocks.insert(&MBB); + } // Compute mappings of block <=> RPO order. ReversePostOrderTraversal RPOT(&MF); unsigned int RPONumber = 0; + OrderToBB.reserve(Size); + BBToOrder.reserve(Size); + BBNumToRPO.reserve(Size); auto processMBB = [&](MachineBasicBlock *MBB) { - OrderToBB[RPONumber] = MBB; + OrderToBB.push_back(MBB); BBToOrder[MBB] = RPONumber; BBNumToRPO[MBB->getNumber()] = RPONumber; ++RPONumber; @@ -3724,14 +3733,13 @@ bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF, // Walk back through each block / instruction, collecting DBG_VALUE // instructions and recording what machine value their operands refer to. - for (auto &OrderPair : OrderToBB) { - MachineBasicBlock &MBB = *OrderPair.second; - CurBB = MBB.getNumber(); + for (MachineBasicBlock *MBB : OrderToBB) { + CurBB = MBB->getNumber(); VTracker = &vlocs[CurBB]; - VTracker->MBB = &MBB; - MTracker->loadFromArray(MInLocs[MBB], CurBB); + VTracker->MBB = MBB; + MTracker->loadFromArray(MInLocs[*MBB], CurBB); CurInst = 1; - for (auto &MI : MBB) { + for (auto &MI : *MBB) { process(MI, &MOutLocs, &MInLocs); ++CurInst; } diff --git a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h index 6d77a6972f09b..8770983481c2f 100644 --- a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h +++ b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h @@ -1153,7 +1153,7 @@ class InstrRefBasedLDV : public LDVImpl { SmallPtrSet ArtificialBlocks; // Mapping of blocks to and from their RPOT order. - DenseMap OrderToBB; + SmallVector OrderToBB; DenseMap BBToOrder; DenseMap BBNumToRPO; From 8d28a4102b3668c75d061235c2890546757f4257 Mon Sep 17 00:00:00 2001 From: Carl Ritson Date: Wed, 17 Jul 2024 19:38:23 +0900 Subject: [PATCH 253/777] [AMDGPU] Remove SIWholeQuadMode pass early exit (#98450) Merge the code bypass elements from the early exit into the main pass execution flow. --- llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp | 63 +++++++++++++--------- 1 file changed, 38 insertions(+), 25 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp index 3dc8cc17afc16..df7906ebd8a7e 100644 --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -219,11 +219,12 @@ class SIWholeQuadMode : public MachineFunctionPass { void lowerBlock(MachineBasicBlock &MBB); void processBlock(MachineBasicBlock &MBB, bool IsEntry); - void lowerLiveMaskQueries(); - void lowerCopyInstrs(); - void lowerKillInstrs(bool IsWQM); + bool lowerLiveMaskQueries(); + bool lowerCopyInstrs(); + bool lowerKillInstrs(bool IsWQM); void lowerInitExec(MachineInstr &MI); - MachineBasicBlock::iterator lowerInitExecInstrs(MachineBasicBlock &Entry); + MachineBasicBlock::iterator lowerInitExecInstrs(MachineBasicBlock &Entry, + bool &Changed); public: static char ID; @@ -796,6 +797,8 @@ MachineBasicBlock *SIWholeQuadMode::splitBlock(MachineBasicBlock *BB, MachineInstr *SIWholeQuadMode::lowerKillF32(MachineBasicBlock &MBB, MachineInstr &MI) { + assert(LiveMaskReg.isVirtual()); + const DebugLoc &DL = MI.getDebugLoc(); unsigned Opcode = 0; @@ -913,6 +916,8 @@ MachineInstr *SIWholeQuadMode::lowerKillF32(MachineBasicBlock &MBB, MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB, MachineInstr &MI, bool IsWQM) { + assert(LiveMaskReg.isVirtual()); + const DebugLoc &DL = MI.getDebugLoc(); MachineInstr *MaskUpdateMI = nullptr; @@ -1144,6 +1149,8 @@ MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion( void SIWholeQuadMode::toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, Register SaveWQM) { + assert(LiveMaskReg.isVirtual()); + bool IsTerminator = Before == MBB.end(); if (!IsTerminator) { auto FirstTerm = MBB.getFirstTerminator(); @@ -1423,7 +1430,7 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) { assert(!SavedNonStrictReg); } -void SIWholeQuadMode::lowerLiveMaskQueries() { +bool SIWholeQuadMode::lowerLiveMaskQueries() { for (MachineInstr *MI : LiveMaskQueries) { const DebugLoc &DL = MI->getDebugLoc(); Register Dest = MI->getOperand(0).getReg(); @@ -1435,9 +1442,10 @@ void SIWholeQuadMode::lowerLiveMaskQueries() { LIS->ReplaceMachineInstrInMaps(*MI, *Copy); MI->eraseFromParent(); } + return !LiveMaskQueries.empty(); } -void SIWholeQuadMode::lowerCopyInstrs() { +bool SIWholeQuadMode::lowerCopyInstrs() { for (MachineInstr *MI : LowerToMovInstrs) { assert(MI->getNumExplicitOperands() == 2); @@ -1492,9 +1500,10 @@ void SIWholeQuadMode::lowerCopyInstrs() { *MRI, MI->getOperand(0))); MI->setDesc(TII->get(CopyOp)); } + return !LowerToCopyInstrs.empty() || !LowerToMovInstrs.empty(); } -void SIWholeQuadMode::lowerKillInstrs(bool IsWQM) { +bool SIWholeQuadMode::lowerKillInstrs(bool IsWQM) { for (MachineInstr *MI : KillInstrs) { MachineBasicBlock *MBB = MI->getParent(); MachineInstr *SplitPoint = nullptr; @@ -1510,6 +1519,7 @@ void SIWholeQuadMode::lowerKillInstrs(bool IsWQM) { if (SplitPoint) splitBlock(MBB, SplitPoint); } + return !KillInstrs.empty(); } void SIWholeQuadMode::lowerInitExec(MachineInstr &MI) { @@ -1601,7 +1611,7 @@ void SIWholeQuadMode::lowerInitExec(MachineInstr &MI) { /// Lower INIT_EXEC instructions. Return a suitable insert point in \p Entry /// for instructions that depend on EXEC. MachineBasicBlock::iterator -SIWholeQuadMode::lowerInitExecInstrs(MachineBasicBlock &Entry) { +SIWholeQuadMode::lowerInitExecInstrs(MachineBasicBlock &Entry, bool &Changed) { MachineBasicBlock::iterator InsertPt = Entry.getFirstNonPHI(); for (MachineInstr *MI : InitExecInstrs) { @@ -1612,6 +1622,7 @@ SIWholeQuadMode::lowerInitExecInstrs(MachineBasicBlock &Entry) { InsertPt = std::next(MI->getIterator()); lowerInitExec(*MI); + Changed = true; } return InsertPt; @@ -1664,48 +1675,50 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { } const char GlobalFlags = analyzeFunction(MF); - const bool NeedsLiveMask = !(KillInstrs.empty() && LiveMaskQueries.empty()); + bool Changed = false; LiveMaskReg = Exec; MachineBasicBlock &Entry = MF.front(); - MachineBasicBlock::iterator EntryMI = lowerInitExecInstrs(Entry); - - // Shader is simple does not need any state changes or any complex lowering - if (!(GlobalFlags & (StateWQM | StateStrict)) && LowerToCopyInstrs.empty() && - LowerToMovInstrs.empty() && KillInstrs.empty()) { - lowerLiveMaskQueries(); - if (!InitExecInstrs.empty()) - LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC); - return !InitExecInstrs.empty() || !LiveMaskQueries.empty(); - } + MachineBasicBlock::iterator EntryMI = lowerInitExecInstrs(Entry, Changed); // Store a copy of the original live mask when required - if (NeedsLiveMask || (GlobalFlags & StateWQM)) { + const bool HasLiveMaskQueries = !LiveMaskQueries.empty(); + const bool HasWaveModes = GlobalFlags & ~StateExact; + const bool HasKills = !KillInstrs.empty(); + const bool UsesWQM = GlobalFlags & StateWQM; + if (HasKills || UsesWQM || (HasWaveModes && HasLiveMaskQueries)) { LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC()); MachineInstr *MI = BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg) .addReg(Exec); LIS->InsertMachineInstrInMaps(*MI); + Changed = true; } LLVM_DEBUG(printInfo()); - lowerLiveMaskQueries(); - lowerCopyInstrs(); + Changed |= lowerLiveMaskQueries(); + Changed |= lowerCopyInstrs(); - // Shader only needs WQM - if (GlobalFlags == StateWQM) { + if (!HasWaveModes) { + // No wave mode execution + Changed |= lowerKillInstrs(false); + } else if (GlobalFlags == StateWQM) { + // Shader only needs WQM auto MI = BuildMI(Entry, EntryMI, DebugLoc(), TII->get(WQMOpc), Exec) .addReg(Exec); LIS->InsertMachineInstrInMaps(*MI); lowerKillInstrs(true); + Changed = true; } else { + // Wave mode switching requires full lowering pass. for (auto BII : Blocks) processBlock(*BII.first, BII.first == &Entry); // Lowering blocks causes block splitting so perform as a second pass. for (auto BII : Blocks) lowerBlock(*BII.first); + Changed = true; } // Compute live range for live mask @@ -1721,5 +1734,5 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { if (!KillInstrs.empty() || !InitExecInstrs.empty()) LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC); - return true; + return Changed; } From d216615518875f828b9055ac79dbdb32e539367a Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 17 Jul 2024 11:43:41 +0100 Subject: [PATCH 254/777] [LV] Process dead interleave pointer ops in reverse order. Process dead interleave pointer ops in reverse order. This also catches cases where the same base pointer is used by multiple different interleave groups. This fixes another case where the legacy cost model inaccuarately estimates cost, surfaced by b841e2eca3b5c8. --- .../Transforms/Vectorize/LoopVectorize.cpp | 7 +- .../LoopVectorize/X86/interleave-cost.ll | 195 ++++++++++++++++++ 2 files changed, 200 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 71ec7252bd950..bc1a566d230ee 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -6998,7 +6998,7 @@ void LoopVectorizationCostModel::collectValuesToIgnore() { // Ignore ephemeral values. CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); - SmallSetVector DeadInterleavePointerOps; + SmallVector InitialInterleavePointersOps; for (BasicBlock *BB : TheLoop->blocks()) for (Instruction &I : *BB) { // Find all stores to invariant variables. Since they are going to sink @@ -7016,10 +7016,13 @@ void LoopVectorizationCostModel::collectValuesToIgnore() { if (Group->getInsertPos() == &I) continue; Value *PointerOp = getLoadStorePointerOperand(&I); - DeadInterleavePointerOps.insert(PointerOp); + InitialInterleavePointersOps.push_back(PointerOp); } } + SmallSetVector DeadInterleavePointerOps( + InitialInterleavePointersOps.rbegin(), + InitialInterleavePointersOps.rend()); // Mark ops feeding interleave group members as free, if they are only used // by other dead computations. for (unsigned I = 0; I != DeadInterleavePointerOps.size(); ++I) { diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll index 0091ebd1ca773..9bba1a90096e6 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll @@ -182,9 +182,204 @@ loop: exit: ret void } + +define void @geps_feeding_interleave_groups_with_reuse(ptr %arg, i64 %arg1, ptr %arg2) #0 { +; CHECK-LABEL: define void @geps_feeding_interleave_groups_with_reuse( +; CHECK-SAME: ptr [[ARG:%.*]], i64 [[ARG1:%.*]], ptr [[ARG2:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[ARG1]], 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 30 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]] +; CHECK: [[VECTOR_SCEVCHECK]]: +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[ARG2]], i64 8 +; CHECK-NEXT: [[MUL:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 16, i64 [[ARG1]]) +; CHECK-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i64, i1 } [[MUL]], 0 +; CHECK-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i64, i1 } [[MUL]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = sub i64 0, [[MUL_RESULT]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[SCEVGEP]], i64 [[MUL_RESULT]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp ult ptr [[TMP2]], [[SCEVGEP]] +; CHECK-NEXT: [[TMP4:%.*]] = or i1 [[TMP3]], [[MUL_OVERFLOW]] +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[ARG2]], i64 12 +; CHECK-NEXT: [[MUL2:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 16, i64 [[ARG1]]) +; CHECK-NEXT: [[MUL_RESULT3:%.*]] = extractvalue { i64, i1 } [[MUL2]], 0 +; CHECK-NEXT: [[MUL_OVERFLOW4:%.*]] = extractvalue { i64, i1 } [[MUL2]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = sub i64 0, [[MUL_RESULT3]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[SCEVGEP1]], i64 [[MUL_RESULT3]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp ult ptr [[TMP6]], [[SCEVGEP1]] +; CHECK-NEXT: [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW4]] +; CHECK-NEXT: [[SCEVGEP5:%.*]] = getelementptr i8, ptr [[ARG2]], i64 4 +; CHECK-NEXT: [[MUL6:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 16, i64 [[ARG1]]) +; CHECK-NEXT: [[MUL_RESULT7:%.*]] = extractvalue { i64, i1 } [[MUL6]], 0 +; CHECK-NEXT: [[MUL_OVERFLOW8:%.*]] = extractvalue { i64, i1 } [[MUL6]], 1 +; CHECK-NEXT: [[TMP9:%.*]] = sub i64 0, [[MUL_RESULT7]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[SCEVGEP5]], i64 [[MUL_RESULT7]] +; CHECK-NEXT: [[TMP11:%.*]] = icmp ult ptr [[TMP10]], [[SCEVGEP5]] +; CHECK-NEXT: [[TMP12:%.*]] = or i1 [[TMP11]], [[MUL_OVERFLOW8]] +; CHECK-NEXT: [[MUL9:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 16, i64 [[ARG1]]) +; CHECK-NEXT: [[MUL_RESULT10:%.*]] = extractvalue { i64, i1 } [[MUL9]], 0 +; CHECK-NEXT: [[MUL_OVERFLOW11:%.*]] = extractvalue { i64, i1 } [[MUL9]], 1 +; CHECK-NEXT: [[TMP13:%.*]] = sub i64 0, [[MUL_RESULT10]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[ARG2]], i64 [[MUL_RESULT10]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp ult ptr [[TMP14]], [[ARG2]] +; CHECK-NEXT: [[TMP16:%.*]] = or i1 [[TMP15]], [[MUL_OVERFLOW11]] +; CHECK-NEXT: [[TMP17:%.*]] = or i1 [[TMP4]], [[TMP8]] +; CHECK-NEXT: [[TMP18:%.*]] = or i1 [[TMP17]], [[TMP12]] +; CHECK-NEXT: [[TMP19:%.*]] = or i1 [[TMP18]], [[TMP16]] +; CHECK-NEXT: br i1 [[TMP19]], label %[[SCALAR_PH]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP20:%.*]] = shl i64 [[ARG1]], 4 +; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[TMP20]], 16 +; CHECK-NEXT: [[SCEVGEP12:%.*]] = getelementptr i8, ptr [[ARG2]], i64 [[TMP21]] +; CHECK-NEXT: [[TMP22:%.*]] = shl i64 [[ARG1]], 5 +; CHECK-NEXT: [[TMP23:%.*]] = add i64 [[TMP22]], 32 +; CHECK-NEXT: [[SCEVGEP13:%.*]] = getelementptr i8, ptr [[ARG]], i64 [[TMP23]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[ARG2]], [[SCEVGEP13]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[ARG]], [[SCEVGEP12]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 2 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP24:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP25:%.*]] = shl i64 [[TMP24]], 5 +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[ARG]], i64 [[TMP25]] +; CHECK-NEXT: [[TMP27:%.*]] = shl i64 [[TMP24]], 4 +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[ARG2]], i64 [[TMP27]] +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr float, ptr [[TMP26]], i32 0 +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x float>, ptr [[TMP29]], align 4 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x float> [[WIDE_VEC]], <16 x float> poison, <2 x i32> +; CHECK-NEXT: [[STRIDED_VEC14:%.*]] = shufflevector <16 x float> [[WIDE_VEC]], <16 x float> poison, <2 x i32> +; CHECK-NEXT: [[STRIDED_VEC15:%.*]] = shufflevector <16 x float> [[WIDE_VEC]], <16 x float> poison, <2 x i32> +; CHECK-NEXT: [[STRIDED_VEC16:%.*]] = shufflevector <16 x float> [[WIDE_VEC]], <16 x float> poison, <2 x i32> +; CHECK-NEXT: [[STRIDED_VEC17:%.*]] = shufflevector <16 x float> [[WIDE_VEC]], <16 x float> poison, <2 x i32> +; CHECK-NEXT: [[STRIDED_VEC18:%.*]] = shufflevector <16 x float> [[WIDE_VEC]], <16 x float> poison, <2 x i32> +; CHECK-NEXT: [[STRIDED_VEC19:%.*]] = shufflevector <16 x float> [[WIDE_VEC]], <16 x float> poison, <2 x i32> +; CHECK-NEXT: [[STRIDED_VEC20:%.*]] = shufflevector <16 x float> [[WIDE_VEC]], <16 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP30:%.*]] = fadd <2 x float> [[STRIDED_VEC]], [[STRIDED_VEC17]] +; CHECK-NEXT: [[TMP31:%.*]] = fmul <2 x float> [[TMP30]], zeroinitializer +; CHECK-NEXT: [[TMP32:%.*]] = fadd <2 x float> [[STRIDED_VEC14]], [[STRIDED_VEC18]] +; CHECK-NEXT: [[TMP33:%.*]] = fmul <2 x float> [[TMP32]], zeroinitializer +; CHECK-NEXT: [[TMP34:%.*]] = fadd <2 x float> [[STRIDED_VEC15]], [[STRIDED_VEC19]] +; CHECK-NEXT: [[TMP35:%.*]] = fmul <2 x float> [[TMP34]], zeroinitializer +; CHECK-NEXT: [[TMP36:%.*]] = fadd <2 x float> [[STRIDED_VEC16]], [[STRIDED_VEC20]] +; CHECK-NEXT: [[TMP37:%.*]] = fmul <2 x float> [[TMP36]], zeroinitializer +; CHECK-NEXT: [[TMP38:%.*]] = getelementptr i8, ptr [[TMP28]], i64 12 +; CHECK-NEXT: [[TMP39:%.*]] = getelementptr float, ptr [[TMP38]], i32 -3 +; CHECK-NEXT: [[TMP40:%.*]] = shufflevector <2 x float> [[TMP31]], <2 x float> [[TMP33]], <4 x i32> +; CHECK-NEXT: [[TMP41:%.*]] = shufflevector <2 x float> [[TMP35]], <2 x float> [[TMP37]], <4 x i32> +; CHECK-NEXT: [[TMP42:%.*]] = shufflevector <4 x float> [[TMP40]], <4 x float> [[TMP41]], <8 x i32> +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x float> [[TMP42]], <8 x float> poison, <8 x i32> +; CHECK-NEXT: store <8 x float> [[INTERLEAVED_VEC]], ptr [[TMP39]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP43:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP43]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[SHL_IV_5:%.*]] = shl i64 [[IV]], 5 +; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr i8, ptr [[ARG]], i64 [[SHL_IV_5]] +; CHECK-NEXT: [[ADD_5:%.*]] = or disjoint i64 [[SHL_IV_5]], 16 +; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr i8, ptr [[ARG]], i64 [[ADD_5]] +; CHECK-NEXT: [[SHL_IV_4:%.*]] = shl i64 [[IV]], 4 +; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr i8, ptr [[ARG2]], i64 [[SHL_IV_4]] +; CHECK-NEXT: [[L_1:%.*]] = load float, ptr [[GEP_1]], align 4 +; CHECK-NEXT: [[L_2:%.*]] = load float, ptr [[GEP_2]], align 4 +; CHECK-NEXT: [[ADD_1:%.*]] = fadd float [[L_1]], [[L_2]] +; CHECK-NEXT: [[MUL_1:%.*]] = fmul float [[ADD_1]], 0.000000e+00 +; CHECK-NEXT: store float [[MUL_1]], ptr [[GEP_3]], align 4 +; CHECK-NEXT: [[GEP_4:%.*]] = getelementptr i8, ptr [[GEP_1]], i64 4 +; CHECK-NEXT: [[L_3:%.*]] = load float, ptr [[GEP_4]], align 4 +; CHECK-NEXT: [[GEP_5:%.*]] = getelementptr i8, ptr [[GEP_2]], i64 4 +; CHECK-NEXT: [[L_4:%.*]] = load float, ptr [[GEP_5]], align 4 +; CHECK-NEXT: [[ADD_2:%.*]] = fadd float [[L_3]], [[L_4]] +; CHECK-NEXT: [[MUL_2:%.*]] = fmul float [[ADD_2]], 0.000000e+00 +; CHECK-NEXT: [[GEP_6:%.*]] = getelementptr i8, ptr [[GEP_3]], i64 4 +; CHECK-NEXT: store float [[MUL_2]], ptr [[GEP_6]], align 4 +; CHECK-NEXT: [[GEP_7:%.*]] = getelementptr i8, ptr [[GEP_1]], i64 8 +; CHECK-NEXT: [[L_5:%.*]] = load float, ptr [[GEP_7]], align 4 +; CHECK-NEXT: [[GEP_8:%.*]] = getelementptr i8, ptr [[GEP_2]], i64 8 +; CHECK-NEXT: [[L_6:%.*]] = load float, ptr [[GEP_8]], align 4 +; CHECK-NEXT: [[ADD_3:%.*]] = fadd float [[L_5]], [[L_6]] +; CHECK-NEXT: [[MUL_3:%.*]] = fmul float [[ADD_3]], 0.000000e+00 +; CHECK-NEXT: [[GEP_9:%.*]] = getelementptr i8, ptr [[GEP_3]], i64 8 +; CHECK-NEXT: store float [[MUL_3]], ptr [[GEP_9]], align 4 +; CHECK-NEXT: [[I27:%.*]] = getelementptr i8, ptr [[GEP_1]], i64 12 +; CHECK-NEXT: [[L_7:%.*]] = load float, ptr [[I27]], align 4 +; CHECK-NEXT: [[GEP_10:%.*]] = getelementptr i8, ptr [[GEP_2]], i64 12 +; CHECK-NEXT: [[L_8:%.*]] = load float, ptr [[GEP_10]], align 4 +; CHECK-NEXT: [[ADD_4:%.*]] = fadd float [[L_7]], [[L_8]] +; CHECK-NEXT: [[MUL_4:%.*]] = fmul float [[ADD_4]], 0.000000e+00 +; CHECK-NEXT: [[GEP_11:%.*]] = getelementptr i8, ptr [[GEP_3]], i64 12 +; CHECK-NEXT: store float [[MUL_4]], ptr [[GEP_11]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], [[ARG1]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %shl.iv.5 = shl i64 %iv, 5 + %gep.1 = getelementptr i8, ptr %arg, i64 %shl.iv.5 + %add.5 = or disjoint i64 %shl.iv.5, 16 + %gep.2 = getelementptr i8, ptr %arg, i64 %add.5 + %shl.iv.4 = shl i64 %iv, 4 + %gep.3 = getelementptr i8, ptr %arg2, i64 %shl.iv.4 + %l.1 = load float, ptr %gep.1, align 4 + %l.2 = load float, ptr %gep.2, align 4 + %add.1 = fadd float %l.1, %l.2 + %mul.1 = fmul float %add.1, 0.000000e+00 + store float %mul.1, ptr %gep.3, align 4 + %gep.4 = getelementptr i8, ptr %gep.1, i64 4 + %l.3 = load float, ptr %gep.4, align 4 + %gep.5 = getelementptr i8, ptr %gep.2, i64 4 + %l.4 = load float, ptr %gep.5, align 4 + %add.2 = fadd float %l.3, %l.4 + %mul.2 = fmul float %add.2, 0.000000e+00 + %gep.6 = getelementptr i8, ptr %gep.3, i64 4 + store float %mul.2, ptr %gep.6, align 4 + %gep.7 = getelementptr i8, ptr %gep.1, i64 8 + %l.5 = load float, ptr %gep.7, align 4 + %gep.8 = getelementptr i8, ptr %gep.2, i64 8 + %l.6 = load float, ptr %gep.8, align 4 + %add.3 = fadd float %l.5, %l.6 + %mul.3 = fmul float %add.3, 0.000000e+00 + %gep.9 = getelementptr i8, ptr %gep.3, i64 8 + store float %mul.3, ptr %gep.9, align 4 + %i27 = getelementptr i8, ptr %gep.1, i64 12 + %l.7 = load float, ptr %i27, align 4 + %gep.10 = getelementptr i8, ptr %gep.2, i64 12 + %l.8 = load float, ptr %gep.10, align 4 + %add.4 = fadd float %l.7, %l.8 + %mul.4 = fmul float %add.4, 0.000000e+00 + %gep.11 = getelementptr i8, ptr %gep.3, i64 12 + store float %mul.4, ptr %gep.11, align 4 + %iv.next = add i64 %iv, 1 + %ec = icmp eq i64 %iv, %arg1 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +attributes #0 = { "target-features"="+sse4.2" } + ;. ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} ; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} ; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]} +; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]]} ;. From 762c607d8ecb2bf678375f79ac23e143be0b5f3f Mon Sep 17 00:00:00 2001 From: Tianyi Guan Date: Mon, 24 Jun 2024 11:43:56 +0100 Subject: [PATCH 255/777] [AArch64][GISel] Add test cases for folding shifts into load/store addressing modes (NFC) --- .../GlobalISel/load-addressing-modes.mir | 325 ++++++++++++-- .../GlobalISel/store-addressing-modes.mir | 211 +++++++-- .../CodeGen/AArch64/aarch64-fold-lslfast.ll | 406 +++++++++++++++--- 3 files changed, 799 insertions(+), 143 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir b/llvm/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir index 7921de6ce2362..3af2aaf57eed8 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir @@ -1,22 +1,30 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=aarch64-unknown-unknown -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s +# RUN: llc -mtriple=aarch64-unknown-unknown -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-FAST --allow-unused-prefixes +# RUN: llc -mtriple=aarch64-unknown-unknown -mattr=+addr-lsl-slow-14 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SLOW --allow-unused-prefixes --- | define void @ldrxrox_breg_oreg(ptr %addr) { ret void } define void @ldrdrox_breg_oreg(ptr %addr) { ret void } define void @more_than_one_use(ptr %addr) { ret void } + define void @ldrhrox_shl(ptr %addr) { ret void } + define void @ldrwrox_shl(ptr %addr) { ret void } define void @ldrxrox_shl(ptr %addr) { ret void } define void @ldrdrox_shl(ptr %addr) { ret void } + define void @ldrqrox_shl(ptr %addr) { ret void } define void @ldrxrox_mul_rhs(ptr %addr) { ret void } define void @ldrdrox_mul_rhs(ptr %addr) { ret void } define void @ldrxrox_mul_lhs(ptr %addr) { ret void } define void @ldrdrox_mul_lhs(ptr %addr) { ret void } define void @mul_not_pow_2(ptr %addr) { ret void } define void @mul_wrong_pow_2(ptr %addr) { ret void } - define void @more_than_one_use_shl_1(ptr %addr) { ret void } - define void @more_than_one_use_shl_2(ptr %addr) { ret void } - define void @more_than_one_use_shl_lsl_fast(ptr %addr) { ret void } - define void @more_than_one_use_shl_lsl_slow(ptr %addr) { ret void } + define void @more_than_one_use_shl_fallback(ptr %addr) { ret void } + define void @ldrxrox_more_than_one_mem_use_shl(ptr %addr) { ret void } + define void @ldrxrox_more_than_one_use_shl(ptr %addr) { ret void } + define void @ldrhrox_more_than_one_mem_use_shl(ptr %addr) { ret void } + define void @ldrhrox_more_than_one_use_shl(ptr %addr) { ret void } + define void @ldrwrox_more_than_one_use_shl(ptr %addr) { ret void } + define void @ldrqrox_more_than_one_use_shl(ptr %addr) { ret void } + define void @more_than_one_use_shl_lsl(ptr %addr) { ret void } define void @more_than_one_use_shl_minsize(ptr %addr) #0 { ret void } define void @ldrwrox(ptr %addr) { ret void } define void @ldrsrox(ptr %addr) { ret void } @@ -113,6 +121,67 @@ body: | ... --- +name: ldrhrox_shl +alignment: 4 +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: {} +body: | + bb.0: + liveins: $x0, $x1, $x2 + liveins: $w1, $x0 + + ; CHECK-LABEL: name: ldrhrox_shl + ; CHECK: liveins: $x0, $x1, $x2, $w1, $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr32 = COPY $w1 + ; CHECK-NEXT: [[UBFMWri:%[0-9]+]]:gpr32 = UBFMWri [[COPY1]], 9, 31 + ; CHECK-NEXT: [[ORRWrs:%[0-9]+]]:gpr32 = ORRWrs $wzr, [[UBFMWri]], 0 + ; CHECK-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, [[ORRWrs]], %subreg.sub_32 + ; CHECK-NEXT: [[ANDXri:%[0-9]+]]:gpr64common = ANDXri [[SUBREG_TO_REG]], 4103 + ; CHECK-NEXT: [[LDRHHroX:%[0-9]+]]:gpr32 = LDRHHroX [[COPY]], [[ANDXri]], 0, 1 :: (load (s16)) + ; CHECK-NEXT: RET_ReallyLR implicit [[LDRHHroX]] + %0:gpr(p0) = COPY $x0 + %1:gpr(s32) = COPY $w1 + %15:gpr(s64) = G_CONSTANT i64 9 + %3:gpr(s32) = G_LSHR %1, %15(s64) + %4:gpr(s64) = G_ZEXT %3(s32) + %5:gpr(s64) = G_CONSTANT i64 255 + %6:gpr(s64) = G_AND %4, %5 + %13:gpr(s64) = G_CONSTANT i64 1 + %8:gpr(s64) = G_SHL %6, %13(s64) + %9:gpr(p0) = G_PTR_ADD %0, %8(s64) + %12:gpr(s32) = G_LOAD %9(p0) :: (load (s16)) + RET_ReallyLR implicit %12 +... +--- +name: ldrwrox_shl +alignment: 4 +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: {} +body: | + bb.0: + liveins: $x0, $x1, $x2 + ; CHECK-LABEL: name: ldrwrox_shl + ; CHECK: liveins: $x0, $x1, $x2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64sp = COPY $x1 + ; CHECK-NEXT: [[LDRWroX:%[0-9]+]]:gpr32 = LDRWroX [[COPY1]], [[COPY]], 0, 1 :: (load (s32) from %ir.addr) + ; CHECK-NEXT: RET_ReallyLR implicit [[LDRWroX]] + %0:gpr(s64) = COPY $x0 + %1:gpr(s64) = G_CONSTANT i64 2 + %2:gpr(s64) = G_SHL %0, %1(s64) + %3:gpr(p0) = COPY $x1 + %4:gpr(p0) = G_PTR_ADD %3, %2 + %5:gpr(s32) = G_LOAD %4(p0) :: (load (s32) from %ir.addr) + RET_ReallyLR implicit %5 +... +--- name: ldrxrox_shl alignment: 4 legalized: true @@ -167,6 +236,32 @@ body: | $d2 = COPY %5(s64) RET_ReallyLR implicit $d2 +... +--- +name: ldrqrox_shl +alignment: 4 +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: {} +body: | + bb.0: + liveins: $x0, $x1, $d2 + ; CHECK-LABEL: name: ldrqrox_shl + ; CHECK: liveins: $x0, $x1, $d2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64sp = COPY $x1 + ; CHECK-NEXT: [[LDRQroX:%[0-9]+]]:fpr128 = LDRQroX [[COPY1]], [[COPY]], 0, 1 :: (load (s128) from %ir.addr) + ; CHECK-NEXT: RET_ReallyLR implicit [[LDRQroX]] + %0:gpr(s64) = COPY $x0 + %1:gpr(s64) = G_CONSTANT i64 4 + %2:gpr(s64) = G_SHL %0, %1(s64) + %3:gpr(p0) = COPY $x1 + %4:gpr(p0) = G_PTR_ADD %3, %2 + %5:fpr(s128) = G_LOAD %4(p0) :: (load (s128) from %ir.addr) + RET_ReallyLR implicit %5 + ... --- name: ldrxrox_mul_rhs @@ -352,7 +447,7 @@ body: | # Show that we can still fall back to the register-register addressing # mode when we fail to pull in the shift. -name: more_than_one_use_shl_1 +name: more_than_one_use_shl_fallback alignment: 4 legalized: true regBankSelected: true @@ -361,19 +456,19 @@ machineFunctionInfo: {} body: | bb.0: liveins: $x0, $x1, $x2 - ; CHECK-LABEL: name: more_than_one_use_shl_1 + ; CHECK-LABEL: name: more_than_one_use_shl_fallback ; CHECK: liveins: $x0, $x1, $x2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0 - ; CHECK-NEXT: [[UBFMXri:%[0-9]+]]:gpr64common = UBFMXri [[COPY]], 61, 60 + ; CHECK-NEXT: [[UBFMXri:%[0-9]+]]:gpr64common = UBFMXri [[COPY]], 62, 61 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64sp = COPY $x1 ; CHECK-NEXT: [[LDRXroX:%[0-9]+]]:gpr64 = LDRXroX [[COPY1]], [[UBFMXri]], 0, 0 :: (load (s64) from %ir.addr) - ; CHECK-NEXT: [[ADDXri:%[0-9]+]]:gpr64common = ADDXri [[UBFMXri]], 3, 0 + ; CHECK-NEXT: [[ADDXri:%[0-9]+]]:gpr64common = ADDXri [[UBFMXri]], 2, 0 ; CHECK-NEXT: [[ADDXrr:%[0-9]+]]:gpr64 = ADDXrr [[LDRXroX]], [[ADDXri]] ; CHECK-NEXT: $x2 = COPY [[ADDXrr]] ; CHECK-NEXT: RET_ReallyLR implicit $x2 %0:gpr(s64) = COPY $x0 - %1:gpr(s64) = G_CONSTANT i64 3 + %1:gpr(s64) = G_CONSTANT i64 2 %2:gpr(s64) = G_SHL %0, %1(s64) %3:gpr(p0) = COPY $x1 %4:gpr(p0) = G_PTR_ADD %3, %2 @@ -385,10 +480,48 @@ body: | ... --- -# Show that when the GEP is used outside a memory op, we don't do any -# folding at all. +name: ldrxrox_more_than_one_mem_use_shl +alignment: 4 +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: {} +body: | + bb.0: + liveins: $x0, $x1, $x2 + ; CHECK-LABEL: name: ldrxrox_more_than_one_mem_use_shl + ; CHECK: liveins: $x0, $x1, $x2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr32 = COPY $w1 + ; CHECK-NEXT: [[UBFMWri:%[0-9]+]]:gpr32 = UBFMWri [[COPY1]], 9, 31 + ; CHECK-NEXT: [[ORRWrs:%[0-9]+]]:gpr32 = ORRWrs $wzr, [[UBFMWri]], 0 + ; CHECK-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, [[ORRWrs]], %subreg.sub_32 + ; CHECK-NEXT: [[ANDXri:%[0-9]+]]:gpr64common = ANDXri [[SUBREG_TO_REG]], 4103 + ; CHECK-NEXT: [[LDRXroX:%[0-9]+]]:gpr64 = LDRXroX [[COPY]], [[ANDXri]], 0, 1 :: (load (s64)) + ; CHECK-NEXT: [[LDRXroX1:%[0-9]+]]:gpr64 = LDRXroX [[COPY]], [[ANDXri]], 0, 1 :: (load (s64)) + ; CHECK-NEXT: [[ADDXrr:%[0-9]+]]:gpr64 = ADDXrr [[LDRXroX]], [[LDRXroX1]] + ; CHECK-NEXT: RET_ReallyLR implicit [[ADDXrr]] + %0:gpr(p0) = COPY $x0 + %1:gpr(s32) = COPY $w1 + %15:gpr(s64) = G_CONSTANT i64 9 + %3:gpr(s32) = G_LSHR %1, %15(s64) + %4:gpr(s64) = G_ZEXT %3(s32) + %5:gpr(s64) = G_CONSTANT i64 255 + %6:gpr(s64) = G_AND %4, %5 + %13:gpr(s64) = G_CONSTANT i64 3 + %8:gpr(s64) = G_SHL %6, %13(s64) + %9:gpr(p0) = G_PTR_ADD %0, %8(s64) + %12:gpr(s64) = G_LOAD %9(p0) :: (load (s64)) + %17:gpr(s64) = G_LOAD %9(p0) :: (load (s64)) + %18:gpr(s64) = G_ADD %12, %17 + RET_ReallyLR implicit %18 -name: more_than_one_use_shl_2 +... +--- +# Show that when the GEP is used both inside and outside a memory op, we only fold the memory op. + +name: ldrxrox_more_than_one_use_shl alignment: 4 legalized: true regBankSelected: true @@ -397,7 +530,7 @@ machineFunctionInfo: {} body: | bb.0: liveins: $x0, $x1, $x2 - ; CHECK-LABEL: name: more_than_one_use_shl_2 + ; CHECK-LABEL: name: ldrxrox_more_than_one_use_shl ; CHECK: liveins: $x0, $x1, $x2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0 @@ -426,10 +559,90 @@ body: | ... --- -# Show that when we have a fastpath for shift-left, we perform the folding -# if it has more than one use. +# Fold SHL into LSL for mem ops. Do not fold if the target has LSLSLOW14. +name: ldrhrox_more_than_one_mem_use_shl +alignment: 4 +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: {} +body: | + bb.0: + liveins: $x0, $x1, $x2 + liveins: $w1, $x0 + + ; CHECK-LABEL: name: ldrhrox_more_than_one_mem_use_shl + ; CHECK: liveins: $x0, $x1, $x2, $w1, $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr32 = COPY $w1 + ; CHECK-NEXT: [[UBFMWri:%[0-9]+]]:gpr32 = UBFMWri [[COPY1]], 9, 31 + ; CHECK-NEXT: [[ORRWrs:%[0-9]+]]:gpr32 = ORRWrs $wzr, [[UBFMWri]], 0 + ; CHECK-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, [[ORRWrs]], %subreg.sub_32 + ; CHECK-NEXT: [[ANDXri:%[0-9]+]]:gpr64common = ANDXri [[SUBREG_TO_REG]], 4103 + ; CHECK-NEXT: [[LDRHHroX:%[0-9]+]]:gpr32 = LDRHHroX [[COPY]], [[ANDXri]], 0, 1 :: (load (s16)) + ; CHECK-NEXT: [[LDRHHroX1:%[0-9]+]]:gpr32 = LDRHHroX [[COPY]], [[ANDXri]], 0, 1 :: (load (s16)) + ; CHECK-NEXT: [[ADDWrr:%[0-9]+]]:gpr32 = ADDWrr [[LDRHHroX]], [[LDRHHroX1]] + ; CHECK-NEXT: RET_ReallyLR implicit [[ADDWrr]] + %0:gpr(p0) = COPY $x0 + %1:gpr(s32) = COPY $w1 + %15:gpr(s64) = G_CONSTANT i64 9 + %3:gpr(s32) = G_LSHR %1, %15(s64) + %4:gpr(s64) = G_ZEXT %3(s32) + %5:gpr(s64) = G_CONSTANT i64 255 + %6:gpr(s64) = G_AND %4, %5 + %13:gpr(s64) = G_CONSTANT i64 1 + %8:gpr(s64) = G_SHL %6, %13(s64) + %9:gpr(p0) = G_PTR_ADD %0, %8(s64) + %12:gpr(s32) = G_LOAD %9(p0) :: (load (s16)) + %17:gpr(s32) = G_LOAD %9(p0) :: (load (s16)) + %18:gpr(s32) = G_ADD %12, %17 + RET_ReallyLR implicit %18 +... +--- +# Fold SHL into LSL for memory ops. Do not fold if the target has LSLSLOW14. +name: ldrhrox_more_than_one_use_shl +alignment: 4 +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: {} +body: | + bb.0: + liveins: $x0, $x1, $x2 + liveins: $w1, $x0 -name: more_than_one_use_shl_lsl_fast + ; CHECK-LABEL: name: ldrhrox_more_than_one_use_shl + ; CHECK: liveins: $x0, $x1, $x2, $w1, $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr32 = COPY $w1 + ; CHECK-NEXT: [[UBFMWri:%[0-9]+]]:gpr32 = UBFMWri [[COPY1]], 9, 31 + ; CHECK-NEXT: [[ORRWrs:%[0-9]+]]:gpr32 = ORRWrs $wzr, [[UBFMWri]], 0 + ; CHECK-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, [[ORRWrs]], %subreg.sub_32 + ; CHECK-NEXT: [[ANDXri:%[0-9]+]]:gpr64common = ANDXri [[SUBREG_TO_REG]], 4103 + ; CHECK-NEXT: [[LDRHHroX:%[0-9]+]]:gpr32 = LDRHHroX [[COPY]], [[ANDXri]], 0, 1 :: (load (s16)) + ; CHECK-NEXT: [[LDRHHroX1:%[0-9]+]]:gpr32 = LDRHHroX [[COPY]], [[ANDXri]], 0, 1 :: (load (s16)) + ; CHECK-NEXT: [[ADDWrr:%[0-9]+]]:gpr32 = ADDWrr [[LDRHHroX]], [[LDRHHroX1]] + ; CHECK-NEXT: RET_ReallyLR implicit [[ADDWrr]] + %0:gpr(p0) = COPY $x0 + %1:gpr(s32) = COPY $w1 + %15:gpr(s64) = G_CONSTANT i64 9 + %3:gpr(s32) = G_LSHR %1, %15(s64) + %4:gpr(s64) = G_ZEXT %3(s32) + %5:gpr(s64) = G_CONSTANT i64 255 + %6:gpr(s64) = G_AND %4, %5 + %13:gpr(s64) = G_CONSTANT i64 1 + %8:gpr(s64) = G_SHL %6, %13(s64) + %9:gpr(p0) = G_PTR_ADD %0, %8(s64) + %12:gpr(s32) = G_LOAD %9(p0) :: (load (s16)) + %17:gpr(s32) = G_LOAD %9(p0) :: (load (s16)) + %18:gpr(s32) = G_ADD %12, %17 + RET_ReallyLR implicit %18 +... +--- +# Fold SHL into LSL for memory ops. +name: ldrwrox_more_than_one_use_shl alignment: 4 legalized: true regBankSelected: true @@ -438,33 +651,81 @@ machineFunctionInfo: {} body: | bb.0: liveins: $x0, $x1, $x2 - ; CHECK-LABEL: name: more_than_one_use_shl_lsl_fast + ; CHECK-LABEL: name: ldrwrox_more_than_one_use_shl ; CHECK: liveins: $x0, $x1, $x2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64sp = COPY $x1 - ; CHECK-NEXT: [[LDRXroX:%[0-9]+]]:gpr64 = LDRXroX [[COPY1]], [[COPY]], 0, 1 :: (load (s64) from %ir.addr) - ; CHECK-NEXT: [[LDRXroX1:%[0-9]+]]:gpr64 = LDRXroX [[COPY1]], [[COPY]], 0, 1 :: (load (s64) from %ir.addr) - ; CHECK-NEXT: [[ADDXrr:%[0-9]+]]:gpr64 = ADDXrr [[LDRXroX]], [[LDRXroX1]] - ; CHECK-NEXT: $x2 = COPY [[ADDXrr]] + ; CHECK-NEXT: [[UBFMXri:%[0-9]+]]:gpr64common = UBFMXri [[COPY]], 62, 61 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1 + ; CHECK-NEXT: [[ADDXrr:%[0-9]+]]:gpr64common = ADDXrr [[COPY1]], [[UBFMXri]] + ; CHECK-NEXT: [[LDRWui:%[0-9]+]]:gpr32 = LDRWui [[ADDXrr]], 0 :: (load (s32) from %ir.addr) + ; CHECK-NEXT: [[ORRWrs:%[0-9]+]]:gpr32 = ORRWrs $wzr, [[LDRWui]], 0 + ; CHECK-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, [[ORRWrs]], %subreg.sub_32 + ; CHECK-NEXT: [[ADDXri:%[0-9]+]]:gpr64common = ADDXri [[UBFMXri]], 2, 0 + ; CHECK-NEXT: [[ADDXrr1:%[0-9]+]]:gpr64 = ADDXrr [[SUBREG_TO_REG]], [[ADDXri]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64 = COPY [[ADDXrr]] + ; CHECK-NEXT: [[ADDXrr2:%[0-9]+]]:gpr64 = ADDXrr [[COPY2]], [[ADDXrr1]] + ; CHECK-NEXT: $x2 = COPY [[ADDXrr2]] ; CHECK-NEXT: RET_ReallyLR implicit $x2 %0:gpr(s64) = COPY $x0 - %1:gpr(s64) = G_CONSTANT i64 3 + %1:gpr(s64) = G_CONSTANT i64 2 %2:gpr(s64) = G_SHL %0, %1(s64) %3:gpr(p0) = COPY $x1 %4:gpr(p0) = G_PTR_ADD %3, %2 - %5:gpr(s64) = G_LOAD %4(p0) :: (load (s64) from %ir.addr) - %6:gpr(s64) = G_LOAD %4(p0) :: (load (s64) from %ir.addr) + %20:gpr(s32) = G_LOAD %4(p0) :: (load (s32) from %ir.addr) + %5:gpr(s64) = G_ZEXT %20 + %6:gpr(s64) = G_ADD %2, %1 %7:gpr(s64) = G_ADD %5, %6 - $x2 = COPY %7(s64) + %8:gpr(s64) = G_PTRTOINT %4 + %9:gpr(s64) = G_ADD %8, %7 + $x2 = COPY %9(s64) RET_ReallyLR implicit $x2 - ... --- -# Show that we don't fold into multiple memory ops when we don't have a -# fastpath for shift-left. +# Fold SHL into LSL for memory ops. Do not fold if the target has LSLSLOW14. +name: ldrqrox_more_than_one_use_shl +alignment: 4 +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: {} +body: | + bb.0: + liveins: $x0, $x1, $x2 + ; CHECK-LABEL: name: ldrqrox_more_than_one_use_shl + ; CHECK: liveins: $x0, $x1, $x2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0 + ; CHECK-NEXT: [[UBFMXri:%[0-9]+]]:gpr64common = UBFMXri [[COPY]], 60, 59 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1 + ; CHECK-NEXT: [[ADDXrr:%[0-9]+]]:gpr64common = ADDXrr [[COPY1]], [[UBFMXri]] + ; CHECK-NEXT: [[LDRQui:%[0-9]+]]:fpr128 = LDRQui [[ADDXrr]], 0 :: (load (s128) from %ir.addr) + ; CHECK-NEXT: [[ADDXri:%[0-9]+]]:gpr64common = ADDXri [[UBFMXri]], 4, 0 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:fpr64 = COPY [[LDRQui]].dsub + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64 = COPY [[COPY2]] + ; CHECK-NEXT: [[ADDXrr1:%[0-9]+]]:gpr64 = ADDXrr [[COPY3]], [[ADDXri]] + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64 = COPY [[ADDXrr]] + ; CHECK-NEXT: [[ADDXrr2:%[0-9]+]]:gpr64 = ADDXrr [[COPY4]], [[ADDXrr1]] + ; CHECK-NEXT: RET_ReallyLR implicit [[ADDXrr2]] + %0:gpr(s64) = COPY $x0 + %1:gpr(s64) = G_CONSTANT i64 4 + %2:gpr(s64) = G_SHL %0, %1(s64) + %3:gpr(p0) = COPY $x1 + %4:gpr(p0) = G_PTR_ADD %3, %2 + %20:fpr(s128) = G_LOAD %4(p0) :: (load (s128) from %ir.addr) + %6:gpr(s64) = G_ADD %2, %1 + %200:fpr(s64) = G_TRUNC %20 + %2000:gpr(s64) = COPY %200 + %7:gpr(s64) = G_ADD %2000, %6 + %8:gpr(s64) = G_PTRTOINT %4 + %9:gpr(s64) = G_ADD %8, %7 + RET_ReallyLR implicit %9 +... +--- +# Show that when we have a fastpath for shift-left, we perform the folding +# if it has more than one use. -name: more_than_one_use_shl_lsl_slow +name: more_than_one_use_shl_lsl alignment: 4 legalized: true regBankSelected: true @@ -473,7 +734,7 @@ machineFunctionInfo: {} body: | bb.0: liveins: $x0, $x1, $x2 - ; CHECK-LABEL: name: more_than_one_use_shl_lsl_slow + ; CHECK-LABEL: name: more_than_one_use_shl_lsl ; CHECK: liveins: $x0, $x1, $x2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0 diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/store-addressing-modes.mir b/llvm/test/CodeGen/AArch64/GlobalISel/store-addressing-modes.mir index 8214b632e5f33..62ebe86504bfa 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/store-addressing-modes.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/store-addressing-modes.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=aarch64-unknown-unknown -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s +# RUN: llc -mtriple=aarch64-unknown-unknown -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-FAST +# RUN: llc -mtriple=aarch64-unknown-unknown -mattr=+addr-lsl-slow-14 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SLOW --- | define void @strxrox(ptr %addr) { ret void } @@ -9,7 +10,11 @@ define void @strsrox(ptr %addr) { ret void } define void @strhrox(ptr %addr) { ret void } define void @strqrox(ptr %addr) { ret void } - define void @shl(ptr %addr) { ret void } + define void @shl_fast_3(ptr %addr) { ret void } + define void @shl_slow_1(ptr %addr) { ret void } + define void @shl_slow_1_more_than_one_use(ptr %addr) { ret void } + define void @shl_slow_4(ptr %addr) { ret void } + define void @shl_slow_4_more_than_one_use(ptr %addr) { ret void } define void @shl_p0(ptr %addr) { ret void } ... @@ -25,10 +30,11 @@ body: | liveins: $x0, $x1, $x2 ; CHECK-LABEL: name: strxrox ; CHECK: liveins: $x0, $x1, $x2 - ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0 - ; CHECK: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1 - ; CHECK: [[COPY2:%[0-9]+]]:gpr64 = COPY $x2 - ; CHECK: STRXroX [[COPY2]], [[COPY]], [[COPY1]], 0, 0 :: (store (s64) into %ir.addr) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64 = COPY $x2 + ; CHECK-NEXT: STRXroX [[COPY2]], [[COPY]], [[COPY1]], 0, 0 :: (store (s64) into %ir.addr) %0:gpr(p0) = COPY $x0 %1:gpr(s64) = COPY $x1 %ptr:gpr(p0) = G_PTR_ADD %0, %1 @@ -47,11 +53,12 @@ body: | liveins: $x0, $x1, $x2 ; CHECK-LABEL: name: strxrox_p0 ; CHECK: liveins: $x0, $x1, $x2 - ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0 - ; CHECK: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1 - ; CHECK: [[COPY2:%[0-9]+]]:gpr64all = COPY $x2 - ; CHECK: [[COPY3:%[0-9]+]]:gpr64 = COPY [[COPY2]] - ; CHECK: STRXroX [[COPY3]], [[COPY]], [[COPY1]], 0, 0 :: (store (p0) into %ir.addr) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64all = COPY $x2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64 = COPY [[COPY2]] + ; CHECK-NEXT: STRXroX [[COPY3]], [[COPY]], [[COPY1]], 0, 0 :: (store (p0) into %ir.addr) %0:gpr(p0) = COPY $x0 %1:gpr(s64) = COPY $x1 %ptr:gpr(p0) = G_PTR_ADD %0, %1 @@ -70,10 +77,11 @@ body: | liveins: $x0, $x1, $d2 ; CHECK-LABEL: name: strdrox ; CHECK: liveins: $x0, $x1, $d2 - ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0 - ; CHECK: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1 - ; CHECK: [[COPY2:%[0-9]+]]:fpr64 = COPY $d2 - ; CHECK: STRDroX [[COPY2]], [[COPY]], [[COPY1]], 0, 0 :: (store (s64) into %ir.addr) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:fpr64 = COPY $d2 + ; CHECK-NEXT: STRDroX [[COPY2]], [[COPY]], [[COPY1]], 0, 0 :: (store (s64) into %ir.addr) %0:gpr(p0) = COPY $x0 %1:gpr(s64) = COPY $x1 %ptr:gpr(p0) = G_PTR_ADD %0, %1 @@ -92,10 +100,11 @@ body: | liveins: $x0, $x1, $w2 ; CHECK-LABEL: name: strwrox ; CHECK: liveins: $x0, $x1, $w2 - ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0 - ; CHECK: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1 - ; CHECK: [[COPY2:%[0-9]+]]:gpr32 = COPY $w2 - ; CHECK: STRWroX [[COPY2]], [[COPY]], [[COPY1]], 0, 0 :: (store (s32) into %ir.addr) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr32 = COPY $w2 + ; CHECK-NEXT: STRWroX [[COPY2]], [[COPY]], [[COPY1]], 0, 0 :: (store (s32) into %ir.addr) %0:gpr(p0) = COPY $x0 %1:gpr(s64) = COPY $x1 %ptr:gpr(p0) = G_PTR_ADD %0, %1 @@ -114,10 +123,11 @@ body: | liveins: $x0, $x1, $s2 ; CHECK-LABEL: name: strsrox ; CHECK: liveins: $x0, $x1, $s2 - ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0 - ; CHECK: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1 - ; CHECK: [[COPY2:%[0-9]+]]:fpr32 = COPY $s2 - ; CHECK: STRSroX [[COPY2]], [[COPY]], [[COPY1]], 0, 0 :: (store (s32) into %ir.addr) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:fpr32 = COPY $s2 + ; CHECK-NEXT: STRSroX [[COPY2]], [[COPY]], [[COPY1]], 0, 0 :: (store (s32) into %ir.addr) %0:gpr(p0) = COPY $x0 %1:gpr(s64) = COPY $x1 %ptr:gpr(p0) = G_PTR_ADD %0, %1 @@ -136,10 +146,11 @@ body: | liveins: $x0, $x1, $h0 ; CHECK-LABEL: name: strhrox ; CHECK: liveins: $x0, $x1, $h0 - ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0 - ; CHECK: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1 - ; CHECK: [[COPY2:%[0-9]+]]:fpr16 = COPY $h0 - ; CHECK: STRHroX [[COPY2]], [[COPY]], [[COPY1]], 0, 0 :: (store (s16) into %ir.addr) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:fpr16 = COPY $h0 + ; CHECK-NEXT: STRHroX [[COPY2]], [[COPY]], [[COPY1]], 0, 0 :: (store (s16) into %ir.addr) %0:gpr(p0) = COPY $x0 %1:gpr(s64) = COPY $x1 %ptr:gpr(p0) = G_PTR_ADD %0, %1 @@ -158,10 +169,11 @@ body: | liveins: $x0, $x1, $q2 ; CHECK-LABEL: name: strqrox ; CHECK: liveins: $x0, $x1, $q2 - ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0 - ; CHECK: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1 - ; CHECK: [[COPY2:%[0-9]+]]:fpr128 = COPY $q2 - ; CHECK: STRQroX [[COPY2]], [[COPY]], [[COPY1]], 0, 0 :: (store (<2 x s64>) into %ir.addr) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:fpr128 = COPY $q2 + ; CHECK-NEXT: STRQroX [[COPY2]], [[COPY]], [[COPY1]], 0, 0 :: (store (<2 x s64>) into %ir.addr) %0:gpr(p0) = COPY $x0 %1:gpr(s64) = COPY $x1 %ptr:gpr(p0) = G_PTR_ADD %0, %1 @@ -169,7 +181,7 @@ body: | G_STORE %2, %ptr :: (store (<2 x s64>) into %ir.addr) ... --- -name: shl +name: shl_fast_3 alignment: 4 legalized: true regBankSelected: true @@ -178,12 +190,13 @@ machineFunctionInfo: {} body: | bb.0: liveins: $x0, $x1, $x2 - ; CHECK-LABEL: name: shl + ; CHECK-LABEL: name: shl_fast_3 ; CHECK: liveins: $x0, $x1, $x2 - ; CHECK: [[COPY:%[0-9]+]]:gpr64 = COPY $x0 - ; CHECK: [[COPY1:%[0-9]+]]:gpr64sp = COPY $x1 - ; CHECK: [[COPY2:%[0-9]+]]:gpr64 = COPY $x2 - ; CHECK: STRXroX [[COPY2]], [[COPY1]], [[COPY]], 0, 1 :: (store (s64) into %ir.addr) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64sp = COPY $x1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64 = COPY $x2 + ; CHECK-NEXT: STRXroX [[COPY2]], [[COPY1]], [[COPY]], 0, 1 :: (store (s64) into %ir.addr) %0:gpr(s64) = COPY $x0 %1:gpr(s64) = G_CONSTANT i64 3 %2:gpr(s64) = G_SHL %0, %1(s64) @@ -193,6 +206,114 @@ body: | G_STORE %4, %ptr :: (store (s64) into %ir.addr) ... --- +name: shl_slow_1 +alignment: 4 +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: {} +body: | + bb.0: + liveins: $x0, $x1, $x2 + ; CHECK-LABEL: name: shl_slow_1 + ; CHECK: liveins: $x0, $x1, $x2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64sp = COPY $x1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64 = COPY $x2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr32 = COPY [[COPY2]].sub_32 + ; CHECK-NEXT: STRHHroX [[COPY3]], [[COPY1]], [[COPY]], 0, 1 :: (store (s16) into %ir.addr) + %0:gpr(s64) = COPY $x0 + %1:gpr(s64) = G_CONSTANT i64 1 + %2:gpr(s64) = G_SHL %0, %1(s64) + %3:gpr(p0) = COPY $x1 + %ptr:gpr(p0) = G_PTR_ADD %3, %2 + %4:gpr(s64) = COPY $x2 + G_STORE %4, %ptr :: (store (s16) into %ir.addr) +... +--- +name: shl_slow_1_more_than_one_use +alignment: 4 +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: {} +body: | + bb.0: + liveins: $x0, $x1, $x2 + ; CHECK-LABEL: name: shl_slow_1_more_than_one_use + ; CHECK: liveins: $x0, $x1, $x2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64sp = COPY $x1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64 = COPY $x2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr32 = COPY [[COPY2]].sub_32 + ; CHECK-NEXT: STRHHroX [[COPY3]], [[COPY1]], [[COPY]], 0, 1 :: (store (s16) into %ir.addr) + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr32 = COPY [[COPY2]].sub_32 + ; CHECK-NEXT: STRHHroX [[COPY4]], [[COPY1]], [[COPY]], 0, 1 :: (store (s16) into %ir.addr) + %0:gpr(s64) = COPY $x0 + %1:gpr(s64) = G_CONSTANT i64 1 + %2:gpr(s64) = G_SHL %0, %1(s64) + %3:gpr(p0) = COPY $x1 + %ptr:gpr(p0) = G_PTR_ADD %3, %2 + %4:gpr(s64) = COPY $x2 + %5:gpr(s16) = G_TRUNC %4 + G_STORE %4, %ptr :: (store (s16) into %ir.addr) + G_STORE %4, %ptr :: (store (s16) into %ir.addr) +... +--- +name: shl_slow_4 +alignment: 4 +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: {} +body: | + bb.0: + liveins: $x0, $x1, $x2, $q0 + ; CHECK-LABEL: name: shl_slow_4 + ; CHECK: liveins: $x0, $x1, $x2, $q0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64sp = COPY $x1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:fpr128 = COPY $q0 + ; CHECK-NEXT: STRQroX [[COPY2]], [[COPY1]], [[COPY]], 0, 1 :: (store (s128) into %ir.addr) + %0:gpr(s64) = COPY $x0 + %1:gpr(s64) = G_CONSTANT i64 4 + %2:gpr(s64) = G_SHL %0, %1(s64) + %3:gpr(p0) = COPY $x1 + %ptr:gpr(p0) = G_PTR_ADD %3, %2 + %5:fpr(s128) = COPY $q0 + G_STORE %5, %ptr :: (store (s128) into %ir.addr) +... +--- +name: shl_slow_4_more_than_one_use +alignment: 4 +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: {} +body: | + bb.0: + liveins: $x0, $x1, $x2, $q0 + ; CHECK-LABEL: name: shl_slow_4_more_than_one_use + ; CHECK: liveins: $x0, $x1, $x2, $q0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64sp = COPY $x1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:fpr128 = COPY $q0 + ; CHECK-NEXT: STRQroX [[COPY2]], [[COPY1]], [[COPY]], 0, 1 :: (store (s128) into %ir.addr) + ; CHECK-NEXT: STRQroX [[COPY2]], [[COPY1]], [[COPY]], 0, 1 :: (store (s128) into %ir.addr) + %0:gpr(s64) = COPY $x0 + %1:gpr(s64) = G_CONSTANT i64 4 + %2:gpr(s64) = G_SHL %0, %1(s64) + %3:gpr(p0) = COPY $x1 + %ptr:gpr(p0) = G_PTR_ADD %3, %2 + %5:fpr(s128) = COPY $q0 + G_STORE %5, %ptr :: (store (s128) into %ir.addr) + G_STORE %5, %ptr :: (store (s128) into %ir.addr) +... +--- name: shl_p0 alignment: 4 legalized: true @@ -204,11 +325,12 @@ body: | liveins: $x0, $x1, $x2 ; CHECK-LABEL: name: shl_p0 ; CHECK: liveins: $x0, $x1, $x2 - ; CHECK: [[COPY:%[0-9]+]]:gpr64 = COPY $x0 - ; CHECK: [[COPY1:%[0-9]+]]:gpr64sp = COPY $x1 - ; CHECK: [[COPY2:%[0-9]+]]:gpr64all = COPY $x2 - ; CHECK: [[COPY3:%[0-9]+]]:gpr64 = COPY [[COPY2]] - ; CHECK: STRXroX [[COPY3]], [[COPY1]], [[COPY]], 0, 1 :: (store (p0) into %ir.addr) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64sp = COPY $x1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64all = COPY $x2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64 = COPY [[COPY2]] + ; CHECK-NEXT: STRXroX [[COPY3]], [[COPY1]], [[COPY]], 0, 1 :: (store (p0) into %ir.addr) %0:gpr(s64) = COPY $x0 %1:gpr(s64) = G_CONSTANT i64 3 %2:gpr(s64) = G_SHL %0, %1(s64) @@ -216,3 +338,8 @@ body: | %ptr:gpr(p0) = G_PTR_ADD %3, %2 %4:gpr(p0) = COPY $x2 G_STORE %4, %ptr :: (store (p0) into %ir.addr) +... + +# NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +# CHECK-FAST: {{.*}} +# CHECK-SLOW: {{.*}} diff --git a/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll b/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll index 022aaea9ef0cc..614ac15d959f0 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+addr-lsl-slow-14 | FileCheck %s --check-prefixes=CHECK,CHECK0 -; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s --check-prefixes=CHECK,CHECK3 +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+addr-lsl-slow-14 | FileCheck %s --check-prefixes=CHECK,CHECK0,CHECK0-SDAG +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+addr-lsl-slow-14 -global-isel=1 -global-isel-abort=1 | FileCheck %s --check-prefixes=CHECK,CHECK0,CHECK0-GISEL +; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s --check-prefixes=CHECK,CHECK3,CHECK3-SDAG +; RUN: llc < %s -mtriple=aarch64-linux-gnu -global-isel=1 -global-isel-abort=1 | FileCheck %s --check-prefixes=CHECK,CHECK3,CHECK3-GISEL %struct.a = type [256 x i16] %struct.b = type [256 x i32] @@ -8,36 +10,66 @@ declare void @foo() define i16 @halfword(ptr %ctx, i32 %xor72) nounwind { -; CHECK0-LABEL: halfword: -; CHECK0: // %bb.0: -; CHECK0-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill -; CHECK0-NEXT: // kill: def $w1 killed $w1 def $x1 -; CHECK0-NEXT: ubfx x8, x1, #9, #8 -; CHECK0-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill -; CHECK0-NEXT: mov x19, x0 -; CHECK0-NEXT: lsl x21, x8, #1 -; CHECK0-NEXT: ldrh w20, [x0, x21] -; CHECK0-NEXT: bl foo -; CHECK0-NEXT: mov w0, w20 -; CHECK0-NEXT: strh w20, [x19, x21] -; CHECK0-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; CHECK0-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload -; CHECK0-NEXT: ret +; CHECK0-SDAG-LABEL: halfword: +; CHECK0-SDAG: // %bb.0: +; CHECK0-SDAG-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill +; CHECK0-SDAG-NEXT: // kill: def $w1 killed $w1 def $x1 +; CHECK0-SDAG-NEXT: ubfx x8, x1, #9, #8 +; CHECK0-SDAG-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK0-SDAG-NEXT: mov x19, x0 +; CHECK0-SDAG-NEXT: lsl x21, x8, #1 +; CHECK0-SDAG-NEXT: ldrh w20, [x0, x21] +; CHECK0-SDAG-NEXT: bl foo +; CHECK0-SDAG-NEXT: mov w0, w20 +; CHECK0-SDAG-NEXT: strh w20, [x19, x21] +; CHECK0-SDAG-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK0-SDAG-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload +; CHECK0-SDAG-NEXT: ret ; -; CHECK3-LABEL: halfword: -; CHECK3: // %bb.0: -; CHECK3-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill -; CHECK3-NEXT: // kill: def $w1 killed $w1 def $x1 -; CHECK3-NEXT: ubfx x21, x1, #9, #8 -; CHECK3-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill -; CHECK3-NEXT: mov x19, x0 -; CHECK3-NEXT: ldrh w20, [x0, x21, lsl #1] -; CHECK3-NEXT: bl foo -; CHECK3-NEXT: mov w0, w20 -; CHECK3-NEXT: strh w20, [x19, x21, lsl #1] -; CHECK3-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; CHECK3-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload -; CHECK3-NEXT: ret +; CHECK0-GISEL-LABEL: halfword: +; CHECK0-GISEL: // %bb.0: +; CHECK0-GISEL-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill +; CHECK0-GISEL-NEXT: lsr w8, w1, #9 +; CHECK0-GISEL-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK0-GISEL-NEXT: mov x19, x0 +; CHECK0-GISEL-NEXT: and x21, x8, #0xff +; CHECK0-GISEL-NEXT: ldrh w20, [x0, x21, lsl #1] +; CHECK0-GISEL-NEXT: bl foo +; CHECK0-GISEL-NEXT: mov w0, w20 +; CHECK0-GISEL-NEXT: strh w20, [x19, x21, lsl #1] +; CHECK0-GISEL-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK0-GISEL-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload +; CHECK0-GISEL-NEXT: ret +; +; CHECK3-SDAG-LABEL: halfword: +; CHECK3-SDAG: // %bb.0: +; CHECK3-SDAG-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill +; CHECK3-SDAG-NEXT: // kill: def $w1 killed $w1 def $x1 +; CHECK3-SDAG-NEXT: ubfx x21, x1, #9, #8 +; CHECK3-SDAG-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK3-SDAG-NEXT: mov x19, x0 +; CHECK3-SDAG-NEXT: ldrh w20, [x0, x21, lsl #1] +; CHECK3-SDAG-NEXT: bl foo +; CHECK3-SDAG-NEXT: mov w0, w20 +; CHECK3-SDAG-NEXT: strh w20, [x19, x21, lsl #1] +; CHECK3-SDAG-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK3-SDAG-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload +; CHECK3-SDAG-NEXT: ret +; +; CHECK3-GISEL-LABEL: halfword: +; CHECK3-GISEL: // %bb.0: +; CHECK3-GISEL-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill +; CHECK3-GISEL-NEXT: lsr w8, w1, #9 +; CHECK3-GISEL-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK3-GISEL-NEXT: mov x19, x0 +; CHECK3-GISEL-NEXT: and x21, x8, #0xff +; CHECK3-GISEL-NEXT: ldrh w20, [x0, x21, lsl #1] +; CHECK3-GISEL-NEXT: bl foo +; CHECK3-GISEL-NEXT: mov w0, w20 +; CHECK3-GISEL-NEXT: strh w20, [x19, x21, lsl #1] +; CHECK3-GISEL-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK3-GISEL-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload +; CHECK3-GISEL-NEXT: ret %shr81 = lshr i32 %xor72, 9 %conv82 = zext i32 %shr81 to i64 %idxprom83 = and i64 %conv82, 255 @@ -49,20 +81,65 @@ define i16 @halfword(ptr %ctx, i32 %xor72) nounwind { } define i32 @word(ptr %ctx, i32 %xor72) nounwind { -; CHECK-LABEL: word: -; CHECK: // %bb.0: -; CHECK-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 -; CHECK-NEXT: ubfx x21, x1, #9, #8 -; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: ldr w20, [x0, x21, lsl #2] -; CHECK-NEXT: bl foo -; CHECK-NEXT: mov w0, w20 -; CHECK-NEXT: str w20, [x19, x21, lsl #2] -; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload -; CHECK-NEXT: ret +; CHECK0-SDAG-LABEL: word: +; CHECK0-SDAG: // %bb.0: +; CHECK0-SDAG-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill +; CHECK0-SDAG-NEXT: // kill: def $w1 killed $w1 def $x1 +; CHECK0-SDAG-NEXT: ubfx x21, x1, #9, #8 +; CHECK0-SDAG-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK0-SDAG-NEXT: mov x19, x0 +; CHECK0-SDAG-NEXT: ldr w20, [x0, x21, lsl #2] +; CHECK0-SDAG-NEXT: bl foo +; CHECK0-SDAG-NEXT: mov w0, w20 +; CHECK0-SDAG-NEXT: str w20, [x19, x21, lsl #2] +; CHECK0-SDAG-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK0-SDAG-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload +; CHECK0-SDAG-NEXT: ret +; +; CHECK0-GISEL-LABEL: word: +; CHECK0-GISEL: // %bb.0: +; CHECK0-GISEL-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill +; CHECK0-GISEL-NEXT: lsr w8, w1, #9 +; CHECK0-GISEL-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK0-GISEL-NEXT: mov x19, x0 +; CHECK0-GISEL-NEXT: and x21, x8, #0xff +; CHECK0-GISEL-NEXT: ldr w20, [x0, x21, lsl #2] +; CHECK0-GISEL-NEXT: bl foo +; CHECK0-GISEL-NEXT: mov w0, w20 +; CHECK0-GISEL-NEXT: str w20, [x19, x21, lsl #2] +; CHECK0-GISEL-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK0-GISEL-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload +; CHECK0-GISEL-NEXT: ret +; +; CHECK3-SDAG-LABEL: word: +; CHECK3-SDAG: // %bb.0: +; CHECK3-SDAG-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill +; CHECK3-SDAG-NEXT: // kill: def $w1 killed $w1 def $x1 +; CHECK3-SDAG-NEXT: ubfx x21, x1, #9, #8 +; CHECK3-SDAG-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK3-SDAG-NEXT: mov x19, x0 +; CHECK3-SDAG-NEXT: ldr w20, [x0, x21, lsl #2] +; CHECK3-SDAG-NEXT: bl foo +; CHECK3-SDAG-NEXT: mov w0, w20 +; CHECK3-SDAG-NEXT: str w20, [x19, x21, lsl #2] +; CHECK3-SDAG-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK3-SDAG-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload +; CHECK3-SDAG-NEXT: ret +; +; CHECK3-GISEL-LABEL: word: +; CHECK3-GISEL: // %bb.0: +; CHECK3-GISEL-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill +; CHECK3-GISEL-NEXT: lsr w8, w1, #9 +; CHECK3-GISEL-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK3-GISEL-NEXT: mov x19, x0 +; CHECK3-GISEL-NEXT: and x21, x8, #0xff +; CHECK3-GISEL-NEXT: ldr w20, [x0, x21, lsl #2] +; CHECK3-GISEL-NEXT: bl foo +; CHECK3-GISEL-NEXT: mov w0, w20 +; CHECK3-GISEL-NEXT: str w20, [x19, x21, lsl #2] +; CHECK3-GISEL-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK3-GISEL-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload +; CHECK3-GISEL-NEXT: ret %shr81 = lshr i32 %xor72, 9 %conv82 = zext i32 %shr81 to i64 %idxprom83 = and i64 %conv82, 255 @@ -74,20 +151,65 @@ define i32 @word(ptr %ctx, i32 %xor72) nounwind { } define i64 @doubleword(ptr %ctx, i32 %xor72) nounwind { -; CHECK-LABEL: doubleword: -; CHECK: // %bb.0: -; CHECK-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 -; CHECK-NEXT: ubfx x21, x1, #9, #8 -; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: ldr x20, [x0, x21, lsl #3] -; CHECK-NEXT: bl foo -; CHECK-NEXT: mov x0, x20 -; CHECK-NEXT: str x20, [x19, x21, lsl #3] -; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload -; CHECK-NEXT: ret +; CHECK0-SDAG-LABEL: doubleword: +; CHECK0-SDAG: // %bb.0: +; CHECK0-SDAG-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill +; CHECK0-SDAG-NEXT: // kill: def $w1 killed $w1 def $x1 +; CHECK0-SDAG-NEXT: ubfx x21, x1, #9, #8 +; CHECK0-SDAG-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK0-SDAG-NEXT: mov x19, x0 +; CHECK0-SDAG-NEXT: ldr x20, [x0, x21, lsl #3] +; CHECK0-SDAG-NEXT: bl foo +; CHECK0-SDAG-NEXT: mov x0, x20 +; CHECK0-SDAG-NEXT: str x20, [x19, x21, lsl #3] +; CHECK0-SDAG-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK0-SDAG-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload +; CHECK0-SDAG-NEXT: ret +; +; CHECK0-GISEL-LABEL: doubleword: +; CHECK0-GISEL: // %bb.0: +; CHECK0-GISEL-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill +; CHECK0-GISEL-NEXT: lsr w8, w1, #9 +; CHECK0-GISEL-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK0-GISEL-NEXT: mov x19, x0 +; CHECK0-GISEL-NEXT: and x21, x8, #0xff +; CHECK0-GISEL-NEXT: ldr x20, [x0, x21, lsl #3] +; CHECK0-GISEL-NEXT: bl foo +; CHECK0-GISEL-NEXT: mov x0, x20 +; CHECK0-GISEL-NEXT: str x20, [x19, x21, lsl #3] +; CHECK0-GISEL-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK0-GISEL-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload +; CHECK0-GISEL-NEXT: ret +; +; CHECK3-SDAG-LABEL: doubleword: +; CHECK3-SDAG: // %bb.0: +; CHECK3-SDAG-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill +; CHECK3-SDAG-NEXT: // kill: def $w1 killed $w1 def $x1 +; CHECK3-SDAG-NEXT: ubfx x21, x1, #9, #8 +; CHECK3-SDAG-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK3-SDAG-NEXT: mov x19, x0 +; CHECK3-SDAG-NEXT: ldr x20, [x0, x21, lsl #3] +; CHECK3-SDAG-NEXT: bl foo +; CHECK3-SDAG-NEXT: mov x0, x20 +; CHECK3-SDAG-NEXT: str x20, [x19, x21, lsl #3] +; CHECK3-SDAG-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK3-SDAG-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload +; CHECK3-SDAG-NEXT: ret +; +; CHECK3-GISEL-LABEL: doubleword: +; CHECK3-GISEL: // %bb.0: +; CHECK3-GISEL-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill +; CHECK3-GISEL-NEXT: lsr w8, w1, #9 +; CHECK3-GISEL-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK3-GISEL-NEXT: mov x19, x0 +; CHECK3-GISEL-NEXT: and x21, x8, #0xff +; CHECK3-GISEL-NEXT: ldr x20, [x0, x21, lsl #3] +; CHECK3-GISEL-NEXT: bl foo +; CHECK3-GISEL-NEXT: mov x0, x20 +; CHECK3-GISEL-NEXT: str x20, [x19, x21, lsl #3] +; CHECK3-GISEL-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK3-GISEL-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload +; CHECK3-GISEL-NEXT: ret %shr81 = lshr i32 %xor72, 9 %conv82 = zext i32 %shr81 to i64 %idxprom83 = and i64 %conv82, 255 @@ -98,17 +220,129 @@ define i64 @doubleword(ptr %ctx, i32 %xor72) nounwind { ret i64 %result } -define i64 @multi_use_non_memory(i64 %a, i64 %b) { -; CHECK-LABEL: multi_use_non_memory: +define i16 @multi_use_half_word(ptr %ctx, i32 %xor72) { +; CHECK0-SDAG-LABEL: multi_use_half_word: +; CHECK0-SDAG: // %bb.0: // %entry +; CHECK0-SDAG-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill +; CHECK0-SDAG-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; CHECK0-SDAG-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; CHECK0-SDAG-NEXT: .cfi_def_cfa_offset 48 +; CHECK0-SDAG-NEXT: .cfi_offset w19, -8 +; CHECK0-SDAG-NEXT: .cfi_offset w20, -16 +; CHECK0-SDAG-NEXT: .cfi_offset w21, -24 +; CHECK0-SDAG-NEXT: .cfi_offset w22, -32 +; CHECK0-SDAG-NEXT: .cfi_offset w30, -48 +; CHECK0-SDAG-NEXT: // kill: def $w1 killed $w1 def $x1 +; CHECK0-SDAG-NEXT: ubfx x8, x1, #9, #8 +; CHECK0-SDAG-NEXT: mov x19, x0 +; CHECK0-SDAG-NEXT: lsl x21, x8, #1 +; CHECK0-SDAG-NEXT: ldrh w20, [x0, x21] +; CHECK0-SDAG-NEXT: add w22, w20, #1 +; CHECK0-SDAG-NEXT: bl foo +; CHECK0-SDAG-NEXT: mov w0, w20 +; CHECK0-SDAG-NEXT: strh w22, [x19, x21] +; CHECK0-SDAG-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; CHECK0-SDAG-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload +; CHECK0-SDAG-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload +; CHECK0-SDAG-NEXT: ret +; +; CHECK0-GISEL-LABEL: multi_use_half_word: +; CHECK0-GISEL: // %bb.0: // %entry +; CHECK0-GISEL-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill +; CHECK0-GISEL-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; CHECK0-GISEL-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; CHECK0-GISEL-NEXT: .cfi_def_cfa_offset 48 +; CHECK0-GISEL-NEXT: .cfi_offset w19, -8 +; CHECK0-GISEL-NEXT: .cfi_offset w20, -16 +; CHECK0-GISEL-NEXT: .cfi_offset w21, -24 +; CHECK0-GISEL-NEXT: .cfi_offset w22, -32 +; CHECK0-GISEL-NEXT: .cfi_offset w30, -48 +; CHECK0-GISEL-NEXT: lsr w8, w1, #9 +; CHECK0-GISEL-NEXT: mov x19, x0 +; CHECK0-GISEL-NEXT: and x21, x8, #0xff +; CHECK0-GISEL-NEXT: ldrh w20, [x0, x21, lsl #1] +; CHECK0-GISEL-NEXT: add w22, w20, #1 +; CHECK0-GISEL-NEXT: bl foo +; CHECK0-GISEL-NEXT: strh w20, [x19, x21, lsl #1] +; CHECK0-GISEL-NEXT: mov w0, w20 +; CHECK0-GISEL-NEXT: strh w22, [x19, x21, lsl #1] +; CHECK0-GISEL-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; CHECK0-GISEL-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload +; CHECK0-GISEL-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload +; CHECK0-GISEL-NEXT: ret +; +; CHECK3-SDAG-LABEL: multi_use_half_word: +; CHECK3-SDAG: // %bb.0: // %entry +; CHECK3-SDAG-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill +; CHECK3-SDAG-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; CHECK3-SDAG-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; CHECK3-SDAG-NEXT: .cfi_def_cfa_offset 48 +; CHECK3-SDAG-NEXT: .cfi_offset w19, -8 +; CHECK3-SDAG-NEXT: .cfi_offset w20, -16 +; CHECK3-SDAG-NEXT: .cfi_offset w21, -24 +; CHECK3-SDAG-NEXT: .cfi_offset w22, -32 +; CHECK3-SDAG-NEXT: .cfi_offset w30, -48 +; CHECK3-SDAG-NEXT: // kill: def $w1 killed $w1 def $x1 +; CHECK3-SDAG-NEXT: ubfx x21, x1, #9, #8 +; CHECK3-SDAG-NEXT: mov x19, x0 +; CHECK3-SDAG-NEXT: ldrh w20, [x0, x21, lsl #1] +; CHECK3-SDAG-NEXT: add w22, w20, #1 +; CHECK3-SDAG-NEXT: bl foo +; CHECK3-SDAG-NEXT: mov w0, w20 +; CHECK3-SDAG-NEXT: strh w22, [x19, x21, lsl #1] +; CHECK3-SDAG-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; CHECK3-SDAG-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload +; CHECK3-SDAG-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload +; CHECK3-SDAG-NEXT: ret +; +; CHECK3-GISEL-LABEL: multi_use_half_word: +; CHECK3-GISEL: // %bb.0: // %entry +; CHECK3-GISEL-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill +; CHECK3-GISEL-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; CHECK3-GISEL-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; CHECK3-GISEL-NEXT: .cfi_def_cfa_offset 48 +; CHECK3-GISEL-NEXT: .cfi_offset w19, -8 +; CHECK3-GISEL-NEXT: .cfi_offset w20, -16 +; CHECK3-GISEL-NEXT: .cfi_offset w21, -24 +; CHECK3-GISEL-NEXT: .cfi_offset w22, -32 +; CHECK3-GISEL-NEXT: .cfi_offset w30, -48 +; CHECK3-GISEL-NEXT: lsr w8, w1, #9 +; CHECK3-GISEL-NEXT: mov x19, x0 +; CHECK3-GISEL-NEXT: and x21, x8, #0xff +; CHECK3-GISEL-NEXT: ldrh w20, [x0, x21, lsl #1] +; CHECK3-GISEL-NEXT: add w22, w20, #1 +; CHECK3-GISEL-NEXT: bl foo +; CHECK3-GISEL-NEXT: strh w20, [x19, x21, lsl #1] +; CHECK3-GISEL-NEXT: mov w0, w20 +; CHECK3-GISEL-NEXT: strh w22, [x19, x21, lsl #1] +; CHECK3-GISEL-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; CHECK3-GISEL-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload +; CHECK3-GISEL-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload +; CHECK3-GISEL-NEXT: ret +entry: + %shr81 = lshr i32 %xor72, 9 + %conv82 = zext i32 %shr81 to i64 + %idxprom83 = and i64 %conv82, 255 + %arrayidx86 = getelementptr inbounds %struct.a, ptr %ctx, i64 0, i64 %idxprom83 + %result = load i16, ptr %arrayidx86, align 2 + %result2 = add i16 %result, 1 + call void @foo() + store i16 %result, ptr %arrayidx86, align 2 + store i16 %result2, ptr %arrayidx86, align 2 + ret i16 %result +} + +define i64 @multi_use_non_memory_call(i64 %a, i64 %b) { +; CHECK-LABEL: multi_use_non_memory_call: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: lsl x8, x0, #3 ; CHECK-NEXT: lsl x9, x1, #3 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.lt .LBB3_2 +; CHECK-NEXT: b.lt .LBB4_2 ; CHECK-NEXT: // %bb.1: // %falsebb ; CHECK-NEXT: csel x0, x8, x9, gt ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB3_2: // %truebb +; CHECK-NEXT: .LBB4_2: // %truebb ; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: .cfi_offset w30, -16 @@ -144,12 +378,43 @@ define i64 @gep3(ptr %p, i64 %b) { } define i128 @gep4(ptr %p, i128 %a, i64 %b) { -; CHECK-LABEL: gep4: -; CHECK: // %bb.0: -; CHECK-NEXT: add x8, x0, x4, lsl #4 -; CHECK-NEXT: ldp x0, x1, [x8] -; CHECK-NEXT: stp x2, x3, [x8] -; CHECK-NEXT: ret +; CHECK0-SDAG-LABEL: gep4: +; CHECK0-SDAG: // %bb.0: +; CHECK0-SDAG-NEXT: add x8, x0, x4, lsl #4 +; CHECK0-SDAG-NEXT: ldp x0, x1, [x8] +; CHECK0-SDAG-NEXT: stp x2, x3, [x8] +; CHECK0-SDAG-NEXT: ret +; +; CHECK0-GISEL-LABEL: gep4: +; CHECK0-GISEL: // %bb.0: +; CHECK0-GISEL-NEXT: ldr q1, [x0, x4, lsl #4] +; CHECK0-GISEL-NEXT: mov v0.d[0], x2 +; CHECK0-GISEL-NEXT: mov x8, x0 +; CHECK0-GISEL-NEXT: mov d2, v1.d[1] +; CHECK0-GISEL-NEXT: fmov x0, d1 +; CHECK0-GISEL-NEXT: mov v0.d[1], x3 +; CHECK0-GISEL-NEXT: fmov x1, d2 +; CHECK0-GISEL-NEXT: str q0, [x8, x4, lsl #4] +; CHECK0-GISEL-NEXT: ret +; +; CHECK3-SDAG-LABEL: gep4: +; CHECK3-SDAG: // %bb.0: +; CHECK3-SDAG-NEXT: add x8, x0, x4, lsl #4 +; CHECK3-SDAG-NEXT: ldp x0, x1, [x8] +; CHECK3-SDAG-NEXT: stp x2, x3, [x8] +; CHECK3-SDAG-NEXT: ret +; +; CHECK3-GISEL-LABEL: gep4: +; CHECK3-GISEL: // %bb.0: +; CHECK3-GISEL-NEXT: ldr q1, [x0, x4, lsl #4] +; CHECK3-GISEL-NEXT: mov v0.d[0], x2 +; CHECK3-GISEL-NEXT: mov x8, x0 +; CHECK3-GISEL-NEXT: mov d2, v1.d[1] +; CHECK3-GISEL-NEXT: fmov x0, d1 +; CHECK3-GISEL-NEXT: mov v0.d[1], x3 +; CHECK3-GISEL-NEXT: fmov x1, d2 +; CHECK3-GISEL-NEXT: str q0, [x8, x4, lsl #4] +; CHECK3-GISEL-NEXT: ret %g = getelementptr inbounds i128, ptr %p, i64 %b %l = load i128, ptr %g store i128 %a, ptr %g @@ -185,3 +450,6 @@ define i64 @addlsl4(i64 %a, i64 %b) { %r = xor i64 %y, %z ret i64 %r } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK0: {{.*}} +; CHECK3: {{.*}} From 290184880ae22541738e280397da33fe515e4c86 Mon Sep 17 00:00:00 2001 From: Krasimir Georgiev Date: Wed, 17 Jul 2024 10:53:25 +0000 Subject: [PATCH 256/777] Revert "[AArch64] Remove superfluous sxtw in peephole opt (#96293)" This reverts commit 7f2a5dfe35f8bbaca2819644c7aa844f938befd6. It appears that after this, llc segfaults on the following code: ``` target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32" target triple = "aarch64--linux-eabi" define i32 @f(i32 %0) { entry: %1 = sext i32 %0 to i64 br label %A A: %2 = trunc i64 %1 to i32 %a69.us = sub i32 0, %2 %a69.us.fr = freeze i32 %a69.us %3 = zext i32 %a69.us.fr to i64 br label %B B: %t = icmp eq i64 0, %3 br i1 %t, label %A, label %B } ``` assert.h assertion failed at .../llvm/lib/CodeGen/LiveVariables.cpp:159 in void llvm::LiveVariables::HandleVirtRegUse(Register, MachineBasicBlock *, MachineInstr &): MRI->getVRegDef(Reg) && "Register use before def!" --- .../Target/AArch64/AArch64MIPeepholeOpt.cpp | 32 ------------- llvm/lib/Target/AArch64/peephole-sxtw.mir | 46 ------------------- .../CodeGen/AArch64/aarch64-mull-masks.ll | 12 +++-- 3 files changed, 8 insertions(+), 82 deletions(-) delete mode 100644 llvm/lib/Target/AArch64/peephole-sxtw.mir diff --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp index a758e0d3e7b55..bd11bc4dd6e3f 100644 --- a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp +++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp @@ -128,7 +128,6 @@ struct AArch64MIPeepholeOpt : public MachineFunctionPass { bool visitINSviGPR(MachineInstr &MI, unsigned Opc); bool visitINSvi64lane(MachineInstr &MI); bool visitFMOVDr(MachineInstr &MI); - bool visitCopy(MachineInstr &MI); bool runOnMachineFunction(MachineFunction &MF) override; StringRef getPassName() const override { @@ -691,34 +690,6 @@ bool AArch64MIPeepholeOpt::visitFMOVDr(MachineInstr &MI) { return true; } -// Across a basic-block we might have in i32 extract from a value that only -// operates on upper bits (for example a sxtw). We can replace the COPY with a -// new version skipping the sxtw. -bool AArch64MIPeepholeOpt::visitCopy(MachineInstr &MI) { - Register InputReg = MI.getOperand(1).getReg(); - if (MI.getOperand(1).getSubReg() != AArch64::sub_32 || - !MRI->hasOneNonDBGUse(InputReg)) - return false; - - MachineInstr *SrcMI = MRI->getUniqueVRegDef(InputReg); - MachineInstr *CopyMI = SrcMI; - while (SrcMI && SrcMI->isFullCopy() && - MRI->hasOneNonDBGUse(SrcMI->getOperand(1).getReg())) - SrcMI = MRI->getUniqueVRegDef(SrcMI->getOperand(1).getReg()); - - if (!SrcMI || SrcMI->getOpcode() != AArch64::SBFMXri || - SrcMI->getOperand(2).getImm() != 0 || SrcMI->getOperand(3).getImm() != 31) - return false; - - Register SrcReg = SrcMI->getOperand(1).getReg(); - MRI->constrainRegClass(SrcReg, MRI->getRegClass(InputReg)); - MI.getOperand(1).setReg(SrcReg); - if (CopyMI != SrcMI) - CopyMI->eraseFromParent(); - SrcMI->eraseFromParent(); - return true; -} - bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(MF.getFunction())) return false; @@ -800,9 +771,6 @@ bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) { case AArch64::FMOVDr: Changed |= visitFMOVDr(MI); break; - case AArch64::COPY: - Changed |= visitCopy(MI); - break; } } } diff --git a/llvm/lib/Target/AArch64/peephole-sxtw.mir b/llvm/lib/Target/AArch64/peephole-sxtw.mir deleted file mode 100644 index 6dd91fbf6ec1d..0000000000000 --- a/llvm/lib/Target/AArch64/peephole-sxtw.mir +++ /dev/null @@ -1,46 +0,0 @@ -# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -run-pass=aarch64-mi-peephole-opt -o - -mtriple=aarch64-unknown-linux -verify-machineinstrs %s | FileCheck %s - ---- -name: removeSxtw -tracksRegLiveness: true -body: | - bb.0.entry: - liveins: $x0 - ; CHECK-LABEL: name: removeSxtw - ; CHECK: liveins: $x0 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr32sp = COPY [[COPY]].sub_32 - ; CHECK-NEXT: [[ADDWri:%[0-9]+]]:gpr32sp = ADDWri [[COPY1]], 1, 0 - ; CHECK-NEXT: $w0 = COPY [[ADDWri]] - ; CHECK-NEXT: RET_ReallyLR implicit $w0 - %0:gpr64 = COPY $x0 - %1:gpr64 = SBFMXri %0:gpr64, 0, 31 - %2:gpr32sp = COPY %1.sub_32:gpr64 - %3:gpr32sp = ADDWri %2:gpr32sp, 1, 0 - $w0 = COPY %3:gpr32sp - RET_ReallyLR implicit $w0 -... ---- -name: extraCopy -tracksRegLiveness: true -body: | - bb.0.entry: - liveins: $x0 - ; CHECK-LABEL: name: extraCopy - ; CHECK: liveins: $x0 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr32sp = COPY [[COPY]].sub_32 - ; CHECK-NEXT: [[ADDWri:%[0-9]+]]:gpr32sp = ADDWri [[COPY1]], 1, 0 - ; CHECK-NEXT: $w0 = COPY [[ADDWri]] - ; CHECK-NEXT: RET_ReallyLR implicit $w0 - %0:gpr64 = COPY $x0 - %1:gpr64 = SBFMXri %0:gpr64, 0, 31 - %2:gpr64all = COPY %1:gpr64 - %3:gpr32sp = COPY %2.sub_32:gpr64all - %4:gpr32sp = ADDWri %3:gpr32sp, 1, 0 - $w0 = COPY %4:gpr32sp - RET_ReallyLR implicit $w0 -... diff --git a/llvm/test/CodeGen/AArch64/aarch64-mull-masks.ll b/llvm/test/CodeGen/AArch64/aarch64-mull-masks.ll index f49d469e50cdd..9b0701ab148dc 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-mull-masks.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-mull-masks.ll @@ -281,7 +281,8 @@ define i64 @smull_ldrsw_shift(ptr %x0, i64 %x1) { ; CHECK-LABEL: smull_ldrsw_shift: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldrsw x8, [x0] -; CHECK-NEXT: smull x0, w8, w1 +; CHECK-NEXT: sxtw x9, w1 +; CHECK-NEXT: smull x0, w8, w9 ; CHECK-NEXT: ret entry: %ext64 = load i32, ptr %x0 @@ -489,7 +490,8 @@ define i64 @smaddl_ldrsw_shift(ptr %x0, i64 %x1, i64 %x2) { ; CHECK-LABEL: smaddl_ldrsw_shift: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldrsw x8, [x0] -; CHECK-NEXT: smaddl x0, w8, w1, x2 +; CHECK-NEXT: sxtw x9, w1 +; CHECK-NEXT: smaddl x0, w8, w9, x2 ; CHECK-NEXT: ret entry: %ext64 = load i32, ptr %x0 @@ -652,7 +654,8 @@ define i64 @smnegl_ldrsw_shift(ptr %x0, i64 %x1) { ; CHECK-LABEL: smnegl_ldrsw_shift: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldrsw x8, [x0] -; CHECK-NEXT: smnegl x0, w8, w1 +; CHECK-NEXT: sxtw x9, w1 +; CHECK-NEXT: smnegl x0, w8, w9 ; CHECK-NEXT: ret entry: %ext64 = load i32, ptr %x0 @@ -815,7 +818,8 @@ define i64 @smsubl_ldrsw_shift(ptr %x0, i64 %x1, i64 %x2) { ; CHECK-LABEL: smsubl_ldrsw_shift: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldrsw x8, [x0] -; CHECK-NEXT: smsubl x0, w8, w1, x2 +; CHECK-NEXT: sxtw x9, w1 +; CHECK-NEXT: smsubl x0, w8, w9, x2 ; CHECK-NEXT: ret entry: %ext64 = load i32, ptr %x0 From 8156be684da4c37b6191dab26d2eb5c2777be17d Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Wed, 17 Jul 2024 07:16:13 -0400 Subject: [PATCH 257/777] [LV][NFC]Introduce isScalableVectorizationAllowed() to refactor getMaxLegalScalableVF(). Adds isScalableVectorizationAllowed() and the corresponding data member to query if the scalable vectorization is supported rather than performing the analysis each time the scalable vector factor is requested. Part of https://github.com/llvm/llvm-project/pull/91403 Reviewers: ayalz, fhahn Reviewed By: fhahn, ayalz Pull Request: https://github.com/llvm/llvm-project/pull/98916 --- .../Transforms/Vectorize/LoopVectorize.cpp | 49 +++++++++++++++---- 1 file changed, 39 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index bc1a566d230ee..5fc365f77efbb 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1628,6 +1628,10 @@ class LoopVectorizationCostModel { ElementCount MaxSafeVF, bool FoldTailByMasking); + /// Checks if scalable vectorization is supported and enabled. Caches the + /// result to avoid repeated debug dumps for repeated queries. + bool isScalableVectorizationAllowed(); + /// \return the maximum legal scalable VF, based on the safe max number /// of elements. ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); @@ -1692,6 +1696,9 @@ class LoopVectorizationCostModel { std::optional> ChosenTailFoldingStyle; + /// true if scalable vectorization is supported and enabled. + std::optional IsScalableVectorizationAllowed; + /// A map holding scalar costs for different vectorization factors. The /// presence of a cost for an instruction in the mapping indicates that the /// instruction will be scalarized when vectorizing with the associated @@ -4144,15 +4151,18 @@ bool LoopVectorizationCostModel::runtimeChecksRequired() { return false; } -ElementCount -LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { +bool LoopVectorizationCostModel::isScalableVectorizationAllowed() { + if (IsScalableVectorizationAllowed) + return *IsScalableVectorizationAllowed; + + IsScalableVectorizationAllowed = false; if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) - return ElementCount::getScalable(0); + return false; if (Hints->isScalableVectorizationDisabled()) { reportVectorizationInfo("Scalable vectorization is explicitly disabled", "ScalableVectorizationDisabled", ORE, TheLoop); - return ElementCount::getScalable(0); + return false; } LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n"); @@ -4172,7 +4182,7 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { "Scalable vectorization not supported for the reduction " "operations found in this loop.", "ScalableVFUnfeasible", ORE, TheLoop); - return ElementCount::getScalable(0); + return false; } // Disable scalable vectorization if the loop contains any instructions @@ -4184,17 +4194,36 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { reportVectorizationInfo("Scalable vectorization is not supported " "for all element types found in this loop.", "ScalableVFUnfeasible", ORE, TheLoop); - return ElementCount::getScalable(0); + return false; + } + + if (!Legal->isSafeForAnyVectorWidth()) { + std::optional MaxVScale = getMaxVScale(*TheFunction, TTI); + if (!MaxVScale) { + reportVectorizationInfo( + "The target does not provide maximum vscale value.", + "ScalableVFUnfeasible", ORE, TheLoop); + return false; + } } + IsScalableVectorizationAllowed = true; + return true; +} + +ElementCount +LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { + if (!isScalableVectorizationAllowed()) + return ElementCount::getScalable(0); + + auto MaxScalableVF = ElementCount::getScalable( + std::numeric_limits::max()); if (Legal->isSafeForAnyVectorWidth()) return MaxScalableVF; + std::optional MaxVScale = getMaxVScale(*TheFunction, TTI); // Limit MaxScalableVF by the maximum safe dependence distance. - if (std::optional MaxVScale = getMaxVScale(*TheFunction, TTI)) - MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale); - else - MaxScalableVF = ElementCount::getScalable(0); + MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale); if (!MaxScalableVF) reportVectorizationInfo( From 440fffad7e7231fab766c6e00e47a39ad5a9b95e Mon Sep 17 00:00:00 2001 From: Younan Zhang Date: Wed, 17 Jul 2024 19:17:01 +0800 Subject: [PATCH 258/777] [Clang][Concepts] Avoid substituting into constraints for invalid TemplateDecls (#75697) Fixes https://github.com/llvm/llvm-project/issues/73885. Substituting into constraints for invalid TemplateDecls might still yield dependent expressions and end up crashing later in evaluation. --- clang/docs/ReleaseNotes.rst | 1 + clang/lib/Sema/SemaConcept.cpp | 6 ++++++ clang/test/SemaTemplate/instantiate-requires-expr.cpp | 10 ++++++++++ 3 files changed, 17 insertions(+) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 8c0d1635d2756..6dc45956a9afb 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -1042,6 +1042,7 @@ Bug Fixes to C++ Support - Fixed failed assertion when resolving context of defaulted comparison method outside of struct. (#GH96043). - Clang now diagnoses explicit object parameters in member pointers and other contexts where they should not appear. Fixes (#GH85992). +- Fixed a crash-on-invalid bug involving extraneous template parameter with concept substitution. (#GH73885) Bug Fixes to AST Handling ^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/lib/Sema/SemaConcept.cpp b/clang/lib/Sema/SemaConcept.cpp index 84c5753a46ac3..9e16b67284be4 100644 --- a/clang/lib/Sema/SemaConcept.cpp +++ b/clang/lib/Sema/SemaConcept.cpp @@ -625,6 +625,12 @@ bool Sema::CheckConstraintSatisfaction( *this, nullptr, ConstraintExprs, ConvertedConstraints, TemplateArgsLists, TemplateIDRange, OutSatisfaction); } + // Invalid templates could make their way here. Substituting them could result + // in dependent expressions. + if (Template->isInvalidDecl()) { + OutSatisfaction.IsSatisfied = false; + return true; + } // A list of the template argument list flattened in a predictible manner for // the purposes of caching. The ConstraintSatisfaction type is in AST so it diff --git a/clang/test/SemaTemplate/instantiate-requires-expr.cpp b/clang/test/SemaTemplate/instantiate-requires-expr.cpp index 516708bf4c875..20a19d731ae16 100644 --- a/clang/test/SemaTemplate/instantiate-requires-expr.cpp +++ b/clang/test/SemaTemplate/instantiate-requires-expr.cpp @@ -227,3 +227,13 @@ struct r6 {}; using r6i = r6; // expected-error@-1 {{constraints not satisfied for class template 'r6' [with T = int]}} + +namespace GH73885 { + +template // expected-error {{extraneous}} +template requires(T{}) +constexpr bool e_v = true; + +static_assert(e_v); + +} // namespace GH73885 From 6425f2d66740b84fc3027b649cd4baf660c384e8 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Wed, 17 Jul 2024 07:17:25 -0400 Subject: [PATCH 259/777] [SLP]Improve minbitwidth analysis for trun'ed gather nodes. If the gather node is trunc'ed, better to trunc scalars and then gather them rather than gather and then trunc. Trunc for scalars is free in most cases. Reviewers: RKSimon Reviewed By: RKSimon Pull Request: https://github.com/llvm/llvm-project/pull/99072 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 15 ++++++- .../X86/int-bitcast-minbitwidth.ll | 6 +-- .../X86/minbitwidth-transformed-operand.ll | 22 ++++----- .../Transforms/SLPVectorizer/X86/resched.ll | 45 ++++++++++--------- .../SLPVectorizer/X86/shuffle-multivector.ll | 13 +++--- .../orig-btiwidth-les-projected.ll | 8 ++-- 6 files changed, 62 insertions(+), 47 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 7b981bead6bb8..722590a840a54 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -15502,8 +15502,21 @@ void BoUpSLP::computeMinimumValueSizes() { auto ComputeMaxBitWidth = [&](const TreeEntry &E, bool IsTopRoot, bool IsProfitableToDemoteRoot, unsigned Opcode, unsigned Limit, bool IsTruncRoot, - bool IsSignedCmp) { + bool IsSignedCmp) -> unsigned { ToDemote.clear(); + // Check if the root is trunc and the next node is gather/buildvector, then + // keep trunc in scalars, which is free in most cases. + if (E.isGather() && IsTruncRoot && E.UserTreeIndices.size() == 1 && + E.Idx > (IsStoreOrInsertElt ? 2 : 1)) { + ToDemote.push_back(E.Idx); + const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE; + auto It = MinBWs.find(UserTE); + if (It != MinBWs.end()) + return It->second.first; + return DL->getTypeSizeInBits( + E.UserTreeIndices.back().UserTE->Scalars.front()->getType()); + } + unsigned VF = E.getVectorFactor(); auto *TreeRootIT = dyn_cast(E.Scalars.front()->getType()); if (!TreeRootIT || !Opcode) diff --git a/llvm/test/Transforms/SLPVectorizer/X86/int-bitcast-minbitwidth.ll b/llvm/test/Transforms/SLPVectorizer/X86/int-bitcast-minbitwidth.ll index 789d73947d1c7..97e505f4319c6 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/int-bitcast-minbitwidth.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/int-bitcast-minbitwidth.ll @@ -5,9 +5,9 @@ define void @t(i64 %v) { ; CHECK-LABEL: define void @t( ; CHECK-SAME: i64 [[V:%.*]]) { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i64> poison, i64 [[V]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = trunc <4 x i64> [[TMP1]] to <4 x i16> +; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[V]] to i16 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i16> [[TMP2]], ; CHECK-NEXT: [[TMP4:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP3]]) ; CHECK-NEXT: [[TMP5:%.*]] = sext i16 [[TMP4]] to i32 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-transformed-operand.ll b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-transformed-operand.ll index 032625a1199f9..57b5d2af48ee6 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-transformed-operand.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-transformed-operand.ll @@ -5,20 +5,16 @@ define void @test(i64 %d.promoted.i) { ; CHECK-LABEL: define void @test( ; CHECK-SAME: i64 [[D_PROMOTED_I:%.*]]) { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[AND_1_I:%.*]] = and i64 0, [[D_PROMOTED_I]] -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i64> , i64 [[AND_1_I]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = trunc <8 x i64> [[TMP0]] to <8 x i1> -; CHECK-NEXT: [[TMP2:%.*]] = mul <8 x i1> [[TMP1]], zeroinitializer -; CHECK-NEXT: [[AND_1_I_1:%.*]] = and i64 0, 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i64> , i64 [[AND_1_I_1]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = trunc <8 x i64> [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = mul <8 x i1> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP2]]) -; CHECK-NEXT: [[TMP7:%.*]] = zext i1 [[TMP6]] to i32 -; CHECK-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP5]]) +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i64> , i64 0, i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> , i64 [[D_PROMOTED_I]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i64> [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i1> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i1> [[TMP3]], <2 x i1> poison, <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <16 x i1> , <16 x i1> [[TMP4]], <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = mul <16 x i1> [[TMP5]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP6]]) ; CHECK-NEXT: [[TMP9:%.*]] = zext i1 [[TMP8]] to i32 -; CHECK-NEXT: [[OP_RDX:%.*]] = or i32 [[TMP7]], [[TMP9]] -; CHECK-NEXT: [[TMP10:%.*]] = and i32 [[OP_RDX]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = and i32 [[TMP9]], 0 ; CHECK-NEXT: store i32 [[TMP10]], ptr null, align 4 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/resched.ll b/llvm/test/Transforms/SLPVectorizer/X86/resched.ll index b7237cbb02bb3..4ed52247c2ef3 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/resched.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/resched.ll @@ -11,26 +11,31 @@ define fastcc void @_ZN12_GLOBAL__N_127PolynomialMultiplyRecognize9recognizeEv() ; CHECK: if.then22.i: ; CHECK-NEXT: [[SUB_I:%.*]] = add nsw i32 undef, -1 ; CHECK-NEXT: [[CONV31_I:%.*]] = and i32 undef, [[SUB_I]] -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[CONV31_I]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = lshr <4 x i32> [[TMP1]], -; CHECK-NEXT: [[SHR_4_I_I:%.*]] = lshr i32 [[CONV31_I]], 5 -; CHECK-NEXT: [[SHR_5_I_I:%.*]] = lshr i32 [[CONV31_I]], 6 -; CHECK-NEXT: [[SHR_6_I_I:%.*]] = lshr i32 [[CONV31_I]], 7 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> poison, i32 [[CONV31_I]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = lshr <8 x i32> [[TMP4]], -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x i32> poison, i32 [[SUB_I]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> [[TMP7]], <16 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <16 x i32> [[TMP8]], i32 [[SHR_4_I_I]], i32 5 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <16 x i32> [[TMP9]], i32 [[SHR_5_I_I]], i32 6 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <16 x i32> [[TMP10]], i32 [[SHR_6_I_I]], i32 7 -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <16 x i32> [[TMP11]], <16 x i32> [[TMP12]], <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = trunc <16 x i32> [[TMP13]] to <16 x i8> -; CHECK-NEXT: [[TMP15:%.*]] = and <16 x i8> [[TMP14]], -; CHECK-NEXT: store <16 x i8> [[TMP15]], ptr undef, align 1 +; CHECK-NEXT: [[SHR_I_I:%.*]] = lshr i32 [[CONV31_I]], 1 +; CHECK-NEXT: [[SHR_1_I_I:%.*]] = lshr i32 [[CONV31_I]], 2 +; CHECK-NEXT: [[SHR_2_I_I:%.*]] = lshr i32 [[CONV31_I]], 3 +; CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[SUB_I]] to i8 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[SHR_I_I]] to i8 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <16 x i8> [[TMP1]], i8 [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i32 [[SHR_1_I_I]] to i8 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x i8> [[TMP3]], i8 [[TMP4]], i32 2 +; CHECK-NEXT: [[TMP6:%.*]] = trunc i32 [[SHR_2_I_I]] to i8 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <16 x i8> [[TMP5]], i8 [[TMP6]], i32 3 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> poison, i32 [[CONV31_I]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = lshr <4 x i32> [[TMP9]], +; CHECK-NEXT: [[TMP11:%.*]] = trunc <4 x i32> [[TMP10]] to <4 x i8> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <16 x i8> [[TMP7]], <16 x i8> [[TMP12]], <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <8 x i32> poison, i32 [[CONV31_I]], i32 0 +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <8 x i32> [[TMP14]], <8 x i32> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = lshr <8 x i32> [[TMP15]], +; CHECK-NEXT: [[TMP17:%.*]] = trunc <8 x i32> [[TMP16]] to <8 x i8> +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <8 x i8> [[TMP17]], <8 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <16 x i8> [[TMP13]], <16 x i8> [[TMP18]], <16 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = and <16 x i8> [[TMP19]], +; CHECK-NEXT: store <16 x i8> [[TMP20]], ptr undef, align 1 ; CHECK-NEXT: unreachable ; CHECK: if.end50.i: ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shuffle-multivector.ll b/llvm/test/Transforms/SLPVectorizer/X86/shuffle-multivector.ll index 143052a3d9cd0..c2555889f5981 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/shuffle-multivector.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/shuffle-multivector.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux -slp-threshold=-160 | FileCheck %s +; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux -slp-threshold=-163 | FileCheck %s define void @test1(i128 %p0, i128 %p1, i128 %p2, i128 %p3, <4 x i128> %vec) { ; CHECK-LABEL: @test1( @@ -14,13 +14,14 @@ define void @test1(i128 %p0, i128 %p1, i128 %p2, i128 %p3, <4 x i128> %vec) { ; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: [[T5:%.*]] = trunc i128 [[P1]] to i32 ; CHECK-NEXT: [[TMP8:%.*]] = sdiv <4 x i32> [[TMP3]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i128> [[TMP1]], <2 x i128> [[TMP5]], <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i128> [[VEC:%.*]], <4 x i128> [[TMP9]], <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = trunc <4 x i128> [[TMP10]] to <4 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = sdiv <4 x i32> [[TMP8]], [[TMP11]] +; CHECK-NEXT: [[TMP9:%.*]] = trunc <4 x i128> [[VEC:%.*]] to <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = trunc <4 x i128> [[VEC]] to <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP6]], <4 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = sdiv <4 x i32> [[TMP8]], [[TMP12]] ; CHECK-NEXT: br label [[BB:%.*]] ; CHECK: bb: -; CHECK-NEXT: [[TMP13:%.*]] = phi <4 x i32> [ [[TMP12]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[TMP14:%.*]] = phi <4 x i32> [ [[TMP13]], [[ENTRY:%.*]] ] ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/orig-btiwidth-les-projected.ll b/llvm/test/Transforms/SLPVectorizer/orig-btiwidth-les-projected.ll index 531e964053482..88503aeb6071f 100644 --- a/llvm/test/Transforms/SLPVectorizer/orig-btiwidth-les-projected.ll +++ b/llvm/test/Transforms/SLPVectorizer/orig-btiwidth-les-projected.ll @@ -5,10 +5,10 @@ define i32 @test(i4 %0) { ; CHECK-LABEL: define i32 @test( ; CHECK-SAME: i4 [[TMP0:%.*]]) { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP1:%.*]] = trunc i8 0 to i4 -; CHECK-NEXT: [[TMP2:%.*]] = trunc i8 0 to i4 -; CHECK-NEXT: [[ADD_R:%.*]] = or i4 [[TMP1]], [[TMP0]] -; CHECK-NEXT: [[ADD_R14:%.*]] = or i4 0, [[TMP2]] +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i4> , i4 [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = or <2 x i4> zeroinitializer, [[TMP1]] +; CHECK-NEXT: [[ADD_R:%.*]] = extractelement <2 x i4> [[TMP2]], i32 0 +; CHECK-NEXT: [[ADD_R14:%.*]] = extractelement <2 x i4> [[TMP2]], i32 1 ; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i4 [[ADD_R]], [[ADD_R14]] ; CHECK-NEXT: ret i32 0 ; From d2bfc2b52bfc1c17248b897ae8618865d4d9a3af Mon Sep 17 00:00:00 2001 From: Lin Jian Date: Wed, 17 Jul 2024 19:20:08 +0800 Subject: [PATCH 260/777] [emacs] Fix autoloading for llvm-mir-mode (#98984) Without this patch, the autoloading of the major mode `llvm-mir-mode` is not generated, which breaks its autoloading functionality. To test this patch, use the following command to generate an autoload file: ```console cd llvm/utils/emacs emacs --quick --batch --load=package --eval='(package-generate-autoloads "llvm-mir-mode" ".")' ``` Diff of generated autoload files is as follows: ```diff > (autoload 'llvm-mir-mode "llvm-mir-mode" "\ > A major mode for editing LLVM MIR files. > > (fn)" t) ``` CC @bogner for review --- llvm/utils/emacs/llvm-mir-mode.el | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/utils/emacs/llvm-mir-mode.el b/llvm/utils/emacs/llvm-mir-mode.el index 5ded9cce50bb7..e53ffe825478b 100644 --- a/llvm/utils/emacs/llvm-mir-mode.el +++ b/llvm/utils/emacs/llvm-mir-mode.el @@ -56,7 +56,7 @@ llvm-font-lock-keywords) "Keyword highlighting specification for `llvm-mir-mode'.") - ;;;###autoload +;;;###autoload (define-derived-mode llvm-mir-mode prog-mode "LLVM MIR" "A major mode for editing LLVM MIR files." (setq-local comment-start "; ") From 343ed3fd5a5e183f0edf87a89955af772aaadcfb Mon Sep 17 00:00:00 2001 From: Adrian Kuegel Date: Wed, 17 Jul 2024 11:18:45 +0000 Subject: [PATCH 261/777] [lldb][Bazel]: Adapt BUILD.bazel file for a751f653b40f2021f091a2f1ebcc2d91bc4cc89d --- utils/bazel/llvm-project-overlay/lldb/BUILD.bazel | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel b/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel index cc573864e29b1..c7ea5f9d938b9 100644 --- a/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel @@ -721,7 +721,10 @@ cc_library( cc_library( name = "Utility", srcs = glob(["source/Utility/**/*.cpp"]), - hdrs = glob(["include/lldb/Utility/**/*.h"]), + hdrs = glob( + ["include/lldb/Utility/**/*.h"], + ["source/Utility/*.h"], + ), strip_include_prefix = "include", deps = [ ":Headers", From 8d97cbcf27becd88e053ef888605f55b68272e60 Mon Sep 17 00:00:00 2001 From: Adrian Kuegel Date: Wed, 17 Jul 2024 11:30:35 +0000 Subject: [PATCH 262/777] Revert "[lldb][Bazel]: Adapt BUILD.bazel file for a751f653b40f2021f091a2f1ebcc2d91bc4cc89d" This reverts commit 343ed3fd5a5e183f0edf87a89955af772aaadcfb. --- utils/bazel/llvm-project-overlay/lldb/BUILD.bazel | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel b/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel index c7ea5f9d938b9..cc573864e29b1 100644 --- a/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel @@ -721,10 +721,7 @@ cc_library( cc_library( name = "Utility", srcs = glob(["source/Utility/**/*.cpp"]), - hdrs = glob( - ["include/lldb/Utility/**/*.h"], - ["source/Utility/*.h"], - ), + hdrs = glob(["include/lldb/Utility/**/*.h"]), strip_include_prefix = "include", deps = [ ":Headers", From b5b9832b42f436986c7290bc1e912e2f276b4e6b Mon Sep 17 00:00:00 2001 From: Adrian Kuegel Date: Wed, 17 Jul 2024 11:32:34 +0000 Subject: [PATCH 263/777] [lldb][Bazel]: Second attempt to adapt for a751f653b40f2021f091a2f1ebcc2d91bc4cc89d --- utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel b/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel index 161b58814d276..329b5bf7c532d 100644 --- a/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel @@ -1086,6 +1086,7 @@ cc_library( "//lldb:Target", "//lldb:TargetHeaders", "//lldb:Utility", + "//lldb:UtilityPrivateHeaders", "//llvm:Core", "//llvm:TargetParser", ], From 64f67a448740480b0ff5d8c89ee2b99878c5559c Mon Sep 17 00:00:00 2001 From: Sylvestre Ledru Date: Wed, 17 Jul 2024 13:48:12 +0200 Subject: [PATCH 264/777] adjust the Xtensa backend after change f270a4dd6667759d7305797a077ae09648318ac7 Similar fix as in 3941f652317d95cac203e64791bfa730de7bbd1e --- llvm/lib/Target/Xtensa/XtensaISelLowering.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp b/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp index 5d5a34157cc9f..80d01d662a221 100644 --- a/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp +++ b/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp @@ -356,7 +356,7 @@ XtensaTargetLowering::LowerCall(CallLoweringInfo &CLI, SDValue Memcpy = DAG.getMemcpy( Chain, DL, Address, ArgValue, SizeNode, Flags.getNonZeroByValAlign(), /*isVolatile=*/false, /*AlwaysInline=*/false, - /*isTailCall=*/false, MachinePointerInfo(), MachinePointerInfo()); + /*CI=*/nullptr, std::nullopt, MachinePointerInfo(), MachinePointerInfo()); MemOpChains.push_back(Memcpy); } else { assert(VA.isMemLoc() && "Argument not register or memory"); From bc8a8f5415c522f99600171e012d511c010d7309 Mon Sep 17 00:00:00 2001 From: MagentaTreehouse <99200384+MagentaTreehouse@users.noreply.github.com> Date: Wed, 17 Jul 2024 08:09:35 -0400 Subject: [PATCH 265/777] [clang][Sema] Improve `Sema::CheckCXXDefaultArguments` (#97338) In the second loop in `Sema::CheckCXXDefaultArguments`, we don't need to re-examine the first parameter with a default argument. Dropped the first iteration of that loop. In addition, use the preferred early `continue` for the if-statement in the loop. --- clang/lib/Sema/SemaDeclCXX.cpp | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp index f24912cde275a..2bfb103e8953d 100644 --- a/clang/lib/Sema/SemaDeclCXX.cpp +++ b/clang/lib/Sema/SemaDeclCXX.cpp @@ -1630,9 +1630,6 @@ void Sema::MergeVarDeclExceptionSpecs(VarDecl *New, VarDecl *Old) { /// function declaration are well-formed according to C++ /// [dcl.fct.default]. void Sema::CheckCXXDefaultArguments(FunctionDecl *FD) { - unsigned NumParams = FD->getNumParams(); - unsigned ParamIdx = 0; - // This checking doesn't make sense for explicit specializations; their // default arguments are determined by the declaration we're specializing, // not by FD. @@ -1642,6 +1639,9 @@ void Sema::CheckCXXDefaultArguments(FunctionDecl *FD) { if (FTD->isMemberSpecialization()) return; + unsigned NumParams = FD->getNumParams(); + unsigned ParamIdx = 0; + // Find first parameter with a default argument for (; ParamIdx < NumParams; ++ParamIdx) { ParmVarDecl *Param = FD->getParamDecl(ParamIdx); @@ -1654,21 +1654,19 @@ void Sema::CheckCXXDefaultArguments(FunctionDecl *FD) { // with a default argument shall have a default argument supplied in this or // a previous declaration, unless the parameter was expanded from a // parameter pack, or shall be a function parameter pack. - for (; ParamIdx < NumParams; ++ParamIdx) { + for (++ParamIdx; ParamIdx < NumParams; ++ParamIdx) { ParmVarDecl *Param = FD->getParamDecl(ParamIdx); - if (!Param->hasDefaultArg() && !Param->isParameterPack() && - !(CurrentInstantiationScope && - CurrentInstantiationScope->isLocalPackExpansion(Param))) { - if (Param->isInvalidDecl()) - /* We already complained about this parameter. */; - else if (Param->getIdentifier()) - Diag(Param->getLocation(), - diag::err_param_default_argument_missing_name) + if (Param->hasDefaultArg() || Param->isParameterPack() || + (CurrentInstantiationScope && + CurrentInstantiationScope->isLocalPackExpansion(Param))) + continue; + if (Param->isInvalidDecl()) + /* We already complained about this parameter. */; + else if (Param->getIdentifier()) + Diag(Param->getLocation(), diag::err_param_default_argument_missing_name) << Param->getIdentifier(); - else - Diag(Param->getLocation(), - diag::err_param_default_argument_missing); - } + else + Diag(Param->getLocation(), diag::err_param_default_argument_missing); } } From fa0e52995929ab67dfb468d71fe793be5e1c7f03 Mon Sep 17 00:00:00 2001 From: Daniel Chen Date: Wed, 17 Jul 2024 08:11:02 -0400 Subject: [PATCH 266/777] [Flang] Exclude the reference to TIME_UTC for AIX. (#99069) This PR supersede PR #98915 --- flang/runtime/time-intrinsic.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/flang/runtime/time-intrinsic.cpp b/flang/runtime/time-intrinsic.cpp index 7352dafc9136e..92b937bc6f626 100644 --- a/flang/runtime/time-intrinsic.cpp +++ b/flang/runtime/time-intrinsic.cpp @@ -139,6 +139,7 @@ count_t ConvertTimeSpecToCount(int kind, const struct timespec &tspec) { } } +#ifndef _AIX // This is the fallback implementation, which should work everywhere. template count_t GetSystemClockCount(int kind, fallback_implementation) { @@ -153,6 +154,7 @@ count_t GetSystemClockCount(int kind, fallback_implementation) { // with the requested kind at the call site. return ConvertTimeSpecToCount(kind, tspec); } +#endif template count_t GetSystemClockCountRate(int kind, fallback_implementation) { From 1813ffd6b2eb04ee2c296a4399a18748740a439d Mon Sep 17 00:00:00 2001 From: Han-Kuan Chen Date: Wed, 17 Jul 2024 20:14:12 +0800 Subject: [PATCH 267/777] [SLP][REVEC] Make SLP support revectorization (-slp-revec) and add simple test. (#98269) This PR will make SLP support revectorization. Add an option -slp-revec to control the functionality. reference: https://discourse.llvm.org/t/rfc-make-slp-vectorizer-revectorize-vector-instructions/79436 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 32 ++++++++++++--- llvm/test/Transforms/SLPVectorizer/revec.ll | 40 +++++++++++++++++++ 2 files changed, 66 insertions(+), 6 deletions(-) create mode 100644 llvm/test/Transforms/SLPVectorizer/revec.ll diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 722590a840a54..ccb6734d5618c 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -113,6 +113,10 @@ static cl::opt RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes")); +static cl::opt + SLPReVec("slp-revec", cl::init(false), cl::Hidden, + cl::desc("Enable vectorization for wider vector utilization")); + static cl::opt SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " @@ -227,13 +231,26 @@ static const unsigned MaxPHINumOperands = 128; /// avoids spending time checking the cost model and realizing that they will /// be inevitably scalarized. static bool isValidElementType(Type *Ty) { + // TODO: Support ScalableVectorType. + if (SLPReVec && isa(Ty)) + Ty = Ty->getScalarType(); return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() && !Ty->isPPC_FP128Ty(); } +/// \returns the number of elements for Ty. +static unsigned getNumElements(Type *Ty) { + assert(!isa(Ty) && + "ScalableVectorType is not supported."); + if (auto *VecTy = dyn_cast(Ty)) + return VecTy->getNumElements(); + return 1; +} + /// \returns the vector type of ScalarTy based on vectorization factor. static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) { - return FixedVectorType::get(ScalarTy, VF); + return FixedVectorType::get(ScalarTy->getScalarType(), + VF * getNumElements(ScalarTy)); } /// \returns True if the value is a constant (but not globals/constant @@ -6779,7 +6796,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, } // Don't handle vectors. - if (S.OpValue->getType()->isVectorTy() && + if (!SLPReVec && S.OpValue->getType()->isVectorTy() && !isa(S.OpValue)) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n"); newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx); @@ -6787,7 +6804,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, } if (StoreInst *SI = dyn_cast(S.OpValue)) - if (SI->getValueOperand()->getType()->isVectorTy()) { + if (!SLPReVec && SI->getValueOperand()->getType()->isVectorTy()) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n"); newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx); return; @@ -11833,10 +11850,12 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis { Value *castToScalarTyElem(Value *V, std::optional IsSigned = std::nullopt) { auto *VecTy = cast(V->getType()); - if (VecTy->getElementType() == ScalarTy) + assert(getNumElements(ScalarTy) < getNumElements(VecTy) && + (getNumElements(VecTy) % getNumElements(ScalarTy) == 0)); + if (VecTy->getElementType() == ScalarTy->getScalarType()) return V; return Builder.CreateIntCast( - V, VectorType::get(ScalarTy, VecTy->getElementCount()), + V, VectorType::get(ScalarTy->getScalarType(), VecTy->getElementCount()), IsSigned.value_or(!isKnownNonNegative(V, SimplifyQuery(*R.DL)))); } @@ -12221,7 +12240,8 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx, return ShuffleBuilder.finalize(std::nullopt); }; Value *V = vectorizeTree(VE, PostponedPHIs); - if (VF != cast(V->getType())->getNumElements()) { + if (VF * getNumElements(VL[0]->getType()) != + cast(V->getType())->getNumElements()) { if (!VE->ReuseShuffleIndices.empty()) { // Reshuffle to get only unique values. // If some of the scalars are duplicated in the vectorization diff --git a/llvm/test/Transforms/SLPVectorizer/revec.ll b/llvm/test/Transforms/SLPVectorizer/revec.ll new file mode 100644 index 0000000000000..4b37b100763a9 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/revec.ll @@ -0,0 +1,40 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -passes=slp-vectorizer -S -slp-revec -slp-max-reg-size=1024 -slp-threshold=-100 %s | FileCheck %s + +define void @test1(ptr %a, ptr %b, ptr %c) { +; CHECK-LABEL: @test1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load <16 x i32>, ptr [[A:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr [[B:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = add <16 x i32> [[TMP1]], [[TMP0]] +; CHECK-NEXT: store <16 x i32> [[TMP2]], ptr [[C:%.*]], align 4 +; CHECK-NEXT: ret void +; +entry: + %arrayidx3 = getelementptr inbounds i32, ptr %a, i64 4 + %arrayidx7 = getelementptr inbounds i32, ptr %a, i64 8 + %arrayidx11 = getelementptr inbounds i32, ptr %a, i64 12 + %0 = load <4 x i32>, ptr %a, align 4 + %1 = load <4 x i32>, ptr %arrayidx3, align 4 + %2 = load <4 x i32>, ptr %arrayidx7, align 4 + %3 = load <4 x i32>, ptr %arrayidx11, align 4 + %arrayidx19 = getelementptr inbounds i32, ptr %b, i64 4 + %arrayidx23 = getelementptr inbounds i32, ptr %b, i64 8 + %arrayidx27 = getelementptr inbounds i32, ptr %b, i64 12 + %4 = load <4 x i32>, ptr %b, align 4 + %5 = load <4 x i32>, ptr %arrayidx19, align 4 + %6 = load <4 x i32>, ptr %arrayidx23, align 4 + %7 = load <4 x i32>, ptr %arrayidx27, align 4 + %add.i = add <4 x i32> %4, %0 + %add.i63 = add <4 x i32> %5, %1 + %add.i64 = add <4 x i32> %6, %2 + %add.i65 = add <4 x i32> %7, %3 + %arrayidx36 = getelementptr inbounds i32, ptr %c, i64 4 + %arrayidx39 = getelementptr inbounds i32, ptr %c, i64 8 + %arrayidx42 = getelementptr inbounds i32, ptr %c, i64 12 + store <4 x i32> %add.i, ptr %c, align 4 + store <4 x i32> %add.i63, ptr %arrayidx36, align 4 + store <4 x i32> %add.i64, ptr %arrayidx39, align 4 + store <4 x i32> %add.i65, ptr %arrayidx42, align 4 + ret void +} From 329e7c80ac2dbc16c267390da5f1baaf1cd438b1 Mon Sep 17 00:00:00 2001 From: Mital Ashok Date: Wed, 17 Jul 2024 13:14:31 +0100 Subject: [PATCH 268/777] [Clang] [C23] Implement N2653: u8 strings are char8_t[] (#97208) https://www.open-std.org/jtc1/sc22/wg14/www/docs/n2653.htm Closes #97202 --------- Co-authored-by: cor3ntin --- clang/docs/ReleaseNotes.rst | 6 ++++ .../clang/Basic/DiagnosticSemaKinds.td | 5 ++- clang/lib/Frontend/InitPreprocessor.cpp | 8 +++-- clang/lib/Headers/stdatomic.h | 6 ++++ clang/lib/Sema/SemaExpr.cpp | 22 ++++++++---- clang/test/C/C23/n2653.c | 34 +++++++++++++++++++ clang/www/c_status.html | 2 +- 7 files changed, 72 insertions(+), 11 deletions(-) create mode 100644 clang/test/C/C23/n2653.c diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 6dc45956a9afb..923f3d0a46164 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -362,6 +362,12 @@ C23 Feature Support - Added the ``FLT_NORM_MAX``, ``DBL_NORM_MAX``, and ``LDBL_NORM_MAX`` to the freestanding implementation of ```` that ships with Clang. +- Compiler support for `N2653 char8_t: A type for UTF-8 characters and strings` + `_: ``u8`` string + literals are now of type ``char8_t[N]`` in C23 and expose + ``__CLANG_ATOMIC_CHAR8_T_LOCK_FREE``/``__GCC_ATOMIC_CHAR8_T_LOCK_FREE`` to + implement the corresponding macro in ````. + Non-comprehensive list of changes in this release ------------------------------------------------- diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 52ff4b026a60e..de3d94155a9a0 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -7249,7 +7249,10 @@ def err_array_init_utf8_string_into_char : Error< def warn_cxx20_compat_utf8_string : Warning< "type of UTF-8 string literal will change from array of const char to " "array of const char8_t in C++20">, InGroup, DefaultIgnore; -def note_cxx20_compat_utf8_string_remove_u8 : Note< +def warn_c23_compat_utf8_string : Warning< + "type of UTF-8 string literal will change from array of char to " + "array of char8_t in C23">, InGroup, DefaultIgnore; +def note_cxx20_c23_compat_utf8_string_remove_u8 : Note< "remove 'u8' prefix to avoid a change of behavior; " "Clang encodes unprefixed narrow string literals as UTF-8">; def err_array_init_different_type : Error< diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp index d40d78a38540b..920ddf7e59913 100644 --- a/clang/lib/Frontend/InitPreprocessor.cpp +++ b/clang/lib/Frontend/InitPreprocessor.cpp @@ -1170,6 +1170,8 @@ static void InitializePredefinedMacros(const TargetInfo &TI, DefineType("__WCHAR_TYPE__", TI.getWCharType(), Builder); DefineType("__WINT_TYPE__", TI.getWIntType(), Builder); DefineTypeSizeAndWidth("__SIG_ATOMIC", TI.getSigAtomicType(), TI, Builder); + if (LangOpts.C23) + DefineType("__CHAR8_TYPE__", TI.UnsignedChar, Builder); DefineType("__CHAR16_TYPE__", TI.getChar16Type(), Builder); DefineType("__CHAR32_TYPE__", TI.getChar32Type(), Builder); @@ -1349,8 +1351,10 @@ static void InitializePredefinedMacros(const TargetInfo &TI, getLockFreeValue(TI.get##Type##Width(), TI)); DEFINE_LOCK_FREE_MACRO(BOOL, Bool); DEFINE_LOCK_FREE_MACRO(CHAR, Char); - if (LangOpts.Char8) - DEFINE_LOCK_FREE_MACRO(CHAR8_T, Char); // Treat char8_t like char. + // char8_t has the same representation / width as unsigned + // char in C++ and is a typedef for unsigned char in C23 + if (LangOpts.Char8 || LangOpts.C23) + DEFINE_LOCK_FREE_MACRO(CHAR8_T, Char); DEFINE_LOCK_FREE_MACRO(CHAR16_T, Char16); DEFINE_LOCK_FREE_MACRO(CHAR32_T, Char32); DEFINE_LOCK_FREE_MACRO(WCHAR_T, WChar); diff --git a/clang/lib/Headers/stdatomic.h b/clang/lib/Headers/stdatomic.h index 9c103d98af8c5..2027055f38796 100644 --- a/clang/lib/Headers/stdatomic.h +++ b/clang/lib/Headers/stdatomic.h @@ -35,6 +35,9 @@ extern "C" { #define ATOMIC_BOOL_LOCK_FREE __CLANG_ATOMIC_BOOL_LOCK_FREE #define ATOMIC_CHAR_LOCK_FREE __CLANG_ATOMIC_CHAR_LOCK_FREE +#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L +#define ATOMIC_CHAR8_T_LOCK_FREE __CLANG_ATOMIC_CHAR8_T_LOCK_FREE +#endif #define ATOMIC_CHAR16_T_LOCK_FREE __CLANG_ATOMIC_CHAR16_T_LOCK_FREE #define ATOMIC_CHAR32_T_LOCK_FREE __CLANG_ATOMIC_CHAR32_T_LOCK_FREE #define ATOMIC_WCHAR_T_LOCK_FREE __CLANG_ATOMIC_WCHAR_T_LOCK_FREE @@ -104,6 +107,9 @@ typedef _Atomic(long) atomic_long; typedef _Atomic(unsigned long) atomic_ulong; typedef _Atomic(long long) atomic_llong; typedef _Atomic(unsigned long long) atomic_ullong; +#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L +typedef _Atomic(unsigned char) atomic_char8_t; +#endif typedef _Atomic(uint_least16_t) atomic_char16_t; typedef _Atomic(uint_least32_t) atomic_char32_t; typedef _Atomic(wchar_t) atomic_wchar_t; diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index 0698c3fbe98d2..d47db14d5dd3b 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -2051,6 +2051,8 @@ Sema::ActOnStringLiteral(ArrayRef StringToks, Scope *UDLScope) { } else if (Literal.isUTF8()) { if (getLangOpts().Char8) CharTy = Context.Char8Ty; + else if (getLangOpts().C23) + CharTy = Context.UnsignedCharTy; Kind = StringLiteralKind::UTF8; } else if (Literal.isUTF16()) { CharTy = Context.Char16Ty; @@ -2062,17 +2064,23 @@ Sema::ActOnStringLiteral(ArrayRef StringToks, Scope *UDLScope) { CharTy = Context.UnsignedCharTy; } - // Warn on initializing an array of char from a u8 string literal; this - // becomes ill-formed in C++2a. - if (getLangOpts().CPlusPlus && !getLangOpts().CPlusPlus20 && - !getLangOpts().Char8 && Kind == StringLiteralKind::UTF8) { - Diag(StringTokLocs.front(), diag::warn_cxx20_compat_utf8_string); + // Warn on u8 string literals before C++20 and C23, whose type + // was an array of char before but becomes an array of char8_t. + // In C++20, it cannot be used where a pointer to char is expected. + // In C23, it might have an unexpected value if char was signed. + if (Kind == StringLiteralKind::UTF8 && + (getLangOpts().CPlusPlus + ? !getLangOpts().CPlusPlus20 && !getLangOpts().Char8 + : !getLangOpts().C23)) { + Diag(StringTokLocs.front(), getLangOpts().CPlusPlus + ? diag::warn_cxx20_compat_utf8_string + : diag::warn_c23_compat_utf8_string); // Create removals for all 'u8' prefixes in the string literal(s). This - // ensures C++2a compatibility (but may change the program behavior when + // ensures C++20/C23 compatibility (but may change the program behavior when // built by non-Clang compilers for which the execution character set is // not always UTF-8). - auto RemovalDiag = PDiag(diag::note_cxx20_compat_utf8_string_remove_u8); + auto RemovalDiag = PDiag(diag::note_cxx20_c23_compat_utf8_string_remove_u8); SourceLocation RemovalDiagLoc; for (const Token &Tok : StringToks) { if (Tok.getKind() == tok::utf8_string_literal) { diff --git a/clang/test/C/C23/n2653.c b/clang/test/C/C23/n2653.c new file mode 100644 index 0000000000000..0c07c9a46eb64 --- /dev/null +++ b/clang/test/C/C23/n2653.c @@ -0,0 +1,34 @@ +// RUN: %clang_cc1 -ffreestanding -verify=c23 -std=c23 %s +// RUN: %clang_cc1 -ffreestanding -verify=c17 -std=c17 %s + +// c23-no-diagnostics + +#include + +#define __enable_constant_folding(x) (__builtin_constant_p(x) ? (x) : (x)) +#define __is_same(a, b) (__extension__ _Generic(a, b: 1, default: 0) && __extension__ _Generic(b, a: 1, default: 0)) + +#ifndef ATOMIC_CHAR8_T_LOCK_FREE +#error missing +#endif +// c17-error@-2 {{missing}} + +_Static_assert(__is_same(atomic_char8_t, unsigned char _Atomic), ""); +// c17-error@-1 {{use of undeclared identifier 'atomic_char8_t'}} +// c17-error@-2 {{unknown type name 'atomic_char8_t'}} + +_Static_assert(_Generic(u8"", unsigned char*: 1, char*: 0), ""); +// c17-error@-1 {{static assertion failed}} + +// -fsigned-char is the default +#define M(X) __enable_constant_folding((X) >= 0x80) + +_Static_assert(M(u8"\U000000E9"[0]), ""); +// c17-error@-1 {{static assertion failed}} +#if __STDC_VERSION__ >= 202311L +_Static_assert(M(u8'\xC3'), ""); +#endif + +const char cu8[] = u8"text"; +const signed char scu8[] = u8"text"; +const unsigned char ucu8[] = u8"text"; diff --git a/clang/www/c_status.html b/clang/www/c_status.html index 669448635837e..3ea70b0163c70 100644 --- a/clang/www/c_status.html +++ b/clang/www/c_status.html @@ -1066,7 +1066,7 @@

    C23 implementation status

    char8_t: A type for UTF-8 characters and strings N2653 - No + Clang 19 Clarification for max exponent macros-update From 177ce1900f0de05337f744edd3f4e454f7a93b06 Mon Sep 17 00:00:00 2001 From: Lawrence Benson Date: Wed, 17 Jul 2024 14:24:24 +0200 Subject: [PATCH 269/777] [LLVM] Add `llvm.experimental.vector.compress` intrinsic (#92289) This PR adds a new vector intrinsic `@llvm.experimental.vector.compress` to "compress" data within a vector based on a selection mask, i.e., it moves all selected values (i.e., where `mask[i] == 1`) to consecutive lanes in the result vector. A `passthru` vector can be provided, from which remaining lanes are filled. The main reason for this is that the existing `@llvm.masked.compressstore` has very strong constraints in that it can only write values that were selected, resulting in guard branches for all targets except AVX-512 (and even there the AMD implementation is _very_ slow). More instruction sets support "compress" logic, but only within registers. So to store the values, an additional store is needed. But this combination is likely significantly faster on many target as it avoids branches. In follow up PRs, my plan is to add target-specific lowerings for x86, SVE, and possibly RISCV. I also want to combine this with a store instruction, as this is probably a common case and we can avoid some memory writes in that case. See [discussion in forum](https://discourse.llvm.org/t/new-intrinsic-for-masked-vector-compress-without-store/78663) for initial discussion on the design. --- llvm/docs/GlobalISel/GenericOpcode.rst | 7 + llvm/docs/LangRef.rst | 87 ++++ llvm/docs/ReleaseNotes.rst | 1 + .../llvm/CodeGen/GlobalISel/LegalizerHelper.h | 1 + llvm/include/llvm/CodeGen/ISDOpcodes.h | 8 + llvm/include/llvm/CodeGen/TargetLowering.h | 4 + llvm/include/llvm/IR/Intrinsics.td | 5 + llvm/include/llvm/Support/TargetOpcodes.def | 3 + llvm/include/llvm/Target/GenericOpcodes.td | 7 + .../Target/GlobalISel/SelectionDAGCompat.td | 1 + .../include/llvm/Target/TargetSelectionDAG.td | 8 + llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp | 2 + .../CodeGen/GlobalISel/LegalizerHelper.cpp | 89 ++++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 53 +- .../SelectionDAG/LegalizeIntegerTypes.cpp | 23 + llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h | 4 + .../SelectionDAG/LegalizeVectorOps.cpp | 4 + .../SelectionDAG/LegalizeVectorTypes.cpp | 34 ++ .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 16 + .../SelectionDAG/SelectionDAGBuilder.cpp | 7 + .../SelectionDAG/SelectionDAGDumper.cpp | 1 + .../CodeGen/SelectionDAG/TargetLowering.cpp | 102 ++++ llvm/lib/CodeGen/TargetLoweringBase.cpp | 3 + .../AArch64/GISel/AArch64LegalizerInfo.cpp | 3 + .../GlobalISel/legalize-vector-compress.mir | 156 ++++++ .../GlobalISel/legalizer-info-validation.mir | 3 + llvm/test/CodeGen/AArch64/vector-compress.ll | 474 ++++++++++++++++++ 27 files changed, 1105 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/legalize-vector-compress.mir create mode 100644 llvm/test/CodeGen/AArch64/vector-compress.ll diff --git a/llvm/docs/GlobalISel/GenericOpcode.rst b/llvm/docs/GlobalISel/GenericOpcode.rst index b05394aeee003..18a53a4815722 100644 --- a/llvm/docs/GlobalISel/GenericOpcode.rst +++ b/llvm/docs/GlobalISel/GenericOpcode.rst @@ -726,6 +726,13 @@ The type of the operand must be equal to or larger than the vector element type. If the operand is larger than the vector element type, the scalar is implicitly truncated to the vector element type. +G_VECTOR_COMPRESS +^^^^^^^^^^^^^^^^^ + +Given an input vector, a mask vector, and a passthru vector, continuously place +all selected (i.e., where mask[i] = true) input lanes in an output vector. All +remaining lanes in the output are taken from passthru, which may be undef. + Vector Reduction Operations --------------------------- diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index f2ff1f0f5852c..cd86156ec816f 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -19525,6 +19525,93 @@ the follow sequence of operations: The ``mask`` operand will apply to at least the gather and scatter operations. + +.. _int_vector_compress: + +'``llvm.experimental.vector.compress.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +LLVM provides an intrinsic for compressing data within a vector based on a selection mask. +Semantically, this is similar to :ref:`llvm.masked.compressstore ` but with weaker assumptions +and without storing the results to memory, i.e., the data remains in the vector. + +Syntax: +""""""" +This is an overloaded intrinsic. A number of scalar values of integer, floating point or pointer data type are collected +from an input vector and placed adjacently within the result vector. A mask defines which elements to collect from the vector. +The remaining lanes are filled with values from ``passthru``. + +:: code-block:: llvm + + declare <8 x i32> @llvm.experimental.vector.compress.v8i32(<8 x i32> , <8 x i1> , <8 x i32> ) + declare <16 x float> @llvm.experimental.vector.compress.v16f32(<16 x float> , <16 x i1> , <16 x float> undef) + +Overview: +""""""""" + +Selects elements from input vector ``value`` according to the ``mask``. +All selected elements are written into adjacent lanes in the result vector, +from lower to higher. +The mask holds an entry for each vector lane, and is used to select elements +to be kept. +If a ``passthru`` vector is given, all remaining lanes are filled with the +corresponding lane's value from ``passthru``. +The main difference to :ref:`llvm.masked.compressstore ` is +that the we do not need to guard against memory access for unselected lanes. +This allows for branchless code and better optimization for all targets that +do not support or have inefficient +instructions of the explicit semantics of +:ref:`llvm.masked.compressstore ` but still have some form +of compress operations. +The result vector can be written with a similar effect, as all the selected +values are at the lower positions of the vector, but without requiring +branches to avoid writes where the mask is ``false``. + +Arguments: +"""""""""" + +The first operand is the input vector, from which elements are selected. +The second operand is the mask, a vector of boolean values. +The third operand is the passthru vector, from which elements are filled +into remaining lanes. +The mask and the input vector must have the same number of vector elements. +The input and passthru vectors must have the same type. + +Semantics: +"""""""""" + +The ``llvm.experimental.vector.compress`` intrinsic compresses data within a vector. +It collects elements from possibly non-adjacent lanes of a vector and places +them contiguously in the result vector based on a selection mask, filling the +remaining lanes with values from ``passthru``. +This intrinsic performs the logic of the following C++ example. +All values in ``out`` after the last selected one are undefined if +``passthru`` is undefined. +If all entries in the ``mask`` are 0, the ``out`` vector is ``passthru``. +If any element of the mask is poison, all elements of the result are poison. +Otherwise, if any element of the mask is undef, all elements of the result are undef. +If ``passthru`` is undefined, the number of valid lanes is equal to the number +of ``true`` entries in the mask, i.e., all lanes >= number-of-selected-values +are undefined. + +.. code-block:: cpp + + // Consecutively place selected values in a vector. + using VecT __attribute__((vector_size(N))) = int; + VecT compress(VecT vec, VecT mask, VecT passthru) { + VecT out; + int idx = 0; + for (int i = 0; i < N / sizeof(int); ++i) { + out[idx] = vec[i]; + idx += static_cast(mask[i]); + } + for (; idx < N / sizeof(int); ++idx) { + out[idx] = passthru[idx]; + } + return out; + } + + Matrix Intrinsics ----------------- diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst index fa60049f67828..827a0fd3606ec 100644 --- a/llvm/docs/ReleaseNotes.rst +++ b/llvm/docs/ReleaseNotes.rst @@ -79,6 +79,7 @@ Changes to the LLVM IR * ``llvm.instprof.mcdc.tvbitmap.update``: 3rd argument has been removed. The next argument has been changed from byte index to bit index. +* Added ``llvm.experimental.vector.compress`` intrinsic. Changes to LLVM infrastructure ------------------------------ diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h index 6e2ab8ce40338..b17bc9aa2a44e 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h @@ -412,6 +412,7 @@ class LegalizerHelper { LegalizeResult lowerUnmergeValues(MachineInstr &MI); LegalizeResult lowerExtractInsertVectorElt(MachineInstr &MI); LegalizeResult lowerShuffleVector(MachineInstr &MI); + LegalizeResult lowerVECTOR_COMPRESS(MachineInstr &MI); Register getDynStackAllocTargetPtr(Register SPReg, Register AllocSize, Align Alignment, LLT PtrTy); LegalizeResult lowerDynStackAlloc(MachineInstr &MI); diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h index e6b10209b4767..daceaf98583bd 100644 --- a/llvm/include/llvm/CodeGen/ISDOpcodes.h +++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h @@ -659,6 +659,14 @@ enum NodeType { /// non-constant operands. STEP_VECTOR, + /// VECTOR_COMPRESS(Vec, Mask, Passthru) + /// consecutively place vector elements based on mask + /// e.g., vec = {A, B, C, D} and mask = {1, 0, 1, 0} + /// --> {A, C, ?, ?} where ? is undefined + /// If passthru is defined, ?s are replaced with elements from passthru. + /// If passthru is undef, ?s remain undefined. + VECTOR_COMPRESS, + /// MULHU/MULHS - Multiply high - Multiply two integers of type iN, /// producing an unsigned/signed value of type i[2*N], then return the top /// part. diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index ef66b82d6f414..d4a2166bf768e 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -5496,6 +5496,10 @@ class TargetLowering : public TargetLoweringBase { /// method accepts vectors as its arguments. SDValue expandVectorSplice(SDNode *Node, SelectionDAG &DAG) const; + /// Expand a vector VECTOR_COMPRESS into a sequence of extract element, store + /// temporarily, advance store position, before re-loading the final vector. + SDValue expandVECTOR_COMPRESS(SDNode *Node, SelectionDAG &DAG) const; + /// Legalize a SETCC or VP_SETCC with given LHS and RHS and condition code CC /// on the current target. A VP_SETCC will additionally be given a Mask /// and/or EVL not equal to SDValue(). diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td index fc39122aa1be0..b4e758136b39f 100644 --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -2398,6 +2398,11 @@ def int_masked_compressstore: [IntrWriteMem, IntrArgMemOnly, IntrWillReturn, NoCapture>]>; +def int_experimental_vector_compress: + DefaultAttrsIntrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMMatchType<0>], + [IntrNoMem, IntrWillReturn]>; + // Test whether a pointer is associated with a type metadata identifier. def int_type_test : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_ptr_ty, llvm_metadata_ty], [IntrNoMem, IntrWillReturn, IntrSpeculatable]>; diff --git a/llvm/include/llvm/Support/TargetOpcodes.def b/llvm/include/llvm/Support/TargetOpcodes.def index e7f40e87ed24a..a6672f87af977 100644 --- a/llvm/include/llvm/Support/TargetOpcodes.def +++ b/llvm/include/llvm/Support/TargetOpcodes.def @@ -754,6 +754,9 @@ HANDLE_TARGET_OPCODE(G_SHUFFLE_VECTOR) /// Generic splatvector. HANDLE_TARGET_OPCODE(G_SPLAT_VECTOR) +/// Generic masked compress. +HANDLE_TARGET_OPCODE(G_VECTOR_COMPRESS) + /// Generic count trailing zeroes. HANDLE_TARGET_OPCODE(G_CTTZ) diff --git a/llvm/include/llvm/Target/GenericOpcodes.td b/llvm/include/llvm/Target/GenericOpcodes.td index e1710ff2d8abf..7501048dfdd78 100644 --- a/llvm/include/llvm/Target/GenericOpcodes.td +++ b/llvm/include/llvm/Target/GenericOpcodes.td @@ -1548,6 +1548,13 @@ def G_SPLAT_VECTOR: GenericInstruction { let hasSideEffects = false; } +// Generic masked compress. +def G_VECTOR_COMPRESS: GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$vec, type1:$mask, type0:$passthru); + let hasSideEffects = false; +} + //------------------------------------------------------------------------------ // Vector reductions //------------------------------------------------------------------------------ diff --git a/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td b/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td index fbe551e1be911..e9dbdef9fe9e7 100644 --- a/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td +++ b/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td @@ -193,6 +193,7 @@ def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; +def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td index 133c9b113e51b..46044aab79a83 100644 --- a/llvm/include/llvm/Target/TargetSelectionDAG.td +++ b/llvm/include/llvm/Target/TargetSelectionDAG.td @@ -266,6 +266,12 @@ def SDTMaskedScatter : SDTypeProfile<0, 4, [ SDTCisSameNumEltsAs<0, 1>, SDTCisSameNumEltsAs<0, 3> ]>; +def SDTVectorCompress : SDTypeProfile<1, 3, [ + SDTCisVec<0>, SDTCisSameAs<0, 1>, + SDTCisVec<2>, SDTCisSameNumEltsAs<1, 2>, + SDTCisSameAs<1, 3> +]>; + def SDTVecShuffle : SDTypeProfile<1, 2, [ SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2> ]>; @@ -757,6 +763,8 @@ def masked_gather : SDNode<"ISD::MGATHER", SDTMaskedGather, def masked_scatter : SDNode<"ISD::MSCATTER", SDTMaskedScatter, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def vector_compress : SDNode<"ISD::VECTOR_COMPRESS", SDTVectorCompress>; + // Do not use ld, st directly. Use load, extload, sextload, zextload, store, // and truncst (see below). def ld : SDNode<"ISD::LOAD" , SDTLoad, diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index 97be19825fcf3..72dff12423ced 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -1994,6 +1994,8 @@ unsigned IRTranslator::getSimpleIntrinsicOpcode(Intrinsic::ID ID) { return TargetOpcode::G_VECREDUCE_UMAX; case Intrinsic::vector_reduce_umin: return TargetOpcode::G_VECREDUCE_UMIN; + case Intrinsic::experimental_vector_compress: + return TargetOpcode::G_VECTOR_COMPRESS; case Intrinsic::lround: return TargetOpcode::G_LROUND; case Intrinsic::llround: diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index b58c96a866883..bcc30390bc82e 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -4034,6 +4034,8 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) { return lowerExtractInsertVectorElt(MI); case G_SHUFFLE_VECTOR: return lowerShuffleVector(MI); + case G_VECTOR_COMPRESS: + return lowerVECTOR_COMPRESS(MI); case G_DYN_STACKALLOC: return lowerDynStackAlloc(MI); case G_STACKSAVE: @@ -7593,6 +7595,93 @@ LegalizerHelper::lowerShuffleVector(MachineInstr &MI) { return Legalized; } +LegalizerHelper::LegalizeResult +LegalizerHelper::lowerVECTOR_COMPRESS(llvm::MachineInstr &MI) { + auto [Dst, DstTy, Vec, VecTy, Mask, MaskTy, Passthru, PassthruTy] = + MI.getFirst4RegLLTs(); + + if (VecTy.isScalableVector()) + report_fatal_error("Cannot expand masked_compress for scalable vectors."); + + Align VecAlign = getStackTemporaryAlignment(VecTy); + MachinePointerInfo PtrInfo; + Register StackPtr = + createStackTemporary(TypeSize::getFixed(VecTy.getSizeInBytes()), VecAlign, + PtrInfo) + .getReg(0); + MachinePointerInfo ValPtrInfo = + MachinePointerInfo::getUnknownStack(*MI.getMF()); + + LLT IdxTy = LLT::scalar(32); + LLT ValTy = VecTy.getElementType(); + Align ValAlign = getStackTemporaryAlignment(ValTy); + + auto OutPos = MIRBuilder.buildConstant(IdxTy, 0); + + bool HasPassthru = + MRI.getVRegDef(Passthru)->getOpcode() != TargetOpcode::G_IMPLICIT_DEF; + + if (HasPassthru) + MIRBuilder.buildStore(Passthru, StackPtr, PtrInfo, VecAlign); + + Register LastWriteVal; + std::optional PassthruSplatVal = + isConstantOrConstantSplatVector(*MRI.getVRegDef(Passthru), MRI); + + if (PassthruSplatVal.has_value()) { + LastWriteVal = + MIRBuilder.buildConstant(ValTy, PassthruSplatVal.value()).getReg(0); + } else if (HasPassthru) { + auto Popcount = MIRBuilder.buildZExt(MaskTy.changeElementSize(32), Mask); + Popcount = MIRBuilder.buildInstr(TargetOpcode::G_VECREDUCE_ADD, + {LLT::scalar(32)}, {Popcount}); + + Register LastElmtPtr = + getVectorElementPointer(StackPtr, VecTy, Popcount.getReg(0)); + LastWriteVal = + MIRBuilder.buildLoad(ValTy, LastElmtPtr, ValPtrInfo, ValAlign) + .getReg(0); + } + + unsigned NumElmts = VecTy.getNumElements(); + for (unsigned I = 0; I < NumElmts; ++I) { + auto Idx = MIRBuilder.buildConstant(IdxTy, I); + auto Val = MIRBuilder.buildExtractVectorElement(ValTy, Vec, Idx); + Register ElmtPtr = + getVectorElementPointer(StackPtr, VecTy, OutPos.getReg(0)); + MIRBuilder.buildStore(Val, ElmtPtr, ValPtrInfo, ValAlign); + + LLT MaskITy = MaskTy.getElementType(); + auto MaskI = MIRBuilder.buildExtractVectorElement(MaskITy, Mask, Idx); + if (MaskITy.getSizeInBits() > 1) + MaskI = MIRBuilder.buildTrunc(LLT::scalar(1), MaskI); + + MaskI = MIRBuilder.buildZExt(IdxTy, MaskI); + OutPos = MIRBuilder.buildAdd(IdxTy, OutPos, MaskI); + + if (HasPassthru && I == NumElmts - 1) { + auto EndOfVector = + MIRBuilder.buildConstant(IdxTy, VecTy.getNumElements() - 1); + auto AllLanesSelected = MIRBuilder.buildICmp( + CmpInst::ICMP_UGT, LLT::scalar(1), OutPos, EndOfVector); + OutPos = MIRBuilder.buildInstr(TargetOpcode::G_UMIN, {IdxTy}, + {OutPos, EndOfVector}); + ElmtPtr = getVectorElementPointer(StackPtr, VecTy, OutPos.getReg(0)); + + LastWriteVal = + MIRBuilder.buildSelect(ValTy, AllLanesSelected, Val, LastWriteVal) + .getReg(0); + MIRBuilder.buildStore(LastWriteVal, ElmtPtr, ValPtrInfo, ValAlign); + } + } + + // TODO: Use StackPtr's FrameIndex alignment. + MIRBuilder.buildLoad(Dst, StackPtr, PtrInfo, VecAlign); + + MI.eraseFromParent(); + return Legalized; +} + Register LegalizerHelper::getDynStackAllocTargetPtr(Register SPReg, Register AllocSize, Align Alignment, diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 302ad128f4f53..30203f9119af7 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -537,6 +537,7 @@ namespace { SDValue visitVECTOR_SHUFFLE(SDNode *N); SDValue visitSCALAR_TO_VECTOR(SDNode *N); SDValue visitINSERT_SUBVECTOR(SDNode *N); + SDValue visitVECTOR_COMPRESS(SDNode *N); SDValue visitMLOAD(SDNode *N); SDValue visitMSTORE(SDNode *N); SDValue visitMGATHER(SDNode *N); @@ -1955,6 +1956,7 @@ SDValue DAGCombiner::visit(SDNode *N) { case ISD::MLOAD: return visitMLOAD(N); case ISD::MSCATTER: return visitMSCATTER(N); case ISD::MSTORE: return visitMSTORE(N); + case ISD::VECTOR_COMPRESS: return visitVECTOR_COMPRESS(N); case ISD::LIFETIME_END: return visitLIFETIME_END(N); case ISD::FP_TO_FP16: return visitFP_TO_FP16(N); case ISD::FP16_TO_FP: return visitFP16_TO_FP(N); @@ -10006,7 +10008,7 @@ SDValue DAGCombiner::visitSHL(SDNode *N) { return LHSC.ult(OpSizeInBits) && RHSC.ult(OpSizeInBits) && LHSC.getZExtValue() <= RHSC.getZExtValue(); }; - + // fold (shl (sr[la] exact X, C1), C2) -> (shl X, (C2-C1)) if C1 <= C2 // fold (shl (sr[la] exact X, C1), C2) -> (sr[la] X, (C2-C1)) if C1 >= C2 if (N0->getFlags().hasExact()) { @@ -12041,6 +12043,55 @@ SDValue DAGCombiner::visitVP_STRIDED_STORE(SDNode *N) { return SDValue(); } +SDValue DAGCombiner::visitVECTOR_COMPRESS(SDNode *N) { + SDLoc DL(N); + SDValue Vec = N->getOperand(0); + SDValue Mask = N->getOperand(1); + SDValue Passthru = N->getOperand(2); + EVT VecVT = Vec.getValueType(); + + bool HasPassthru = !Passthru.isUndef(); + + APInt SplatVal; + if (ISD::isConstantSplatVector(Mask.getNode(), SplatVal)) + return TLI.isConstTrueVal(Mask) ? Vec : Passthru; + + if (Vec.isUndef() || Mask.isUndef()) + return Passthru; + + // No need for potentially expensive compress if the mask is constant. + if (ISD::isBuildVectorOfConstantSDNodes(Mask.getNode())) { + SmallVector Ops; + EVT ScalarVT = VecVT.getVectorElementType(); + unsigned NumSelected = 0; + unsigned NumElmts = VecVT.getVectorNumElements(); + for (unsigned I = 0; I < NumElmts; ++I) { + SDValue MaskI = Mask.getOperand(I); + // We treat undef mask entries as "false". + if (MaskI.isUndef()) + continue; + + if (TLI.isConstTrueVal(MaskI)) { + SDValue VecI = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Vec, + DAG.getVectorIdxConstant(I, DL)); + Ops.push_back(VecI); + NumSelected++; + } + } + for (unsigned Rest = NumSelected; Rest < NumElmts; ++Rest) { + SDValue Val = + HasPassthru + ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Passthru, + DAG.getVectorIdxConstant(Rest, DL)) + : DAG.getUNDEF(ScalarVT); + Ops.push_back(Val); + } + return DAG.getBuildVector(VecVT, DL, Ops); + } + + return SDValue(); +} + SDValue DAGCombiner::visitVPGATHER(SDNode *N) { VPGatherSDNode *MGT = cast(N); SDValue Mask = MGT->getMask(); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index 08321c3842450..af77b0070df0a 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -87,6 +87,9 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) { break; case ISD::MGATHER: Res = PromoteIntRes_MGATHER(cast(N)); break; + case ISD::VECTOR_COMPRESS: + Res = PromoteIntRes_VECTOR_COMPRESS(N); + break; case ISD::SELECT: case ISD::VSELECT: case ISD::VP_SELECT: @@ -995,6 +998,13 @@ SDValue DAGTypeLegalizer::PromoteIntRes_MGATHER(MaskedGatherSDNode *N) { return Res; } +SDValue DAGTypeLegalizer::PromoteIntRes_VECTOR_COMPRESS(SDNode *N) { + SDValue Vec = GetPromotedInteger(N->getOperand(0)); + SDValue Passthru = GetPromotedInteger(N->getOperand(2)); + return DAG.getNode(ISD::VECTOR_COMPRESS, SDLoc(N), Vec.getValueType(), Vec, + N->getOperand(1), Passthru); +} + /// Promote the overflow flag of an overflowing arithmetic node. SDValue DAGTypeLegalizer::PromoteIntRes_Overflow(SDNode *N) { // Change the return type of the boolean result while obeying @@ -1944,6 +1954,9 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) { OpNo); break; case ISD::MSCATTER: Res = PromoteIntOp_MSCATTER(cast(N), OpNo); break; + case ISD::VECTOR_COMPRESS: + Res = PromoteIntOp_VECTOR_COMPRESS(N, OpNo); + break; case ISD::VP_TRUNCATE: case ISD::TRUNCATE: Res = PromoteIntOp_TRUNCATE(N); break; case ISD::BF16_TO_FP: @@ -2442,6 +2455,16 @@ SDValue DAGTypeLegalizer::PromoteIntOp_MSCATTER(MaskedScatterSDNode *N, N->getIndexType(), TruncateStore); } +SDValue DAGTypeLegalizer::PromoteIntOp_VECTOR_COMPRESS(SDNode *N, + unsigned OpNo) { + assert(OpNo == 1 && "Can only promote VECTOR_COMPRESS mask."); + SDValue Vec = N->getOperand(0); + EVT VT = Vec.getValueType(); + SDValue Passthru = N->getOperand(2); + SDValue Mask = PromoteTargetBoolean(N->getOperand(1), VT); + return DAG.getNode(ISD::VECTOR_COMPRESS, SDLoc(N), VT, Vec, Mask, Passthru); +} + SDValue DAGTypeLegalizer::PromoteIntOp_TRUNCATE(SDNode *N) { SDValue Op = GetPromotedInteger(N->getOperand(0)); if (N->getOpcode() == ISD::VP_TRUNCATE) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index a5c92ee463690..d4e61c8588901 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -340,6 +340,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue PromoteIntRes_LOAD(LoadSDNode *N); SDValue PromoteIntRes_MLOAD(MaskedLoadSDNode *N); SDValue PromoteIntRes_MGATHER(MaskedGatherSDNode *N); + SDValue PromoteIntRes_VECTOR_COMPRESS(SDNode *N); SDValue PromoteIntRes_Overflow(SDNode *N); SDValue PromoteIntRes_FFREXP(SDNode *N); SDValue PromoteIntRes_SADDSUBO(SDNode *N, unsigned ResNo); @@ -412,6 +413,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue PromoteIntOp_MLOAD(MaskedLoadSDNode *N, unsigned OpNo); SDValue PromoteIntOp_MSCATTER(MaskedScatterSDNode *N, unsigned OpNo); SDValue PromoteIntOp_MGATHER(MaskedGatherSDNode *N, unsigned OpNo); + SDValue PromoteIntOp_VECTOR_COMPRESS(SDNode *N, unsigned OpNo); SDValue PromoteIntOp_FRAMERETURNADDR(SDNode *N); SDValue PromoteIntOp_FIX(SDNode *N); SDValue PromoteIntOp_ExpOp(SDNode *N); @@ -927,6 +929,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { void SplitVecRes_MLOAD(MaskedLoadSDNode *MLD, SDValue &Lo, SDValue &Hi); void SplitVecRes_Gather(MemSDNode *VPGT, SDValue &Lo, SDValue &Hi, bool SplitSETCC = false); + void SplitVecRes_VECTOR_COMPRESS(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_ScalarOp(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_VP_SPLAT(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_STEP_VECTOR(SDNode *N, SDValue &Lo, SDValue &Hi); @@ -1019,6 +1022,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue WidenVecRes_LOAD(SDNode* N); SDValue WidenVecRes_VP_LOAD(VPLoadSDNode *N); SDValue WidenVecRes_VP_STRIDED_LOAD(VPStridedLoadSDNode *N); + SDValue WidenVecRes_VECTOR_COMPRESS(SDNode *N); SDValue WidenVecRes_MLOAD(MaskedLoadSDNode* N); SDValue WidenVecRes_MGATHER(MaskedGatherSDNode* N); SDValue WidenVecRes_VP_GATHER(VPGatherSDNode* N); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index 307d1fc920d48..57843f0959ac2 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -455,6 +455,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) { case ISD::FP_TO_SINT_SAT: case ISD::FP_TO_UINT_SAT: case ISD::MGATHER: + case ISD::VECTOR_COMPRESS: case ISD::SCMP: case ISD::UCMP: Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0)); @@ -1123,6 +1124,9 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl &Results) { return; break; + case ISD::VECTOR_COMPRESS: + Results.push_back(TLI.expandVECTOR_COMPRESS(Node, DAG)); + return; } SDValue Unrolled = DAG.UnrollVectorOp(Node); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index ed629485c0c2b..92b62ccdc2755 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -1110,6 +1110,9 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) { case ISD::VP_GATHER: SplitVecRes_Gather(cast(N), Lo, Hi, /*SplitSETCC*/ true); break; + case ISD::VECTOR_COMPRESS: + SplitVecRes_VECTOR_COMPRESS(N, Lo, Hi); + break; case ISD::SETCC: case ISD::VP_SETCC: SplitVecRes_SETCC(N, Lo, Hi); @@ -2401,6 +2404,17 @@ void DAGTypeLegalizer::SplitVecRes_Gather(MemSDNode *N, SDValue &Lo, ReplaceValueWith(SDValue(N, 1), Ch); } +void DAGTypeLegalizer::SplitVecRes_VECTOR_COMPRESS(SDNode *N, SDValue &Lo, + SDValue &Hi) { + // This is not "trivial", as there is a dependency between the two subvectors. + // Depending on the number of 1s in the mask, the elements from the Hi vector + // need to be moved to the Lo vector. So we just perform this as one "big" + // operation and then extract the Lo and Hi vectors from that. This gets rid + // of VECTOR_COMPRESS and all other operands can be legalized later. + SDValue Compressed = TLI.expandVECTOR_COMPRESS(N, DAG); + std::tie(Lo, Hi) = DAG.SplitVector(Compressed, SDLoc(N)); +} + void DAGTypeLegalizer::SplitVecRes_SETCC(SDNode *N, SDValue &Lo, SDValue &Hi) { assert(N->getValueType(0).isVector() && N->getOperand(0).getValueType().isVector() && @@ -4333,6 +4347,9 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) { case ISD::EXPERIMENTAL_VP_STRIDED_LOAD: Res = WidenVecRes_VP_STRIDED_LOAD(cast(N)); break; + case ISD::VECTOR_COMPRESS: + Res = WidenVecRes_VECTOR_COMPRESS(N); + break; case ISD::MLOAD: Res = WidenVecRes_MLOAD(cast(N)); break; @@ -5759,6 +5776,23 @@ SDValue DAGTypeLegalizer::WidenVecRes_VP_STRIDED_LOAD(VPStridedLoadSDNode *N) { return Res; } +SDValue DAGTypeLegalizer::WidenVecRes_VECTOR_COMPRESS(SDNode *N) { + SDValue Vec = N->getOperand(0); + SDValue Mask = N->getOperand(1); + SDValue Passthru = N->getOperand(2); + EVT WideVecVT = + TLI.getTypeToTransformTo(*DAG.getContext(), Vec.getValueType()); + EVT WideMaskVT = EVT::getVectorVT(*DAG.getContext(), + Mask.getValueType().getVectorElementType(), + WideVecVT.getVectorNumElements()); + + SDValue WideVec = ModifyToType(Vec, WideVecVT); + SDValue WideMask = ModifyToType(Mask, WideMaskVT, /*FillWithZeroes=*/true); + SDValue WidePassthru = ModifyToType(Passthru, WideVecVT); + return DAG.getNode(ISD::VECTOR_COMPRESS, SDLoc(N), WideVecVT, WideVec, + WideMask, WidePassthru); +} + SDValue DAGTypeLegalizer::WidenVecRes_MLOAD(MaskedLoadSDNode *N) { EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(),N->getValueType(0)); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 94349ec97693f..2cd0e209f1c07 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -7556,6 +7556,22 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, if (N1.getValueType() == VT) return N1; break; + case ISD::VECTOR_COMPRESS: { + EVT VecVT = N1.getValueType(); + [[maybe_unused]] EVT MaskVT = N2.getValueType(); + [[maybe_unused]] EVT PassthruVT = N3.getValueType(); + assert(VT == VecVT && "Vector and result type don't match."); + assert(VecVT.isVector() && MaskVT.isVector() && PassthruVT.isVector() && + "All inputs must be vectors."); + assert(VecVT == PassthruVT && "Vector and passthru types don't match."); + assert(VecVT.getVectorElementCount() == MaskVT.getVectorElementCount() && + "Vector and mask must have same number of elements."); + + if (N1.isUndef() || N2.isUndef()) + return N3; + + break; + } } // Memoize node if it doesn't produce a glue result. diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index d5cbb733a408d..bbd4c3521d908 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -8115,6 +8115,13 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, case Intrinsic::vector_deinterleave2: visitVectorDeinterleave(I); return; + case Intrinsic::experimental_vector_compress: + setValue(&I, DAG.getNode(ISD::VECTOR_COMPRESS, sdl, + getValue(I.getArgOperand(0)).getValueType(), + getValue(I.getArgOperand(0)), + getValue(I.getArgOperand(1)), + getValue(I.getArgOperand(2)), Flags)); + return; case Intrinsic::experimental_convergence_anchor: case Intrinsic::experimental_convergence_entry: case Intrinsic::experimental_convergence_loop: diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp index cc8de3a217f82..16fc52caebb75 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -434,6 +434,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { case ISD::MSTORE: return "masked_store"; case ISD::MGATHER: return "masked_gather"; case ISD::MSCATTER: return "masked_scatter"; + case ISD::VECTOR_COMPRESS: return "vector_compress"; case ISD::VAARG: return "vaarg"; case ISD::VACOPY: return "vacopy"; case ISD::VAEND: return "vaend"; diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 1433c8821248d..adf14bd007356 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -11372,6 +11372,108 @@ SDValue TargetLowering::expandVectorSplice(SDNode *Node, MachinePointerInfo::getUnknownStack(MF)); } +SDValue TargetLowering::expandVECTOR_COMPRESS(SDNode *Node, + SelectionDAG &DAG) const { + SDLoc DL(Node); + SDValue Vec = Node->getOperand(0); + SDValue Mask = Node->getOperand(1); + SDValue Passthru = Node->getOperand(2); + + EVT VecVT = Vec.getValueType(); + EVT ScalarVT = VecVT.getScalarType(); + EVT MaskVT = Mask.getValueType(); + EVT MaskScalarVT = MaskVT.getScalarType(); + + // Needs to be handled by targets that have scalable vector types. + if (VecVT.isScalableVector()) + report_fatal_error("Cannot expand masked_compress for scalable vectors."); + + SDValue StackPtr = DAG.CreateStackTemporary( + VecVT.getStoreSize(), DAG.getReducedAlign(VecVT, /*UseABI=*/false)); + int FI = cast(StackPtr.getNode())->getIndex(); + MachinePointerInfo PtrInfo = + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI); + + MVT PositionVT = getVectorIdxTy(DAG.getDataLayout()); + SDValue Chain = DAG.getEntryNode(); + SDValue OutPos = DAG.getConstant(0, DL, PositionVT); + + bool HasPassthru = !Passthru.isUndef(); + + // If we have a passthru vector, store it on the stack, overwrite the matching + // positions and then re-write the last element that was potentially + // overwritten even though mask[i] = false. + if (HasPassthru) + Chain = DAG.getStore(Chain, DL, Passthru, StackPtr, PtrInfo); + + SDValue LastWriteVal; + APInt PassthruSplatVal; + bool IsSplatPassthru = + ISD::isConstantSplatVector(Passthru.getNode(), PassthruSplatVal); + + if (IsSplatPassthru) { + // As we do not know which position we wrote to last, we cannot simply + // access that index from the passthru vector. So we first check if passthru + // is a splat vector, to use any element ... + LastWriteVal = DAG.getConstant(PassthruSplatVal, DL, ScalarVT); + } else if (HasPassthru) { + // ... if it is not a splat vector, we need to get the passthru value at + // position = popcount(mask) and re-load it from the stack before it is + // overwritten in the loop below. + SDValue Popcount = DAG.getNode( + ISD::TRUNCATE, DL, MaskVT.changeVectorElementType(MVT::i1), Mask); + Popcount = DAG.getNode(ISD::ZERO_EXTEND, DL, + MaskVT.changeVectorElementType(ScalarVT), Popcount); + Popcount = DAG.getNode(ISD::VECREDUCE_ADD, DL, ScalarVT, Popcount); + SDValue LastElmtPtr = + getVectorElementPointer(DAG, StackPtr, VecVT, Popcount); + LastWriteVal = DAG.getLoad( + ScalarVT, DL, Chain, LastElmtPtr, + MachinePointerInfo::getUnknownStack(DAG.getMachineFunction())); + Chain = LastWriteVal.getValue(1); + } + + unsigned NumElms = VecVT.getVectorNumElements(); + for (unsigned I = 0; I < NumElms; I++) { + SDValue Idx = DAG.getVectorIdxConstant(I, DL); + + SDValue ValI = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Vec, Idx); + SDValue OutPtr = getVectorElementPointer(DAG, StackPtr, VecVT, OutPos); + Chain = DAG.getStore( + Chain, DL, ValI, OutPtr, + MachinePointerInfo::getUnknownStack(DAG.getMachineFunction())); + + // Get the mask value and add it to the current output position. This + // either increments by 1 if MaskI is true or adds 0 otherwise. + // Freeze in case we have poison/undef mask entries. + SDValue MaskI = DAG.getFreeze( + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskScalarVT, Mask, Idx)); + MaskI = DAG.getFreeze(MaskI); + MaskI = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, MaskI); + MaskI = DAG.getNode(ISD::ZERO_EXTEND, DL, PositionVT, MaskI); + OutPos = DAG.getNode(ISD::ADD, DL, PositionVT, OutPos, MaskI); + + if (HasPassthru && I == NumElms - 1) { + SDValue EndOfVector = + DAG.getConstant(VecVT.getVectorNumElements() - 1, DL, PositionVT); + SDValue AllLanesSelected = + DAG.getSetCC(DL, MVT::i1, OutPos, EndOfVector, ISD::CondCode::SETUGT); + OutPos = DAG.getNode(ISD::UMIN, DL, PositionVT, OutPos, EndOfVector); + OutPtr = getVectorElementPointer(DAG, StackPtr, VecVT, OutPos); + + // Re-write the last ValI if all lanes were selected. Otherwise, + // overwrite the last write it with the passthru value. + LastWriteVal = + DAG.getSelect(DL, ScalarVT, AllLanesSelected, ValI, LastWriteVal); + Chain = DAG.getStore( + Chain, DL, LastWriteVal, OutPtr, + MachinePointerInfo::getUnknownStack(DAG.getMachineFunction())); + } + } + + return DAG.getLoad(VecVT, DL, Chain, StackPtr, PtrInfo); +} + bool TargetLowering::LegalizeSetCCCondCode(SelectionDAG &DAG, EVT VT, SDValue &LHS, SDValue &RHS, SDValue &CC, SDValue Mask, diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index bf031c00a2449..8040f1eeae810 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -758,6 +758,9 @@ void TargetLoweringBase::initActions() { // Named vector shuffles default to expand. setOperationAction(ISD::VECTOR_SPLICE, VT, Expand); + // Only some target support this vector operation. Most need to expand it. + setOperationAction(ISD::VECTOR_COMPRESS, VT, Expand); + // VP operations default to expand. #define BEGIN_REGISTER_VP_SDNODE(SDOPC, ...) \ setOperationAction(ISD::SDOPC, VT, Expand); diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index d42d5511a8242..a73c971020bd8 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -1202,6 +1202,9 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) .scalarize(1) .lower(); + // TODO: Update this to correct handling when adding AArch64/SVE support. + getActionDefinitionsBuilder(G_VECTOR_COMPRESS).lower(); + getActionDefinitionsBuilder({G_FSHL, G_FSHR}) .customFor({{s32, s32}, {s32, s64}, {s64, s64}}) .lower(); diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-vector-compress.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-vector-compress.mir new file mode 100644 index 0000000000000..cc7577473b548 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-vector-compress.mir @@ -0,0 +1,156 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=aarch64 -run-pass=legalizer %s -o - | FileCheck %s +--- +name: test_vector_compress_v4s32 +body: | + bb.0: + liveins: $q0, $d1 + + ; CHECK-LABEL: name: test_vector_compress_v4s32 + ; CHECK: liveins: $q0, $d1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $d1 + ; CHECK-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<4 x s32>), [[C1]](s64) + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[C1]], [[C2]] + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[MUL]](s64) + ; CHECK-NEXT: G_STORE [[EVEC]](s32), [[PTR_ADD]](p0) :: (store (s32)) + ; CHECK-NEXT: [[EVEC1:%[0-9]+]]:_(s16) = G_EXTRACT_VECTOR_ELT [[COPY1]](<4 x s16>), [[C1]](s64) + ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC1]](s16) + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C3]] + ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[C]], [[AND]] + ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; CHECK-NEXT: [[EVEC2:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<4 x s32>), [[C4]](s64) + ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ADD]], [[C5]] + ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[AND1]](s32) + ; CHECK-NEXT: [[MUL1:%[0-9]+]]:_(s64) = G_MUL [[SEXT]], [[C2]] + ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[MUL1]](s64) + ; CHECK-NEXT: G_STORE [[EVEC2]](s32), [[PTR_ADD1]](p0) :: (store (s32)) + ; CHECK-NEXT: [[EVEC3:%[0-9]+]]:_(s16) = G_EXTRACT_VECTOR_ELT [[COPY1]](<4 x s16>), [[C4]](s64) + ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC3]](s16) + ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C3]] + ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ADD]], [[AND2]] + ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 + ; CHECK-NEXT: [[EVEC4:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<4 x s32>), [[C6]](s64) + ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ADD1]], [[C5]] + ; CHECK-NEXT: [[SEXT1:%[0-9]+]]:_(s64) = G_SEXT [[AND3]](s32) + ; CHECK-NEXT: [[MUL2:%[0-9]+]]:_(s64) = G_MUL [[SEXT1]], [[C2]] + ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[MUL2]](s64) + ; CHECK-NEXT: G_STORE [[EVEC4]](s32), [[PTR_ADD2]](p0) :: (store (s32)) + ; CHECK-NEXT: [[EVEC5:%[0-9]+]]:_(s16) = G_EXTRACT_VECTOR_ELT [[COPY1]](<4 x s16>), [[C6]](s64) + ; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC5]](s16) + ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[ANYEXT2]], [[C3]] + ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[ADD1]], [[AND4]] + ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 3 + ; CHECK-NEXT: [[EVEC6:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<4 x s32>), [[C7]](s64) + ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[ADD2]], [[C5]] + ; CHECK-NEXT: [[SEXT2:%[0-9]+]]:_(s64) = G_SEXT [[AND5]](s32) + ; CHECK-NEXT: [[MUL3:%[0-9]+]]:_(s64) = G_MUL [[SEXT2]], [[C2]] + ; CHECK-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[MUL3]](s64) + ; CHECK-NEXT: G_STORE [[EVEC6]](s32), [[PTR_ADD3]](p0) :: (store (s32)) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[FRAME_INDEX]](p0) :: (load (<4 x s32>) from %stack.0) + ; CHECK-NEXT: $q0 = COPY [[LOAD]](<4 x s32>) + ; CHECK-NEXT: RET_ReallyLR implicit $q0 + %0:_(<4 x s32>) = COPY $q0 + %1:_(<4 x s16>) = COPY $d1 + %2:_(<4 x s32>) = G_IMPLICIT_DEF + %3:_(<4 x s32>) = G_VECTOR_COMPRESS %0(<4 x s32>), %1(<4 x s16>), %2(<4 x s32>) + $q0 = COPY %3(<4 x s32>) + RET_ReallyLR implicit $q0 +... +--- +name: test_vector_compress_v4s32_with_passthru +body: | + bb.0: + liveins: $q0, $d1, $q2 + + + ; CHECK-LABEL: name: test_vector_compress_v4s32_with_passthru + ; CHECK: liveins: $q0, $d1, $q2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $d1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(<4 x s32>) = COPY $q2 + ; CHECK-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: G_STORE [[COPY2]](<4 x s32>), [[FRAME_INDEX]](p0) :: (store (<4 x s32>) into %stack.0) + ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(<4 x s32>) = G_ZEXT [[COPY1]](<4 x s16>) + ; CHECK-NEXT: [[VECREDUCE_ADD:%[0-9]+]]:_(s32) = G_VECREDUCE_ADD [[ZEXT]](<4 x s32>) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[VECREDUCE_ADD]], [[C1]] + ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[AND]](s32) + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[SEXT]], [[C2]] + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[MUL]](s64) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s32)) + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<4 x s32>), [[C3]](s64) + ; CHECK-NEXT: [[MUL1:%[0-9]+]]:_(s64) = G_MUL [[C3]], [[C2]] + ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[MUL1]](s64) + ; CHECK-NEXT: G_STORE [[EVEC]](s32), [[PTR_ADD1]](p0) :: (store (s32)) + ; CHECK-NEXT: [[EVEC1:%[0-9]+]]:_(s16) = G_EXTRACT_VECTOR_ELT [[COPY1]](<4 x s16>), [[C3]](s64) + ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC1]](s16) + ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C4]] + ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[C]], [[AND1]] + ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; CHECK-NEXT: [[EVEC2:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<4 x s32>), [[C5]](s64) + ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ADD]], [[C1]] + ; CHECK-NEXT: [[SEXT1:%[0-9]+]]:_(s64) = G_SEXT [[AND2]](s32) + ; CHECK-NEXT: [[MUL2:%[0-9]+]]:_(s64) = G_MUL [[SEXT1]], [[C2]] + ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[MUL2]](s64) + ; CHECK-NEXT: G_STORE [[EVEC2]](s32), [[PTR_ADD2]](p0) :: (store (s32)) + ; CHECK-NEXT: [[EVEC3:%[0-9]+]]:_(s16) = G_EXTRACT_VECTOR_ELT [[COPY1]](<4 x s16>), [[C5]](s64) + ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC3]](s16) + ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C4]] + ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ADD]], [[AND3]] + ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 + ; CHECK-NEXT: [[EVEC4:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<4 x s32>), [[C6]](s64) + ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[ADD1]], [[C1]] + ; CHECK-NEXT: [[SEXT2:%[0-9]+]]:_(s64) = G_SEXT [[AND4]](s32) + ; CHECK-NEXT: [[MUL3:%[0-9]+]]:_(s64) = G_MUL [[SEXT2]], [[C2]] + ; CHECK-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[MUL3]](s64) + ; CHECK-NEXT: G_STORE [[EVEC4]](s32), [[PTR_ADD3]](p0) :: (store (s32)) + ; CHECK-NEXT: [[EVEC5:%[0-9]+]]:_(s16) = G_EXTRACT_VECTOR_ELT [[COPY1]](<4 x s16>), [[C6]](s64) + ; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC5]](s16) + ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[ANYEXT2]], [[C4]] + ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[ADD1]], [[AND5]] + ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 3 + ; CHECK-NEXT: [[EVEC6:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<4 x s32>), [[C7]](s64) + ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[ADD2]], [[C1]] + ; CHECK-NEXT: [[SEXT3:%[0-9]+]]:_(s64) = G_SEXT [[AND6]](s32) + ; CHECK-NEXT: [[MUL4:%[0-9]+]]:_(s64) = G_MUL [[SEXT3]], [[C2]] + ; CHECK-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[MUL4]](s64) + ; CHECK-NEXT: G_STORE [[EVEC6]](s32), [[PTR_ADD4]](p0) :: (store (s32)) + ; CHECK-NEXT: [[EVEC7:%[0-9]+]]:_(s16) = G_EXTRACT_VECTOR_ELT [[COPY1]](<4 x s16>), [[C7]](s64) + ; CHECK-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC7]](s16) + ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[ANYEXT3]], [[C4]] + ; CHECK-NEXT: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[ADD2]], [[AND7]] + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[ADD3]](s32), [[C1]] + ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD3]](s32), [[C1]] + ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[ADD3]], [[C1]] + ; CHECK-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[SELECT]], [[C1]] + ; CHECK-NEXT: [[SEXT4:%[0-9]+]]:_(s64) = G_SEXT [[AND8]](s32) + ; CHECK-NEXT: [[MUL5:%[0-9]+]]:_(s64) = G_MUL [[SEXT4]], [[C2]] + ; CHECK-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[MUL5]](s64) + ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[EVEC6]], [[LOAD]] + ; CHECK-NEXT: G_STORE [[SELECT1]](s32), [[PTR_ADD5]](p0) :: (store (s32)) + ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[FRAME_INDEX]](p0) :: (load (<4 x s32>) from %stack.0) + ; CHECK-NEXT: $q0 = COPY [[LOAD1]](<4 x s32>) + ; CHECK-NEXT: RET_ReallyLR implicit $q0 + %0:_(<4 x s32>) = COPY $q0 + %1:_(<4 x s16>) = COPY $d1 + %2:_(<4 x s32>) = COPY $q2 + %3:_(<4 x s32>) = G_VECTOR_COMPRESS %0(<4 x s32>), %1(<4 x s16>), %2(<4 x s32>) + $q0 = COPY %3(<4 x s32>) + RET_ReallyLR implicit $q0 +... + + + diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir index 6db0b9326ca47..61ea3fb998374 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir @@ -646,6 +646,9 @@ # DEBUG-NEXT: G_SPLAT_VECTOR (opcode {{[0-9]+}}): 2 type indices, 0 imm indices # DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined # DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_VECTOR_COMPRESS (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected # DEBUG-NEXT: G_CTTZ (opcode {{[0-9]+}}): 2 type indices, 0 imm indices # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected diff --git a/llvm/test/CodeGen/AArch64/vector-compress.ll b/llvm/test/CodeGen/AArch64/vector-compress.ll new file mode 100644 index 0000000000000..fcf5c546f2610 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/vector-compress.ll @@ -0,0 +1,474 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -mtriple=aarch64-apple-darwin -verify-machineinstrs < %s | FileCheck %s + +define <4 x i32> @test_compress_v4i32(<4 x i32> %vec, <4 x i1> %mask) { +; CHECK-LABEL: test_compress_v4i32: +; CHECK: ; %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ushll.4s v1, v1, #0 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str s0, [sp] +; CHECK-NEXT: shl.4s v1, v1, #31 +; CHECK-NEXT: cmlt.4s v1, v1, #0 +; CHECK-NEXT: mov.s w9, v1[1] +; CHECK-NEXT: mov.s w10, v1[2] +; CHECK-NEXT: fmov w11, s1 +; CHECK-NEXT: bfi x8, x11, #2, #1 +; CHECK-NEXT: and x11, x11, #0x1 +; CHECK-NEXT: and x9, x9, #0x1 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: add x9, x11, x9 +; CHECK-NEXT: mov x11, sp +; CHECK-NEXT: st1.s { v0 }[1], [x8] +; CHECK-NEXT: add w10, w9, w10 +; CHECK-NEXT: orr x9, x11, x9, lsl #2 +; CHECK-NEXT: bfi x11, x10, #2, #2 +; CHECK-NEXT: st1.s { v0 }[2], [x9] +; CHECK-NEXT: st1.s { v0 }[3], [x11] +; CHECK-NEXT: ldr q0, [sp], #16 +; CHECK-NEXT: ret + %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> %mask, <4 x i32> undef) + ret <4 x i32> %out +} + + +define <4 x i32> @test_compress_v4i32_with_passthru(<4 x i32> %vec, <4 x i1> %mask, <4 x i32> %passthru) { +; CHECK-LABEL: test_compress_v4i32_with_passthru: +; CHECK: ; %bb.0: +; CHECK-NEXT: ushll.4s v1, v1, #0 +; CHECK-NEXT: movi.4s v3, #1 +; CHECK-NEXT: shl.4s v1, v1, #31 +; CHECK-NEXT: cmlt.4s v1, v1, #0 +; CHECK-NEXT: and.16b v3, v1, v3 +; CHECK-NEXT: str q2, [sp, #-16]! +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov.s w8, v1[1] +; CHECK-NEXT: fmov w16, s1 +; CHECK-NEXT: mov x12, sp +; CHECK-NEXT: mov.s w11, v1[2] +; CHECK-NEXT: addv.4s s2, v3 +; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: mov.s w13, v1[3] +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: mov x14, sp +; CHECK-NEXT: bfi x12, x16, #2, #1 +; CHECK-NEXT: and x16, x16, #0x1 +; CHECK-NEXT: mov w15, #3 ; =0x3 +; CHECK-NEXT: and x8, x8, #0x1 +; CHECK-NEXT: add x8, x16, x8 +; CHECK-NEXT: fmov w16, s2 +; CHECK-NEXT: and x11, x11, #0x1 +; CHECK-NEXT: and x13, x13, #0x1 +; CHECK-NEXT: add x11, x8, x11 +; CHECK-NEXT: orr x8, x9, x8, lsl #2 +; CHECK-NEXT: add x13, x11, x13 +; CHECK-NEXT: bfi x14, x11, #2, #2 +; CHECK-NEXT: bfi x10, x16, #2, #2 +; CHECK-NEXT: mov.s w16, v0[3] +; CHECK-NEXT: cmp x13, #3 +; CHECK-NEXT: csel x11, x13, x15, lo +; CHECK-NEXT: ldr w10, [x10] +; CHECK-NEXT: str s0, [sp] +; CHECK-NEXT: st1.s { v0 }[1], [x12] +; CHECK-NEXT: st1.s { v0 }[2], [x8] +; CHECK-NEXT: orr x8, x9, x11, lsl #2 +; CHECK-NEXT: csel w9, w16, w10, hi +; CHECK-NEXT: st1.s { v0 }[3], [x14] +; CHECK-NEXT: str w9, [x8] +; CHECK-NEXT: ldr q0, [sp], #16 +; CHECK-NEXT: ret + %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> %mask, <4 x i32> %passthru) + ret <4 x i32> %out +} + +define <2 x double> @test_compress_v2f64(<2 x double> %vec, <2 x i1> %mask) { +; CHECK-LABEL: test_compress_v2f64: +; CHECK: ; %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ushll.2d v1, v1, #0 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str d0, [sp] +; CHECK-NEXT: shl.2d v1, v1, #63 +; CHECK-NEXT: cmlt.2d v1, v1, #0 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: bfi x8, x9, #3, #1 +; CHECK-NEXT: st1.d { v0 }[1], [x8] +; CHECK-NEXT: ldr q0, [sp], #16 +; CHECK-NEXT: ret + %out = call <2 x double> @llvm.experimental.vector.compress.v2f64(<2 x double> %vec, <2 x i1> %mask, <2 x double> undef) + ret <2 x double> %out +} + +define <16 x i8> @test_compress_v16i8(<16 x i8> %vec, <16 x i1> %mask) { +; CHECK-LABEL: test_compress_v16i8: +; CHECK: ; %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: shl.16b v1, v1, #7 +; CHECK-NEXT: mov x12, sp +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: st1.b { v0 }[0], [x8] +; CHECK-NEXT: mov x13, sp +; CHECK-NEXT: cmlt.16b v1, v1, #0 +; CHECK-NEXT: umov.b w9, v1[0] +; CHECK-NEXT: umov.b w10, v1[1] +; CHECK-NEXT: umov.b w11, v1[2] +; CHECK-NEXT: umov.b w14, v1[3] +; CHECK-NEXT: bfxil x12, x9, #0, #1 +; CHECK-NEXT: and x10, x10, #0x1 +; CHECK-NEXT: and x9, x9, #0x1 +; CHECK-NEXT: add x9, x9, x10 +; CHECK-NEXT: umov.b w10, v1[4] +; CHECK-NEXT: and x11, x11, #0x1 +; CHECK-NEXT: st1.b { v0 }[1], [x12] +; CHECK-NEXT: orr x12, x8, x9 +; CHECK-NEXT: add x9, x9, x11 +; CHECK-NEXT: umov.b w11, v1[5] +; CHECK-NEXT: and x14, x14, #0x1 +; CHECK-NEXT: st1.b { v0 }[2], [x12] +; CHECK-NEXT: add x14, x9, x14 +; CHECK-NEXT: umov.b w12, v1[6] +; CHECK-NEXT: orr x9, x8, x9 +; CHECK-NEXT: and x10, x10, #0x1 +; CHECK-NEXT: st1.b { v0 }[3], [x9] +; CHECK-NEXT: orr x9, x8, x14 +; CHECK-NEXT: add x10, x14, x10 +; CHECK-NEXT: umov.b w14, v1[7] +; CHECK-NEXT: st1.b { v0 }[4], [x9] +; CHECK-NEXT: and x11, x11, #0x1 +; CHECK-NEXT: bfxil x13, x10, #0, #4 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: add x10, x10, x11 +; CHECK-NEXT: umov.b w11, v1[8] +; CHECK-NEXT: and x12, x12, #0x1 +; CHECK-NEXT: bfxil x9, x10, #0, #4 +; CHECK-NEXT: st1.b { v0 }[5], [x13] +; CHECK-NEXT: umov.b w13, v1[9] +; CHECK-NEXT: add x10, x10, x12 +; CHECK-NEXT: mov x12, sp +; CHECK-NEXT: and x14, x14, #0x1 +; CHECK-NEXT: st1.b { v0 }[6], [x9] +; CHECK-NEXT: umov.b w9, v1[10] +; CHECK-NEXT: bfxil x12, x10, #0, #4 +; CHECK-NEXT: add x10, x10, x14 +; CHECK-NEXT: mov x14, sp +; CHECK-NEXT: and x11, x11, #0x1 +; CHECK-NEXT: bfxil x14, x10, #0, #4 +; CHECK-NEXT: add x10, x10, x11 +; CHECK-NEXT: mov x11, sp +; CHECK-NEXT: and x13, x13, #0x1 +; CHECK-NEXT: st1.b { v0 }[7], [x12] +; CHECK-NEXT: mov x12, sp +; CHECK-NEXT: bfxil x11, x10, #0, #4 +; CHECK-NEXT: add x10, x10, x13 +; CHECK-NEXT: umov.b w13, v1[11] +; CHECK-NEXT: st1.b { v0 }[8], [x14] +; CHECK-NEXT: umov.b w14, v1[12] +; CHECK-NEXT: and x9, x9, #0x1 +; CHECK-NEXT: bfxil x12, x10, #0, #4 +; CHECK-NEXT: add x9, x10, x9 +; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: st1.b { v0 }[9], [x11] +; CHECK-NEXT: umov.b w11, v1[13] +; CHECK-NEXT: bfxil x10, x9, #0, #4 +; CHECK-NEXT: st1.b { v0 }[10], [x12] +; CHECK-NEXT: umov.b w12, v1[14] +; CHECK-NEXT: and x13, x13, #0x1 +; CHECK-NEXT: and x14, x14, #0x1 +; CHECK-NEXT: add x9, x9, x13 +; CHECK-NEXT: st1.b { v0 }[11], [x10] +; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: add x13, x9, x14 +; CHECK-NEXT: mov x14, sp +; CHECK-NEXT: bfxil x10, x9, #0, #4 +; CHECK-NEXT: and x9, x11, #0x1 +; CHECK-NEXT: mov x11, sp +; CHECK-NEXT: add x9, x13, x9 +; CHECK-NEXT: and w12, w12, #0x1 +; CHECK-NEXT: bfxil x14, x13, #0, #4 +; CHECK-NEXT: bfxil x11, x9, #0, #4 +; CHECK-NEXT: add w9, w9, w12 +; CHECK-NEXT: st1.b { v0 }[12], [x10] +; CHECK-NEXT: bfxil x8, x9, #0, #4 +; CHECK-NEXT: st1.b { v0 }[13], [x14] +; CHECK-NEXT: st1.b { v0 }[14], [x11] +; CHECK-NEXT: st1.b { v0 }[15], [x8] +; CHECK-NEXT: ldr q0, [sp], #16 +; CHECK-NEXT: ret + %out = call <16 x i8> @llvm.experimental.vector.compress(<16 x i8> %vec, <16 x i1> %mask, <16 x i8> undef) + ret <16 x i8> %out +} + +define <8 x i32> @test_compress_large(<8 x i32> %vec, <8 x i1> %mask) { +; CHECK-LABEL: test_compress_large: +; CHECK: ; %bb.0: +; CHECK-NEXT: sub sp, sp, #32 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: ; kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: umov.b w9, v2[0] +; CHECK-NEXT: umov.b w10, v2[1] +; CHECK-NEXT: mov x12, sp +; CHECK-NEXT: umov.b w11, v2[2] +; CHECK-NEXT: umov.b w13, v2[3] +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: umov.b w14, v2[4] +; CHECK-NEXT: str s0, [sp] +; CHECK-NEXT: and x10, x10, #0x1 +; CHECK-NEXT: and x15, x9, #0x1 +; CHECK-NEXT: bfi x12, x9, #2, #1 +; CHECK-NEXT: and x9, x11, #0x1 +; CHECK-NEXT: add x10, x15, x10 +; CHECK-NEXT: umov.b w11, v2[5] +; CHECK-NEXT: add x9, x10, x9 +; CHECK-NEXT: orr x15, x8, x10, lsl #2 +; CHECK-NEXT: umov.b w10, v2[6] +; CHECK-NEXT: st1.s { v0 }[1], [x12] +; CHECK-NEXT: add x12, x8, x9, lsl #2 +; CHECK-NEXT: and x13, x13, #0x1 +; CHECK-NEXT: st1.s { v0 }[2], [x15] +; CHECK-NEXT: add x9, x9, x13 +; CHECK-NEXT: st1.s { v0 }[3], [x12] +; CHECK-NEXT: and x12, x14, #0x1 +; CHECK-NEXT: and x11, x11, #0x1 +; CHECK-NEXT: add x12, x9, x12 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: and x9, x9, #0x7 +; CHECK-NEXT: add x11, x12, x11 +; CHECK-NEXT: and x12, x12, #0x7 +; CHECK-NEXT: str s1, [x8, x9, lsl #2] +; CHECK-NEXT: add w10, w11, w10 +; CHECK-NEXT: and x11, x11, #0x7 +; CHECK-NEXT: add x12, x8, x12, lsl #2 +; CHECK-NEXT: and x10, x10, #0x7 +; CHECK-NEXT: add x9, x8, x11, lsl #2 +; CHECK-NEXT: add x8, x8, x10, lsl #2 +; CHECK-NEXT: st1.s { v1 }[1], [x12] +; CHECK-NEXT: st1.s { v1 }[2], [x9] +; CHECK-NEXT: st1.s { v1 }[3], [x8] +; CHECK-NEXT: ldp q0, q1, [sp], #32 +; CHECK-NEXT: ret + %out = call <8 x i32> @llvm.experimental.vector.compress(<8 x i32> %vec, <8 x i1> %mask, <8 x i32> undef) + ret <8 x i32> %out +} + +define <4 x i32> @test_compress_all_const() { +; CHECK-LABEL: test_compress_all_const: +; CHECK: ; %bb.0: +; CHECK-NEXT: Lloh0: +; CHECK-NEXT: adrp x8, lCPI5_0@PAGE +; CHECK-NEXT: Lloh1: +; CHECK-NEXT: ldr q0, [x8, lCPI5_0@PAGEOFF] +; CHECK-NEXT: ret +; CHECK-NEXT: .loh AdrpLdr Lloh0, Lloh1 + %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> , + <4 x i1> , + <4 x i32> undef) + ret <4 x i32> %out +} + +define <4 x i32> @test_compress_const_mask(<4 x i32> %vec) { +; CHECK-LABEL: test_compress_const_mask: +; CHECK: ; %bb.0: +; CHECK-NEXT: mov.s v0[1], v0[3] +; CHECK-NEXT: ret + %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> , <4 x i32> undef) + ret <4 x i32> %out +} + +define <4 x i32> @test_compress_const_mask_passthrough(<4 x i32> %vec, <4 x i32> %passthru) { +; CHECK-LABEL: test_compress_const_mask_passthrough: +; CHECK: ; %bb.0: +; CHECK-NEXT: mov.d v1[0], v0[1] +; CHECK-NEXT: mov.s v1[0], v0[0] +; CHECK-NEXT: mov.16b v0, v1 +; CHECK-NEXT: ret + %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> , <4 x i32> %passthru) + ret <4 x i32> %out +} + +define <4 x i32> @test_compress_const_mask_const_passthrough(<4 x i32> %vec) { +; CHECK-LABEL: test_compress_const_mask_const_passthrough: +; CHECK: ; %bb.0: +; CHECK-NEXT: mov.s v0[1], v0[3] +; CHECK-NEXT: mov w8, #7 ; =0x7 +; CHECK-NEXT: mov.s v0[2], w8 +; CHECK-NEXT: mov w8, #8 ; =0x8 +; CHECK-NEXT: mov.s v0[3], w8 +; CHECK-NEXT: ret + %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> , <4 x i32> ) + ret <4 x i32> %out +} + +; We pass a placeholder value for the const_mask* tests to check that they are converted to a no-op by simply copying +; the second vector input register to the return register or doing nothing. +define <4 x i32> @test_compress_const_splat1_mask(<4 x i32> %ignore, <4 x i32> %vec) { +; CHECK-LABEL: test_compress_const_splat1_mask: +; CHECK: ; %bb.0: +; CHECK-NEXT: mov.16b v0, v1 +; CHECK-NEXT: ret + %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> splat (i1 -1), <4 x i32> undef) + ret <4 x i32> %out +} +define <4 x i32> @test_compress_const_splat0_mask(<4 x i32> %ignore, <4 x i32> %vec) { +; CHECK-LABEL: test_compress_const_splat0_mask: +; CHECK: ; %bb.0: +; CHECK-NEXT: ret + %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> splat (i1 0), <4 x i32> undef) + ret <4 x i32> %out +} +define <4 x i32> @test_compress_undef_mask(<4 x i32> %ignore, <4 x i32> %vec) { +; CHECK-LABEL: test_compress_undef_mask: +; CHECK: ; %bb.0: +; CHECK-NEXT: ret + %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> undef, <4 x i32> undef) + ret <4 x i32> %out +} +define <4 x i32> @test_compress_const_splat0_mask_with_passthru(<4 x i32> %ignore, <4 x i32> %vec, <4 x i32> %passthru) { +; CHECK-LABEL: test_compress_const_splat0_mask_with_passthru: +; CHECK: ; %bb.0: +; CHECK-NEXT: mov.16b v0, v2 +; CHECK-NEXT: ret + %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> splat (i1 0), <4 x i32> %passthru) + ret <4 x i32> %out +} +define <4 x i32> @test_compress_const_splat0_mask_without_passthru(<4 x i32> %ignore, <4 x i32> %vec) { +; CHECK-LABEL: test_compress_const_splat0_mask_without_passthru: +; CHECK: ; %bb.0: +; CHECK-NEXT: ret + %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> splat (i1 0), <4 x i32> undef) + ret <4 x i32> %out +} + +define <4 x i8> @test_compress_small(<4 x i8> %vec, <4 x i1> %mask) { +; CHECK-LABEL: test_compress_small: +; CHECK: ; %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: shl.4h v1, v1, #15 +; CHECK-NEXT: add x8, sp, #8 +; CHECK-NEXT: ; kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: str h0, [sp, #8] +; CHECK-NEXT: cmlt.4h v1, v1, #0 +; CHECK-NEXT: umov.h w9, v1[0] +; CHECK-NEXT: umov.h w10, v1[1] +; CHECK-NEXT: umov.h w11, v1[2] +; CHECK-NEXT: bfi x8, x9, #1, #1 +; CHECK-NEXT: and x10, x10, #0x1 +; CHECK-NEXT: and x9, x9, #0x1 +; CHECK-NEXT: add x9, x9, x10 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: add x10, sp, #8 +; CHECK-NEXT: add w11, w9, w11 +; CHECK-NEXT: orr x9, x10, x9, lsl #1 +; CHECK-NEXT: st1.h { v0 }[1], [x8] +; CHECK-NEXT: bfi x10, x11, #1, #2 +; CHECK-NEXT: st1.h { v0 }[2], [x9] +; CHECK-NEXT: st1.h { v0 }[3], [x10] +; CHECK-NEXT: ldr d0, [sp, #8] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %out = call <4 x i8> @llvm.experimental.vector.compress(<4 x i8> %vec, <4 x i1> %mask, <4 x i8> undef) + ret <4 x i8> %out +} + +define <4 x i4> @test_compress_illegal_element_type(<4 x i4> %vec, <4 x i1> %mask) { +; CHECK-LABEL: test_compress_illegal_element_type: +; CHECK: ; %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: shl.4h v1, v1, #15 +; CHECK-NEXT: add x8, sp, #8 +; CHECK-NEXT: ; kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: str h0, [sp, #8] +; CHECK-NEXT: cmlt.4h v1, v1, #0 +; CHECK-NEXT: umov.h w9, v1[0] +; CHECK-NEXT: umov.h w10, v1[1] +; CHECK-NEXT: umov.h w11, v1[2] +; CHECK-NEXT: bfi x8, x9, #1, #1 +; CHECK-NEXT: and x10, x10, #0x1 +; CHECK-NEXT: and x9, x9, #0x1 +; CHECK-NEXT: add x9, x9, x10 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: add x10, sp, #8 +; CHECK-NEXT: add w11, w9, w11 +; CHECK-NEXT: orr x9, x10, x9, lsl #1 +; CHECK-NEXT: st1.h { v0 }[1], [x8] +; CHECK-NEXT: bfi x10, x11, #1, #2 +; CHECK-NEXT: st1.h { v0 }[2], [x9] +; CHECK-NEXT: st1.h { v0 }[3], [x10] +; CHECK-NEXT: ldr d0, [sp, #8] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %out = call <4 x i4> @llvm.experimental.vector.compress(<4 x i4> %vec, <4 x i1> %mask, <4 x i4> undef) + ret <4 x i4> %out +} + +define <3 x i32> @test_compress_narrow(<3 x i32> %vec, <3 x i1> %mask) { +; CHECK-LABEL: test_compress_narrow: +; CHECK: ; %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: movi.2d v1, #0000000000000000 +; CHECK-NEXT: mov x11, sp +; CHECK-NEXT: str s0, [sp] +; CHECK-NEXT: mov.h v1[0], w0 +; CHECK-NEXT: mov.h v1[1], w1 +; CHECK-NEXT: mov.h v1[2], w2 +; CHECK-NEXT: ushll.4s v1, v1, #0 +; CHECK-NEXT: shl.4s v1, v1, #31 +; CHECK-NEXT: cmlt.4s v1, v1, #0 +; CHECK-NEXT: mov.s w8, v1[1] +; CHECK-NEXT: mov.s w9, v1[2] +; CHECK-NEXT: fmov w10, s1 +; CHECK-NEXT: bfi x11, x10, #2, #1 +; CHECK-NEXT: and x10, x10, #0x1 +; CHECK-NEXT: and x8, x8, #0x1 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: add x8, x10, x8 +; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: st1.s { v0 }[1], [x11] +; CHECK-NEXT: add w9, w8, w9 +; CHECK-NEXT: orr x8, x10, x8, lsl #2 +; CHECK-NEXT: bfi x10, x9, #2, #2 +; CHECK-NEXT: st1.s { v0 }[2], [x8] +; CHECK-NEXT: st1.s { v0 }[3], [x10] +; CHECK-NEXT: ldr q0, [sp], #16 +; CHECK-NEXT: ret + %out = call <3 x i32> @llvm.experimental.vector.compress(<3 x i32> %vec, <3 x i1> %mask, <3 x i32> undef) + ret <3 x i32> %out +} + +define <3 x i3> @test_compress_narrow_illegal_element_type(<3 x i3> %vec, <3 x i1> %mask) { +; CHECK-LABEL: test_compress_narrow_illegal_element_type: +; CHECK: ; %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: movi.2d v0, #0000000000000000 +; CHECK-NEXT: add x10, sp, #8 +; CHECK-NEXT: strh w0, [sp, #8] +; CHECK-NEXT: mov.h v0[0], w3 +; CHECK-NEXT: mov.h v0[1], w4 +; CHECK-NEXT: mov.h v0[2], w5 +; CHECK-NEXT: shl.4h v0, v0, #15 +; CHECK-NEXT: cmlt.4h v0, v0, #0 +; CHECK-NEXT: umov.h w8, v0[0] +; CHECK-NEXT: umov.h w9, v0[1] +; CHECK-NEXT: and x9, x9, #0x1 +; CHECK-NEXT: and x11, x8, #0x1 +; CHECK-NEXT: bfi x10, x8, #1, #1 +; CHECK-NEXT: add x8, x11, x9 +; CHECK-NEXT: add x9, sp, #8 +; CHECK-NEXT: orr x8, x9, x8, lsl #1 +; CHECK-NEXT: strh w1, [x10] +; CHECK-NEXT: strh w2, [x8] +; CHECK-NEXT: ldr d0, [sp, #8] +; CHECK-NEXT: umov.h w0, v0[0] +; CHECK-NEXT: umov.h w1, v0[1] +; CHECK-NEXT: umov.h w2, v0[2] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %out = call <3 x i3> @llvm.experimental.vector.compress(<3 x i3> %vec, <3 x i1> %mask, <3 x i3> undef) + ret <3 x i3> %out +} From b05ccaf451bca11fda5437003d54d7975cd8e575 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Wed, 17 Jul 2024 05:51:29 -0700 Subject: [PATCH 270/777] Revert "[SLP]Improve minbitwidth analysis for trun'ed gather nodes." This reverts commit 6425f2d66740b84fc3027b649cd4baf660c384e8 to fix the buildbost issues reported in https://lab.llvm.org/buildbot/#/builders/95/builds/1404. --- .../Transforms/Vectorize/SLPVectorizer.cpp | 15 +------ .../X86/int-bitcast-minbitwidth.ll | 6 +-- .../X86/minbitwidth-transformed-operand.ll | 22 +++++---- .../Transforms/SLPVectorizer/X86/resched.ll | 45 +++++++++---------- .../SLPVectorizer/X86/shuffle-multivector.ll | 13 +++--- .../orig-btiwidth-les-projected.ll | 8 ++-- 6 files changed, 47 insertions(+), 62 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index ccb6734d5618c..7bdbbecb7f0d8 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -15522,21 +15522,8 @@ void BoUpSLP::computeMinimumValueSizes() { auto ComputeMaxBitWidth = [&](const TreeEntry &E, bool IsTopRoot, bool IsProfitableToDemoteRoot, unsigned Opcode, unsigned Limit, bool IsTruncRoot, - bool IsSignedCmp) -> unsigned { + bool IsSignedCmp) { ToDemote.clear(); - // Check if the root is trunc and the next node is gather/buildvector, then - // keep trunc in scalars, which is free in most cases. - if (E.isGather() && IsTruncRoot && E.UserTreeIndices.size() == 1 && - E.Idx > (IsStoreOrInsertElt ? 2 : 1)) { - ToDemote.push_back(E.Idx); - const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE; - auto It = MinBWs.find(UserTE); - if (It != MinBWs.end()) - return It->second.first; - return DL->getTypeSizeInBits( - E.UserTreeIndices.back().UserTE->Scalars.front()->getType()); - } - unsigned VF = E.getVectorFactor(); auto *TreeRootIT = dyn_cast(E.Scalars.front()->getType()); if (!TreeRootIT || !Opcode) diff --git a/llvm/test/Transforms/SLPVectorizer/X86/int-bitcast-minbitwidth.ll b/llvm/test/Transforms/SLPVectorizer/X86/int-bitcast-minbitwidth.ll index 97e505f4319c6..789d73947d1c7 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/int-bitcast-minbitwidth.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/int-bitcast-minbitwidth.ll @@ -5,9 +5,9 @@ define void @t(i64 %v) { ; CHECK-LABEL: define void @t( ; CHECK-SAME: i64 [[V:%.*]]) { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[V]] to i16 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i64> poison, i64 [[V]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = trunc <4 x i64> [[TMP1]] to <4 x i16> ; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i16> [[TMP2]], ; CHECK-NEXT: [[TMP4:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP3]]) ; CHECK-NEXT: [[TMP5:%.*]] = sext i16 [[TMP4]] to i32 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-transformed-operand.ll b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-transformed-operand.ll index 57b5d2af48ee6..032625a1199f9 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-transformed-operand.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-transformed-operand.ll @@ -5,16 +5,20 @@ define void @test(i64 %d.promoted.i) { ; CHECK-LABEL: define void @test( ; CHECK-SAME: i64 [[D_PROMOTED_I:%.*]]) { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i64> , i64 0, i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> , i64 [[D_PROMOTED_I]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i64> [[TMP0]], [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i1> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i1> [[TMP3]], <2 x i1> poison, <16 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <16 x i1> , <16 x i1> [[TMP4]], <16 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = mul <16 x i1> [[TMP5]], zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP6]]) +; CHECK-NEXT: [[AND_1_I:%.*]] = and i64 0, [[D_PROMOTED_I]] +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i64> , i64 [[AND_1_I]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = trunc <8 x i64> [[TMP0]] to <8 x i1> +; CHECK-NEXT: [[TMP2:%.*]] = mul <8 x i1> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[AND_1_I_1:%.*]] = and i64 0, 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i64> , i64 [[AND_1_I_1]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = trunc <8 x i64> [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = mul <8 x i1> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP2]]) +; CHECK-NEXT: [[TMP7:%.*]] = zext i1 [[TMP6]] to i32 +; CHECK-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP5]]) ; CHECK-NEXT: [[TMP9:%.*]] = zext i1 [[TMP8]] to i32 -; CHECK-NEXT: [[TMP10:%.*]] = and i32 [[TMP9]], 0 +; CHECK-NEXT: [[OP_RDX:%.*]] = or i32 [[TMP7]], [[TMP9]] +; CHECK-NEXT: [[TMP10:%.*]] = and i32 [[OP_RDX]], 0 ; CHECK-NEXT: store i32 [[TMP10]], ptr null, align 4 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/resched.ll b/llvm/test/Transforms/SLPVectorizer/X86/resched.ll index 4ed52247c2ef3..b7237cbb02bb3 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/resched.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/resched.ll @@ -11,31 +11,26 @@ define fastcc void @_ZN12_GLOBAL__N_127PolynomialMultiplyRecognize9recognizeEv() ; CHECK: if.then22.i: ; CHECK-NEXT: [[SUB_I:%.*]] = add nsw i32 undef, -1 ; CHECK-NEXT: [[CONV31_I:%.*]] = and i32 undef, [[SUB_I]] -; CHECK-NEXT: [[SHR_I_I:%.*]] = lshr i32 [[CONV31_I]], 1 -; CHECK-NEXT: [[SHR_1_I_I:%.*]] = lshr i32 [[CONV31_I]], 2 -; CHECK-NEXT: [[SHR_2_I_I:%.*]] = lshr i32 [[CONV31_I]], 3 -; CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[SUB_I]] to i8 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[SHR_I_I]] to i8 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <16 x i8> [[TMP1]], i8 [[TMP2]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = trunc i32 [[SHR_1_I_I]] to i8 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x i8> [[TMP3]], i8 [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP6:%.*]] = trunc i32 [[SHR_2_I_I]] to i8 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <16 x i8> [[TMP5]], i8 [[TMP6]], i32 3 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> poison, i32 [[CONV31_I]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = lshr <4 x i32> [[TMP9]], -; CHECK-NEXT: [[TMP11:%.*]] = trunc <4 x i32> [[TMP10]] to <4 x i8> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <16 x i8> [[TMP7]], <16 x i8> [[TMP12]], <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <8 x i32> poison, i32 [[CONV31_I]], i32 0 -; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <8 x i32> [[TMP14]], <8 x i32> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = lshr <8 x i32> [[TMP15]], -; CHECK-NEXT: [[TMP17:%.*]] = trunc <8 x i32> [[TMP16]] to <8 x i8> -; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <8 x i8> [[TMP17]], <8 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <16 x i8> [[TMP13]], <16 x i8> [[TMP18]], <16 x i32> -; CHECK-NEXT: [[TMP20:%.*]] = and <16 x i8> [[TMP19]], -; CHECK-NEXT: store <16 x i8> [[TMP20]], ptr undef, align 1 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[CONV31_I]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = lshr <4 x i32> [[TMP1]], +; CHECK-NEXT: [[SHR_4_I_I:%.*]] = lshr i32 [[CONV31_I]], 5 +; CHECK-NEXT: [[SHR_5_I_I:%.*]] = lshr i32 [[CONV31_I]], 6 +; CHECK-NEXT: [[SHR_6_I_I:%.*]] = lshr i32 [[CONV31_I]], 7 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> poison, i32 [[CONV31_I]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = lshr <8 x i32> [[TMP4]], +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x i32> poison, i32 [[SUB_I]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> [[TMP7]], <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <16 x i32> [[TMP8]], i32 [[SHR_4_I_I]], i32 5 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <16 x i32> [[TMP9]], i32 [[SHR_5_I_I]], i32 6 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <16 x i32> [[TMP10]], i32 [[SHR_6_I_I]], i32 7 +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <16 x i32> [[TMP11]], <16 x i32> [[TMP12]], <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = trunc <16 x i32> [[TMP13]] to <16 x i8> +; CHECK-NEXT: [[TMP15:%.*]] = and <16 x i8> [[TMP14]], +; CHECK-NEXT: store <16 x i8> [[TMP15]], ptr undef, align 1 ; CHECK-NEXT: unreachable ; CHECK: if.end50.i: ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shuffle-multivector.ll b/llvm/test/Transforms/SLPVectorizer/X86/shuffle-multivector.ll index c2555889f5981..143052a3d9cd0 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/shuffle-multivector.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/shuffle-multivector.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux -slp-threshold=-163 | FileCheck %s +; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux -slp-threshold=-160 | FileCheck %s define void @test1(i128 %p0, i128 %p1, i128 %p2, i128 %p3, <4 x i128> %vec) { ; CHECK-LABEL: @test1( @@ -14,14 +14,13 @@ define void @test1(i128 %p0, i128 %p1, i128 %p2, i128 %p3, <4 x i128> %vec) { ; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: [[T5:%.*]] = trunc i128 [[P1]] to i32 ; CHECK-NEXT: [[TMP8:%.*]] = sdiv <4 x i32> [[TMP3]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = trunc <4 x i128> [[VEC:%.*]] to <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = trunc <4 x i128> [[VEC]] to <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP6]], <4 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = sdiv <4 x i32> [[TMP8]], [[TMP12]] +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i128> [[TMP1]], <2 x i128> [[TMP5]], <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i128> [[VEC:%.*]], <4 x i128> [[TMP9]], <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = trunc <4 x i128> [[TMP10]] to <4 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = sdiv <4 x i32> [[TMP8]], [[TMP11]] ; CHECK-NEXT: br label [[BB:%.*]] ; CHECK: bb: -; CHECK-NEXT: [[TMP14:%.*]] = phi <4 x i32> [ [[TMP13]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[TMP13:%.*]] = phi <4 x i32> [ [[TMP12]], [[ENTRY:%.*]] ] ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/orig-btiwidth-les-projected.ll b/llvm/test/Transforms/SLPVectorizer/orig-btiwidth-les-projected.ll index 88503aeb6071f..531e964053482 100644 --- a/llvm/test/Transforms/SLPVectorizer/orig-btiwidth-les-projected.ll +++ b/llvm/test/Transforms/SLPVectorizer/orig-btiwidth-les-projected.ll @@ -5,10 +5,10 @@ define i32 @test(i4 %0) { ; CHECK-LABEL: define i32 @test( ; CHECK-SAME: i4 [[TMP0:%.*]]) { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i4> , i4 [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = or <2 x i4> zeroinitializer, [[TMP1]] -; CHECK-NEXT: [[ADD_R:%.*]] = extractelement <2 x i4> [[TMP2]], i32 0 -; CHECK-NEXT: [[ADD_R14:%.*]] = extractelement <2 x i4> [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = trunc i8 0 to i4 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i8 0 to i4 +; CHECK-NEXT: [[ADD_R:%.*]] = or i4 [[TMP1]], [[TMP0]] +; CHECK-NEXT: [[ADD_R14:%.*]] = or i4 0, [[TMP2]] ; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i4 [[ADD_R]], [[ADD_R14]] ; CHECK-NEXT: ret i32 0 ; From e093109e4a1551de13a1219275d62e9c7ee3146f Mon Sep 17 00:00:00 2001 From: Jeremy Morse Date: Wed, 17 Jul 2024 13:56:44 +0100 Subject: [PATCH 271/777] [InstrRef][NFC] Avoid another DenseMap, use a sorted vector (#99051) When resolving value-numbers to specific machine locations in the final stages of LiveDebugValues, we've been producing a DenseMap containing all the value-numbers we're interested in. However we never modify the map keys as they're all pre-known. Thus, this is a suitable collection to switch to a sorted vector that gets searched, rather than a DenseMap that gets probed. The overall operation of LiveDebugValues isn't affected at all. --- .../LiveDebugValues/InstrRefBasedImpl.cpp | 36 ++++++++++++++----- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp index bde8cc4a89715..247258a1ff553 100644 --- a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp +++ b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp @@ -316,6 +316,13 @@ class TransferTracker { bool isBest() const { return getQuality() == LocationQuality::Best; } }; + using ValueLocPair = std::pair; + + static inline bool ValueToLocSort(const ValueLocPair &A, + const ValueLocPair &B) { + return A.first < B.first; + }; + // Returns the LocationQuality for the location L iff the quality of L is // is strictly greater than the provided minimum quality. std::optional @@ -344,7 +351,7 @@ class TransferTracker { /// \p DbgOpStore is the map containing the DbgOpID->DbgOp mapping needed to /// determine the values used by Value. void loadVarInloc(MachineBasicBlock &MBB, DbgOpIDMap &DbgOpStore, - const DenseMap &ValueToLoc, + const SmallVectorImpl &ValueToLoc, DebugVariable Var, DbgValue Value) { SmallVector DbgOps; SmallVector ResolvedDbgOps; @@ -373,9 +380,17 @@ class TransferTracker { continue; } - // If the value has no location, we can't make a variable location. + // Search for the desired ValueIDNum, to examine the best location found + // for it. Use an empty ValueLocPair to search for an entry in ValueToLoc. const ValueIDNum &Num = Op.ID; - auto ValuesPreferredLoc = ValueToLoc.find(Num); + ValueLocPair Probe(Num, LocationAndQuality()); + auto ValuesPreferredLoc = std::lower_bound( + ValueToLoc.begin(), ValueToLoc.end(), Probe, ValueToLocSort); + + // There must be a legitimate entry found for Num. + assert(ValuesPreferredLoc != ValueToLoc.end() && + ValuesPreferredLoc->first == Num); + if (ValuesPreferredLoc->second.isIllegal()) { // If it's a def that occurs in this block, register it as a // use-before-def to be resolved as we step through the block. @@ -439,8 +454,9 @@ class TransferTracker { UseBeforeDefs.clear(); UseBeforeDefVariables.clear(); - // Map of the preferred location for each value. - DenseMap ValueToLoc; + // Mapping of the preferred locations for each value. Collected into this + // vector then sorted for easy searching. + SmallVector ValueToLoc; // Initialized the preferred-location map with illegal locations, to be // filled in later. @@ -448,8 +464,10 @@ class TransferTracker { if (VLoc.second.Kind == DbgValue::Def) for (DbgOpID OpID : VLoc.second.getDbgOpIDs()) if (!OpID.ID.IsConst) - ValueToLoc.insert({DbgOpStore.find(OpID).ID, LocationAndQuality()}); + ValueToLoc.push_back( + {DbgOpStore.find(OpID).ID, LocationAndQuality()}); + llvm::sort(ValueToLoc, ValueToLocSort); ActiveMLocs.reserve(VLocs.size()); ActiveVLocs.reserve(VLocs.size()); @@ -464,8 +482,10 @@ class TransferTracker { VarLocs.push_back(VNum); // Is there a variable that wants a location for this value? If not, skip. - auto VIt = ValueToLoc.find(VNum); - if (VIt == ValueToLoc.end()) + ValueLocPair Probe(VNum, LocationAndQuality()); + auto VIt = std::lower_bound(ValueToLoc.begin(), ValueToLoc.end(), Probe, + ValueToLocSort); + if (VIt == ValueToLoc.end() || VIt->first != VNum) continue; auto &Previous = VIt->second; From 8917d52938b907b00fced24506708b381f472890 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 16 Jul 2024 19:01:05 +0100 Subject: [PATCH 272/777] [X86] createSetFPEnvNodes - pass SDLoc by reference instead of value. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 64303130922bd..881e06e5f78b4 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -27837,7 +27837,7 @@ SDValue X86TargetLowering::LowerGET_FPENV_MEM(SDValue Op, return Chain; } -static SDValue createSetFPEnvNodes(SDValue Ptr, SDValue Chain, SDLoc DL, +static SDValue createSetFPEnvNodes(SDValue Ptr, SDValue Chain, const SDLoc &DL, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG, const X86Subtarget &Subtarget) { From 396a5ba51e9e47f818a749ec3f2368e4fea6a67f Mon Sep 17 00:00:00 2001 From: Mital Ashok Date: Wed, 17 Jul 2024 14:08:51 +0100 Subject: [PATCH 273/777] [Clang] Add attribute for consteval builtin functions (#91894) Builtins with the new `Consteval` attribute will also be marked `Constexpr` and will only be available in C++20 mode where `consteval` makes sense. --- clang/include/clang/Basic/Builtins.def | 1 + clang/include/clang/Basic/Builtins.h | 5 +++++ clang/include/clang/Basic/BuiltinsBase.td | 2 ++ clang/lib/Basic/Builtins.cpp | 3 +++ clang/lib/Sema/SemaDecl.cpp | 15 +++++++++++---- clang/lib/Sema/SemaExpr.cpp | 8 ++++++-- 6 files changed, 28 insertions(+), 6 deletions(-) diff --git a/clang/include/clang/Basic/Builtins.def b/clang/include/clang/Basic/Builtins.def index d2d500c990b99..48437c9397570 100644 --- a/clang/include/clang/Basic/Builtins.def +++ b/clang/include/clang/Basic/Builtins.def @@ -101,3 +101,4 @@ // M_0, ..., M_k as payload // z -> this is a function in (possibly-versioned) namespace std // E -> this function can be constant evaluated by Clang frontend +// G -> this is a C++20 consteval function diff --git a/clang/include/clang/Basic/Builtins.h b/clang/include/clang/Basic/Builtins.h index f955d21169556..e85ec5b2dca14 100644 --- a/clang/include/clang/Basic/Builtins.h +++ b/clang/include/clang/Basic/Builtins.h @@ -280,6 +280,11 @@ class Context { return strchr(getRecord(ID).Attributes, 'E') != nullptr; } + /// Returns true if this is an immediate (consteval) function + bool isImmediate(unsigned ID) const { + return strchr(getRecord(ID).Attributes, 'G') != nullptr; + } + private: const Info &getRecord(unsigned ID) const; diff --git a/clang/include/clang/Basic/BuiltinsBase.td b/clang/include/clang/Basic/BuiltinsBase.td index 724747ec76d73..58dee22fc0a45 100644 --- a/clang/include/clang/Basic/BuiltinsBase.td +++ b/clang/include/clang/Basic/BuiltinsBase.td @@ -70,6 +70,8 @@ class VScanfFormat : IndexedAttribute<"S", I>; // Builtin can be constant evaluated def Constexpr : Attribute<"E">; +// Builtin is immediate and must be constant evaluated. Implies Constexpr, and will only be supported in C++20 mode. +def Consteval : Attribute<"EG">; // Builtin kinds // ============= diff --git a/clang/lib/Basic/Builtins.cpp b/clang/lib/Basic/Builtins.cpp index b116abbe034f7..7116e27cd9546 100644 --- a/clang/lib/Basic/Builtins.cpp +++ b/clang/lib/Basic/Builtins.cpp @@ -119,6 +119,9 @@ static bool builtinIsSupported(const Builtin::Info &BuiltinInfo, /* CPlusPlus Unsupported */ if (!LangOpts.CPlusPlus && BuiltinInfo.Langs == CXX_LANG) return false; + /* consteval Unsupported */ + if (!LangOpts.CPlusPlus20 && strchr(BuiltinInfo.Attributes, 'G') != nullptr) + return false; return true; } diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index 80b5a8cd4bae6..1f2fde12c9d24 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -2294,10 +2294,17 @@ FunctionDecl *Sema::CreateBuiltin(IdentifierInfo *II, QualType Type, Parent = CLinkageDecl; } - FunctionDecl *New = FunctionDecl::Create(Context, Parent, Loc, Loc, II, Type, - /*TInfo=*/nullptr, SC_Extern, - getCurFPFeatures().isFPConstrained(), - false, Type->isFunctionProtoType()); + ConstexprSpecKind ConstexprKind = ConstexprSpecKind::Unspecified; + if (Context.BuiltinInfo.isImmediate(ID)) { + assert(getLangOpts().CPlusPlus20 && + "consteval builtins should only be available in C++20 mode"); + ConstexprKind = ConstexprSpecKind::Consteval; + } + + FunctionDecl *New = FunctionDecl::Create( + Context, Parent, Loc, Loc, II, Type, /*TInfo=*/nullptr, SC_Extern, + getCurFPFeatures().isFPConstrained(), /*isInlineSpecified=*/false, + Type->isFunctionProtoType(), ConstexprKind); New->setImplicit(); New->addAttr(BuiltinAttr::CreateImplicit(Context, ID)); diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index d47db14d5dd3b..8d24e34520e77 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -6770,8 +6770,12 @@ ExprResult Sema::BuildResolvedCallExpr(Expr *Fn, NamedDecl *NDecl, } // Bail out early if calling a builtin with custom type checking. - if (BuiltinID && Context.BuiltinInfo.hasCustomTypechecking(BuiltinID)) - return CheckBuiltinFunctionCall(FDecl, BuiltinID, TheCall); + if (BuiltinID && Context.BuiltinInfo.hasCustomTypechecking(BuiltinID)) { + ExprResult E = CheckBuiltinFunctionCall(FDecl, BuiltinID, TheCall); + if (!E.isInvalid() && Context.BuiltinInfo.isImmediate(BuiltinID)) + E = CheckForImmediateInvocation(E, FDecl); + return E; + } if (getLangOpts().CUDA) { if (Config) { From 6451806ef73bb033be3f6e1599f3bcb224943206 Mon Sep 17 00:00:00 2001 From: Mital Ashok Date: Wed, 17 Jul 2024 14:19:23 +0100 Subject: [PATCH 274/777] [Clang] Require base element type of `__has_unique_object_representations` to be complete (#95432) Fixes #95311 Previous behaviour was that `false` was silently returned, templated classes were not instantiated and incomplete classes did not issue an error. --------- Co-authored-by: cor3ntin --- clang/docs/ReleaseNotes.rst | 3 +++ clang/lib/AST/ASTContext.cpp | 4 ++++ clang/lib/Sema/SemaExprCXX.cpp | 5 ++++- clang/test/SemaCXX/type-traits.cpp | 11 +++++++++++ 4 files changed, 22 insertions(+), 1 deletion(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 923f3d0a46164..e63282ca3b40d 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -835,6 +835,9 @@ Bug Fixes in This Version - Fixed Clang from generating dangling StringRefs when deserializing Exprs & Stmts (#GH98667) +- ``__has_unique_object_representations`` correctly handles arrays of unknown bounds of + types by ensuring they are complete and instantiating them if needed. Fixes (#GH95311). + Bug Fixes to Compiler Builtins ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp index 6c89e3890ae3e..ccbb4baad68af 100644 --- a/clang/lib/AST/ASTContext.cpp +++ b/clang/lib/AST/ASTContext.cpp @@ -2831,6 +2831,10 @@ bool ASTContext::hasUniqueObjectRepresentations( return hasUniqueObjectRepresentations(getBaseElementType(Ty), CheckIfTriviallyCopyable); + assert((Ty->isVoidType() || !Ty->isIncompleteType()) && + "hasUniqueObjectRepresentations should not be called with an " + "incomplete type"); + // (9.1) - T is trivially copyable... if (CheckIfTriviallyCopyable && !Ty.isTriviallyCopyableType(*this)) return false; diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp index bef7da239e6e5..14d1f395af90e 100644 --- a/clang/lib/Sema/SemaExprCXX.cpp +++ b/clang/lib/Sema/SemaExprCXX.cpp @@ -5069,6 +5069,10 @@ static bool CheckUnaryTypeTraitTypeCompleteness(Sema &S, TypeTrait UTT, case UTT_HasTrivialCopy: case UTT_HasTrivialDestructor: case UTT_HasVirtualDestructor: + // has_unique_object_representations when T is an array is defined in terms + // of has_unique_object_representations>, so the base + // type needs to be complete even if the type is an incomplete array type. + case UTT_HasUniqueObjectRepresentations: ArgTy = QualType(ArgTy->getBaseElementTypeUnsafe(), 0); [[fallthrough]]; @@ -5077,7 +5081,6 @@ static bool CheckUnaryTypeTraitTypeCompleteness(Sema &S, TypeTrait UTT, case UTT_IsDestructible: case UTT_IsNothrowDestructible: case UTT_IsTriviallyDestructible: - case UTT_HasUniqueObjectRepresentations: if (ArgTy->isIncompleteArrayType() || ArgTy->isVoidType()) return true; diff --git a/clang/test/SemaCXX/type-traits.cpp b/clang/test/SemaCXX/type-traits.cpp index 7adbf4aad7afe..23b07cac13eaf 100644 --- a/clang/test/SemaCXX/type-traits.cpp +++ b/clang/test/SemaCXX/type-traits.cpp @@ -3505,6 +3505,17 @@ static_assert(__has_unique_object_representations(_BitInt(8)), "BitInt:"); static_assert(!__has_unique_object_representations(_BitInt(127)), "BitInt:"); static_assert(__has_unique_object_representations(_BitInt(128)), "BitInt:"); +namespace GH95311 { + +template +class Foo { + int x; +}; +static_assert(__has_unique_object_representations(Foo<0>[])); +class Bar; // expected-note {{forward declaration of 'GH95311::Bar'}} +static_assert(__has_unique_object_representations(Bar[])); // expected-error {{incomplete type}} + +} namespace PR46209 { // Foo has both a trivial assignment operator and a non-trivial one. From 544c390aac41983342227db1c47be9308188712f Mon Sep 17 00:00:00 2001 From: Jie Fu Date: Wed, 17 Jul 2024 21:20:07 +0800 Subject: [PATCH 275/777] [CodeGen] Fix -Wunused-variable in SelectionDAG.cpp (NFC) /llvm-project/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp:7560:9: error: unused variable 'VecVT' [-Werror,-Wunused-variable] EVT VecVT = N1.getValueType(); ^ 1 error generated. --- llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 2cd0e209f1c07..02d44cd36ae53 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -7557,7 +7557,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, return N1; break; case ISD::VECTOR_COMPRESS: { - EVT VecVT = N1.getValueType(); + [[maybe_unused]] EVT VecVT = N1.getValueType(); [[maybe_unused]] EVT MaskVT = N2.getValueType(); [[maybe_unused]] EVT PassthruVT = N3.getValueType(); assert(VT == VecVT && "Vector and result type don't match."); From 75b3ddf23b7dfb2cf4cb3c99b4b7ee80e510589d Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 17 Jul 2024 14:30:19 +0100 Subject: [PATCH 276/777] [VPlan] Use State.VF in vectorizeInterleaveGroup (NFCI). Update vectorizeInterleaveGroup to use State.VF in preparation to moving the code directly to the recipe. --- .../Transforms/Vectorize/LoopVectorize.cpp | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 5fc365f77efbb..c276a2995f54c 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2426,7 +2426,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( // Prepare for the vector type of the interleaved load/store. Type *ScalarTy = getLoadStoreType(Instr); unsigned InterleaveFactor = Group->getFactor(); - auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); + auto *VecTy = VectorType::get(ScalarTy, State.VF * InterleaveFactor); // Prepare for the new pointers. SmallVector AddrParts; @@ -2444,7 +2444,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( // uniform instructions, we're only required to generate a value for the // first vector lane in each unroll iteration. if (Group->isReverse()) { - Value *RuntimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF); + Value *RuntimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF); Idx = Builder.CreateSub(RuntimeVF, Builder.getInt32(1)); Idx = Builder.CreateMul(Idx, Builder.getInt32(Group->getFactor())); Idx = Builder.CreateAdd(Idx, Builder.getInt32(Index)); @@ -2481,14 +2481,14 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( auto CreateGroupMask = [this, &BlockInMask, &State, &InterleaveFactor]( unsigned Part, Value *MaskForGaps) -> Value * { - if (VF.isScalable()) { + if (State.VF.isScalable()) { assert(!MaskForGaps && "Interleaved groups with gaps are not supported."); assert(InterleaveFactor == 2 && "Unsupported deinterleave factor for scalable vectors"); auto *BlockInMaskPart = State.get(BlockInMask, Part); SmallVector Ops = {BlockInMaskPart, BlockInMaskPart}; - auto *MaskTy = - VectorType::get(Builder.getInt1Ty(), VF.getKnownMinValue() * 2, true); + auto *MaskTy = VectorType::get(Builder.getInt1Ty(), + State.VF.getKnownMinValue() * 2, true); return Builder.CreateIntrinsic(MaskTy, Intrinsic::vector_interleave2, Ops, /*FMFSource=*/nullptr, "interleaved.mask"); } @@ -2499,7 +2499,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( Value *BlockInMaskPart = State.get(BlockInMask, Part); Value *ShuffledMask = Builder.CreateShuffleVector( BlockInMaskPart, - createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), + createReplicatedMask(InterleaveFactor, State.VF.getKnownMinValue()), "interleaved.mask"); return MaskForGaps ? Builder.CreateBinOp(Instruction::And, ShuffledMask, MaskForGaps) @@ -2511,7 +2511,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( Value *MaskForGaps = nullptr; if (NeedsMaskForGaps) { MaskForGaps = - createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); + createBitMaskForGaps(Builder, State.VF.getKnownMinValue(), *Group); assert(MaskForGaps && "Mask for Gaps is required but it is null"); } @@ -2554,7 +2554,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( Value *StridedVec = Builder.CreateExtractValue(DI, I); // If this member has different type, cast the result type. if (Member->getType() != ScalarTy) { - VectorType *OtherVTy = VectorType::get(Member->getType(), VF); + VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF); StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); } @@ -2580,15 +2580,15 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( continue; auto StrideMask = - createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); + createStrideMask(I, InterleaveFactor, State.VF.getKnownMinValue()); for (unsigned Part = 0; Part < State.UF; Part++) { Value *StridedVec = Builder.CreateShuffleVector( NewLoads[Part], StrideMask, "strided.vec"); // If this member has different type, cast the result type. if (Member->getType() != ScalarTy) { - assert(!VF.isScalable() && "VF is assumed to be non scalable."); - VectorType *OtherVTy = VectorType::get(Member->getType(), VF); + assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); + VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF); StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); } @@ -2603,14 +2603,14 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( } // The sub vector type for current instruction. - auto *SubVT = VectorType::get(ScalarTy, VF); + auto *SubVT = VectorType::get(ScalarTy, State.VF); // Vectorize the interleaved store group. Value *MaskForGaps = - createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); + createBitMaskForGaps(Builder, State.VF.getKnownMinValue(), *Group); assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) && "masked interleaved groups are not allowed."); - assert((!MaskForGaps || !VF.isScalable()) && + assert((!MaskForGaps || !State.VF.isScalable()) && "masking gaps for scalable vectors is not yet supported."); for (unsigned Part = 0; Part < State.UF; Part++) { // Collect the stored vector from each member. From 3fae5551de72756c3bb9fb2e5a29c95f02cbbd6b Mon Sep 17 00:00:00 2001 From: Jan Leyonberg Date: Wed, 17 Jul 2024 09:33:04 -0400 Subject: [PATCH 277/777] [MLIR][ROCDL] Refactor conversion of math operations to ROCDL calls to a separate pass (#98653) This patch refactors the conversion of math operations to ROCDL library calls. This pass will also be used in flang to lower Fortran intrinsics/math functions for OpenMP target offloading codgen. --- .../mlir/Conversion/MathToROCDL/MathToROCDL.h | 26 ++ mlir/include/mlir/Conversion/Passes.h | 1 + mlir/include/mlir/Conversion/Passes.td | 17 + mlir/lib/Conversion/CMakeLists.txt | 1 + mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt | 1 + .../GPUToROCDL/LowerGpuOpsToROCDLOps.cpp | 46 +- .../lib/Conversion/MathToROCDL/CMakeLists.txt | 23 + .../Conversion/MathToROCDL/MathToROCDL.cpp | 146 ++++++ .../Conversion/MathToROCDL/math-to-rocdl.mlir | 435 ++++++++++++++++++ 9 files changed, 652 insertions(+), 44 deletions(-) create mode 100644 mlir/include/mlir/Conversion/MathToROCDL/MathToROCDL.h create mode 100644 mlir/lib/Conversion/MathToROCDL/CMakeLists.txt create mode 100644 mlir/lib/Conversion/MathToROCDL/MathToROCDL.cpp create mode 100644 mlir/test/Conversion/MathToROCDL/math-to-rocdl.mlir diff --git a/mlir/include/mlir/Conversion/MathToROCDL/MathToROCDL.h b/mlir/include/mlir/Conversion/MathToROCDL/MathToROCDL.h new file mode 100644 index 0000000000000..fa7a635568c7c --- /dev/null +++ b/mlir/include/mlir/Conversion/MathToROCDL/MathToROCDL.h @@ -0,0 +1,26 @@ +//===- MathToROCDL.h - Utils to convert from the complex dialect --------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#ifndef MLIR_CONVERSION_MATHTOROCDL_MATHTOROCDL_H_ +#define MLIR_CONVERSION_MATHTOROCDL_MATHTOROCDL_H_ + +#include "mlir/Conversion/LLVMCommon/TypeConverter.h" +#include "mlir/IR/PatternMatch.h" +#include + +namespace mlir { +class Pass; + +#define GEN_PASS_DECL_CONVERTMATHTOROCDL +#include "mlir/Conversion/Passes.h.inc" + +/// Populate the given list with patterns that convert from Math to ROCDL calls. +void populateMathToROCDLConversionPatterns(LLVMTypeConverter &converter, + RewritePatternSet &patterns); +} // namespace mlir + +#endif // MLIR_CONVERSION_MATHTOROCDL_MATHTOROCDL_H_ diff --git a/mlir/include/mlir/Conversion/Passes.h b/mlir/include/mlir/Conversion/Passes.h index 8c6f85d461aea..208f26489d6c3 100644 --- a/mlir/include/mlir/Conversion/Passes.h +++ b/mlir/include/mlir/Conversion/Passes.h @@ -46,6 +46,7 @@ #include "mlir/Conversion/MathToFuncs/MathToFuncs.h" #include "mlir/Conversion/MathToLLVM/MathToLLVM.h" #include "mlir/Conversion/MathToLibm/MathToLibm.h" +#include "mlir/Conversion/MathToROCDL/MathToROCDL.h" #include "mlir/Conversion/MathToSPIRV/MathToSPIRVPass.h" #include "mlir/Conversion/MemRefToEmitC/MemRefToEmitCPass.h" #include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h" diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td index 560b088dbe5cd..54b94bbfb93d1 100644 --- a/mlir/include/mlir/Conversion/Passes.td +++ b/mlir/include/mlir/Conversion/Passes.td @@ -733,6 +733,23 @@ def ConvertMathToLLVMPass : Pass<"convert-math-to-llvm"> { ]; } +//===----------------------------------------------------------------------===// +// MathToLibm +//===----------------------------------------------------------------------===// + +def ConvertMathToROCDL : Pass<"convert-math-to-rocdl", "ModuleOp"> { + let summary = "Convert Math dialect to ROCDL library calls"; + let description = [{ + This pass converts supported Math ops to ROCDL library calls. + }]; + let dependentDialects = [ + "arith::ArithDialect", + "func::FuncDialect", + "ROCDL::ROCDLDialect", + "vector::VectorDialect", + ]; +} + //===----------------------------------------------------------------------===// // MathToSPIRV //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Conversion/CMakeLists.txt b/mlir/lib/Conversion/CMakeLists.txt index e107738a4c50c..80c8b84d9ae89 100644 --- a/mlir/lib/Conversion/CMakeLists.txt +++ b/mlir/lib/Conversion/CMakeLists.txt @@ -36,6 +36,7 @@ add_subdirectory(LLVMCommon) add_subdirectory(MathToFuncs) add_subdirectory(MathToLibm) add_subdirectory(MathToLLVM) +add_subdirectory(MathToROCDL) add_subdirectory(MathToSPIRV) add_subdirectory(MemRefToEmitC) add_subdirectory(MemRefToLLVM) diff --git a/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt b/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt index 70707b5c3a049..945e3ccdfa87b 100644 --- a/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt +++ b/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt @@ -13,6 +13,7 @@ add_mlir_conversion_library(MLIRGPUToROCDLTransforms MLIRArithToLLVM MLIRArithTransforms MLIRMathToLLVM + MLIRMathToROCDL MLIRAMDGPUToROCDL MLIRFuncToLLVM MLIRGPUDialect diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp index 40eb15a491063..100181cdc69fe 100644 --- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp +++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp @@ -26,6 +26,7 @@ #include "mlir/Conversion/LLVMCommon/LoweringOptions.h" #include "mlir/Conversion/LLVMCommon/Pattern.h" #include "mlir/Conversion/LLVMCommon/TypeConverter.h" +#include "mlir/Conversion/MathToROCDL/MathToROCDL.h" #include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h" #include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h" #include "mlir/Dialect/ControlFlow/IR/ControlFlow.h" @@ -386,50 +387,7 @@ void mlir::populateGpuToROCDLConversionPatterns( patterns.add(converter); - populateOpPatterns(converter, patterns, "__ocml_fabs_f32", - "__ocml_fabs_f64"); - populateOpPatterns(converter, patterns, "__ocml_atan_f32", - "__ocml_atan_f64"); - populateOpPatterns(converter, patterns, "__ocml_atan2_f32", - "__ocml_atan2_f64"); - populateOpPatterns(converter, patterns, "__ocml_cbrt_f32", - "__ocml_cbrt_f64"); - populateOpPatterns(converter, patterns, "__ocml_ceil_f32", - "__ocml_ceil_f64"); - populateOpPatterns(converter, patterns, "__ocml_cos_f32", - "__ocml_cos_f64"); - populateOpPatterns(converter, patterns, "__ocml_exp_f32", - "__ocml_exp_f64"); - populateOpPatterns(converter, patterns, "__ocml_exp2_f32", - "__ocml_exp2_f64"); - populateOpPatterns(converter, patterns, "__ocml_expm1_f32", - "__ocml_expm1_f64"); - populateOpPatterns(converter, patterns, "__ocml_floor_f32", - "__ocml_floor_f64"); - populateOpPatterns(converter, patterns, "__ocml_fmod_f32", - "__ocml_fmod_f64"); - populateOpPatterns(converter, patterns, "__ocml_log_f32", - "__ocml_log_f64"); - populateOpPatterns(converter, patterns, "__ocml_log10_f32", - "__ocml_log10_f64"); - populateOpPatterns(converter, patterns, "__ocml_log1p_f32", - "__ocml_log1p_f64"); - populateOpPatterns(converter, patterns, "__ocml_log2_f32", - "__ocml_log2_f64"); - populateOpPatterns(converter, patterns, "__ocml_pow_f32", - "__ocml_pow_f64"); - populateOpPatterns(converter, patterns, "__ocml_rsqrt_f32", - "__ocml_rsqrt_f64"); - populateOpPatterns(converter, patterns, "__ocml_sin_f32", - "__ocml_sin_f64"); - populateOpPatterns(converter, patterns, "__ocml_sqrt_f32", - "__ocml_sqrt_f64"); - populateOpPatterns(converter, patterns, "__ocml_tanh_f32", - "__ocml_tanh_f64"); - populateOpPatterns(converter, patterns, "__ocml_tan_f32", - "__ocml_tan_f64"); - populateOpPatterns(converter, patterns, "__ocml_erf_f32", - "__ocml_erf_f64"); + populateMathToROCDLConversionPatterns(converter, patterns); } std::unique_ptr> diff --git a/mlir/lib/Conversion/MathToROCDL/CMakeLists.txt b/mlir/lib/Conversion/MathToROCDL/CMakeLists.txt new file mode 100644 index 0000000000000..2771955aa9493 --- /dev/null +++ b/mlir/lib/Conversion/MathToROCDL/CMakeLists.txt @@ -0,0 +1,23 @@ +add_mlir_conversion_library(MLIRMathToROCDL + MathToROCDL.cpp + + ADDITIONAL_HEADER_DIRS + ${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/MathToROCDL + + DEPENDS + MLIRConversionPassIncGen + + LINK_COMPONENTS + Core + + LINK_LIBS PUBLIC + MLIRDialectUtils + MLIRFuncDialect + MLIRGPUToGPURuntimeTransforms + MLIRMathDialect + MLIRLLVMCommonConversion + MLIRPass + MLIRTransformUtils + MLIRVectorDialect + MLIRVectorUtils + ) diff --git a/mlir/lib/Conversion/MathToROCDL/MathToROCDL.cpp b/mlir/lib/Conversion/MathToROCDL/MathToROCDL.cpp new file mode 100644 index 0000000000000..03c7ce5dac0d1 --- /dev/null +++ b/mlir/lib/Conversion/MathToROCDL/MathToROCDL.cpp @@ -0,0 +1,146 @@ +//===-- MathToROCDL.cpp - conversion from Math to rocdl calls -------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "mlir/Conversion/MathToROCDL/MathToROCDL.h" +#include "mlir/Conversion/LLVMCommon/LoweringOptions.h" +#include "mlir/Conversion/LLVMCommon/TypeConverter.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/Dialect/LLVMIR/ROCDLDialect.h" +#include "mlir/Dialect/Math/IR/Math.h" +#include "mlir/Dialect/Utils/IndexingUtils.h" +#include "mlir/Dialect/Vector/IR/VectorOps.h" +#include "mlir/IR/BuiltinDialect.h" +#include "mlir/IR/PatternMatch.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Transforms/DialectConversion.h" + +#include "../GPUCommon/GPUOpsLowering.h" +#include "../GPUCommon/IndexIntrinsicsOpLowering.h" +#include "../GPUCommon/OpToFuncCallLowering.h" +#include "mlir/Conversion/GPUCommon/GPUCommonPass.h" + +namespace mlir { +#define GEN_PASS_DEF_CONVERTMATHTOROCDL +#include "mlir/Conversion/Passes.h.inc" +} // namespace mlir + +using namespace mlir; + +#define DEBUG_TYPE "math-to-rocdl" +#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ") + +template +static void populateOpPatterns(LLVMTypeConverter &converter, + RewritePatternSet &patterns, StringRef f32Func, + StringRef f64Func) { + patterns.add>(converter); + patterns.add>(converter, f32Func, f64Func); +} + +void mlir::populateMathToROCDLConversionPatterns(LLVMTypeConverter &converter, + RewritePatternSet &patterns) { + // Handled by mathToLLVM: math::AbsIOp + // Handled by mathToLLVM: math::CopySignOp + // Handled by mathToLLVM: math::CountLeadingZerosOp + // Handled by mathToLLVM: math::CountTrailingZerosOp + // Handled by mathToLLVM: math::CgPopOp + // Handled by mathToLLVM: math::FmaOp + // FIXME: math::IPowIOp + // FIXME: math::FPowIOp + // Handled by mathToLLVM: math::RoundEvenOp + // Handled by mathToLLVM: math::RoundOp + // Handled by mathToLLVM: math::TruncOp + populateOpPatterns(converter, patterns, "__ocml_fabs_f32", + "__ocml_fabs_f64"); + populateOpPatterns(converter, patterns, "__ocml_acos_f32", + "__ocml_acos_f64"); + populateOpPatterns(converter, patterns, "__ocml_acosh_f32", + "__ocml_acosh_f64"); + populateOpPatterns(converter, patterns, "__ocml_asin_f32", + "__ocml_asin_f64"); + populateOpPatterns(converter, patterns, "__ocml_asinh_f32", + "__ocml_asinh_f64"); + populateOpPatterns(converter, patterns, "__ocml_atan_f32", + "__ocml_atan_f64"); + populateOpPatterns(converter, patterns, "__ocml_atanh_f32", + "__ocml_atanh_f64"); + populateOpPatterns(converter, patterns, "__ocml_atan2_f32", + "__ocml_atan2_f64"); + populateOpPatterns(converter, patterns, "__ocml_cbrt_f32", + "__ocml_cbrt_f64"); + populateOpPatterns(converter, patterns, "__ocml_ceil_f32", + "__ocml_ceil_f64"); + populateOpPatterns(converter, patterns, "__ocml_cos_f32", + "__ocml_cos_f64"); + populateOpPatterns(converter, patterns, "__ocml_cosh_f32", + "__ocml_cosh_f64"); + populateOpPatterns(converter, patterns, "__ocml_sinh_f32", + "__ocml_sinh_f64"); + populateOpPatterns(converter, patterns, "__ocml_exp_f32", + "__ocml_exp_f64"); + populateOpPatterns(converter, patterns, "__ocml_exp2_f32", + "__ocml_exp2_f64"); + populateOpPatterns(converter, patterns, "__ocml_expm1_f32", + "__ocml_expm1_f64"); + populateOpPatterns(converter, patterns, "__ocml_floor_f32", + "__ocml_floor_f64"); + populateOpPatterns(converter, patterns, "__ocml_log_f32", + "__ocml_log_f64"); + populateOpPatterns(converter, patterns, "__ocml_log10_f32", + "__ocml_log10_f64"); + populateOpPatterns(converter, patterns, "__ocml_log1p_f32", + "__ocml_log1p_f64"); + populateOpPatterns(converter, patterns, "__ocml_log2_f32", + "__ocml_log2_f64"); + populateOpPatterns(converter, patterns, "__ocml_pow_f32", + "__ocml_pow_f64"); + populateOpPatterns(converter, patterns, "__ocml_rsqrt_f32", + "__ocml_rsqrt_f64"); + populateOpPatterns(converter, patterns, "__ocml_sin_f32", + "__ocml_sin_f64"); + populateOpPatterns(converter, patterns, "__ocml_sqrt_f32", + "__ocml_sqrt_f64"); + populateOpPatterns(converter, patterns, "__ocml_tanh_f32", + "__ocml_tanh_f64"); + populateOpPatterns(converter, patterns, "__ocml_tan_f32", + "__ocml_tan_f64"); + populateOpPatterns(converter, patterns, "__ocml_erf_f32", + "__ocml_erf_f64"); + // Single arith pattern that needs a ROCDL call, probably not + // worth creating a separate pass for it. + populateOpPatterns(converter, patterns, "__ocml_fmod_f32", + "__ocml_fmod_f64"); +} + +namespace { +struct ConvertMathToROCDLPass + : public impl::ConvertMathToROCDLBase { + ConvertMathToROCDLPass() = default; + void runOnOperation() override; +}; +} // namespace + +void ConvertMathToROCDLPass::runOnOperation() { + auto m = getOperation(); + MLIRContext *ctx = m.getContext(); + + RewritePatternSet patterns(&getContext()); + LowerToLLVMOptions options(ctx, DataLayout(m)); + LLVMTypeConverter converter(ctx, options); + populateMathToROCDLConversionPatterns(converter, patterns); + ConversionTarget target(getContext()); + target.addLegalDialect(); + target.addIllegalOp(); + if (failed(applyPartialConversion(m, target, std::move(patterns)))) + signalPassFailure(); +} diff --git a/mlir/test/Conversion/MathToROCDL/math-to-rocdl.mlir b/mlir/test/Conversion/MathToROCDL/math-to-rocdl.mlir new file mode 100644 index 0000000000000..a406ec45a7f10 --- /dev/null +++ b/mlir/test/Conversion/MathToROCDL/math-to-rocdl.mlir @@ -0,0 +1,435 @@ +// RUN: mlir-opt %s -convert-math-to-rocdl -split-input-file | FileCheck %s + +module @test_module { + // CHECK: llvm.func @__ocml_fmod_f32(f32, f32) -> f32 + // CHECK: llvm.func @__ocml_fmod_f64(f64, f64) -> f64 + // CHECK-LABEL: func @arith_remf + func.func @arith_remf(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + %result32 = arith.remf %arg_f32, %arg_f32 : f32 + // CHECK: llvm.call @__ocml_fmod_f32(%{{.*}}, %{{.*}}) : (f32, f32) -> f32 + %result64 = arith.remf %arg_f64, %arg_f64 : f64 + // CHECK: llvm.call @__ocml_fmod_f64(%{{.*}}, %{{.*}}) : (f64, f64) -> f64 + func.return %result32, %result64 : f32, f64 + } +} + +// ----- + +module @test_module { + // CHECK: llvm.func @__ocml_fabs_f32(f32) -> f32 + // CHECK: llvm.func @__ocml_fabs_f64(f64) -> f64 + // CHECK-LABEL: func @math_absf + func.func @math_absf(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + %result32 = math.absf %arg_f32 : f32 + // CHECK: llvm.call @__ocml_fabs_f32(%{{.*}}) : (f32) -> f32 + %result64 = math.absf %arg_f64 : f64 + // CHECK: llvm.call @__ocml_fabs_f64(%{{.*}}) : (f64) -> f64 + func.return %result32, %result64 : f32, f64 + } +} + +// ----- + +module @test_module { + // CHECK: llvm.func @__ocml_acos_f32(f32) -> f32 + // CHECK: llvm.func @__ocml_acos_f64(f64) -> f64 + // CHECK-LABEL: func @math_acos + func.func @math_acos(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + %result32 = math.acos %arg_f32 : f32 + // CHECK: llvm.call @__ocml_acos_f32(%{{.*}}) : (f32) -> f32 + %result64 = math.acos %arg_f64 : f64 + // CHECK: llvm.call @__ocml_acos_f64(%{{.*}}) : (f64) -> f64 + func.return %result32, %result64 : f32, f64 + } +} + +// ----- + +module @test_module { + // CHECK: llvm.func @__ocml_acosh_f32(f32) -> f32 + // CHECK: llvm.func @__ocml_acosh_f64(f64) -> f64 + // CHECK-LABEL: func @math_acosh + func.func @math_acosh(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + %result32 = math.acosh %arg_f32 : f32 + // CHECK: llvm.call @__ocml_acosh_f32(%{{.*}}) : (f32) -> f32 + %result64 = math.acosh %arg_f64 : f64 + // CHECK: llvm.call @__ocml_acosh_f64(%{{.*}}) : (f64) -> f64 + func.return %result32, %result64 : f32, f64 + } +} + +// ----- + +module @test_module { + // CHECK: llvm.func @__ocml_asin_f32(f32) -> f32 + // CHECK: llvm.func @__ocml_asin_f64(f64) -> f64 + // CHECK-LABEL: func @math_asin + func.func @math_asin(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + %result32 = math.asin %arg_f32 : f32 + // CHECK: llvm.call @__ocml_asin_f32(%{{.*}}) : (f32) -> f32 + %result64 = math.asin %arg_f64 : f64 + // CHECK: llvm.call @__ocml_asin_f64(%{{.*}}) : (f64) -> f64 + func.return %result32, %result64 : f32, f64 + } +} + +// ----- + +module @test_module { + // CHECK: llvm.func @__ocml_asinh_f32(f32) -> f32 + // CHECK: llvm.func @__ocml_asinh_f64(f64) -> f64 + // CHECK-LABEL: func @math_asinh + func.func @math_asinh(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + %result32 = math.asinh %arg_f32 : f32 + // CHECK: llvm.call @__ocml_asinh_f32(%{{.*}}) : (f32) -> f32 + %result64 = math.asinh %arg_f64 : f64 + // CHECK: llvm.call @__ocml_asinh_f64(%{{.*}}) : (f64) -> f64 + func.return %result32, %result64 : f32, f64 + } +} + +// ----- + +module @test_module { + // CHECK: llvm.func @__ocml_atan_f32(f32) -> f32 + // CHECK: llvm.func @__ocml_atan_f64(f64) -> f64 + // CHECK-LABEL: func @math_atan + func.func @math_atan(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + %result32 = math.atan %arg_f32 : f32 + // CHECK: llvm.call @__ocml_atan_f32(%{{.*}}) : (f32) -> f32 + %result64 = math.atan %arg_f64 : f64 + // CHECK: llvm.call @__ocml_atan_f64(%{{.*}}) : (f64) -> f64 + func.return %result32, %result64 : f32, f64 + } +} + +// ----- + +module @test_module { + // CHECK: llvm.func @__ocml_atanh_f32(f32) -> f32 + // CHECK: llvm.func @__ocml_atanh_f64(f64) -> f64 + // CHECK-LABEL: func @math_atanh + func.func @math_atanh(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + %result32 = math.atanh %arg_f32 : f32 + // CHECK: llvm.call @__ocml_atanh_f32(%{{.*}}) : (f32) -> f32 + %result64 = math.atanh %arg_f64 : f64 + // CHECK: llvm.call @__ocml_atanh_f64(%{{.*}}) : (f64) -> f64 + func.return %result32, %result64 : f32, f64 + } +} + +// ----- + +module @test_module { + // CHECK: llvm.func @__ocml_atan2_f32(f32, f32) -> f32 + // CHECK: llvm.func @__ocml_atan2_f64(f64, f64) -> f64 + // CHECK-LABEL: func @math_atan2 + func.func @math_atan2(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + %result32 = math.atan2 %arg_f32, %arg_f32 : f32 + // CHECK: llvm.call @__ocml_atan2_f32(%{{.*}}, %{{.*}}) : (f32, f32) -> f32 + %result64 = math.atan2 %arg_f64, %arg_f64 : f64 + // CHECK: llvm.call @__ocml_atan2_f64(%{{.*}}, %{{.*}}) : (f64, f64) -> f64 + func.return %result32, %result64 : f32, f64 + } +} + +// ----- + +module @test_module { + // CHECK: llvm.func @__ocml_cbrt_f32(f32) -> f32 + // CHECK: llvm.func @__ocml_cbrt_f64(f64) -> f64 + // CHECK-LABEL: func @math_cbrt + func.func @math_cbrt(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + %result32 = math.cbrt %arg_f32 : f32 + // CHECK: llvm.call @__ocml_cbrt_f32(%{{.*}}) : (f32) -> f32 + %result64 = math.cbrt %arg_f64 : f64 + // CHECK: llvm.call @__ocml_cbrt_f64(%{{.*}}) : (f64) -> f64 + func.return %result32, %result64 : f32, f64 + } +} + +// ----- + +module @test_module { + // CHECK: llvm.func @__ocml_ceil_f32(f32) -> f32 + // CHECK: llvm.func @__ocml_ceil_f64(f64) -> f64 + // CHECK-LABEL: func @math_ceil + func.func @math_ceil(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + %result32 = math.ceil %arg_f32 : f32 + // CHECK: llvm.call @__ocml_ceil_f32(%{{.*}}) : (f32) -> f32 + %result64 = math.ceil %arg_f64 : f64 + // CHECK: llvm.call @__ocml_ceil_f64(%{{.*}}) : (f64) -> f64 + func.return %result32, %result64 : f32, f64 + } +} + +// ----- + +module @test_module { + // CHECK: llvm.func @__ocml_cos_f32(f32) -> f32 + // CHECK: llvm.func @__ocml_cos_f64(f64) -> f64 + // CHECK-LABEL: func @math_cos + func.func @math_cos(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + %result32 = math.cos %arg_f32 : f32 + // CHECK: llvm.call @__ocml_cos_f32(%{{.*}}) : (f32) -> f32 + %result64 = math.cos %arg_f64 : f64 + // CHECK: llvm.call @__ocml_cos_f64(%{{.*}}) : (f64) -> f64 + func.return %result32, %result64 : f32, f64 + } +} + +// ----- + +module @test_module { + // CHECK: llvm.func @__ocml_cosh_f32(f32) -> f32 + // CHECK: llvm.func @__ocml_cosh_f64(f64) -> f64 + // CHECK-LABEL: func @math_cosh + func.func @math_cosh(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + %result32 = math.cosh %arg_f32 : f32 + // CHECK: llvm.call @__ocml_cosh_f32(%{{.*}}) : (f32) -> f32 + %result64 = math.cosh %arg_f64 : f64 + // CHECK: llvm.call @__ocml_cosh_f64(%{{.*}}) : (f64) -> f64 + func.return %result32, %result64 : f32, f64 + } +} + +// ----- + +module @test_module { + // CHECK: llvm.func @__ocml_sinh_f32(f32) -> f32 + // CHECK: llvm.func @__ocml_sinh_f64(f64) -> f64 + // CHECK-LABEL: func @math_sinh + func.func @math_sinh(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + %result32 = math.sinh %arg_f32 : f32 + // CHECK: llvm.call @__ocml_sinh_f32(%{{.*}}) : (f32) -> f32 + %result64 = math.sinh %arg_f64 : f64 + // CHECK: llvm.call @__ocml_sinh_f64(%{{.*}}) : (f64) -> f64 + func.return %result32, %result64 : f32, f64 + } +} + +// ----- + +module @test_module { + // CHECK: llvm.func @__ocml_exp_f32(f32) -> f32 + // CHECK: llvm.func @__ocml_exp_f64(f64) -> f64 + // CHECK-LABEL: func @math_exp + func.func @math_exp(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + %result32 = math.exp %arg_f32 : f32 + // CHECK: llvm.call @__ocml_exp_f32(%{{.*}}) : (f32) -> f32 + %result64 = math.exp %arg_f64 : f64 + // CHECK: llvm.call @__ocml_exp_f64(%{{.*}}) : (f64) -> f64 + func.return %result32, %result64 : f32, f64 + } +} + +// ----- + +module @test_module { + // CHECK: llvm.func @__ocml_exp2_f32(f32) -> f32 + // CHECK: llvm.func @__ocml_exp2_f64(f64) -> f64 + // CHECK-LABEL: func @math_exp2 + func.func @math_exp2(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + %result32 = math.exp2 %arg_f32 : f32 + // CHECK: llvm.call @__ocml_exp2_f32(%{{.*}}) : (f32) -> f32 + %result64 = math.exp2 %arg_f64 : f64 + // CHECK: llvm.call @__ocml_exp2_f64(%{{.*}}) : (f64) -> f64 + func.return %result32, %result64 : f32, f64 + } +} + +// ----- + +module @test_module { + // CHECK: llvm.func @__ocml_expm1_f32(f32) -> f32 + // CHECK: llvm.func @__ocml_expm1_f64(f64) -> f64 + // CHECK-LABEL: func @math_expm1 + func.func @math_expm1(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + %result32 = math.expm1 %arg_f32 : f32 + // CHECK: llvm.call @__ocml_expm1_f32(%{{.*}}) : (f32) -> f32 + %result64 = math.expm1 %arg_f64 : f64 + // CHECK: llvm.call @__ocml_expm1_f64(%{{.*}}) : (f64) -> f64 + func.return %result32, %result64 : f32, f64 + } +} + +// ----- + +module @test_module { + // CHECK: llvm.func @__ocml_floor_f32(f32) -> f32 + // CHECK: llvm.func @__ocml_floor_f64(f64) -> f64 + // CHECK-LABEL: func @math_floor + func.func @math_floor(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + %result32 = math.floor %arg_f32 : f32 + // CHECK: llvm.call @__ocml_floor_f32(%{{.*}}) : (f32) -> f32 + %result64 = math.floor %arg_f64 : f64 + // CHECK: llvm.call @__ocml_floor_f64(%{{.*}}) : (f64) -> f64 + func.return %result32, %result64 : f32, f64 + } +} + +// ----- + +module @test_module { + // CHECK: llvm.func @__ocml_log_f32(f32) -> f32 + // CHECK: llvm.func @__ocml_log_f64(f64) -> f64 + // CHECK-LABEL: func @math_log + func.func @math_log(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + %result32 = math.log %arg_f32 : f32 + // CHECK: llvm.call @__ocml_log_f32(%{{.*}}) : (f32) -> f32 + %result64 = math.log %arg_f64 : f64 + // CHECK: llvm.call @__ocml_log_f64(%{{.*}}) : (f64) -> f64 + func.return %result32, %result64 : f32, f64 + } +} + +// ----- + +module @test_module { + // CHECK: llvm.func @__ocml_log10_f32(f32) -> f32 + // CHECK: llvm.func @__ocml_log10_f64(f64) -> f64 + // CHECK-LABEL: func @math_log10 + func.func @math_log10(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + %result32 = math.log10 %arg_f32 : f32 + // CHECK: llvm.call @__ocml_log10_f32(%{{.*}}) : (f32) -> f32 + %result64 = math.log10 %arg_f64 : f64 + // CHECK: llvm.call @__ocml_log10_f64(%{{.*}}) : (f64) -> f64 + func.return %result32, %result64 : f32, f64 + } +} + +// ----- + +module @test_module { + // CHECK: llvm.func @__ocml_log1p_f32(f32) -> f32 + // CHECK: llvm.func @__ocml_log1p_f64(f64) -> f64 + // CHECK-LABEL: func @math_log1p + func.func @math_log1p(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + %result32 = math.log1p %arg_f32 : f32 + // CHECK: llvm.call @__ocml_log1p_f32(%{{.*}}) : (f32) -> f32 + %result64 = math.log1p %arg_f64 : f64 + // CHECK: llvm.call @__ocml_log1p_f64(%{{.*}}) : (f64) -> f64 + func.return %result32, %result64 : f32, f64 + } +} + +// ----- + +module @test_module { + // CHECK: llvm.func @__ocml_pow_f32(f32, f32) -> f32 + // CHECK: llvm.func @__ocml_pow_f64(f64, f64) -> f64 + // CHECK-LABEL: func @math_powf + func.func @math_powf(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + %result32 = math.powf %arg_f32, %arg_f32 : f32 + // CHECK: llvm.call @__ocml_pow_f32(%{{.*}}, %{{.*}}) : (f32, f32) -> f32 + %result64 = math.powf %arg_f64, %arg_f64 : f64 + // CHECK: llvm.call @__ocml_pow_f64(%{{.*}}, %{{.*}}) : (f64, f64) -> f64 + func.return %result32, %result64 : f32, f64 + } +} + +// ----- + +module @test_module { + // CHECK: llvm.func @__ocml_rsqrt_f32(f32) -> f32 + // CHECK: llvm.func @__ocml_rsqrt_f64(f64) -> f64 + // CHECK-LABEL: func @math_rsqrt + func.func @math_rsqrt(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + %result32 = math.rsqrt %arg_f32 : f32 + // CHECK: llvm.call @__ocml_rsqrt_f32(%{{.*}}) : (f32) -> f32 + %result64 = math.rsqrt %arg_f64 : f64 + // CHECK: llvm.call @__ocml_rsqrt_f64(%{{.*}}) : (f64) -> f64 + func.return %result32, %result64 : f32, f64 + } +} + +// ----- + +module @test_module { + // CHECK: llvm.func @__ocml_sin_f32(f32) -> f32 + // CHECK: llvm.func @__ocml_sin_f64(f64) -> f64 + // CHECK-LABEL: func @math_sin + func.func @math_sin(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + %result32 = math.sin %arg_f32 : f32 + // CHECK: llvm.call @__ocml_sin_f32(%{{.*}}) : (f32) -> f32 + %result64 = math.sin %arg_f64 : f64 + // CHECK: llvm.call @__ocml_sin_f64(%{{.*}}) : (f64) -> f64 + func.return %result32, %result64 : f32, f64 + } +} + +// ----- + +module @test_module { + // CHECK: llvm.func @__ocml_sqrt_f32(f32) -> f32 + // CHECK: llvm.func @__ocml_sqrt_f64(f64) -> f64 + // CHECK-LABEL: func @math_sqrt + func.func @math_sqrt(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + %result32 = math.sqrt %arg_f32 : f32 + // CHECK: llvm.call @__ocml_sqrt_f32(%{{.*}}) : (f32) -> f32 + %result64 = math.sqrt %arg_f64 : f64 + // CHECK: llvm.call @__ocml_sqrt_f64(%{{.*}}) : (f64) -> f64 + func.return %result32, %result64 : f32, f64 + } +} + +// ----- + +module @test_module { + // CHECK: llvm.func @__ocml_tanh_f32(f32) -> f32 + // CHECK: llvm.func @__ocml_tanh_f64(f64) -> f64 + // CHECK-LABEL: func @math_tanh + func.func @math_tanh(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + %result32 = math.tanh %arg_f32 : f32 + // CHECK: llvm.call @__ocml_tanh_f32(%{{.*}}) : (f32) -> f32 + %result64 = math.tanh %arg_f64 : f64 + // CHECK: llvm.call @__ocml_tanh_f64(%{{.*}}) : (f64) -> f64 + func.return %result32, %result64 : f32, f64 + } +} + +// ----- + +module @test_module { + // CHECK: llvm.func @__ocml_tan_f32(f32) -> f32 + // CHECK: llvm.func @__ocml_tan_f64(f64) -> f64 + // CHECK-LABEL: func @math_tan + func.func @math_tan(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + %result32 = math.tan %arg_f32 : f32 + // CHECK: llvm.call @__ocml_tan_f32(%{{.*}}) : (f32) -> f32 + %result64 = math.tan %arg_f64 : f64 + // CHECK: llvm.call @__ocml_tan_f64(%{{.*}}) : (f64) -> f64 + func.return %result32, %result64 : f32, f64 + } +} + +// ----- + +module @test_module { + // CHECK: llvm.func @__ocml_erf_f32(f32) -> f32 + // CHECK: llvm.func @__ocml_erf_f64(f64) -> f64 + // CHECK-LABEL: func @math_erf + func.func @math_erf(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + %result32 = math.erf %arg_f32 : f32 + // CHECK: llvm.call @__ocml_erf_f32(%{{.*}}) : (f32) -> f32 + %result64 = math.erf %arg_f64 : f64 + // CHECK: llvm.call @__ocml_erf_f64(%{{.*}}) : (f64) -> f64 + func.return %result32, %result64 : f32, f64 + } +} + +// ----- + +module @test_module { + // CHECK: llvm.func @__ocml_fmod_f32(f32, f32) -> f32 + // CHECK: llvm.func @__ocml_fmod_f64(f64, f64) -> f64 + // CHECK-LABEL: func @arith_remf + func.func @arith_remf(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + %result32 = arith.remf %arg_f32, %arg_f32 : f32 + // CHECK: llvm.call @__ocml_fmod_f32(%{{.*}}, %{{.*}}) : (f32, f32) -> f32 + %result64 = arith.remf %arg_f64, %arg_f64 : f64 + // CHECK: llvm.call @__ocml_fmod_f64(%{{.*}}, %{{.*}}) : (f64, f64) -> f64 + func.return %result32, %result64 : f32, f64 + } +} + From 8687f7cd662384e3bd009a0f43eabbbe87f4387a Mon Sep 17 00:00:00 2001 From: Alex Bradbury Date: Wed, 17 Jul 2024 15:19:31 +0100 Subject: [PATCH 278/777] [RISCV] Support constant hoisting of immediate store values (#96073) Previously getIntImmInstCost only calculated the cost of materialising the argument of a store if it was the address. This means ConstantHoisting's transformation wouldn't kick in for cases like storing two values that require multiple instructions to materialise but where one can be cheaply generated from the other (e.g. by an addition). Two key changes were needed to avoid regressions when enabling this: * Allowing constant materialisation cost to be calculated assuming zeroes are free (as might happen if you had a 2*XLEN constant and one half is zero). * Avoiding constant hoisting if we have a misaligned store that's going to be a legalised to a sequence of narrower stores. I'm seeing cases where hoisting the constant ends up with worse codegen in that case. Out of caution and so as not to unexpectedly degrade other existing hoisting logic, FreeZeroes is used only for the new cost calculations for the load instruction. It would likely make sense to revisit this later. --- .../Target/RISCV/MCTargetDesc/RISCVMatInt.cpp | 6 +- .../Target/RISCV/MCTargetDesc/RISCVMatInt.h | 7 ++- .../Target/RISCV/RISCVTargetTransformInfo.cpp | 39 ++++++++++--- .../ConstantHoisting/RISCV/immediates.ll | 58 ++++++++++++++++++- 4 files changed, 95 insertions(+), 15 deletions(-) diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp index 0a857eb96935e..26725cf7decbe 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp @@ -499,7 +499,7 @@ InstSeq generateTwoRegInstSeq(int64_t Val, const MCSubtargetInfo &STI, } int getIntMatCost(const APInt &Val, unsigned Size, const MCSubtargetInfo &STI, - bool CompressionCost) { + bool CompressionCost, bool FreeZeroes) { bool IsRV64 = STI.hasFeature(RISCV::Feature64Bit); bool HasRVC = CompressionCost && (STI.hasFeature(RISCV::FeatureStdExtC) || STI.hasFeature(RISCV::FeatureStdExtZca)); @@ -510,10 +510,12 @@ int getIntMatCost(const APInt &Val, unsigned Size, const MCSubtargetInfo &STI, int Cost = 0; for (unsigned ShiftVal = 0; ShiftVal < Size; ShiftVal += PlatRegSize) { APInt Chunk = Val.ashr(ShiftVal).sextOrTrunc(PlatRegSize); + if (FreeZeroes && Chunk.getSExtValue() == 0) + continue; InstSeq MatSeq = generateInstSeq(Chunk.getSExtValue(), STI); Cost += getInstSeqCost(MatSeq, HasRVC); } - return std::max(1, Cost); + return std::max(FreeZeroes ? 0 : 1, Cost); } OpndKind Inst::getOpndKind() const { diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.h index e87e0f3256470..ae94f3778b217 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.h +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.h @@ -71,8 +71,13 @@ InstSeq generateTwoRegInstSeq(int64_t Val, const MCSubtargetInfo &STI, // If CompressionCost is true it will use a different cost calculation if RVC is // enabled. This should be used to compare two different sequences to determine // which is more compressible. +// +// If FreeZeroes is true, it will be assumed free to materialize any +// XLen-sized chunks that are 0. This is appropriate to use in instances when +// the zero register can be used, e.g. when estimating the cost of +// materializing a value used by a particular operation. int getIntMatCost(const APInt &Val, unsigned Size, const MCSubtargetInfo &STI, - bool CompressionCost = false); + bool CompressionCost = false, bool FreeZeroes = false); } // namespace RISCVMatInt } // namespace llvm #endif diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index d603138773de4..f9eef60f77b7a 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -109,8 +109,11 @@ RISCVTTIImpl::getRISCVInstructionCost(ArrayRef OpCodes, MVT VT, return Cost; } -InstructionCost RISCVTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, - TTI::TargetCostKind CostKind) { +static InstructionCost getIntImmCostImpl(const DataLayout &DL, + const RISCVSubtarget *ST, + const APInt &Imm, Type *Ty, + TTI::TargetCostKind CostKind, + bool FreeZeroes) { assert(Ty->isIntegerTy() && "getIntImmCost can only estimate cost of materialising integers"); @@ -119,8 +122,13 @@ InstructionCost RISCVTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, return TTI::TCC_Free; // Otherwise, we check how many instructions it will take to materialise. - const DataLayout &DL = getDataLayout(); - return RISCVMatInt::getIntMatCost(Imm, DL.getTypeSizeInBits(Ty), *getST()); + return RISCVMatInt::getIntMatCost(Imm, DL.getTypeSizeInBits(Ty), *ST, + /*CompressionCost=*/false, FreeZeroes); +} + +InstructionCost RISCVTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, + TTI::TargetCostKind CostKind) { + return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind, false); } // Look for patterns of shift followed by AND that can be turned into a pair of @@ -172,11 +180,24 @@ InstructionCost RISCVTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, // split up large offsets in GEP into better parts than ConstantHoisting // can. return TTI::TCC_Free; - case Instruction::Store: - // If the address is a constant, use the materialization cost. - if (Idx == 1) - return getIntImmCost(Imm, Ty, CostKind); - return TTI::TCC_Free; + case Instruction::Store: { + // Use the materialization cost regardless of if it's the address or the + // value that is constant, except for if the store is misaligned and + // misaligned accesses are not legal (experience shows constant hoisting + // can sometimes be harmful in such cases). + if (Idx == 1 || !Inst) + return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind, + /*FreeZeroes=*/true); + + StoreInst *ST = cast(Inst); + if (!getTLI()->allowsMemoryAccessForAlignment( + Ty->getContext(), DL, getTLI()->getValueType(DL, Ty), + ST->getPointerAddressSpace(), ST->getAlign())) + return TTI::TCC_Free; + + return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind, + /*FreeZeroes=*/true); + } case Instruction::Load: // If the address is a constant, use the materialization cost. return getIntImmCost(Imm, Ty, CostKind); diff --git a/llvm/test/Transforms/ConstantHoisting/RISCV/immediates.ll b/llvm/test/Transforms/ConstantHoisting/RISCV/immediates.ll index 8f57df6edb2c0..329281e7dc301 100644 --- a/llvm/test/Transforms/ConstantHoisting/RISCV/immediates.ll +++ b/llvm/test/Transforms/ConstantHoisting/RISCV/immediates.ll @@ -211,12 +211,64 @@ exit: ; Check that we use a common base for immediates needed by a store if the ; constants require more than 1 instruction. -; TODO: This doesn't trigger currently. define void @test20(ptr %p1, ptr %p2) { ; CHECK-LABEL: test20 -; CHECK: store i32 15111111, ptr %p1 -; CHECK: store i32 15111112, ptr %p2 +; CHECK: %const = bitcast i32 15111111 to i32 +; CHECK: store i32 %const, ptr %p1, align 4 +; CHECK: %const_mat = add i32 %const, 1 +; CHECK: store i32 %const_mat, ptr %p2, align 4 store i32 15111111, ptr %p1, align 4 store i32 15111112, ptr %p2, align 4 ret void } + +define void @test21(ptr %p1, ptr %p2) { +; CHECK-LABEL: define void @test21( +; CHECK-SAME: ptr [[P1:%.*]], ptr [[P2:%.*]]) { +; CHECK-NEXT: store i32 15111111, ptr [[P1]], align 1 +; CHECK-NEXT: store i32 15111112, ptr [[P2]], align 1 +; CHECK-NEXT: ret void +; + store i32 15111111, ptr %p1, align 1 + store i32 15111112, ptr %p2, align 1 + ret void +} + +; 0 immediates shouldn't be hoisted. +define void @test22(ptr %p1, ptr %p2) { +; CHECK-LABEL: define void @test22( +; CHECK-SAME: ptr [[P1:%.*]], ptr [[P2:%.*]]) { +; CHECK-NEXT: store i64 0, ptr [[P1]], align 8 +; CHECK-NEXT: store i64 -1, ptr [[P2]], align 8 +; CHECK-NEXT: ret void +; + store i64 0, ptr %p1, align 8 + store i64 -1, ptr %p2, align 8 + ret void +} + +; 0 immediates shouldn't be hoisted. +define void @test23(ptr %p1, ptr %p2) { +; CHECK-LABEL: define void @test23( +; CHECK-SAME: ptr [[P1:%.*]], ptr [[P2:%.*]]) { +; CHECK-NEXT: store i127 0, ptr [[P1]], align 8 +; CHECK-NEXT: store i127 -1, ptr [[P2]], align 8 +; CHECK-NEXT: ret void +; + store i127 0, ptr %p1, align 8 + store i127 -1, ptr %p2, align 8 + ret void +} + +; Hoisting doesn't happen for types that aren't legal. +define void @test24(ptr %p1, ptr %p2) { +; CHECK-LABEL: define void @test24( +; CHECK-SAME: ptr [[P1:%.*]], ptr [[P2:%.*]]) { +; CHECK-NEXT: store i128 15111111, ptr [[P1]], align 4 +; CHECK-NEXT: store i128 15111112, ptr [[P2]], align 4 +; CHECK-NEXT: ret void +; + store i128 15111111, ptr %p1, align 4 + store i128 15111112, ptr %p2, align 4 + ret void +} From d3d2f9a4208eedbd2f372c34725ab61c3f4d3aed Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Wed, 17 Jul 2024 07:17:25 -0400 Subject: [PATCH 279/777] [SLP]Improve minbitwidth analysis for trun'ed gather nodes. If the gather node is trunc'ed, better to trunc scalars and then gather them rather than gather and then trunc. Trunc for scalars is free in most cases. Reviewers: RKSimon Reviewed By: RKSimon Pull Request: https://github.com/llvm/llvm-project/pull/99072 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 18 +++++++- .../X86/int-bitcast-minbitwidth.ll | 6 +-- .../X86/minbitwidth-transformed-operand.ll | 22 ++++----- .../Transforms/SLPVectorizer/X86/resched.ll | 45 ++++++++++--------- .../SLPVectorizer/X86/shuffle-multivector.ll | 13 +++--- 5 files changed, 61 insertions(+), 43 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 7bdbbecb7f0d8..b46644650a5dc 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -15522,8 +15522,24 @@ void BoUpSLP::computeMinimumValueSizes() { auto ComputeMaxBitWidth = [&](const TreeEntry &E, bool IsTopRoot, bool IsProfitableToDemoteRoot, unsigned Opcode, unsigned Limit, bool IsTruncRoot, - bool IsSignedCmp) { + bool IsSignedCmp) -> unsigned { ToDemote.clear(); + // Check if the root is trunc and the next node is gather/buildvector, then + // keep trunc in scalars, which is free in most cases. + if (E.isGather() && IsTruncRoot && E.UserTreeIndices.size() == 1 && + E.Idx > (IsStoreOrInsertElt ? 2 : 1)) { + ToDemote.push_back(E.Idx); + const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE; + auto It = MinBWs.find(UserTE); + if (It != MinBWs.end()) + return It->second.first; + unsigned MaxBitWidth = + bit_ceil(DL->getTypeSizeInBits(UserTE->Scalars.front()->getType())); + if (MaxBitWidth < 8 && MaxBitWidth > 1) + MaxBitWidth = 8; + return MaxBitWidth; + } + unsigned VF = E.getVectorFactor(); auto *TreeRootIT = dyn_cast(E.Scalars.front()->getType()); if (!TreeRootIT || !Opcode) diff --git a/llvm/test/Transforms/SLPVectorizer/X86/int-bitcast-minbitwidth.ll b/llvm/test/Transforms/SLPVectorizer/X86/int-bitcast-minbitwidth.ll index 789d73947d1c7..97e505f4319c6 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/int-bitcast-minbitwidth.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/int-bitcast-minbitwidth.ll @@ -5,9 +5,9 @@ define void @t(i64 %v) { ; CHECK-LABEL: define void @t( ; CHECK-SAME: i64 [[V:%.*]]) { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i64> poison, i64 [[V]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = trunc <4 x i64> [[TMP1]] to <4 x i16> +; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[V]] to i16 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i16> [[TMP2]], ; CHECK-NEXT: [[TMP4:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP3]]) ; CHECK-NEXT: [[TMP5:%.*]] = sext i16 [[TMP4]] to i32 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-transformed-operand.ll b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-transformed-operand.ll index 032625a1199f9..57b5d2af48ee6 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-transformed-operand.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-transformed-operand.ll @@ -5,20 +5,16 @@ define void @test(i64 %d.promoted.i) { ; CHECK-LABEL: define void @test( ; CHECK-SAME: i64 [[D_PROMOTED_I:%.*]]) { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[AND_1_I:%.*]] = and i64 0, [[D_PROMOTED_I]] -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i64> , i64 [[AND_1_I]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = trunc <8 x i64> [[TMP0]] to <8 x i1> -; CHECK-NEXT: [[TMP2:%.*]] = mul <8 x i1> [[TMP1]], zeroinitializer -; CHECK-NEXT: [[AND_1_I_1:%.*]] = and i64 0, 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i64> , i64 [[AND_1_I_1]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = trunc <8 x i64> [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = mul <8 x i1> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP2]]) -; CHECK-NEXT: [[TMP7:%.*]] = zext i1 [[TMP6]] to i32 -; CHECK-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP5]]) +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i64> , i64 0, i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> , i64 [[D_PROMOTED_I]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i64> [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i1> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i1> [[TMP3]], <2 x i1> poison, <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <16 x i1> , <16 x i1> [[TMP4]], <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = mul <16 x i1> [[TMP5]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP6]]) ; CHECK-NEXT: [[TMP9:%.*]] = zext i1 [[TMP8]] to i32 -; CHECK-NEXT: [[OP_RDX:%.*]] = or i32 [[TMP7]], [[TMP9]] -; CHECK-NEXT: [[TMP10:%.*]] = and i32 [[OP_RDX]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = and i32 [[TMP9]], 0 ; CHECK-NEXT: store i32 [[TMP10]], ptr null, align 4 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/resched.ll b/llvm/test/Transforms/SLPVectorizer/X86/resched.ll index b7237cbb02bb3..4ed52247c2ef3 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/resched.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/resched.ll @@ -11,26 +11,31 @@ define fastcc void @_ZN12_GLOBAL__N_127PolynomialMultiplyRecognize9recognizeEv() ; CHECK: if.then22.i: ; CHECK-NEXT: [[SUB_I:%.*]] = add nsw i32 undef, -1 ; CHECK-NEXT: [[CONV31_I:%.*]] = and i32 undef, [[SUB_I]] -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[CONV31_I]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = lshr <4 x i32> [[TMP1]], -; CHECK-NEXT: [[SHR_4_I_I:%.*]] = lshr i32 [[CONV31_I]], 5 -; CHECK-NEXT: [[SHR_5_I_I:%.*]] = lshr i32 [[CONV31_I]], 6 -; CHECK-NEXT: [[SHR_6_I_I:%.*]] = lshr i32 [[CONV31_I]], 7 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> poison, i32 [[CONV31_I]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = lshr <8 x i32> [[TMP4]], -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x i32> poison, i32 [[SUB_I]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> [[TMP7]], <16 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <16 x i32> [[TMP8]], i32 [[SHR_4_I_I]], i32 5 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <16 x i32> [[TMP9]], i32 [[SHR_5_I_I]], i32 6 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <16 x i32> [[TMP10]], i32 [[SHR_6_I_I]], i32 7 -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <16 x i32> [[TMP11]], <16 x i32> [[TMP12]], <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = trunc <16 x i32> [[TMP13]] to <16 x i8> -; CHECK-NEXT: [[TMP15:%.*]] = and <16 x i8> [[TMP14]], -; CHECK-NEXT: store <16 x i8> [[TMP15]], ptr undef, align 1 +; CHECK-NEXT: [[SHR_I_I:%.*]] = lshr i32 [[CONV31_I]], 1 +; CHECK-NEXT: [[SHR_1_I_I:%.*]] = lshr i32 [[CONV31_I]], 2 +; CHECK-NEXT: [[SHR_2_I_I:%.*]] = lshr i32 [[CONV31_I]], 3 +; CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[SUB_I]] to i8 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[SHR_I_I]] to i8 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <16 x i8> [[TMP1]], i8 [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i32 [[SHR_1_I_I]] to i8 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x i8> [[TMP3]], i8 [[TMP4]], i32 2 +; CHECK-NEXT: [[TMP6:%.*]] = trunc i32 [[SHR_2_I_I]] to i8 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <16 x i8> [[TMP5]], i8 [[TMP6]], i32 3 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> poison, i32 [[CONV31_I]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = lshr <4 x i32> [[TMP9]], +; CHECK-NEXT: [[TMP11:%.*]] = trunc <4 x i32> [[TMP10]] to <4 x i8> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <16 x i8> [[TMP7]], <16 x i8> [[TMP12]], <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <8 x i32> poison, i32 [[CONV31_I]], i32 0 +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <8 x i32> [[TMP14]], <8 x i32> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = lshr <8 x i32> [[TMP15]], +; CHECK-NEXT: [[TMP17:%.*]] = trunc <8 x i32> [[TMP16]] to <8 x i8> +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <8 x i8> [[TMP17]], <8 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <16 x i8> [[TMP13]], <16 x i8> [[TMP18]], <16 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = and <16 x i8> [[TMP19]], +; CHECK-NEXT: store <16 x i8> [[TMP20]], ptr undef, align 1 ; CHECK-NEXT: unreachable ; CHECK: if.end50.i: ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shuffle-multivector.ll b/llvm/test/Transforms/SLPVectorizer/X86/shuffle-multivector.ll index 143052a3d9cd0..c2555889f5981 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/shuffle-multivector.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/shuffle-multivector.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux -slp-threshold=-160 | FileCheck %s +; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux -slp-threshold=-163 | FileCheck %s define void @test1(i128 %p0, i128 %p1, i128 %p2, i128 %p3, <4 x i128> %vec) { ; CHECK-LABEL: @test1( @@ -14,13 +14,14 @@ define void @test1(i128 %p0, i128 %p1, i128 %p2, i128 %p3, <4 x i128> %vec) { ; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: [[T5:%.*]] = trunc i128 [[P1]] to i32 ; CHECK-NEXT: [[TMP8:%.*]] = sdiv <4 x i32> [[TMP3]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i128> [[TMP1]], <2 x i128> [[TMP5]], <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i128> [[VEC:%.*]], <4 x i128> [[TMP9]], <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = trunc <4 x i128> [[TMP10]] to <4 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = sdiv <4 x i32> [[TMP8]], [[TMP11]] +; CHECK-NEXT: [[TMP9:%.*]] = trunc <4 x i128> [[VEC:%.*]] to <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = trunc <4 x i128> [[VEC]] to <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP6]], <4 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = sdiv <4 x i32> [[TMP8]], [[TMP12]] ; CHECK-NEXT: br label [[BB:%.*]] ; CHECK: bb: -; CHECK-NEXT: [[TMP13:%.*]] = phi <4 x i32> [ [[TMP12]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[TMP14:%.*]] = phi <4 x i32> [ [[TMP13]], [[ENTRY:%.*]] ] ; CHECK-NEXT: ret void ; entry: From 05b067b5f952a427f80e1c39a5c9025fdb2d64b2 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Wed, 17 Jul 2024 07:31:27 -0700 Subject: [PATCH 280/777] Revert "[SLP]Improve minbitwidth analysis for trun'ed gather nodes." This reverts commit d3d2f9a4208eedbd2f372c34725ab61c3f4d3aed to fix buildbot https://lab.llvm.org/buildbot/#/builders/92/builds/1880. --- .../Transforms/Vectorize/SLPVectorizer.cpp | 18 +------- .../X86/int-bitcast-minbitwidth.ll | 6 +-- .../X86/minbitwidth-transformed-operand.ll | 22 +++++---- .../Transforms/SLPVectorizer/X86/resched.ll | 45 +++++++++---------- .../SLPVectorizer/X86/shuffle-multivector.ll | 13 +++--- 5 files changed, 43 insertions(+), 61 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index b46644650a5dc..7bdbbecb7f0d8 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -15522,24 +15522,8 @@ void BoUpSLP::computeMinimumValueSizes() { auto ComputeMaxBitWidth = [&](const TreeEntry &E, bool IsTopRoot, bool IsProfitableToDemoteRoot, unsigned Opcode, unsigned Limit, bool IsTruncRoot, - bool IsSignedCmp) -> unsigned { + bool IsSignedCmp) { ToDemote.clear(); - // Check if the root is trunc and the next node is gather/buildvector, then - // keep trunc in scalars, which is free in most cases. - if (E.isGather() && IsTruncRoot && E.UserTreeIndices.size() == 1 && - E.Idx > (IsStoreOrInsertElt ? 2 : 1)) { - ToDemote.push_back(E.Idx); - const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE; - auto It = MinBWs.find(UserTE); - if (It != MinBWs.end()) - return It->second.first; - unsigned MaxBitWidth = - bit_ceil(DL->getTypeSizeInBits(UserTE->Scalars.front()->getType())); - if (MaxBitWidth < 8 && MaxBitWidth > 1) - MaxBitWidth = 8; - return MaxBitWidth; - } - unsigned VF = E.getVectorFactor(); auto *TreeRootIT = dyn_cast(E.Scalars.front()->getType()); if (!TreeRootIT || !Opcode) diff --git a/llvm/test/Transforms/SLPVectorizer/X86/int-bitcast-minbitwidth.ll b/llvm/test/Transforms/SLPVectorizer/X86/int-bitcast-minbitwidth.ll index 97e505f4319c6..789d73947d1c7 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/int-bitcast-minbitwidth.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/int-bitcast-minbitwidth.ll @@ -5,9 +5,9 @@ define void @t(i64 %v) { ; CHECK-LABEL: define void @t( ; CHECK-SAME: i64 [[V:%.*]]) { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[V]] to i16 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i64> poison, i64 [[V]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = trunc <4 x i64> [[TMP1]] to <4 x i16> ; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i16> [[TMP2]], ; CHECK-NEXT: [[TMP4:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP3]]) ; CHECK-NEXT: [[TMP5:%.*]] = sext i16 [[TMP4]] to i32 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-transformed-operand.ll b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-transformed-operand.ll index 57b5d2af48ee6..032625a1199f9 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-transformed-operand.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-transformed-operand.ll @@ -5,16 +5,20 @@ define void @test(i64 %d.promoted.i) { ; CHECK-LABEL: define void @test( ; CHECK-SAME: i64 [[D_PROMOTED_I:%.*]]) { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i64> , i64 0, i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> , i64 [[D_PROMOTED_I]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i64> [[TMP0]], [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i1> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i1> [[TMP3]], <2 x i1> poison, <16 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <16 x i1> , <16 x i1> [[TMP4]], <16 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = mul <16 x i1> [[TMP5]], zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP6]]) +; CHECK-NEXT: [[AND_1_I:%.*]] = and i64 0, [[D_PROMOTED_I]] +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i64> , i64 [[AND_1_I]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = trunc <8 x i64> [[TMP0]] to <8 x i1> +; CHECK-NEXT: [[TMP2:%.*]] = mul <8 x i1> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[AND_1_I_1:%.*]] = and i64 0, 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i64> , i64 [[AND_1_I_1]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = trunc <8 x i64> [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = mul <8 x i1> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP2]]) +; CHECK-NEXT: [[TMP7:%.*]] = zext i1 [[TMP6]] to i32 +; CHECK-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP5]]) ; CHECK-NEXT: [[TMP9:%.*]] = zext i1 [[TMP8]] to i32 -; CHECK-NEXT: [[TMP10:%.*]] = and i32 [[TMP9]], 0 +; CHECK-NEXT: [[OP_RDX:%.*]] = or i32 [[TMP7]], [[TMP9]] +; CHECK-NEXT: [[TMP10:%.*]] = and i32 [[OP_RDX]], 0 ; CHECK-NEXT: store i32 [[TMP10]], ptr null, align 4 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/resched.ll b/llvm/test/Transforms/SLPVectorizer/X86/resched.ll index 4ed52247c2ef3..b7237cbb02bb3 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/resched.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/resched.ll @@ -11,31 +11,26 @@ define fastcc void @_ZN12_GLOBAL__N_127PolynomialMultiplyRecognize9recognizeEv() ; CHECK: if.then22.i: ; CHECK-NEXT: [[SUB_I:%.*]] = add nsw i32 undef, -1 ; CHECK-NEXT: [[CONV31_I:%.*]] = and i32 undef, [[SUB_I]] -; CHECK-NEXT: [[SHR_I_I:%.*]] = lshr i32 [[CONV31_I]], 1 -; CHECK-NEXT: [[SHR_1_I_I:%.*]] = lshr i32 [[CONV31_I]], 2 -; CHECK-NEXT: [[SHR_2_I_I:%.*]] = lshr i32 [[CONV31_I]], 3 -; CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[SUB_I]] to i8 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[SHR_I_I]] to i8 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <16 x i8> [[TMP1]], i8 [[TMP2]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = trunc i32 [[SHR_1_I_I]] to i8 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x i8> [[TMP3]], i8 [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP6:%.*]] = trunc i32 [[SHR_2_I_I]] to i8 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <16 x i8> [[TMP5]], i8 [[TMP6]], i32 3 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> poison, i32 [[CONV31_I]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = lshr <4 x i32> [[TMP9]], -; CHECK-NEXT: [[TMP11:%.*]] = trunc <4 x i32> [[TMP10]] to <4 x i8> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <16 x i8> [[TMP7]], <16 x i8> [[TMP12]], <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <8 x i32> poison, i32 [[CONV31_I]], i32 0 -; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <8 x i32> [[TMP14]], <8 x i32> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = lshr <8 x i32> [[TMP15]], -; CHECK-NEXT: [[TMP17:%.*]] = trunc <8 x i32> [[TMP16]] to <8 x i8> -; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <8 x i8> [[TMP17]], <8 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <16 x i8> [[TMP13]], <16 x i8> [[TMP18]], <16 x i32> -; CHECK-NEXT: [[TMP20:%.*]] = and <16 x i8> [[TMP19]], -; CHECK-NEXT: store <16 x i8> [[TMP20]], ptr undef, align 1 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[CONV31_I]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = lshr <4 x i32> [[TMP1]], +; CHECK-NEXT: [[SHR_4_I_I:%.*]] = lshr i32 [[CONV31_I]], 5 +; CHECK-NEXT: [[SHR_5_I_I:%.*]] = lshr i32 [[CONV31_I]], 6 +; CHECK-NEXT: [[SHR_6_I_I:%.*]] = lshr i32 [[CONV31_I]], 7 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> poison, i32 [[CONV31_I]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = lshr <8 x i32> [[TMP4]], +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x i32> poison, i32 [[SUB_I]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> [[TMP7]], <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <16 x i32> [[TMP8]], i32 [[SHR_4_I_I]], i32 5 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <16 x i32> [[TMP9]], i32 [[SHR_5_I_I]], i32 6 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <16 x i32> [[TMP10]], i32 [[SHR_6_I_I]], i32 7 +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <16 x i32> [[TMP11]], <16 x i32> [[TMP12]], <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = trunc <16 x i32> [[TMP13]] to <16 x i8> +; CHECK-NEXT: [[TMP15:%.*]] = and <16 x i8> [[TMP14]], +; CHECK-NEXT: store <16 x i8> [[TMP15]], ptr undef, align 1 ; CHECK-NEXT: unreachable ; CHECK: if.end50.i: ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shuffle-multivector.ll b/llvm/test/Transforms/SLPVectorizer/X86/shuffle-multivector.ll index c2555889f5981..143052a3d9cd0 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/shuffle-multivector.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/shuffle-multivector.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux -slp-threshold=-163 | FileCheck %s +; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux -slp-threshold=-160 | FileCheck %s define void @test1(i128 %p0, i128 %p1, i128 %p2, i128 %p3, <4 x i128> %vec) { ; CHECK-LABEL: @test1( @@ -14,14 +14,13 @@ define void @test1(i128 %p0, i128 %p1, i128 %p2, i128 %p3, <4 x i128> %vec) { ; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: [[T5:%.*]] = trunc i128 [[P1]] to i32 ; CHECK-NEXT: [[TMP8:%.*]] = sdiv <4 x i32> [[TMP3]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = trunc <4 x i128> [[VEC:%.*]] to <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = trunc <4 x i128> [[VEC]] to <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP6]], <4 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = sdiv <4 x i32> [[TMP8]], [[TMP12]] +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i128> [[TMP1]], <2 x i128> [[TMP5]], <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i128> [[VEC:%.*]], <4 x i128> [[TMP9]], <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = trunc <4 x i128> [[TMP10]] to <4 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = sdiv <4 x i32> [[TMP8]], [[TMP11]] ; CHECK-NEXT: br label [[BB:%.*]] ; CHECK: bb: -; CHECK-NEXT: [[TMP14:%.*]] = phi <4 x i32> [ [[TMP13]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[TMP13:%.*]] = phi <4 x i32> [ [[TMP12]], [[ENTRY:%.*]] ] ; CHECK-NEXT: ret void ; entry: From c5c1bd164fc81a992dfdb5b7c7c672dab0e3f165 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Wed, 17 Jul 2024 07:17:25 -0400 Subject: [PATCH 281/777] [SLP]Improve minbitwidth analysis for trun'ed gather nodes. If the gather node is trunc'ed, better to trunc scalars and then gather them rather than gather and then trunc. Trunc for scalars is free in most cases. Reviewers: RKSimon Reviewed By: RKSimon Pull Request: https://github.com/llvm/llvm-project/pull/99072 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 19 +++++++- .../X86/int-bitcast-minbitwidth.ll | 6 +-- .../X86/minbitwidth-transformed-operand.ll | 22 ++++----- .../Transforms/SLPVectorizer/X86/resched.ll | 45 ++++++++++--------- .../SLPVectorizer/X86/shuffle-multivector.ll | 13 +++--- 5 files changed, 62 insertions(+), 43 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 7bdbbecb7f0d8..1cf2ff89371d9 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -15522,8 +15522,25 @@ void BoUpSLP::computeMinimumValueSizes() { auto ComputeMaxBitWidth = [&](const TreeEntry &E, bool IsTopRoot, bool IsProfitableToDemoteRoot, unsigned Opcode, unsigned Limit, bool IsTruncRoot, - bool IsSignedCmp) { + bool IsSignedCmp) -> unsigned { ToDemote.clear(); + // Check if the root is trunc and the next node is gather/buildvector, then + // keep trunc in scalars, which is free in most cases. + if (E.isGather() && IsTruncRoot && E.UserTreeIndices.size() == 1 && + E.Idx > (IsStoreOrInsertElt ? 2 : 1)) { + ToDemote.push_back(E.Idx); + const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE; + auto It = MinBWs.find(UserTE); + if (It != MinBWs.end()) + return It->second.first; + unsigned MaxBitWidth = + DL->getTypeSizeInBits(UserTE->Scalars.front()->getType()); + MaxBitWidth = bit_ceil(MaxBitWidth); + if (MaxBitWidth < 8 && MaxBitWidth > 1) + MaxBitWidth = 8; + return MaxBitWidth; + } + unsigned VF = E.getVectorFactor(); auto *TreeRootIT = dyn_cast(E.Scalars.front()->getType()); if (!TreeRootIT || !Opcode) diff --git a/llvm/test/Transforms/SLPVectorizer/X86/int-bitcast-minbitwidth.ll b/llvm/test/Transforms/SLPVectorizer/X86/int-bitcast-minbitwidth.ll index 789d73947d1c7..97e505f4319c6 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/int-bitcast-minbitwidth.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/int-bitcast-minbitwidth.ll @@ -5,9 +5,9 @@ define void @t(i64 %v) { ; CHECK-LABEL: define void @t( ; CHECK-SAME: i64 [[V:%.*]]) { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i64> poison, i64 [[V]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = trunc <4 x i64> [[TMP1]] to <4 x i16> +; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[V]] to i16 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i16> [[TMP2]], ; CHECK-NEXT: [[TMP4:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP3]]) ; CHECK-NEXT: [[TMP5:%.*]] = sext i16 [[TMP4]] to i32 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-transformed-operand.ll b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-transformed-operand.ll index 032625a1199f9..57b5d2af48ee6 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-transformed-operand.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-transformed-operand.ll @@ -5,20 +5,16 @@ define void @test(i64 %d.promoted.i) { ; CHECK-LABEL: define void @test( ; CHECK-SAME: i64 [[D_PROMOTED_I:%.*]]) { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[AND_1_I:%.*]] = and i64 0, [[D_PROMOTED_I]] -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i64> , i64 [[AND_1_I]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = trunc <8 x i64> [[TMP0]] to <8 x i1> -; CHECK-NEXT: [[TMP2:%.*]] = mul <8 x i1> [[TMP1]], zeroinitializer -; CHECK-NEXT: [[AND_1_I_1:%.*]] = and i64 0, 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i64> , i64 [[AND_1_I_1]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = trunc <8 x i64> [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = mul <8 x i1> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP2]]) -; CHECK-NEXT: [[TMP7:%.*]] = zext i1 [[TMP6]] to i32 -; CHECK-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP5]]) +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i64> , i64 0, i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> , i64 [[D_PROMOTED_I]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i64> [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i1> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i1> [[TMP3]], <2 x i1> poison, <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <16 x i1> , <16 x i1> [[TMP4]], <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = mul <16 x i1> [[TMP5]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP6]]) ; CHECK-NEXT: [[TMP9:%.*]] = zext i1 [[TMP8]] to i32 -; CHECK-NEXT: [[OP_RDX:%.*]] = or i32 [[TMP7]], [[TMP9]] -; CHECK-NEXT: [[TMP10:%.*]] = and i32 [[OP_RDX]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = and i32 [[TMP9]], 0 ; CHECK-NEXT: store i32 [[TMP10]], ptr null, align 4 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/resched.ll b/llvm/test/Transforms/SLPVectorizer/X86/resched.ll index b7237cbb02bb3..4ed52247c2ef3 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/resched.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/resched.ll @@ -11,26 +11,31 @@ define fastcc void @_ZN12_GLOBAL__N_127PolynomialMultiplyRecognize9recognizeEv() ; CHECK: if.then22.i: ; CHECK-NEXT: [[SUB_I:%.*]] = add nsw i32 undef, -1 ; CHECK-NEXT: [[CONV31_I:%.*]] = and i32 undef, [[SUB_I]] -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[CONV31_I]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = lshr <4 x i32> [[TMP1]], -; CHECK-NEXT: [[SHR_4_I_I:%.*]] = lshr i32 [[CONV31_I]], 5 -; CHECK-NEXT: [[SHR_5_I_I:%.*]] = lshr i32 [[CONV31_I]], 6 -; CHECK-NEXT: [[SHR_6_I_I:%.*]] = lshr i32 [[CONV31_I]], 7 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> poison, i32 [[CONV31_I]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = lshr <8 x i32> [[TMP4]], -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x i32> poison, i32 [[SUB_I]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> [[TMP7]], <16 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <16 x i32> [[TMP8]], i32 [[SHR_4_I_I]], i32 5 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <16 x i32> [[TMP9]], i32 [[SHR_5_I_I]], i32 6 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <16 x i32> [[TMP10]], i32 [[SHR_6_I_I]], i32 7 -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <16 x i32> [[TMP11]], <16 x i32> [[TMP12]], <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = trunc <16 x i32> [[TMP13]] to <16 x i8> -; CHECK-NEXT: [[TMP15:%.*]] = and <16 x i8> [[TMP14]], -; CHECK-NEXT: store <16 x i8> [[TMP15]], ptr undef, align 1 +; CHECK-NEXT: [[SHR_I_I:%.*]] = lshr i32 [[CONV31_I]], 1 +; CHECK-NEXT: [[SHR_1_I_I:%.*]] = lshr i32 [[CONV31_I]], 2 +; CHECK-NEXT: [[SHR_2_I_I:%.*]] = lshr i32 [[CONV31_I]], 3 +; CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[SUB_I]] to i8 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[SHR_I_I]] to i8 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <16 x i8> [[TMP1]], i8 [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i32 [[SHR_1_I_I]] to i8 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x i8> [[TMP3]], i8 [[TMP4]], i32 2 +; CHECK-NEXT: [[TMP6:%.*]] = trunc i32 [[SHR_2_I_I]] to i8 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <16 x i8> [[TMP5]], i8 [[TMP6]], i32 3 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> poison, i32 [[CONV31_I]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = lshr <4 x i32> [[TMP9]], +; CHECK-NEXT: [[TMP11:%.*]] = trunc <4 x i32> [[TMP10]] to <4 x i8> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <16 x i8> [[TMP7]], <16 x i8> [[TMP12]], <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <8 x i32> poison, i32 [[CONV31_I]], i32 0 +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <8 x i32> [[TMP14]], <8 x i32> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = lshr <8 x i32> [[TMP15]], +; CHECK-NEXT: [[TMP17:%.*]] = trunc <8 x i32> [[TMP16]] to <8 x i8> +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <8 x i8> [[TMP17]], <8 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <16 x i8> [[TMP13]], <16 x i8> [[TMP18]], <16 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = and <16 x i8> [[TMP19]], +; CHECK-NEXT: store <16 x i8> [[TMP20]], ptr undef, align 1 ; CHECK-NEXT: unreachable ; CHECK: if.end50.i: ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shuffle-multivector.ll b/llvm/test/Transforms/SLPVectorizer/X86/shuffle-multivector.ll index 143052a3d9cd0..c2555889f5981 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/shuffle-multivector.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/shuffle-multivector.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux -slp-threshold=-160 | FileCheck %s +; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux -slp-threshold=-163 | FileCheck %s define void @test1(i128 %p0, i128 %p1, i128 %p2, i128 %p3, <4 x i128> %vec) { ; CHECK-LABEL: @test1( @@ -14,13 +14,14 @@ define void @test1(i128 %p0, i128 %p1, i128 %p2, i128 %p3, <4 x i128> %vec) { ; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: [[T5:%.*]] = trunc i128 [[P1]] to i32 ; CHECK-NEXT: [[TMP8:%.*]] = sdiv <4 x i32> [[TMP3]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i128> [[TMP1]], <2 x i128> [[TMP5]], <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i128> [[VEC:%.*]], <4 x i128> [[TMP9]], <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = trunc <4 x i128> [[TMP10]] to <4 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = sdiv <4 x i32> [[TMP8]], [[TMP11]] +; CHECK-NEXT: [[TMP9:%.*]] = trunc <4 x i128> [[VEC:%.*]] to <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = trunc <4 x i128> [[VEC]] to <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP6]], <4 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = sdiv <4 x i32> [[TMP8]], [[TMP12]] ; CHECK-NEXT: br label [[BB:%.*]] ; CHECK: bb: -; CHECK-NEXT: [[TMP13:%.*]] = phi <4 x i32> [ [[TMP12]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[TMP14:%.*]] = phi <4 x i32> [ [[TMP13]], [[ENTRY:%.*]] ] ; CHECK-NEXT: ret void ; entry: From 554febd3aad8d7cea7b8f8f6124d691031fb618c Mon Sep 17 00:00:00 2001 From: Mital Ashok Date: Wed, 17 Jul 2024 15:42:02 +0100 Subject: [PATCH 282/777] [Clang] Fix some assertions not looking through type sugar (#92299) Fixes #92284 Co-authored-by: cor3ntin --- clang/lib/AST/ExprConstant.cpp | 2 +- clang/lib/Sema/SemaInit.cpp | 2 +- clang/test/SemaCXX/paren-list-agg-init.cpp | 21 ++++++++++++++++++++- 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 0aeac9d03eed3..5af712dd7257b 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -11478,7 +11478,7 @@ bool ArrayExprEvaluator::VisitCXXConstructExpr(const CXXConstructExpr *E, bool ArrayExprEvaluator::VisitCXXParenListInitExpr( const CXXParenListInitExpr *E) { - assert(dyn_cast(E->getType()) && + assert(E->getType()->isConstantArrayType() && "Expression result is not a constant array type"); return VisitCXXParenListOrInitListExpr(E, E->getInitExprs(), diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp index d97a5c8988840..17435afab03f4 100644 --- a/clang/lib/Sema/SemaInit.cpp +++ b/clang/lib/Sema/SemaInit.cpp @@ -5621,7 +5621,7 @@ static void TryOrBuildParenListInitialization( << SE->getSourceRange(); return; } else { - assert(isa(Entity.getType())); + assert(Entity.getType()->isIncompleteArrayType()); ArrayLength = Args.size(); } EntityIndexToProcess = ArrayLength; diff --git a/clang/test/SemaCXX/paren-list-agg-init.cpp b/clang/test/SemaCXX/paren-list-agg-init.cpp index efc1e955d4ed8..cc2a9d88dd4a6 100644 --- a/clang/test/SemaCXX/paren-list-agg-init.cpp +++ b/clang/test/SemaCXX/paren-list-agg-init.cpp @@ -314,8 +314,8 @@ namespace GH63903 { // expected-error {{constexpr variable 's' must be initialized by a constant expression}} } - namespace gh62863 { + int (&&arr)[] = static_cast(42); // beforecxx20-warning@-1 {{aggregate initialization of type 'int[1]' from a parenthesized list of values is a C++20 extension}} int (&&arr1)[1] = static_cast(42); @@ -333,4 +333,23 @@ int (&&arr6)[2] = (int[])(42); // expected-error {{reference to type 'int[2]' co // beforecxx20-warning@-1 {{aggregate initialization of type 'int[1]' from a parenthesized list of values is a C++20 extension}} int (&&arr7)[3] = (int[3])(42); // beforecxx20-warning@-1 {{aggregate initialization of type 'int[3]' from a parenthesized list of values is a C++20 extension}} + +} + +namespace GH92284 { + +using T = int[1]; T x(42); +// beforecxx20-warning@-1 {{aggregate initialization of type 'T' (aka 'int[1]') from a parenthesized list of values is a C++20 extension}} +using Ta = int[2]; Ta a(42); +// beforecxx20-warning@-1 {{aggregate initialization of type 'Ta' (aka 'int[2]') from a parenthesized list of values is a C++20 extension}} +using Tb = int[2]; Tb b(42,43); +// beforecxx20-warning@-1 {{aggregate initialization of type 'Tb' (aka 'int[2]') from a parenthesized list of values is a C++20 extension}} +using Tc = int[]; Tc c(42); +// beforecxx20-warning@-1 {{aggregate initialization of type 'int[1]' from a parenthesized list of values is a C++20 extension}} +using Td = int[]; Td d(42,43); +// beforecxx20-warning@-1 {{aggregate initialization of type 'int[2]' from a parenthesized list of values is a C++20 extension}} +template using ThroughAlias = T[Sz]; +ThroughAlias e(42); +// beforecxx20-warning@-1 {{aggregate initialization of type 'ThroughAlias' (aka 'int[1]') from a parenthesized list of values is a C++20 extension}} + } From ec9d62fe84fe314370a256306c083a9e7079b80b Mon Sep 17 00:00:00 2001 From: Leandro Lupori Date: Wed, 17 Jul 2024 11:45:49 -0300 Subject: [PATCH 283/777] [lldb] Disable verbose_trap.test on Windows (#99323) verbose_trap.test, added in #80368, fails on some Windows bots. See https://lab.llvm.org/buildbot/#/builders/141/builds/808. --- lldb/test/Shell/Recognizer/verbose_trap.test | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lldb/test/Shell/Recognizer/verbose_trap.test b/lldb/test/Shell/Recognizer/verbose_trap.test index 45ef84bef611f..dafab7bdea688 100644 --- a/lldb/test/Shell/Recognizer/verbose_trap.test +++ b/lldb/test/Shell/Recognizer/verbose_trap.test @@ -1,3 +1,5 @@ +# UNSUPPORTED: system-windows +# # RUN: %clang_host -g -O0 %S/Inputs/verbose_trap.cpp -o %t.out -DVERBOSE_TRAP_TEST_CATEGORY=\"Foo\" -DVERBOSE_TRAP_TEST_MESSAGE=\"Bar\" # RUN: %lldb -b -s %s %t.out | FileCheck %s --check-prefixes=CHECK,CHECK-BOTH # From 77b2c681677db02552475426f0f7cf2c009ff98d Mon Sep 17 00:00:00 2001 From: Nathan James Date: Wed, 17 Jul 2024 15:50:24 +0100 Subject: [PATCH 284/777] [clang-tidy] Add support for std::rotate(_copy) and inplace_merge to modernize-use-ranges (#99057) These algorithms take 3 iterators for the range and we are only interested in the first and last iterator argument. The ranges versions of these take a range and an iterator(defined to be inside the range) so the transformation is pretty similar `algo(I.begin, other, I.end,...)` -> `ranges::algo(I, other,...)` --- .../clang-tidy/modernize/UseRangesCheck.cpp | 11 ++++++++++- .../docs/clang-tidy/checks/modernize/use-ranges.rst | 3 +++ .../checkers/modernize/Inputs/use-ranges/fake_std.h | 3 +++ .../test/clang-tidy/checkers/modernize/use-ranges.cpp | 11 +++++++++++ 4 files changed, 27 insertions(+), 1 deletion(-) diff --git a/clang-tools-extra/clang-tidy/modernize/UseRangesCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseRangesCheck.cpp index b0a31ad53be3f..604204e762c78 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseRangesCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/UseRangesCheck.cpp @@ -96,6 +96,9 @@ static constexpr const char *TwoRangeNames[] = { "is_permutation", }; +static constexpr const char *SinglePivotRangeNames[] = {"rotate", "rotate_copy", + "inplace_merge"}; + namespace { class StdReplacer : public utils::UseRangesCheck::Replacer { public: @@ -141,13 +144,19 @@ utils::UseRangesCheck::ReplacerMap UseRangesCheck::getReplacerMap() const { // Func(Iter1 first1, Iter1 last1, Iter2 first2, Iter2 last2,...). static const Signature TwoRangeArgs = {{0}, {2}}; + // template Func(Iter first, Iter pivot, Iter last,...). + static const Signature SinglePivotRange = {{0, 2}}; + static const Signature SingleRangeFunc[] = {SingleRangeArgs}; static const Signature TwoRangeFunc[] = {TwoRangeArgs}; + static const Signature SinglePivotFunc[] = {SinglePivotRange}; + static const std::pair, ArrayRef> AlgorithmNames[] = {{SingleRangeFunc, SingleRangeNames}, - {TwoRangeFunc, TwoRangeNames}}; + {TwoRangeFunc, TwoRangeNames}, + {SinglePivotFunc, SinglePivotRangeNames}}; SmallString<64> Buff; for (const auto &[Signatures, Values] : AlgorithmNames) { auto Replacer = llvm::makeIntrusiveRefCnt( diff --git a/clang-tools-extra/docs/clang-tidy/checks/modernize/use-ranges.rst b/clang-tools-extra/docs/clang-tidy/checks/modernize/use-ranges.rst index 5c0b8058e4535..1ce866ca1f66a 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/modernize/use-ranges.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/modernize/use-ranges.rst @@ -46,6 +46,7 @@ Calls to the following std library algorithms are checked: ``std::for_each``, ``std::generate``, ``std::includes``, +``std::inplace_merge``, ``std::iota``, ``std::is_heap_until``, ``std::is_heap``, @@ -79,6 +80,8 @@ Calls to the following std library algorithms are checked: ``std::replace``, ``std::reverse_copy``, ``std::reverse``, +``std::rotate``, +``std::rotate_copy``, ``std::sample``, ``std::search``, ``std::set_difference``, diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/Inputs/use-ranges/fake_std.h b/clang-tools-extra/test/clang-tidy/checkers/modernize/Inputs/use-ranges/fake_std.h index 987ee4e35b3bc..6596511c7a38b 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/modernize/Inputs/use-ranges/fake_std.h +++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/Inputs/use-ranges/fake_std.h @@ -106,6 +106,9 @@ bool equal(InputIt1 first1, InputIt1 last1, template void iota(ForwardIt first, ForwardIt last, T value); +template +ForwardIt rotate(ForwardIt first, ForwardIt middle, ForwardIt last); + } // namespace std #endif // USE_RANGES_FAKE_STD_H diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-ranges.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-ranges.cpp index e937e1e4e7d3b..b022efebfdf4d 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-ranges.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-ranges.cpp @@ -57,6 +57,10 @@ void Positives() { // CHECK-MESSAGES-CPP23: :[[@LINE-1]]:3: warning: use a ranges version of this algorithm // CHECK-FIXES-CPP23: std::ranges::iota(I, 0); + std::rotate(I.begin(), I.begin() + 2, I.end()); + // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: use a ranges version of this algorithm + // CHECK-FIXES: std::ranges::rotate(I, I.begin() + 2); + using std::find; namespace my_std = std; @@ -100,4 +104,11 @@ void Negatives() { std::equal(I.begin(), I.end(), J.end(), J.end()); std::equal(std::rbegin(I), std::rend(I), std::rend(J), std::rbegin(J)); std::equal(I.begin(), J.end(), I.begin(), I.end()); + + // std::rotate expects the full range in the 1st and 3rd argument. + // Anyone writing this code has probably written a bug, but this isn't the + // purpose of this check. + std::rotate(I.begin(), I.end(), I.begin() + 2); + // Pathological, but probably shouldn't diagnose this + std::rotate(I.begin(), I.end(), I.end() + 0); } From c034c44362a5dda93a8049d452625c59b76f7169 Mon Sep 17 00:00:00 2001 From: Tim Gymnich Date: Wed, 17 Jul 2024 16:57:42 +0200 Subject: [PATCH 285/777] [InstCombine] Fold select of symmetric selects (#99245) fixes #98800 Fold patterns like: select c2 (select c1 a b) (select c1 b a) into: select (xor c1 c2) b a Alive2 proofs: https://alive2.llvm.org/ce/z/4QAm4K https://alive2.llvm.org/ce/z/vTVRnC --- .../InstCombine/InstCombineSelect.cpp | 29 +++++ .../select-of-symmetric-selects.ll | 122 ++++++++++++++++++ 2 files changed, 151 insertions(+) create mode 100644 llvm/test/Transforms/InstCombine/select-of-symmetric-selects.ll diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index 394dfca262e13..e387034110df9 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -3012,6 +3012,32 @@ struct DecomposedSelect { }; } // namespace +/// Folds patterns like: +/// select c2 (select c1 a b) (select c1 b a) +/// into: +/// select (xor c1 c2) b a +static Instruction * +foldSelectOfSymmetricSelect(SelectInst &OuterSelVal, + InstCombiner::BuilderTy &Builder) { + + Value *OuterCond, *InnerCond, *InnerTrueVal, *InnerFalseVal; + if (!match( + &OuterSelVal, + m_Select(m_Value(OuterCond), + m_OneUse(m_Select(m_Value(InnerCond), m_Value(InnerTrueVal), + m_Value(InnerFalseVal))), + m_OneUse(m_Select(m_Deferred(InnerCond), + m_Deferred(InnerFalseVal), + m_Deferred(InnerTrueVal)))))) + return nullptr; + + if (OuterCond->getType() != InnerCond->getType()) + return nullptr; + + Value *Xor = Builder.CreateXor(InnerCond, OuterCond); + return SelectInst::Create(Xor, InnerFalseVal, InnerTrueVal); +} + /// Look for patterns like /// %outer.cond = select i1 %inner.cond, i1 %alt.cond, i1 false /// %inner.sel = select i1 %inner.cond, i8 %inner.sel.t, i8 %inner.sel.f @@ -3987,6 +4013,9 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) { } } + if (Instruction *I = foldSelectOfSymmetricSelect(SI, Builder)) + return I; + if (Instruction *I = foldNestedSelects(SI, Builder)) return I; diff --git a/llvm/test/Transforms/InstCombine/select-of-symmetric-selects.ll b/llvm/test/Transforms/InstCombine/select-of-symmetric-selects.ll new file mode 100644 index 0000000000000..0936f58ac9443 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/select-of-symmetric-selects.ll @@ -0,0 +1,122 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -passes=instcombine -S | FileCheck %s + +define i32 @select_of_symmetric_selects(i32 %a, i32 %b, i1 %c1, i1 %c2) { +; CHECK-LABEL: @select_of_symmetric_selects( +; CHECK-NEXT: [[TMP1:%.*]] = xor i1 [[C1:%.*]], [[C2:%.*]] +; CHECK-NEXT: [[RET:%.*]] = select i1 [[TMP1]], i32 [[B:%.*]], i32 [[A:%.*]] +; CHECK-NEXT: ret i32 [[RET]] +; + %sel1 = select i1 %c1, i32 %a, i32 %b + %sel2 = select i1 %c1, i32 %b, i32 %a + %ret = select i1 %c2, i32 %sel1, i32 %sel2 + ret i32 %ret +} + +define i32 @select_of_symmetric_selects_negative1(i32 %a, i32 %b, i1 %c1, i1 %c2) { +; CHECK-LABEL: @select_of_symmetric_selects_negative1( +; CHECK-NEXT: [[SEL1:%.*]] = select i1 [[C1:%.*]], i32 [[A:%.*]], i32 [[B:%.*]] +; CHECK-NEXT: [[RET:%.*]] = select i1 [[C2:%.*]], i32 [[SEL1]], i32 [[A]] +; CHECK-NEXT: ret i32 [[RET]] +; + %sel1 = select i1 %c1, i32 %a, i32 %b + %sel2 = select i1 %c2, i32 %b, i32 %a + %ret = select i1 %c2, i32 %sel1, i32 %sel2 + ret i32 %ret +} + +define i32 @select_of_symmetric_selects_negative2(i32 %a, i32 %b, i32 %c, i1 %c1, i1 %c2) { +; CHECK-LABEL: @select_of_symmetric_selects_negative2( +; CHECK-NEXT: [[SEL1:%.*]] = select i1 [[C1:%.*]], i32 [[A:%.*]], i32 [[B:%.*]] +; CHECK-NEXT: [[SEL2:%.*]] = select i1 [[C1]], i32 [[B]], i32 [[C:%.*]] +; CHECK-NEXT: [[RET:%.*]] = select i1 [[C2:%.*]], i32 [[SEL1]], i32 [[SEL2]] +; CHECK-NEXT: ret i32 [[RET]] +; + %sel1 = select i1 %c1, i32 %a, i32 %b + %sel2 = select i1 %c1, i32 %b, i32 %c + %ret = select i1 %c2, i32 %sel1, i32 %sel2 + ret i32 %ret +} + +declare void @use(i32) + +define i32 @select_of_symmetric_selects_multi_use1(i32 %a, i32 %b, i1 %c1, i1 %c2) { +; CHECK-LABEL: @select_of_symmetric_selects_multi_use1( +; CHECK-NEXT: [[SEL1:%.*]] = select i1 [[C1:%.*]], i32 [[A:%.*]], i32 [[B:%.*]] +; CHECK-NEXT: [[SEL2:%.*]] = select i1 [[C1]], i32 [[B]], i32 [[A]] +; CHECK-NEXT: call void @use(i32 [[SEL2]]) +; CHECK-NEXT: [[RET:%.*]] = select i1 [[C2:%.*]], i32 [[SEL1]], i32 [[SEL2]] +; CHECK-NEXT: ret i32 [[RET]] +; + %sel1 = select i1 %c1, i32 %a, i32 %b + %sel2 = select i1 %c1, i32 %b, i32 %a + call void @use(i32 %sel2) + %ret = select i1 %c2, i32 %sel1, i32 %sel2 + ret i32 %ret +} + +define i32 @select_of_symmetric_selects_multi_use2(i32 %a, i32 %b, i1 %c1, i1 %c2) { +; CHECK-LABEL: @select_of_symmetric_selects_multi_use2( +; CHECK-NEXT: [[SEL1:%.*]] = select i1 [[C1:%.*]], i32 [[A:%.*]], i32 [[B:%.*]] +; CHECK-NEXT: call void @use(i32 [[SEL1]]) +; CHECK-NEXT: [[SEL2:%.*]] = select i1 [[C1]], i32 [[B]], i32 [[A]] +; CHECK-NEXT: call void @use(i32 [[SEL2]]) +; CHECK-NEXT: [[RET:%.*]] = select i1 [[C2:%.*]], i32 [[SEL1]], i32 [[SEL2]] +; CHECK-NEXT: ret i32 [[RET]] +; + %sel1 = select i1 %c1, i32 %a, i32 %b + call void @use(i32 %sel1) + %sel2 = select i1 %c1, i32 %b, i32 %a + call void @use(i32 %sel2) + %ret = select i1 %c2, i32 %sel1, i32 %sel2 + ret i32 %ret +} + +define i32 @select_of_symmetric_selects_commuted(i32 %a, i32 %b, i1 %c1, i1 %c2) { +; CHECK-LABEL: @select_of_symmetric_selects_commuted( +; CHECK-NEXT: [[TMP1:%.*]] = xor i1 [[C1:%.*]], [[C2:%.*]] +; CHECK-NEXT: [[RET:%.*]] = select i1 [[TMP1]], i32 [[A:%.*]], i32 [[B:%.*]] +; CHECK-NEXT: ret i32 [[RET]] +; + %sel1 = select i1 %c1, i32 %a, i32 %b + %sel2 = select i1 %c1, i32 %b, i32 %a + %ret = select i1 %c2, i32 %sel2, i32 %sel1 + ret i32 %ret +} + +define <4 x i32> @select_of_symmetric_selects_vector1(<4 x i32> %a, <4 x i32> %b, i1 %c1, i1 %c2) { +; CHECK-LABEL: @select_of_symmetric_selects_vector1( +; CHECK-NEXT: [[TMP1:%.*]] = xor i1 [[C1:%.*]], [[C2:%.*]] +; CHECK-NEXT: [[RET:%.*]] = select i1 [[TMP1]], <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]] +; CHECK-NEXT: ret <4 x i32> [[RET]] +; + %sel1 = select i1 %c1, <4 x i32> %a, <4 x i32> %b + %sel2 = select i1 %c1, <4 x i32> %b, <4 x i32> %a + %ret = select i1 %c2, <4 x i32> %sel2, <4 x i32> %sel1 + ret <4 x i32> %ret +} + +define <4 x i32> @select_of_symmetric_selects_vector2(<4 x i32> %a, <4 x i32> %b, <4 x i1> %c1, <4 x i1> %c2) { +; CHECK-LABEL: @select_of_symmetric_selects_vector2( +; CHECK-NEXT: [[TMP1:%.*]] = xor <4 x i1> [[C1:%.*]], [[C2:%.*]] +; CHECK-NEXT: [[RET:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]] +; CHECK-NEXT: ret <4 x i32> [[RET]] +; + %sel1 = select <4 x i1> %c1, <4 x i32> %a, <4 x i32> %b + %sel2 = select <4 x i1> %c1, <4 x i32> %b, <4 x i32> %a + %ret = select <4 x i1> %c2, <4 x i32> %sel2, <4 x i32> %sel1 + ret <4 x i32> %ret +} + +define <2 x i32> @select_of_symmetric_selects_vector3(<2 x i32> %a, <2 x i32> %b, <2 x i1> %c1, i1 %c2) { +; CHECK-LABEL: @select_of_symmetric_selects_vector3( +; CHECK-NEXT: [[SEL1:%.*]] = select <2 x i1> [[C1:%.*]], <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]] +; CHECK-NEXT: [[SEL2:%.*]] = select <2 x i1> [[C1]], <2 x i32> [[B]], <2 x i32> [[A]] +; CHECK-NEXT: [[RET:%.*]] = select i1 [[C2:%.*]], <2 x i32> [[SEL1]], <2 x i32> [[SEL2]] +; CHECK-NEXT: ret <2 x i32> [[RET]] +; + %sel1 = select <2 x i1> %c1, <2 x i32> %a, <2 x i32> %b + %sel2 = select <2 x i1> %c1, <2 x i32> %b, <2 x i32> %a + %ret = select i1 %c2, <2 x i32> %sel1, <2 x i32> %sel2 + ret <2 x i32> %ret + } From a56e009ef852926c8e77eb8e50739d2b5a389212 Mon Sep 17 00:00:00 2001 From: Mital Ashok Date: Wed, 17 Jul 2024 15:58:21 +0100 Subject: [PATCH 286/777] [Clang] [C23] Fix typeof_unqual for qualified array types (#92767) Properly remove qualifiers for both the element type and the array type Fixes #92667 --------- Co-authored-by: cor3ntin --- clang/docs/ReleaseNotes.rst | 2 ++ clang/include/clang/AST/ASTContext.h | 4 +++ clang/include/clang/AST/Type.h | 37 ++++++++++------------ clang/lib/AST/ASTContext.cpp | 12 +++---- clang/lib/AST/Type.cpp | 38 +++++++++++++++++----- clang/test/Sema/c2x-typeof.c | 47 ++++++++++++++++++++++++++++ 6 files changed, 106 insertions(+), 34 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index e63282ca3b40d..1c1b874273a7c 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -838,6 +838,8 @@ Bug Fixes in This Version - ``__has_unique_object_representations`` correctly handles arrays of unknown bounds of types by ensuring they are complete and instantiating them if needed. Fixes (#GH95311). +- ``typeof_unqual`` now properly removes type qualifiers from arrays and their element types. (#GH92667) + Bug Fixes to Compiler Builtins ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h index 57022e75073fe..13aa203de32ba 100644 --- a/clang/include/clang/AST/ASTContext.h +++ b/clang/include/clang/AST/ASTContext.h @@ -2653,6 +2653,10 @@ class ASTContext : public RefCountedBase { /// \returns if this is an array type, the completely unqualified array type /// that corresponds to it. Otherwise, returns T.getUnqualifiedType(). QualType getUnqualifiedArrayType(QualType T, Qualifiers &Quals) const; + QualType getUnqualifiedArrayType(QualType T) const { + Qualifiers Quals; + return getUnqualifiedArrayType(T, Quals); + } /// Determine whether the given types are equivalent after /// cvr-qualifiers have been removed. diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h index 3aa0f05b0ab60..4c9ba37fe1e3a 100644 --- a/clang/include/clang/AST/Type.h +++ b/clang/include/clang/AST/Type.h @@ -1618,6 +1618,10 @@ class QualType { QualType stripObjCKindOfType(const ASTContext &ctx) const; /// Remove all qualifiers including _Atomic. + /// + /// Like getUnqualifiedType(), the type may still be qualified if it is a + /// sugared array type. To strip qualifiers even from within a sugared array + /// type, use in conjunction with ASTContext::getUnqualifiedArrayType. QualType getAtomicUnqualifiedType() const; private: @@ -2105,8 +2109,8 @@ class alignas(TypeAlignment) Type : public ExtQualsTypeCommonBase { LLVM_PREFERRED_TYPE(TypeBitfields) unsigned : NumTypeBits; - LLVM_PREFERRED_TYPE(bool) - unsigned IsUnqual : 1; // If true: typeof_unqual, else: typeof + LLVM_PREFERRED_TYPE(TypeOfKind) + unsigned Kind : 1; }; class UsingBitfields { @@ -5661,19 +5665,20 @@ class MacroQualifiedType : public Type { /// extension) or a `typeof_unqual` expression (a C23 feature). class TypeOfExprType : public Type { Expr *TOExpr; + const ASTContext &Context; protected: friend class ASTContext; // ASTContext creates these. - TypeOfExprType(Expr *E, TypeOfKind Kind, QualType Can = QualType()); + TypeOfExprType(const ASTContext &Context, Expr *E, TypeOfKind Kind, + QualType Can = QualType()); public: Expr *getUnderlyingExpr() const { return TOExpr; } /// Returns the kind of 'typeof' type this is. TypeOfKind getKind() const { - return TypeOfBits.IsUnqual ? TypeOfKind::Unqualified - : TypeOfKind::Qualified; + return static_cast(TypeOfBits.Kind); } /// Remove a single level of sugar. @@ -5694,7 +5699,8 @@ class TypeOfExprType : public Type { class DependentTypeOfExprType : public TypeOfExprType, public llvm::FoldingSetNode { public: - DependentTypeOfExprType(Expr *E, TypeOfKind Kind) : TypeOfExprType(E, Kind) {} + DependentTypeOfExprType(const ASTContext &Context, Expr *E, TypeOfKind Kind) + : TypeOfExprType(Context, E, Kind) {} void Profile(llvm::FoldingSetNodeID &ID, const ASTContext &Context) { Profile(ID, Context, getUnderlyingExpr(), @@ -5711,32 +5717,23 @@ class TypeOfType : public Type { friend class ASTContext; // ASTContext creates these. QualType TOType; + const ASTContext &Context; - TypeOfType(QualType T, QualType Can, TypeOfKind Kind) - : Type(TypeOf, - Kind == TypeOfKind::Unqualified ? Can.getAtomicUnqualifiedType() - : Can, - T->getDependence()), - TOType(T) { - TypeOfBits.IsUnqual = Kind == TypeOfKind::Unqualified; - } + TypeOfType(const ASTContext &Context, QualType T, QualType Can, + TypeOfKind Kind); public: QualType getUnmodifiedType() const { return TOType; } /// Remove a single level of sugar. - QualType desugar() const { - QualType QT = getUnmodifiedType(); - return TypeOfBits.IsUnqual ? QT.getAtomicUnqualifiedType() : QT; - } + QualType desugar() const; /// Returns whether this type directly provides sugar. bool isSugared() const { return true; } /// Returns the kind of 'typeof' type this is. TypeOfKind getKind() const { - return TypeOfBits.IsUnqual ? TypeOfKind::Unqualified - : TypeOfKind::Qualified; + return static_cast(TypeOfBits.Kind); } static bool classof(const Type *T) { return T->getTypeClass() == TypeOf; } diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp index ccbb4baad68af..f4aa1387974aa 100644 --- a/clang/lib/AST/ASTContext.cpp +++ b/clang/lib/AST/ASTContext.cpp @@ -6020,19 +6020,19 @@ QualType ASTContext::getTypeOfExprType(Expr *tofExpr, TypeOfKind Kind) const { if (Canon) { // We already have a "canonical" version of an identical, dependent // typeof(expr) type. Use that as our canonical type. - toe = new (*this, alignof(TypeOfExprType)) - TypeOfExprType(tofExpr, Kind, QualType((TypeOfExprType *)Canon, 0)); + toe = new (*this, alignof(TypeOfExprType)) TypeOfExprType( + *this, tofExpr, Kind, QualType((TypeOfExprType *)Canon, 0)); } else { // Build a new, canonical typeof(expr) type. Canon = new (*this, alignof(DependentTypeOfExprType)) - DependentTypeOfExprType(tofExpr, Kind); + DependentTypeOfExprType(*this, tofExpr, Kind); DependentTypeOfExprTypes.InsertNode(Canon, InsertPos); toe = Canon; } } else { QualType Canonical = getCanonicalType(tofExpr->getType()); toe = new (*this, alignof(TypeOfExprType)) - TypeOfExprType(tofExpr, Kind, Canonical); + TypeOfExprType(*this, tofExpr, Kind, Canonical); } Types.push_back(toe); return QualType(toe, 0); @@ -6045,8 +6045,8 @@ QualType ASTContext::getTypeOfExprType(Expr *tofExpr, TypeOfKind Kind) const { /// on canonical types (which are always unique). QualType ASTContext::getTypeOfType(QualType tofType, TypeOfKind Kind) const { QualType Canonical = getCanonicalType(tofType); - auto *tot = - new (*this, alignof(TypeOfType)) TypeOfType(tofType, Canonical, Kind); + auto *tot = new (*this, alignof(TypeOfType)) + TypeOfType(*this, tofType, Canonical, Kind); Types.push_back(tot); return QualType(tot, 0); } diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp index cc535aba4936e..5bf1f3dbdbd4b 100644 --- a/clang/lib/AST/Type.cpp +++ b/clang/lib/AST/Type.cpp @@ -1627,9 +1627,10 @@ QualType QualType::stripObjCKindOfType(const ASTContext &constCtx) const { } QualType QualType::getAtomicUnqualifiedType() const { - if (const auto AT = getTypePtr()->getAs()) - return AT->getValueType().getUnqualifiedType(); - return getUnqualifiedType(); + QualType T = *this; + if (const auto AT = T.getTypePtr()->getAs()) + T = AT->getValueType(); + return T.getUnqualifiedType(); } std::optional> @@ -3890,18 +3891,19 @@ QualType MacroQualifiedType::getModifiedType() const { return Inner; } -TypeOfExprType::TypeOfExprType(Expr *E, TypeOfKind Kind, QualType Can) +TypeOfExprType::TypeOfExprType(const ASTContext &Context, Expr *E, + TypeOfKind Kind, QualType Can) : Type(TypeOfExpr, // We have to protect against 'Can' being invalid through its // default argument. Kind == TypeOfKind::Unqualified && !Can.isNull() - ? Can.getAtomicUnqualifiedType() + ? Context.getUnqualifiedArrayType(Can).getAtomicUnqualifiedType() : Can, toTypeDependence(E->getDependence()) | (E->getType()->getDependence() & TypeDependence::VariablyModified)), - TOExpr(E) { - TypeOfBits.IsUnqual = Kind == TypeOfKind::Unqualified; + TOExpr(E), Context(Context) { + TypeOfBits.Kind = static_cast(Kind); } bool TypeOfExprType::isSugared() const { @@ -3911,7 +3913,9 @@ bool TypeOfExprType::isSugared() const { QualType TypeOfExprType::desugar() const { if (isSugared()) { QualType QT = getUnderlyingExpr()->getType(); - return TypeOfBits.IsUnqual ? QT.getAtomicUnqualifiedType() : QT; + return getKind() == TypeOfKind::Unqualified + ? Context.getUnqualifiedArrayType(QT).getAtomicUnqualifiedType() + : QT; } return QualType(this, 0); } @@ -3923,6 +3927,24 @@ void DependentTypeOfExprType::Profile(llvm::FoldingSetNodeID &ID, ID.AddBoolean(IsUnqual); } +TypeOfType::TypeOfType(const ASTContext &Context, QualType T, QualType Can, + TypeOfKind Kind) + : Type(TypeOf, + Kind == TypeOfKind::Unqualified + ? Context.getUnqualifiedArrayType(Can).getAtomicUnqualifiedType() + : Can, + T->getDependence()), + TOType(T), Context(Context) { + TypeOfBits.Kind = static_cast(Kind); +} + +QualType TypeOfType::desugar() const { + QualType QT = getUnmodifiedType(); + return getKind() == TypeOfKind::Unqualified + ? Context.getUnqualifiedArrayType(QT).getAtomicUnqualifiedType() + : QT; +} + DecltypeType::DecltypeType(Expr *E, QualType underlyingType, QualType can) // C++11 [temp.type]p2: "If an expression e involves a template parameter, // decltype(e) denotes a unique dependent type." Hence a decltype type is diff --git a/clang/test/Sema/c2x-typeof.c b/clang/test/Sema/c2x-typeof.c index cf985c244f4a4..2cc3f57b509d4 100644 --- a/clang/test/Sema/c2x-typeof.c +++ b/clang/test/Sema/c2x-typeof.c @@ -92,3 +92,50 @@ extern __attribute__((address_space(0))) int type_attr_test_2; // expec void invalid_param_fn(__attribute__((address_space(1))) int i); // expected-error {{parameter may not be qualified with an address space}} typeof(invalid_param_fn) invalid_param_1; typeof_unqual(invalid_param_fn) invalid_param_2; + +// Ensure restrict is stripped +extern int *restrict p1; +extern int *p2; +extern typeof(p1) p1; +extern typeof_unqual(p1) p2; + +// Ensure array qualifications are removed +extern const int aci[2]; +extern const int acii[2][2]; +extern int ai[2]; +extern int aii[2][2]; +extern typeof(aci) aci; +extern typeof_unqual(aci) ai; +extern typeof(acii) acii; +extern typeof_unqual(acii) aii; + +extern int *restrict arpi[2]; +extern int *restrict arpii[2][2]; +extern int *api[2]; +extern int *apii[2][2]; +extern typeof(arpi) arpi; +extern typeof_unqual(arpi) api; +extern typeof(arpii) arpii; +extern typeof_unqual(arpii) apii; + +extern int _Atomic aAi[2]; +extern int _Atomic aAii[2][2]; +extern typeof(aAi) aAi; +extern typeof_unqual(aAi) aAi; +extern typeof(aAii) aAii; +extern typeof_unqual(aAii) aAii; + +extern _Atomic(int) aAi[2]; +extern _Atomic(int) aAii[2][2]; +extern typeof(aAi) aAi; +extern typeof_unqual(aAi) aAi; +extern typeof(aAii) aAii; +extern typeof_unqual(aAii) aAii; + +const char* const animals[] = { "aardvark", "bluejay", "catte" }; +void GH92667(void) { + const char* animals2_array1[3]; + typeof_unqual(animals) animals2_array; + animals2_array1[0] = 0; + animals2_array[0] = 0; +} From 5d42d69d936bc3f29e849aac33d331b198143145 Mon Sep 17 00:00:00 2001 From: "Mikhail R. Gadelha" Date: Wed, 17 Jul 2024 17:01:26 +0200 Subject: [PATCH 287/777] [libc] Change rand implementation so all tests pass in both 32- and 64-bit systems (#98692) This patch makes rand select different algorithms depending on the arch. This is needed to avoid a test failure in 32-bit systems where the LSB of rand was not uniform enough when the 64-bit constants are used in 32-bit systems. --- libc/src/stdlib/rand.cpp | 41 +++++++++++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 11 deletions(-) diff --git a/libc/src/stdlib/rand.cpp b/libc/src/stdlib/rand.cpp index 1931727e2d9d1..a8a4fab3377cc 100644 --- a/libc/src/stdlib/rand.cpp +++ b/libc/src/stdlib/rand.cpp @@ -14,20 +14,39 @@ namespace LIBC_NAMESPACE_DECL { -// An implementation of the xorshift64star pseudo random number generator. This -// is a good general purpose generator for most non-cryptographics applications. LLVM_LIBC_FUNCTION(int, rand, (void)) { unsigned long orig = rand_next.load(cpp::MemoryOrder::RELAXED); - for (;;) { - unsigned long x = orig; - x ^= x >> 12; - x ^= x << 25; - x ^= x >> 27; - if (rand_next.compare_exchange_strong(orig, x, cpp::MemoryOrder::ACQUIRE, - cpp::MemoryOrder::RELAXED)) - return static_cast((x * 0x2545F4914F6CDD1Dul) >> 32) & RAND_MAX; - sleep_briefly(); + + // An implementation of the xorshift64star pseudo random number generator. + // This is a good general purpose generator for most non-cryptographics + // applications. + if constexpr (sizeof(void *) == sizeof(uint64_t)) { + for (;;) { + unsigned long x = orig; + x ^= x >> 12; + x ^= x << 25; + x ^= x >> 27; + if (rand_next.compare_exchange_strong(orig, x, cpp::MemoryOrder::ACQUIRE, + cpp::MemoryOrder::RELAXED)) + return static_cast((x * 0x2545F4914F6CDD1Dul) >> 32) & RAND_MAX; + sleep_briefly(); + } + } else { + // This is the xorshift32 pseudo random number generator, slightly different + // from the 64-bit star version above, as the previous version fails to + // generate uniform enough LSB in 32-bit systems. + for (;;) { + unsigned long x = orig; + x ^= x >> 13; + x ^= x << 27; + x ^= x >> 5; + if (rand_next.compare_exchange_strong(orig, x, cpp::MemoryOrder::ACQUIRE, + cpp::MemoryOrder::RELAXED)) + return static_cast(x * 1597334677ul) & RAND_MAX; + sleep_briefly(); + } } + __builtin_unreachable(); } } // namespace LIBC_NAMESPACE_DECL From 60b6f43ea188f3427985f6328a638375063a9f44 Mon Sep 17 00:00:00 2001 From: Hristo Hristov Date: Wed, 17 Jul 2024 18:10:17 +0300 Subject: [PATCH 288/777] [libc++][ranges] LWG4001: `iota_view` should provide `empty` (#79687) Implements: https://wg21.link/LWG4001 - https://eel.is/c++draft/range.iota.view --------- Co-authored-by: Zingam Co-authored-by: Will Hawkins --- libcxx/docs/Status/Cxx2cIssues.csv | 2 +- libcxx/include/__ranges/iota_view.h | 2 + .../range.iota.view/empty.pass.cpp | 117 ++++++++++++++++++ 3 files changed, 120 insertions(+), 1 deletion(-) create mode 100644 libcxx/test/std/ranges/range.factories/range.iota.view/empty.pass.cpp diff --git a/libcxx/docs/Status/Cxx2cIssues.csv b/libcxx/docs/Status/Cxx2cIssues.csv index f9a70aee1bf46..b5732ee981ffb 100644 --- a/libcxx/docs/Status/Cxx2cIssues.csv +++ b/libcxx/docs/Status/Cxx2cIssues.csv @@ -38,7 +38,7 @@ "`3974 `__","``mdspan::operator[]`` should not copy ``OtherIndexTypes``","Kona November 2023","","","" "`3987 `__","Including ```` doesn't provide ``std::begin``/``end``","Kona November 2023","","","|flat_containers|" "`3990 `__","Program-defined specializations of ``std::tuple`` and ``std::variant`` can't be properly supported","Kona November 2023","","","" -"`4001 `__","``iota_view`` should provide ``empty``","Kona November 2023","","","|ranges|" +"`4001 `__","``iota_view`` should provide ``empty``","Kona November 2023","|Complete|","19.0","|ranges|" "","","","","","" "`3767 `__","``codecvt`` incorrectly added to locale","Tokyo March 2024","","","" "`3919 `__","``enumerate_view`` may invoke UB for sized common non-forward underlying ranges","Tokyo March 2024","","","|ranges|" diff --git a/libcxx/include/__ranges/iota_view.h b/libcxx/include/__ranges/iota_view.h index c0f5ed936a66d..b2fa958a0f56e 100644 --- a/libcxx/include/__ranges/iota_view.h +++ b/libcxx/include/__ranges/iota_view.h @@ -344,6 +344,8 @@ class iota_view : public view_interface> { return __iterator{__bound_sentinel_}; } + _LIBCPP_HIDE_FROM_ABI constexpr bool empty() const { return __value_ == __bound_sentinel_; } + _LIBCPP_HIDE_FROM_ABI constexpr auto size() const requires(same_as<_Start, _BoundSentinel> && __advanceable<_Start>) || (integral<_Start> && integral<_BoundSentinel>) || sized_sentinel_for<_BoundSentinel, _Start> diff --git a/libcxx/test/std/ranges/range.factories/range.iota.view/empty.pass.cpp b/libcxx/test/std/ranges/range.factories/range.iota.view/empty.pass.cpp new file mode 100644 index 0000000000000..a8a34e152643a --- /dev/null +++ b/libcxx/test/std/ranges/range.factories/range.iota.view/empty.pass.cpp @@ -0,0 +1,117 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 + +// constexpr bool empty() const; + +#include +#include +#include +#include +#include + +#include "types.h" + +template +concept HasEmpty = requires(const R r) { + std::ranges::empty(r); + { r.empty() } -> std::same_as; +}; + +constexpr void test_empty_iota_sfinae() { + std::vector ev; + + auto iv = std::views::iota(std::ranges::begin(ev), std::ranges::end(ev)); + + static_assert(HasEmpty); + static_assert(HasEmpty); +} + +constexpr void test_nonempty_iota_sfinae() { + // Default ctr + { + std::ranges::iota_view> iv; + + static_assert(HasEmpty); + } + // Value pass + { + std::ranges::iota_view iv(SomeInt(94)); + + static_assert(HasEmpty); + } + + { + std::vector v; + auto it = std::back_inserter(v); + auto iv = std::views::iota(it); + + static_assert(HasEmpty); + } + { + std::vector v{'b', 'a', 'b', 'a', 'z', 'm', 't'}; + auto it = std::back_inserter(v); + auto iv = std::views::iota(it); + + static_assert(HasEmpty); + } +} + +constexpr void test_empty_iota() { + std::vector ev; + + auto iv = std::views::iota(std::ranges::begin(ev), std::ranges::end(ev)); + + assert(iv.empty()); + assert(std::as_const(iv).empty()); +} + +constexpr void test_nonempty_iota() { + // Default ctr + { + std::ranges::iota_view> iv; + + assert(!iv.empty()); + } + // Value pass + { + std::ranges::iota_view iv(SomeInt(94)); + + assert(!iv.empty()); + } + + { + std::vector v; + auto it = std::back_inserter(v); + auto iv = std::views::iota(it); + + assert(!iv.empty()); + } + { + std::vector v{'b', 'a', 'b', 'a', 'z', 'm', 't'}; + auto it = std::back_inserter(v); + auto iv = std::views::iota(it); + + assert(!iv.empty()); + } +} + +constexpr bool test() { + test_empty_iota(); + test_nonempty_iota(); + + return true; +} + +int main(int, char**) { + test(); + static_assert(test()); + + return 0; +} From 351a4b27da7dfe2ec6ae3400bd681eae1fb5180f Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Wed, 17 Jul 2024 10:09:42 -0500 Subject: [PATCH 289/777] [AMDGPU] Simplify alias stripping to use utility function --- llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp index 3bf72d1a5d40a..146649a7e2d54 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp @@ -65,10 +65,7 @@ static const Function *getCalleeFunction(const MachineOperand &Op) { assert(Op.getImm() == 0); return nullptr; } - const GlobalValue *GV = Op.getGlobal(); - while (auto *GA = dyn_cast(GV)) - GV = cast(GA->getOperand(0)); - return cast(GV); + return cast(Op.getGlobal()->stripPointerCastsAndAliases()); } static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI, From e9fdc689dbb141a318bb7be40001cef03ca67301 Mon Sep 17 00:00:00 2001 From: smanna12 Date: Wed, 17 Jul 2024 08:18:32 -0700 Subject: [PATCH 290/777] [Clang][NFC] Remove unnecessary copy (#97902) Reported by Static Analyzer Tool: In clang::ASTNodeImporter::VisitCountAttributedType(clang::CountAttributedType const *): Using the auto keyword without an & causes the copy of an object of type TypeCoupledDeclRefInfo --- clang/lib/AST/ASTImporter.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp index 4e1b3a5a94de7..0c27f6f5df2da 100644 --- a/clang/lib/AST/ASTImporter.cpp +++ b/clang/lib/AST/ASTImporter.cpp @@ -1551,7 +1551,7 @@ ASTNodeImporter::VisitCountAttributedType(const CountAttributedType *T) { Expr *CountExpr = importChecked(Err, T->getCountExpr()); SmallVector CoupledDecls; - for (auto TI : T->dependent_decls()) { + for (const TypeCoupledDeclRefInfo &TI : T->dependent_decls()) { Expected ToDeclOrErr = import(TI.getDecl()); if (!ToDeclOrErr) return ToDeclOrErr.takeError(); From 73799b46072c2241ae32c87f478a7e2a30c0e1a3 Mon Sep 17 00:00:00 2001 From: "Mikhail R. Gadelha" Date: Wed, 17 Jul 2024 17:20:15 +0200 Subject: [PATCH 291/777] [libc] Added missing operator delete generated by gcc/clang (#67457) This patch adds two operators delete that are being generated by clang 15 on rv32 (operator delete(void *mem, std::align_val_t)) and by gcc 13 on intel 64 (operator delete(void *mem, unsigned long)). --- libc/test/UnitTest/HermeticTestUtils.cpp | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/libc/test/UnitTest/HermeticTestUtils.cpp b/libc/test/UnitTest/HermeticTestUtils.cpp index 85e5cf02ff613..47f813b0b7a4e 100644 --- a/libc/test/UnitTest/HermeticTestUtils.cpp +++ b/libc/test/UnitTest/HermeticTestUtils.cpp @@ -124,7 +124,7 @@ unsigned long __getauxval(unsigned long id) { } // extern "C" -void *operator new(unsigned long size, void *ptr) { return ptr; } +void *operator new(size_t size, void *ptr) { return ptr; } void *operator new(size_t size) { return malloc(size); } @@ -137,3 +137,16 @@ void operator delete(void *) { } void operator delete(void *ptr, size_t size) { __builtin_trap(); } + +// Defining members in the std namespace is not preferred. But, we do it here +// so that we can use it to define the operator new which takes std::align_val_t +// argument. +namespace std { +enum class align_val_t : size_t {}; +} // namespace std + +void operator delete(void *mem, std::align_val_t) noexcept { __builtin_trap(); } + +void operator delete(void *mem, unsigned int, std::align_val_t) noexcept { + __builtin_trap(); +} From daab6fc5357b3a7f8b6780134d5cb6130f92329b Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 17 Jul 2024 16:30:27 +0100 Subject: [PATCH 292/777] [Transforms] DXILResource.cpp - fix MSVC "not all control paths return a value" warning. NFC. --- llvm/lib/Transforms/Utils/DXILResource.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/lib/Transforms/Utils/DXILResource.cpp b/llvm/lib/Transforms/Utils/DXILResource.cpp index 7281c7ad04531..bf45654a591b5 100644 --- a/llvm/lib/Transforms/Utils/DXILResource.cpp +++ b/llvm/lib/Transforms/Utils/DXILResource.cpp @@ -49,6 +49,7 @@ bool ResourceInfo::isTyped() const { case ResourceKind::NumEntries: llvm_unreachable("Invalid resource kind"); } + llvm_unreachable("Unhandled ResourceKind enum"); } bool ResourceInfo::isFeedback() const { From 3ad7108c3cf843cac6301db3f73ccea9661bc4d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Wed, 17 Jul 2024 08:39:18 -0700 Subject: [PATCH 293/777] [flang][cuda] Avoid temporary when RHS is a logical constant (#99078) Enhance the detection of constant on the RHS for logical cases so we don't create a temporary. --- flang/lib/Lower/Bridge.cpp | 8 ++++++-- flang/test/Lower/CUDA/cuda-data-transfer.cuf | 9 +++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp index 77e038dac13ff..a4043744c6386 100644 --- a/flang/lib/Lower/Bridge.cpp +++ b/flang/lib/Lower/Bridge.cpp @@ -4230,9 +4230,13 @@ class FirConverter : public Fortran::lower::AbstractConverter { auto transferKindAttr = cuf::DataTransferKindAttr::get( builder.getContext(), cuf::DataTransferKind::HostDevice); if (!rhs.isVariable()) { + mlir::Value base = rhs; + if (auto convertOp = + mlir::dyn_cast(rhs.getDefiningOp())) + base = convertOp.getValue(); // Special case if the rhs is a constant. - if (matchPattern(rhs.getDefiningOp(), mlir::m_Constant())) { - builder.create(loc, rhs, lhsVal, + if (matchPattern(base.getDefiningOp(), mlir::m_Constant())) { + builder.create(loc, base, lhsVal, transferKindAttr); } else { auto associate = hlfir::genAssociateExpr( diff --git a/flang/test/Lower/CUDA/cuda-data-transfer.cuf b/flang/test/Lower/CUDA/cuda-data-transfer.cuf index 1383b73ea44d6..d657f819dfbf1 100644 --- a/flang/test/Lower/CUDA/cuda-data-transfer.cuf +++ b/flang/test/Lower/CUDA/cuda-data-transfer.cuf @@ -265,3 +265,12 @@ end subroutine ! CHECK: %[[TEMP:.*]] = fir.allocmem !fir.array, %14#1 {bindc_name = ".tmp", uniq_name = ""} ! CHECK: cuf.data_transfer ! CHECK: fir.freemem %[[TEMP]] : !fir.heap> + +subroutine sub14() + logical(4), device :: log(10) + log = .true. +end subroutine + +! CHECK-LABEL: func.func @_QPsub14() +! CHECK: %[[TRUE:.*]] = arith.constant true +! CHECK: cuf.data_transfer %[[TRUE]] to %{{.*}}#0 {transfer_kind = #cuf.cuda_transfer} : i1, !fir.ref>> From 666d224248707f373577b5b049b5b0229100006c Mon Sep 17 00:00:00 2001 From: Mike Crowe Date: Wed, 17 Jul 2024 16:45:47 +0100 Subject: [PATCH 294/777] [clang-tidy] Fix modernize-use-std-print/format for fmt (#99021) When fixing #92896 in 0e62d5cf55479981da5e05e406bbca4afb3cdc4f (#94104) I failed to spot that I'd broken converting from fmt::printf, fmt::fprintf and fmt::sprintf in these checks since the format parameter of those functions is not a simple character pointer. The first part of the previous fix to avoid the assert and instead produce an error message was sufficient. It was only the second part that required the format parameter of the called function to be a simple character pointer that was problematic. Let's remove that second part and add the now-expected error messages to the lit tests along with fixing the prototype for the fmt functions to more accurately reflect the ones used by the fmt library so they are actually useful. Fixes #92896 --- .../modernize/UseStdFormatCheck.cpp | 14 +++++----- .../clang-tidy/modernize/UseStdPrintCheck.cpp | 4 --- .../modernize/use-std-format-custom.cpp | 1 + .../checkers/modernize/use-std-format-fmt.cpp | 6 ++--- .../modernize/use-std-print-custom.cpp | 26 +++++++++++++++++-- 5 files changed, 34 insertions(+), 17 deletions(-) diff --git a/clang-tools-extra/clang-tidy/modernize/UseStdFormatCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseStdFormatCheck.cpp index d082faa786b37..6cef21f1318a2 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseStdFormatCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/UseStdFormatCheck.cpp @@ -47,15 +47,13 @@ void UseStdFormatCheck::registerPPCallbacks(const SourceManager &SM, } void UseStdFormatCheck::registerMatchers(MatchFinder *Finder) { - auto CharPointerType = - hasType(pointerType(pointee(matchers::isSimpleChar()))); Finder->addMatcher( - callExpr( - argumentCountAtLeast(1), hasArgument(0, stringLiteral(isOrdinary())), - callee(functionDecl( - unless(cxxMethodDecl()), hasParameter(0, CharPointerType), - matchers::matchesAnyListedName(StrFormatLikeFunctions)) - .bind("func_decl"))) + callExpr(argumentCountAtLeast(1), + hasArgument(0, stringLiteral(isOrdinary())), + callee(functionDecl(unless(cxxMethodDecl()), + matchers::matchesAnyListedName( + StrFormatLikeFunctions)) + .bind("func_decl"))) .bind("strformat"), this); } diff --git a/clang-tools-extra/clang-tidy/modernize/UseStdPrintCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseStdPrintCheck.cpp index 1ea170c3cd310..ff990feadc0c1 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseStdPrintCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/UseStdPrintCheck.cpp @@ -95,15 +95,12 @@ unusedReturnValue(clang::ast_matchers::StatementMatcher MatchedCallExpr) { } void UseStdPrintCheck::registerMatchers(MatchFinder *Finder) { - auto CharPointerType = - hasType(pointerType(pointee(matchers::isSimpleChar()))); if (!PrintfLikeFunctions.empty()) Finder->addMatcher( unusedReturnValue( callExpr(argumentCountAtLeast(1), hasArgument(0, stringLiteral(isOrdinary())), callee(functionDecl(unless(cxxMethodDecl()), - hasParameter(0, CharPointerType), matchers::matchesAnyListedName( PrintfLikeFunctions)) .bind("func_decl"))) @@ -116,7 +113,6 @@ void UseStdPrintCheck::registerMatchers(MatchFinder *Finder) { callExpr(argumentCountAtLeast(2), hasArgument(1, stringLiteral(isOrdinary())), callee(functionDecl(unless(cxxMethodDecl()), - hasParameter(1, CharPointerType), matchers::matchesAnyListedName( FprintfLikeFunctions)) .bind("func_decl"))) diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-std-format-custom.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-std-format-custom.cpp index c025113055cce..7da0bb02ad766 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-std-format-custom.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-std-format-custom.cpp @@ -63,4 +63,5 @@ std::string unsupported_format_parameter_type() // No fixes here because the format parameter of the function called is not a // string. return bad_format_type_strprintf(""); +// CHECK-MESSAGES: [[@LINE-1]]:10: warning: unable to use 'fmt::format' instead of 'bad_format_type_strprintf' because first argument is not a narrow string literal [modernize-use-std-format] } diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-std-format-fmt.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-std-format-fmt.cpp index 9d136cf309168..1eaf18ac11996 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-std-format-fmt.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-std-format-fmt.cpp @@ -12,9 +12,9 @@ namespace fmt { -// Use const char * for the format since the real type is hard to mock up. -template -std::string sprintf(const char *format, const Args&... args); +template +std::basic_string sprintf(const S& fmt, const T&... args); } // namespace fmt std::string fmt_sprintf_simple() { diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-std-print-custom.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-std-print-custom.cpp index 09720001ab837..687b8c0780b01 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-std-print-custom.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-std-print-custom.cpp @@ -1,8 +1,8 @@ // RUN: %check_clang_tidy -std=c++23 %s modernize-use-std-print %t -- \ // RUN: -config="{CheckOptions: \ // RUN: { \ -// RUN: modernize-use-std-print.PrintfLikeFunctions: 'unqualified_printf;::myprintf; mynamespace::myprintf2; bad_format_type_printf', \ -// RUN: modernize-use-std-print.FprintfLikeFunctions: '::myfprintf; mynamespace::myfprintf2; bad_format_type_fprintf' \ +// RUN: modernize-use-std-print.PrintfLikeFunctions: 'unqualified_printf;::myprintf; mynamespace::myprintf2; bad_format_type_printf; fmt::printf', \ +// RUN: modernize-use-std-print.FprintfLikeFunctions: '::myfprintf; mynamespace::myfprintf2; bad_format_type_fprintf; fmt::fprintf' \ // RUN: } \ // RUN: }" \ // RUN: -- -isystem %clang_tidy_headers @@ -106,5 +106,27 @@ void unsupported_format_parameter_type() // No fixes here because the format parameter of the function called is not a // string. bad_format_type_printf("Hello %s", "world"); +// CHECK-MESSAGES: [[@LINE-1]]:3: warning: unable to use 'std::print' instead of 'bad_format_type_printf' because first argument is not a narrow string literal [modernize-use-std-print] + bad_format_type_fprintf(stderr, "Hello %s", "world"); +// CHECK-MESSAGES: [[@LINE-1]]:3: warning: unable to use 'std::print' instead of 'bad_format_type_fprintf' because first argument is not a narrow string literal [modernize-use-std-print] +} + +namespace fmt { + template + inline int printf(const S& fmt, const T&... args); + + template + inline int fprintf(std::FILE* f, const S& fmt, const T&... args); +} + +void fmt_printf() +{ + fmt::printf("fmt::printf templated %s argument %d\n", "format", 424); + // CHECK-MESSAGES: [[@LINE-1]]:3: warning: use 'std::println' instead of 'printf' [modernize-use-std-print] + // CHECK-FIXES: std::println("fmt::printf templated {} argument {}", "format", 424); + + fmt::fprintf(stderr, "fmt::fprintf templated %s argument %d\n", "format", 425); + // CHECK-MESSAGES: [[@LINE-1]]:3: warning: use 'std::println' instead of 'fprintf' [modernize-use-std-print] + // CHECK-FIXES: std::println(stderr, "fmt::fprintf templated {} argument {}", "format", 425); } From c63125d4533a22a200c3b5b1efb8ac3ce4b1cb69 Mon Sep 17 00:00:00 2001 From: Giuseppe Rossini Date: Wed, 17 Jul 2024 17:05:40 +0100 Subject: [PATCH 295/777] [mlir] Fix block merging (#97697) With this PR I am trying to address: https://github.com/llvm/llvm-project/issues/63230. What changed: - While merging identical blocks, don't add a block argument if it is "identical" to another block argument. I.e., if the two block arguments refer to the same `Value`. The operations operands in the block will point to the argument we already inserted. This needs to happen to all the arguments we pass to the different successors of the parent block - After merged the blocks, get rid of "unnecessary" arguments. I.e., if all the predecessors pass the same block argument, there is no need to pass it as an argument. - This last simplification clashed with `BufferDeallocationSimplification`. The reason, I think, is that the two simplifications are clashing. I.e., `BufferDeallocationSimplification` contains an analysis based on the block structure. If we simplify the block structure (by merging and/or dropping block arguments) the analysis is invalid . The solution I found is to do a more prudent simplification when running that pass. **Note**: this a rework of #96871 . I ran all the integration tests (`-DMLIR_INCLUDE_INTEGRATION_TESTS=ON`) and they passed. --- .../BufferDeallocationSimplification.cpp | 9 +- mlir/lib/Transforms/Utils/RegionUtils.cpp | 204 +++++++++++++++++- .../dealloc-branchop-interface.mlir | 20 +- .../Linalg/detensorize_entry_block.mlir | 6 +- mlir/test/Dialect/Linalg/detensorize_if.mlir | 67 +++--- .../Dialect/Linalg/detensorize_while.mlir | 12 +- .../Linalg/detensorize_while_impure_cf.mlir | 12 +- .../Linalg/detensorize_while_pure_cf.mlir | 4 +- .../Transforms/canonicalize-block-merge.mlir | 6 +- mlir/test/Transforms/canonicalize-dce.mlir | 8 +- .../Transforms/make-isolated-from-above.mlir | 18 +- .../test-canonicalize-merge-large-blocks.mlir | 162 ++++++++++++++ 12 files changed, 445 insertions(+), 83 deletions(-) create mode 100644 mlir/test/Transforms/test-canonicalize-merge-large-blocks.mlir diff --git a/mlir/lib/Dialect/Bufferization/Transforms/BufferDeallocationSimplification.cpp b/mlir/lib/Dialect/Bufferization/Transforms/BufferDeallocationSimplification.cpp index 954485cfede3d..5227b22653eef 100644 --- a/mlir/lib/Dialect/Bufferization/Transforms/BufferDeallocationSimplification.cpp +++ b/mlir/lib/Dialect/Bufferization/Transforms/BufferDeallocationSimplification.cpp @@ -463,10 +463,15 @@ struct BufferDeallocationSimplificationPass SplitDeallocWhenNotAliasingAnyOther, RetainedMemrefAliasingAlwaysDeallocatedMemref>(&getContext(), analysis); + // We don't want that the block structure changes invalidating the + // `BufferOriginAnalysis` so we apply the rewrites witha `Normal` level of + // region simplification + GreedyRewriteConfig config; + config.enableRegionSimplification = GreedySimplifyRegionLevel::Normal; populateDeallocOpCanonicalizationPatterns(patterns, &getContext()); - if (failed( - applyPatternsAndFoldGreedily(getOperation(), std::move(patterns)))) + if (failed(applyPatternsAndFoldGreedily(getOperation(), std::move(patterns), + config))) signalPassFailure(); } }; diff --git a/mlir/lib/Transforms/Utils/RegionUtils.cpp b/mlir/lib/Transforms/Utils/RegionUtils.cpp index 4c0f15bafbaba..946d65cef4186 100644 --- a/mlir/lib/Transforms/Utils/RegionUtils.cpp +++ b/mlir/lib/Transforms/Utils/RegionUtils.cpp @@ -9,6 +9,7 @@ #include "mlir/Transforms/RegionUtils.h" #include "mlir/Analysis/TopologicalSortUtils.h" #include "mlir/IR/Block.h" +#include "mlir/IR/BuiltinOps.h" #include "mlir/IR/IRMapping.h" #include "mlir/IR/Operation.h" #include "mlir/IR/PatternMatch.h" @@ -16,11 +17,15 @@ #include "mlir/IR/Value.h" #include "mlir/Interfaces/ControlFlowInterfaces.h" #include "mlir/Interfaces/SideEffectInterfaces.h" +#include "mlir/Support/LogicalResult.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallSet.h" #include +#include using namespace mlir; @@ -674,6 +679,91 @@ static bool ableToUpdatePredOperands(Block *block) { return true; } +/// Prunes the redundant list of arguments. E.g., if we are passing an argument +/// list like [x, y, z, x] this would return [x, y, z] and it would update the +/// `block` (to whom the argument are passed to) accordingly. +static SmallVector, 2> pruneRedundantArguments( + const SmallVector, 2> &newArguments, + RewriterBase &rewriter, Block *block) { + + SmallVector, 2> newArgumentsPruned( + newArguments.size(), SmallVector()); + + if (newArguments.empty()) + return newArguments; + + // `newArguments` is a 2D array of size `numLists` x `numArgs` + unsigned numLists = newArguments.size(); + unsigned numArgs = newArguments[0].size(); + + // Map that for each arg index contains the index that we can use in place of + // the original index. E.g., if we have newArgs = [x, y, z, x], we will have + // idxToReplacement[3] = 0 + llvm::DenseMap idxToReplacement; + + // This is a useful data structure to track the first appearance of a Value + // on a given list of arguments + DenseMap firstValueToIdx; + for (unsigned j = 0; j < numArgs; ++j) { + Value newArg = newArguments[0][j]; + if (!firstValueToIdx.contains(newArg)) + firstValueToIdx[newArg] = j; + } + + // Go through the first list of arguments (list 0). + for (unsigned j = 0; j < numArgs; ++j) { + bool shouldReplaceJ = false; + unsigned replacement = 0; + // Look back to see if there are possible redundancies in list 0. Please + // note that we are using a map to annotate when an argument was seen first + // to avoid a O(N^2) algorithm. This has the drawback that if we have two + // lists like: + // list0: [%a, %a, %a] + // list1: [%c, %b, %b] + // We cannot simplify it, because firstVlaueToIdx[%a] = 0, but we cannot + // point list1[1](==%b) or list1[2](==%b) to list1[0](==%c). However, since + // the number of arguments can be potentially unbounded we cannot afford a + // O(N^2) algorithm (to search to all the possible pairs) and we need to + // accept the trade-off. + unsigned k = firstValueToIdx[newArguments[0][j]]; + if (k != j) { + shouldReplaceJ = true; + replacement = k; + // If a possible redundancy is found, then scan the other lists: we + // can prune the arguments if and only if they are redundant in every + // list. + for (unsigned i = 1; i < numLists; ++i) + shouldReplaceJ = + shouldReplaceJ && (newArguments[i][k] == newArguments[i][j]); + } + // Save the replacement. + if (shouldReplaceJ) + idxToReplacement[j] = replacement; + } + + // Populate the pruned argument list. + for (unsigned i = 0; i < numLists; ++i) + for (unsigned j = 0; j < numArgs; ++j) + if (!idxToReplacement.contains(j)) + newArgumentsPruned[i].push_back(newArguments[i][j]); + + // Replace the block's redundant arguments. + SmallVector toErase; + for (auto [idx, arg] : llvm::enumerate(block->getArguments())) { + if (idxToReplacement.contains(idx)) { + Value oldArg = block->getArgument(idx); + Value newArg = block->getArgument(idxToReplacement[idx]); + rewriter.replaceAllUsesWith(oldArg, newArg); + toErase.push_back(idx); + } + } + + // Erase the block's redundant arguments. + for (unsigned idxToErase : llvm::reverse(toErase)) + block->eraseArgument(idxToErase); + return newArgumentsPruned; +} + LogicalResult BlockMergeCluster::merge(RewriterBase &rewriter) { // Don't consider clusters that don't have blocks to merge. if (blocksToMerge.empty()) @@ -722,6 +812,10 @@ LogicalResult BlockMergeCluster::merge(RewriterBase &rewriter) { } } } + + // Prune redundant arguments and update the leader block argument list + newArguments = pruneRedundantArguments(newArguments, rewriter, leaderBlock); + // Update the predecessors for each of the blocks. auto updatePredecessors = [&](Block *block, unsigned clusterIndex) { for (auto predIt = block->pred_begin(), predE = block->pred_end(); @@ -818,6 +912,108 @@ static LogicalResult mergeIdenticalBlocks(RewriterBase &rewriter, return success(anyChanged); } +static LogicalResult dropRedundantArguments(RewriterBase &rewriter, + Block &block) { + SmallVector argsToErase; + + // Go through the arguments of the block. + for (auto [argIdx, blockOperand] : llvm::enumerate(block.getArguments())) { + bool sameArg = true; + Value commonValue; + + // Go through the block predecessor and flag if they pass to the block + // different values for the same argument. + for (auto predIt = block.pred_begin(), predE = block.pred_end(); + predIt != predE; ++predIt) { + auto branch = dyn_cast((*predIt)->getTerminator()); + if (!branch) { + sameArg = false; + break; + } + unsigned succIndex = predIt.getSuccessorIndex(); + SuccessorOperands succOperands = branch.getSuccessorOperands(succIndex); + auto branchOperands = succOperands.getForwardedOperands(); + if (!commonValue) { + commonValue = branchOperands[argIdx]; + } else { + if (branchOperands[argIdx] != commonValue) { + sameArg = false; + break; + } + } + } + + // If they are passing the same value, drop the argument. + if (commonValue && sameArg) { + argsToErase.push_back(argIdx); + + // Remove the argument from the block. + rewriter.replaceAllUsesWith(blockOperand, commonValue); + } + } + + // Remove the arguments. + for (auto argIdx : llvm::reverse(argsToErase)) { + block.eraseArgument(argIdx); + + // Remove the argument from the branch ops. + for (auto predIt = block.pred_begin(), predE = block.pred_end(); + predIt != predE; ++predIt) { + auto branch = cast((*predIt)->getTerminator()); + unsigned succIndex = predIt.getSuccessorIndex(); + SuccessorOperands succOperands = branch.getSuccessorOperands(succIndex); + succOperands.erase(argIdx); + } + } + return success(!argsToErase.empty()); +} + +/// This optimization drops redundant argument to blocks. I.e., if a given +/// argument to a block receives the same value from each of the block +/// predecessors, we can remove the argument from the block and use directly the +/// original value. This is a simple example: +/// +/// %cond = llvm.call @rand() : () -> i1 +/// %val0 = llvm.mlir.constant(1 : i64) : i64 +/// %val1 = llvm.mlir.constant(2 : i64) : i64 +/// %val2 = llvm.mlir.constant(3 : i64) : i64 +/// llvm.cond_br %cond, ^bb1(%val0 : i64, %val1 : i64), ^bb2(%val0 : i64, %val2 +/// : i64) +/// +/// ^bb1(%arg0 : i64, %arg1 : i64): +/// llvm.call @foo(%arg0, %arg1) +/// +/// The previous IR can be rewritten as: +/// %cond = llvm.call @rand() : () -> i1 +/// %val0 = llvm.mlir.constant(1 : i64) : i64 +/// %val1 = llvm.mlir.constant(2 : i64) : i64 +/// %val2 = llvm.mlir.constant(3 : i64) : i64 +/// llvm.cond_br %cond, ^bb1(%val1 : i64), ^bb2(%val2 : i64) +/// +/// ^bb1(%arg0 : i64): +/// llvm.call @foo(%val0, %arg0) +/// +static LogicalResult dropRedundantArguments(RewriterBase &rewriter, + MutableArrayRef regions) { + llvm::SmallSetVector worklist; + for (Region ®ion : regions) + worklist.insert(®ion); + bool anyChanged = false; + while (!worklist.empty()) { + Region *region = worklist.pop_back_val(); + + // Add any nested regions to the worklist. + for (Block &block : *region) { + anyChanged = succeeded(dropRedundantArguments(rewriter, block)); + + for (Operation &op : block) + for (Region &nestedRegion : op.getRegions()) + worklist.insert(&nestedRegion); + } + } + return success(anyChanged); +} + //===----------------------------------------------------------------------===// // Region Simplification //===----------------------------------------------------------------------===// @@ -832,8 +1028,12 @@ LogicalResult mlir::simplifyRegions(RewriterBase &rewriter, bool eliminatedBlocks = succeeded(eraseUnreachableBlocks(rewriter, regions)); bool eliminatedOpsOrArgs = succeeded(runRegionDCE(rewriter, regions)); bool mergedIdenticalBlocks = false; - if (mergeBlocks) + bool droppedRedundantArguments = false; + if (mergeBlocks) { mergedIdenticalBlocks = succeeded(mergeIdenticalBlocks(rewriter, regions)); + droppedRedundantArguments = + succeeded(dropRedundantArguments(rewriter, regions)); + } return success(eliminatedBlocks || eliminatedOpsOrArgs || - mergedIdenticalBlocks); + mergedIdenticalBlocks || droppedRedundantArguments); } diff --git a/mlir/test/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation/dealloc-branchop-interface.mlir b/mlir/test/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation/dealloc-branchop-interface.mlir index 5e8104f83cc4d..8e14990502143 100644 --- a/mlir/test/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation/dealloc-branchop-interface.mlir +++ b/mlir/test/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation/dealloc-branchop-interface.mlir @@ -178,7 +178,7 @@ func.func @condBranchDynamicTypeNested( // CHECK-NEXT: ^bb1 // CHECK-NOT: bufferization.dealloc // CHECK-NOT: bufferization.clone -// CHECK: cf.br ^bb5([[ARG1]], %false{{[0-9_]*}} : +// CHECK: cf.br ^bb6([[ARG1]], %false{{[0-9_]*}} : // CHECK: ^bb2([[IDX:%.*]]:{{.*}}) // CHECK: [[ALLOC1:%.*]] = memref.alloc([[IDX]]) // CHECK-NEXT: test.buffer_based @@ -186,20 +186,24 @@ func.func @condBranchDynamicTypeNested( // CHECK-NEXT: [[OWN:%.+]] = arith.select [[ARG0]], [[ARG0]], [[NOT_ARG0]] // CHECK-NOT: bufferization.dealloc // CHECK-NOT: bufferization.clone -// CHECK: cf.cond_br{{.*}}, ^bb3, ^bb3 +// CHECK: cf.cond_br{{.*}}, ^bb3, ^bb4 // CHECK-NEXT: ^bb3: // CHECK-NOT: bufferization.dealloc // CHECK-NOT: bufferization.clone -// CHECK: cf.br ^bb4([[ALLOC1]], [[OWN]] -// CHECK-NEXT: ^bb4([[ALLOC2:%.*]]:{{.*}}, [[COND1:%.+]]:{{.*}}) +// CHECK: cf.br ^bb5([[ALLOC1]], [[OWN]] +// CHECK-NEXT: ^bb4: // CHECK-NOT: bufferization.dealloc // CHECK-NOT: bufferization.clone -// CHECK: cf.br ^bb5([[ALLOC2]], [[COND1]] -// CHECK-NEXT: ^bb5([[ALLOC4:%.*]]:{{.*}}, [[COND2:%.+]]:{{.*}}) +// CHECK: cf.br ^bb5([[ALLOC1]], [[OWN]] +// CHECK-NEXT: ^bb5([[ALLOC2:%.*]]:{{.*}}, [[COND1:%.+]]:{{.*}}) +// CHECK-NOT: bufferization.dealloc +// CHECK-NOT: bufferization.clone +// CHECK: cf.br ^bb6([[ALLOC2]], [[COND1]] +// CHECK-NEXT: ^bb6([[ALLOC4:%.*]]:{{.*}}, [[COND2:%.+]]:{{.*}}) // CHECK-NEXT: [[BASE:%[a-zA-Z0-9_]+]]{{.*}} = memref.extract_strided_metadata [[ALLOC4]] // CHECK-NEXT: [[OWN:%.+]]:2 = bufferization.dealloc ([[BASE]] :{{.*}}) if ([[COND2]]) retain ([[ALLOC4]], [[ARG2]] : -// CHECK: cf.br ^bb6([[ALLOC4]], [[OWN]]#0 -// CHECK-NEXT: ^bb6([[ALLOC5:%.*]]:{{.*}}, [[COND3:%.+]]:{{.*}}) +// CHECK: cf.br ^bb7([[ALLOC4]], [[OWN]]#0 +// CHECK-NEXT: ^bb7([[ALLOC5:%.*]]:{{.*}}, [[COND3:%.+]]:{{.*}}) // CHECK: test.copy // CHECK: [[BASE:%[a-zA-Z0-9_]+]]{{.*}} = memref.extract_strided_metadata [[ALLOC5]] // CHECK-NEXT: bufferization.dealloc ([[BASE]] : {{.*}}) if ([[COND3]]) diff --git a/mlir/test/Dialect/Linalg/detensorize_entry_block.mlir b/mlir/test/Dialect/Linalg/detensorize_entry_block.mlir index d1a89226fdb58..50a2d6bf532aa 100644 --- a/mlir/test/Dialect/Linalg/detensorize_entry_block.mlir +++ b/mlir/test/Dialect/Linalg/detensorize_entry_block.mlir @@ -15,7 +15,7 @@ func.func @main(%arg0: tensor) -> tensor { // CHECK-LABEL: @main // CHECK-SAME: (%[[ARG0:.+]]: tensor) -> tensor // CHECK: %[[EXTRACTED:.+]] = tensor.extract %[[ARG0]][] : tensor -// CHECK: cf.br ^{{.*}}(%[[EXTRACTED]] : f32) -// CHECK: ^{{.*}}(%[[ARG1:.+]]: f32): -// CHECK: %[[ELEMENTS:.+]] = tensor.from_elements %[[ARG1]] : tensor +// CHECK: cf.br ^{{.*}} +// CHECK: ^{{.*}}: +// CHECK: %[[ELEMENTS:.+]] = tensor.from_elements %[[EXTRACTED]] : tensor // CHECK: return %[[ELEMENTS]] : tensor diff --git a/mlir/test/Dialect/Linalg/detensorize_if.mlir b/mlir/test/Dialect/Linalg/detensorize_if.mlir index 8d17763c04b6c..c728ad21d2209 100644 --- a/mlir/test/Dialect/Linalg/detensorize_if.mlir +++ b/mlir/test/Dialect/Linalg/detensorize_if.mlir @@ -42,18 +42,15 @@ func.func @main() -> (tensor) attributes {} { } // CHECK-LABEL: func @main() -// CHECK-DAG: arith.constant 0 -// CHECK-DAG: arith.constant 10 -// CHECK: cf.br ^[[bb1:.*]](%{{.*}}: i32) -// CHECK-NEXT: ^[[bb1]](%{{.*}}: i32): -// CHECK-NEXT: arith.cmpi slt, %{{.*}}, %{{.*}} -// CHECK-NEXT: cf.cond_br %{{.*}}, ^[[bb2:.*]](%{{.*}} : i32), ^bb3(%{{.*}} : i32) -// CHECK-NEXT: ^[[bb2]](%{{.*}}: i32) -// CHECK-NEXT: arith.addi %{{.*}}, %{{.*}} -// CHECK-NEXT: cf.br ^[[bb3:.*]](%{{.*}} : i32) -// CHECK-NEXT: ^[[bb3]](%{{.*}}: i32) -// CHECK-NEXT: tensor.from_elements %{{.*}} : tensor -// CHECK-NEXT: return %{{.*}} +// CHECK-DAG: %[[cst:.*]] = arith.constant dense<0> +// CHECK-DAG: arith.constant true +// CHECK: cf.br +// CHECK-NEXT: ^[[bb1:.*]]: +// CHECK-NEXT: cf.cond_br %{{.*}}, ^[[bb2:.*]], ^bb3 +// CHECK-NEXT: ^[[bb2]] +// CHECK-NEXT: cf.br ^[[bb3:.*]] +// CHECK-NEXT: ^[[bb3]] +// CHECK-NEXT: return %[[cst]] // CHECK-NEXT: } // ----- @@ -106,20 +103,17 @@ func.func @main() -> (tensor) attributes {} { } // CHECK-LABEL: func @main() -// CHECK-DAG: arith.constant 0 -// CHECK-DAG: arith.constant 10 -// CHECK: cf.br ^[[bb1:.*]](%{{.*}}: i32) -// CHECK-NEXT: ^[[bb1]](%{{.*}}: i32): -// CHECK-NEXT: arith.cmpi slt, %{{.*}}, %{{.*}} -// CHECK-NEXT: cf.cond_br %{{.*}}, ^[[bb2:.*]](%{{.*}} : i32), ^bb3(%{{.*}} : i32) -// CHECK-NEXT: ^[[bb2]](%{{.*}}: i32) -// CHECK-NEXT: arith.addi %{{.*}}, %{{.*}} -// CHECK-NEXT: cf.br ^[[bb3:.*]](%{{.*}} : i32) -// CHECK-NEXT: ^[[bb3]](%{{.*}}: i32) -// CHECK-NEXT: cf.br ^[[bb4:.*]](%{{.*}} : i32) -// CHECK-NEXT: ^[[bb4]](%{{.*}}: i32) -// CHECK-NEXT: tensor.from_elements %{{.*}} : tensor -// CHECK-NEXT: return %{{.*}} +// CHECK-DAG: %[[cst:.*]] = arith.constant dense<0> +// CHECK-DAG: arith.constant true +// CHECK: cf.br ^[[bb1:.*]] +// CHECK-NEXT: ^[[bb1:.*]]: +// CHECK-NEXT: cf.cond_br %{{.*}}, ^[[bb2:.*]], ^bb3 +// CHECK-NEXT: ^[[bb2]]: +// CHECK-NEXT: cf.br ^[[bb3:.*]] +// CHECK-NEXT: ^[[bb3]]: +// CHECK-NEXT: cf.br ^[[bb4:.*]] +// CHECK-NEXT: ^[[bb4]]: +// CHECK-NEXT: return %[[cst]] // CHECK-NEXT: } // ----- @@ -171,16 +165,13 @@ func.func @main() -> (tensor) attributes {} { } // CHECK-LABEL: func @main() -// CHECK-DAG: arith.constant 0 -// CHECK-DAG: arith.constant 10 -// CHECK: cf.br ^[[bb1:.*]](%{{.*}}: i32) -// CHECK-NEXT: ^[[bb1]](%{{.*}}: i32): -// CHECK-NEXT: arith.cmpi slt, %{{.*}}, %{{.*}} -// CHECK-NEXT: cf.cond_br %{{.*}}, ^[[bb2:.*]](%{{.*}} : i32), ^bb2(%{{.*}} : i32) -// CHECK-NEXT: ^[[bb2]](%{{.*}}: i32) -// CHECK-NEXT: arith.addi %{{.*}}, %{{.*}} -// CHECK-NEXT: cf.br ^[[bb3:.*]](%{{.*}} : i32) -// CHECK-NEXT: ^[[bb3]](%{{.*}}: i32) -// CHECK-NEXT: tensor.from_elements %{{.*}} : tensor -// CHECK-NEXT: return %{{.*}} +// CHECK-DAG: %[[cst:.*]] = arith.constant dense<10> +// CHECK-DAG: arith.constant true +// CHECK: cf.br ^[[bb1:.*]] +// CHECK-NEXT: ^[[bb1]]: +// CHECK-NEXT: cf.cond_br %{{.*}}, ^[[bb2:.*]], ^bb2 +// CHECK-NEXT: ^[[bb2]] +// CHECK-NEXT: cf.br ^[[bb3:.*]] +// CHECK-NEXT: ^[[bb3]] +// CHECK-NEXT: return %[[cst]] // CHECK-NEXT: } diff --git a/mlir/test/Dialect/Linalg/detensorize_while.mlir b/mlir/test/Dialect/Linalg/detensorize_while.mlir index aa30900f76a33..580a97d3a851b 100644 --- a/mlir/test/Dialect/Linalg/detensorize_while.mlir +++ b/mlir/test/Dialect/Linalg/detensorize_while.mlir @@ -46,11 +46,11 @@ func.func @main(%farg0: tensor, %farg1: tensor) -> tensor attribu // DET-ALL: cf.br ^[[bb1:.*]](%{{.*}} : i32) // DET-ALL: ^[[bb1]](%{{.*}}: i32) // DET-ALL: arith.cmpi slt, {{.*}} -// DET-ALL: cf.cond_br {{.*}}, ^[[bb2:.*]](%{{.*}} : i32), ^[[bb3:.*]](%{{.*}} : i32) -// DET-ALL: ^[[bb2]](%{{.*}}: i32) +// DET-ALL: cf.cond_br {{.*}}, ^[[bb2:.*]], ^[[bb3:.*]] +// DET-ALL: ^[[bb2]] // DET-ALL: arith.addi {{.*}} // DET-ALL: cf.br ^[[bb1]](%{{.*}} : i32) -// DET-ALL: ^[[bb3]](%{{.*}}: i32) +// DET-ALL: ^[[bb3]]: // DET-ALL: tensor.from_elements {{.*}} // DET-ALL: return %{{.*}} : tensor @@ -62,10 +62,10 @@ func.func @main(%farg0: tensor, %farg1: tensor) -> tensor attribu // DET-CF: cf.br ^[[bb1:.*]](%{{.*}} : i32) // DET-CF: ^[[bb1]](%{{.*}}: i32) // DET-CF: arith.cmpi slt, {{.*}} -// DET-CF: cf.cond_br {{.*}}, ^[[bb2:.*]](%{{.*}} : i32), ^[[bb3:.*]](%{{.*}} : i32) -// DET-CF: ^[[bb2]](%{{.*}}: i32) +// DET-CF: cf.cond_br {{.*}}, ^[[bb2:.*]], ^[[bb3:.*]] +// DET-CF: ^[[bb2]]: // DET-CF: arith.addi {{.*}} // DET-CF: cf.br ^[[bb1]](%{{.*}} : i32) -// DET-CF: ^[[bb3]](%{{.*}}: i32) +// DET-CF: ^[[bb3]]: // DET-CF: tensor.from_elements %{{.*}} : tensor // DET-CF: return %{{.*}} : tensor diff --git a/mlir/test/Dialect/Linalg/detensorize_while_impure_cf.mlir b/mlir/test/Dialect/Linalg/detensorize_while_impure_cf.mlir index 955c7be5ef4c8..414d9b94cbf53 100644 --- a/mlir/test/Dialect/Linalg/detensorize_while_impure_cf.mlir +++ b/mlir/test/Dialect/Linalg/detensorize_while_impure_cf.mlir @@ -74,8 +74,8 @@ func.func @main(%farg0: tensor<10xi32>, %farg1: tensor) -> tensor attr // DET-ALL: } -> tensor // DET-ALL: tensor.extract %{{.*}}[] : tensor // DET-ALL: cmpi slt, %{{.*}}, %{{.*}} : i32 -// DET-ALL: cf.cond_br %{{.*}}, ^[[bb2:.*]](%{{.*}} : i32), ^[[bb3:.*]](%{{.*}} : i32) -// DET-ALL: ^[[bb2]](%{{.*}}: i32) +// DET-ALL: cf.cond_br %{{.*}}, ^[[bb2:.*]], ^[[bb3:.*]] +// DET-ALL: ^[[bb2]]: // DET-ALL: tensor.from_elements %{{.*}} : tensor // DET-ALL: tensor.empty() : tensor<10xi32> // DET-ALL: linalg.generic {{{.*}}} ins(%{{.*}} : tensor) outs(%{{.*}} : tensor<10xi32>) { @@ -83,7 +83,7 @@ func.func @main(%farg0: tensor<10xi32>, %farg1: tensor) -> tensor attr // DET-ALL: linalg.yield %{{.*}} : i32 // DET-ALL: } -> tensor<10xi32> // DET-ALL: cf.br ^[[bb1]](%{{.*}} : tensor<10xi32>) -// DET-ALL: ^[[bb3]](%{{.*}}: i32) +// DET-ALL: ^[[bb3]] // DET-ALL: tensor.from_elements %{{.*}} : tensor // DET-ALL: return %{{.*}} : tensor // DET-ALL: } @@ -95,10 +95,10 @@ func.func @main(%farg0: tensor<10xi32>, %farg1: tensor) -> tensor attr // DET-CF: %{{.*}} = linalg.generic {{{.*}}} ins(%{{.*}} : tensor<10xi32>) outs(%{{.*}} : tensor) { // DET-CF: tensor.extract %{{.*}}[] : tensor // DET-CF: cmpi slt, %{{.*}}, %{{.*}} : i32 -// DET-CF: cf.cond_br %{{.*}}, ^bb2(%{{.*}} : tensor), ^bb3(%{{.*}} : tensor) -// DET-CF: ^bb2(%{{.*}}: tensor) +// DET-CF: cf.cond_br %{{.*}}, ^bb2, ^bb3 +// DET-CF: ^bb2: // DET-CF: %{{.*}} = linalg.generic {{{.*}}} ins(%{{.*}} : tensor) outs(%{{.*}} : tensor<10xi32>) { // DET-CF: cf.br ^bb1(%{{.*}} : tensor<10xi32>) -// DET-CF: ^bb3(%{{.*}}: tensor) +// DET-CF: ^bb3: // DET-CF: return %{{.*}} : tensor // DET-CF: } diff --git a/mlir/test/Dialect/Linalg/detensorize_while_pure_cf.mlir b/mlir/test/Dialect/Linalg/detensorize_while_pure_cf.mlir index 6d8d5fe71fca5..913e78272db79 100644 --- a/mlir/test/Dialect/Linalg/detensorize_while_pure_cf.mlir +++ b/mlir/test/Dialect/Linalg/detensorize_while_pure_cf.mlir @@ -49,8 +49,8 @@ func.func @main() -> () attributes {} { // CHECK-NEXT: cf.br ^[[bb1:.*]](%{{.*}} : i32) // CHECK-NEXT: ^[[bb1]](%{{.*}}: i32) // CHECK-NEXT: %{{.*}} = arith.cmpi slt, %{{.*}}, %{{.*}} -// CHECK-NEXT: cf.cond_br %{{.*}}, ^[[bb2:.*]](%{{.*}} : i32), ^[[bb3:.*]] -// CHECK-NEXT: ^[[bb2]](%{{.*}}: i32) +// CHECK-NEXT: cf.cond_br %{{.*}}, ^[[bb2:.*]], ^[[bb3:.*]] +// CHECK-NEXT: ^[[bb2]] // CHECK-NEXT: %{{.*}} = arith.addi %{{.*}}, %{{.*}} // CHECK-NEXT: cf.br ^[[bb1]](%{{.*}} : i32) // CHECK-NEXT: ^[[bb3]]: diff --git a/mlir/test/Transforms/canonicalize-block-merge.mlir b/mlir/test/Transforms/canonicalize-block-merge.mlir index 3b8b1fce0575a..92cfde817cf7f 100644 --- a/mlir/test/Transforms/canonicalize-block-merge.mlir +++ b/mlir/test/Transforms/canonicalize-block-merge.mlir @@ -87,7 +87,7 @@ func.func @mismatch_operands_matching_arguments(%cond : i1, %arg0 : i32, %arg1 : // CHECK-LABEL: func @mismatch_argument_uses( func.func @mismatch_argument_uses(%cond : i1, %arg0 : i32, %arg1 : i32) -> (i32, i32) { - // CHECK: cf.cond_br %{{.*}}, ^bb1(%{{.*}}), ^bb2 + // CHECK: return {{.*}}, {{.*}} cf.cond_br %cond, ^bb1(%arg1 : i32), ^bb2(%arg0 : i32) @@ -101,7 +101,7 @@ func.func @mismatch_argument_uses(%cond : i1, %arg0 : i32, %arg1 : i32) -> (i32, // CHECK-LABEL: func @mismatch_argument_types( func.func @mismatch_argument_types(%cond : i1, %arg0 : i32, %arg1 : i16) { - // CHECK: cf.cond_br %{{.*}}, ^bb1(%{{.*}}), ^bb2 + // CHECK: cf.cond_br %{{.*}}, ^bb1, ^bb2 cf.cond_br %cond, ^bb1(%arg0 : i32), ^bb2(%arg1 : i16) @@ -115,7 +115,7 @@ func.func @mismatch_argument_types(%cond : i1, %arg0 : i32, %arg1 : i16) { // CHECK-LABEL: func @mismatch_argument_count( func.func @mismatch_argument_count(%cond : i1, %arg0 : i32) { - // CHECK: cf.cond_br %{{.*}}, ^bb1(%{{.*}}), ^bb2 + // CHECK: cf.cond_br %{{.*}}, ^bb1, ^bb2 cf.cond_br %cond, ^bb1(%arg0 : i32), ^bb2 diff --git a/mlir/test/Transforms/canonicalize-dce.mlir b/mlir/test/Transforms/canonicalize-dce.mlir index ac034d567a26a..84631947970de 100644 --- a/mlir/test/Transforms/canonicalize-dce.mlir +++ b/mlir/test/Transforms/canonicalize-dce.mlir @@ -137,10 +137,10 @@ func.func @f(%arg0: f32) { // Test case: Test the mechanics of deleting multiple block arguments. // CHECK: func @f(%arg0: tensor<1xf32>, %arg1: tensor<2xf32>, %arg2: tensor<3xf32>, %arg3: tensor<4xf32>, %arg4: tensor<5xf32>) -// CHECK-NEXT: "test.br"(%arg1, %arg3)[^bb1] : (tensor<2xf32>, tensor<4xf32>) -// CHECK-NEXT: ^bb1([[VAL0:%.+]]: tensor<2xf32>, [[VAL1:%.+]]: tensor<4xf32>): -// CHECK-NEXT: "foo.print"([[VAL0]]) -// CHECK-NEXT: "foo.print"([[VAL1]]) +// CHECK-NEXT: "test.br"()[^bb1] +// CHECK-NEXT: ^bb1: +// CHECK-NEXT: "foo.print"(%arg1) +// CHECK-NEXT: "foo.print"(%arg3) // CHECK-NEXT: return diff --git a/mlir/test/Transforms/make-isolated-from-above.mlir b/mlir/test/Transforms/make-isolated-from-above.mlir index 58f6cfbc5dd65..a9d4325944fd9 100644 --- a/mlir/test/Transforms/make-isolated-from-above.mlir +++ b/mlir/test/Transforms/make-isolated-from-above.mlir @@ -78,9 +78,9 @@ func.func @make_isolated_from_above_multiple_blocks(%arg0 : index, %arg1 : index // CHECK-DAG: %[[D1:.+]] = tensor.dim %[[EMPTY]], %[[C1]] // CHECK: test.isolated_one_region_op %[[ARG2]], %[[C0]], %[[C1]], %[[D0]], %[[D1]] // CHECK-NEXT: ^bb0(%[[B0:[a-zA-Z0-9]+]]: index, %[[B1:[a-zA-Z0-9]+]]: index, %[[B2:[a-zA-Z0-9]+]]: index, %[[B3:[a-zA-Z0-9]+]]: index, %[[B4:[a-zA-Z0-9]+]]: index) -// CHECK-NEXT: cf.br ^bb1(%[[B0]] : index) -// CHECK: ^bb1(%[[B5:.+]]: index) -// CHECK: "foo.yield"(%[[B1]], %[[B2]], %[[B3]], %[[B4]], %[[B5]]) +// CHECK-NEXT: cf.br ^bb1 +// CHECK: ^bb1: +// CHECK: "foo.yield"(%[[B1]], %[[B2]], %[[B3]], %[[B4]], %[[B0]]) // CLONE1-LABEL: func @make_isolated_from_above_multiple_blocks( // CLONE1-SAME: %[[ARG0:[a-zA-Z0-9]+]]: index @@ -95,9 +95,9 @@ func.func @make_isolated_from_above_multiple_blocks(%arg0 : index, %arg1 : index // CLONE1-NEXT: ^bb0(%[[B0:[a-zA-Z0-9]+]]: index, %[[B1:[a-zA-Z0-9]+]]: index, %[[B2:[a-zA-Z0-9]+]]: index) // CLONE1-DAG: %[[C0_0:.+]] = arith.constant 0 : index // CLONE1-DAG: %[[C1_0:.+]] = arith.constant 1 : index -// CLONE1-NEXT: cf.br ^bb1(%[[B0]] : index) -// CLONE1: ^bb1(%[[B3:.+]]: index) -// CLONE1: "foo.yield"(%[[C0_0]], %[[C1_0]], %[[B1]], %[[B2]], %[[B3]]) +// CLONE1-NEXT: cf.br ^bb1 +// CLONE1: ^bb1: +// CLONE1: "foo.yield"(%[[C0_0]], %[[C1_0]], %[[B1]], %[[B2]], %[[B0]]) // CLONE2-LABEL: func @make_isolated_from_above_multiple_blocks( // CLONE2-SAME: %[[ARG0:[a-zA-Z0-9]+]]: index @@ -110,6 +110,6 @@ func.func @make_isolated_from_above_multiple_blocks(%arg0 : index, %arg1 : index // CLONE2-DAG: %[[EMPTY:.+]] = tensor.empty(%[[B1]], %[[B2]]) // CLONE2-DAG: %[[D0:.+]] = tensor.dim %[[EMPTY]], %[[C0]] // CLONE2-DAG: %[[D1:.+]] = tensor.dim %[[EMPTY]], %[[C1]] -// CLONE2-NEXT: cf.br ^bb1(%[[B0]] : index) -// CLONE2: ^bb1(%[[B3:.+]]: index) -// CLONE2: "foo.yield"(%[[C0]], %[[C1]], %[[D0]], %[[D1]], %[[B3]]) +// CLONE2-NEXT: cf.br ^bb1 +// CLONE2: ^bb1: +// CLONE2: "foo.yield"(%[[C0]], %[[C1]], %[[D0]], %[[D1]], %[[B0]]) diff --git a/mlir/test/Transforms/test-canonicalize-merge-large-blocks.mlir b/mlir/test/Transforms/test-canonicalize-merge-large-blocks.mlir new file mode 100644 index 0000000000000..e821dcd0c2064 --- /dev/null +++ b/mlir/test/Transforms/test-canonicalize-merge-large-blocks.mlir @@ -0,0 +1,162 @@ + // RUN: mlir-opt -pass-pipeline='builtin.module(llvm.func(canonicalize{region-simplify=aggressive}))' %s | FileCheck %s + +llvm.func @foo(%arg0: i64) + +llvm.func @rand() -> i1 + +// CHECK-LABEL: func @large_merge_block( +llvm.func @large_merge_block(%arg0: i64) { + // CHECK: %[[C0:.*]] = llvm.mlir.constant(0 : i64) : i64 + // CHECK: %[[C1:.*]] = llvm.mlir.constant(1 : i64) : i64 + // CHECK: %[[C2:.*]] = llvm.mlir.constant(2 : i64) : i64 + // CHECK: %[[C3:.*]] = llvm.mlir.constant(3 : i64) : i64 + // CHECK: %[[C4:.*]] = llvm.mlir.constant(4 : i64) : i64 + + // CHECK: llvm.cond_br %5, ^bb1(%[[C1]], %[[C3]], %[[C4]], %[[C2]] : i64, i64, i64, i64), ^bb1(%[[C4]], %[[C2]], %[[C1]], %[[C3]] : i64, i64, i64, i64) + // CHECK: ^bb{{.*}}(%[[arg0:.*]]: i64, %[[arg1:.*]]: i64, %[[arg2:.*]]: i64, %[[arg3:.*]]: i64): + // CHECK: llvm.cond_br %{{.*}}, ^bb2(%[[arg0]] : i64), ^bb2(%[[arg3]] : i64) + // CHECK: ^bb{{.*}}(%11: i64): + // CHECK: llvm.br ^bb{{.*}} + // CHECK: ^bb{{.*}}: + // CHECK: llvm.call + // CHECK: llvm.cond_br {{.*}}, ^bb{{.*}}(%[[arg1]] : i64), ^bb{{.*}}(%[[arg2]] : i64) + // CHECK: ^bb{{.*}}: + // CHECK: llvm.call + // CHECK llvm.br ^bb{{.*}} + + %0 = llvm.mlir.constant(0 : i64) : i64 + %1 = llvm.mlir.constant(1 : i64) : i64 + %2 = llvm.mlir.constant(2 : i64) : i64 + %3 = llvm.mlir.constant(3 : i64) : i64 + %4 = llvm.mlir.constant(4 : i64) : i64 + %10 = llvm.icmp "eq" %arg0, %0 : i64 + llvm.cond_br %10, ^bb1, ^bb14 +^bb1: // pred: ^bb0 + %11 = llvm.call @rand() : () -> i1 + llvm.cond_br %11, ^bb2, ^bb3 +^bb2: // pred: ^bb1 + llvm.call @foo(%1) : (i64) -> () + llvm.br ^bb4 +^bb3: // pred: ^bb1 + llvm.call @foo(%2) : (i64) -> () + llvm.br ^bb4 +^bb4: // 2 preds: ^bb2, ^bb3 + %14 = llvm.call @rand() : () -> i1 + llvm.cond_br %14, ^bb5, ^bb6 +^bb5: // pred: ^bb4 + llvm.call @foo(%3) : (i64) -> () + llvm.br ^bb13 +^bb6: // pred: ^bb4 + llvm.call @foo(%4) : (i64) -> () + llvm.br ^bb13 +^bb13: // 2 preds: ^bb11, ^bb12 + llvm.br ^bb27 +^bb14: // pred: ^bb0 + %23 = llvm.call @rand() : () -> i1 + llvm.cond_br %23, ^bb15, ^bb16 +^bb15: // pred: ^bb14 + llvm.call @foo(%4) : (i64) -> () + llvm.br ^bb17 +^bb16: // pred: ^bb14 + llvm.call @foo(%3) : (i64) -> () + llvm.br ^bb17 +^bb17: // 2 preds: ^bb15, ^bb16 + %26 = llvm.call @rand() : () -> i1 + llvm.cond_br %26, ^bb18, ^bb19 +^bb18: // pred: ^bb17 + llvm.call @foo(%2) : (i64) -> () + llvm.br ^bb26 +^bb19: // pred: ^bb17 + llvm.call @foo(%1) : (i64) -> () + llvm.br ^bb26 +^bb26: // 2 preds: ^bb24, ^bb25 + llvm.br ^bb27 +^bb27: // 2 preds: ^bb13, ^bb26 + llvm.return +} + +llvm.func @redundant_args0(%cond : i1) { + %0 = llvm.mlir.constant(0 : i64) : i64 + %2 = llvm.mlir.constant(1 : i64) : i64 + %3 = llvm.mlir.constant(2 : i64) : i64 + // CHECK %[[C0:.*]] = llvm.mlir.constant(0 : i64) : i64 + // CHECK %[[C1:.*]] = llvm.mlir.constant(1 : i64) : i64 + // CHECK %[[C2:.*]] = llvm.mlir.constant(2 : i64) : i64 + + llvm.cond_br %cond, ^bb1, ^bb2 + + // CHECK: llvm.cond_br %{{.*}}, ^bb{{.*}}(%[[C0]], %[[C0]] : i64, i64), ^bb{{.*}}(%[[C1]], %[[C2]] : i64, i64) + // CHECK: ^bb{{.*}}(%{{.*}}: i64, %{{.*}}: i64) +^bb1: + llvm.call @foo(%0) : (i64) -> () + llvm.call @foo(%0) : (i64) -> () + llvm.br ^bb3 +^bb2: + llvm.call @foo(%2) : (i64) -> () + llvm.call @foo(%3) : (i64) -> () + llvm.br ^bb3 +^bb3: + llvm.return +} + +llvm.func @redundant_args1(%cond : i1) { + %0 = llvm.mlir.constant(0 : i64) : i64 + %2 = llvm.mlir.constant(1 : i64) : i64 + %3 = llvm.mlir.constant(2 : i64) : i64 + // CHECK %[[C0:.*]] = llvm.mlir.constant(0 : i64) : i64 + // CHECK %[[C1:.*]] = llvm.mlir.constant(1 : i64) : i64 + // CHECK %[[C2:.*]] = llvm.mlir.constant(2 : i64) : i64 + + llvm.cond_br %cond, ^bb1, ^bb2 + + // CHECK: llvm.cond_br %{{.*}}, ^bb{{.*}}(%[[C1]], %[[C2]] : i64, i64), ^bb{{.*}}(%[[C0]], %[[C0]] : i64, i64) + // CHECK: ^bb{{.*}}(%{{.*}}: i64, %{{.*}}: i64) +^bb1: + llvm.call @foo(%2) : (i64) -> () + llvm.call @foo(%3) : (i64) -> () + llvm.br ^bb3 +^bb2: + llvm.call @foo(%0) : (i64) -> () + llvm.call @foo(%0) : (i64) -> () + llvm.br ^bb3 +^bb3: + llvm.return +} + +llvm.func @redundant_args_complex(%cond : i1) { + %0 = llvm.mlir.constant(0 : i64) : i64 + %1 = llvm.mlir.constant(1 : i64) : i64 + %2 = llvm.mlir.constant(2 : i64) : i64 + %3 = llvm.mlir.constant(3 : i64) : i64 + // CHECK: %[[C0:.*]] = llvm.mlir.constant(0 : i64) : i64 + // CHECK: %[[C1:.*]] = llvm.mlir.constant(1 : i64) : i64 + // CHECK: %[[C2:.*]] = llvm.mlir.constant(2 : i64) : i64 + // CHECK: %[[C3:.*]] = llvm.mlir.constant(3 : i64) : i64 + + llvm.cond_br %cond, ^bb1, ^bb2 + + // CHECK: llvm.cond_br %{{.*}}, ^bb{{.*}}(%[[C2]], %[[C1]], %[[C3]] : i64, i64, i64), ^bb{{.*}}(%[[C0]], %[[C3]], %[[C2]] : i64, i64, i64) + // CHECK: ^bb{{.*}}(%[[arg0:.*]]: i64, %[[arg1:.*]]: i64, %[[arg2:.*]]: i64): + // CHECK: llvm.call @foo(%[[arg0]]) + // CHECK: llvm.call @foo(%[[arg0]]) + // CHECK: llvm.call @foo(%[[arg1]]) + // CHECK: llvm.call @foo(%[[C2]]) + // CHECK: llvm.call @foo(%[[arg2]]) + +^bb1: + llvm.call @foo(%2) : (i64) -> () + llvm.call @foo(%2) : (i64) -> () + llvm.call @foo(%1) : (i64) -> () + llvm.call @foo(%2) : (i64) -> () + llvm.call @foo(%3) : (i64) -> () + llvm.br ^bb3 +^bb2: + llvm.call @foo(%0) : (i64) -> () + llvm.call @foo(%0) : (i64) -> () + llvm.call @foo(%3) : (i64) -> () + llvm.call @foo(%2) : (i64) -> () + llvm.call @foo(%2) : (i64) -> () + llvm.br ^bb3 +^bb3: + llvm.return +} From 850a2e68749266ae0944a27fedf81c6f68d5a2c4 Mon Sep 17 00:00:00 2001 From: Tom Eccles Date: Wed, 17 Jul 2024 17:05:55 +0100 Subject: [PATCH 296/777] [flang] Fix compiler warning (#99306) --- flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index e12e21bb00e15..ba71fb3b4040c 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -6148,7 +6148,7 @@ IntrinsicLibrary::genScan(mlir::Type resultType, fir::ExtendedValue IntrinsicLibrary::genSecond(std::optional resultType, mlir::ArrayRef args) { - assert(args.size() == 1 && !resultType || args.empty() && resultType); + assert((args.size() == 1 && !resultType) || (args.empty() && resultType)); fir::ExtendedValue result; From 136737d94777140952c4948aa4c8fe441aec48e3 Mon Sep 17 00:00:00 2001 From: Mark de Wever Date: Wed, 17 Jul 2024 18:21:36 +0200 Subject: [PATCH 297/777] [libc++] Deprecates rel_ops. (#91642) These operators were deprecated in P0768R1 Library Support for the Spaceship (Comparison) Operator This was discovered while investigating the paper's implementation status. --- libcxx/docs/ReleaseNotes/19.rst | 2 ++ libcxx/include/__utility/rel_ops.h | 8 ++--- .../rel_ops.depr_in_cxx20.verify.cpp | 35 +++++++++++++++++++ .../iterator.rel_ops.compile.pass.cpp | 2 ++ .../utility/operators/rel_ops.pass.cpp | 2 ++ 5 files changed, 45 insertions(+), 4 deletions(-) create mode 100644 libcxx/test/libcxx/depr/depr.rel_ops/rel_ops.depr_in_cxx20.verify.cpp diff --git a/libcxx/docs/ReleaseNotes/19.rst b/libcxx/docs/ReleaseNotes/19.rst index e6d8acb74aeb2..05aeaba7f8716 100644 --- a/libcxx/docs/ReleaseNotes/19.rst +++ b/libcxx/docs/ReleaseNotes/19.rst @@ -141,6 +141,8 @@ Deprecations and Removals of randomness, and others. Users that were checking whether including a header would fail (e.g. via a script or CMake's ``try_compile`` will experience a change in behavior). +- The operators in the ``rel_ops`` namespace have been deprecated. The deprecation is part of the paper + P0768R1 "Library Support for the Spaceship (Comparison) Operator". Upcoming Deprecations and Removals ---------------------------------- diff --git a/libcxx/include/__utility/rel_ops.h b/libcxx/include/__utility/rel_ops.h index ee8657196d98c..a8caf5bdeaf27 100644 --- a/libcxx/include/__utility/rel_ops.h +++ b/libcxx/include/__utility/rel_ops.h @@ -20,22 +20,22 @@ _LIBCPP_BEGIN_NAMESPACE_STD namespace rel_ops { template -inline _LIBCPP_HIDE_FROM_ABI bool operator!=(const _Tp& __x, const _Tp& __y) { +inline _LIBCPP_DEPRECATED_IN_CXX20 _LIBCPP_HIDE_FROM_ABI bool operator!=(const _Tp& __x, const _Tp& __y) { return !(__x == __y); } template -inline _LIBCPP_HIDE_FROM_ABI bool operator>(const _Tp& __x, const _Tp& __y) { +inline _LIBCPP_DEPRECATED_IN_CXX20 _LIBCPP_HIDE_FROM_ABI bool operator>(const _Tp& __x, const _Tp& __y) { return __y < __x; } template -inline _LIBCPP_HIDE_FROM_ABI bool operator<=(const _Tp& __x, const _Tp& __y) { +inline _LIBCPP_DEPRECATED_IN_CXX20 _LIBCPP_HIDE_FROM_ABI bool operator<=(const _Tp& __x, const _Tp& __y) { return !(__y < __x); } template -inline _LIBCPP_HIDE_FROM_ABI bool operator>=(const _Tp& __x, const _Tp& __y) { +inline _LIBCPP_DEPRECATED_IN_CXX20 _LIBCPP_HIDE_FROM_ABI bool operator>=(const _Tp& __x, const _Tp& __y) { return !(__x < __y); } diff --git a/libcxx/test/libcxx/depr/depr.rel_ops/rel_ops.depr_in_cxx20.verify.cpp b/libcxx/test/libcxx/depr/depr.rel_ops/rel_ops.depr_in_cxx20.verify.cpp new file mode 100644 index 0000000000000..35457f65fe2eb --- /dev/null +++ b/libcxx/test/libcxx/depr/depr.rel_ops/rel_ops.depr_in_cxx20.verify.cpp @@ -0,0 +1,35 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 + +#include +#include + +struct A { + int data_ = 0; +}; + +inline bool operator==(const A& x, const A& y) { return x.data_ == y.data_; } + +inline bool operator<(const A& x, const A& y) { return x.data_ < y.data_; } + +void test() { + using namespace std::rel_ops; + A a1{1}; + A a2{2}; + (void)(a1 == a1); + (void)(a1 != a2); // note not deprecated message, due to compiler generated operator. + std::rel_ops::operator!=(a1, a2); // expected-warning {{is deprecated}} + (void)(a1 < a2); + (void)(a1 > a2); // expected-warning 2 {{is deprecated}} + (void)(a1 <= a2); // expected-warning 2 {{is deprecated}} + (void)(a1 >= a2); // expected-warning 2 {{is deprecated}} +} diff --git a/libcxx/test/std/containers/iterator.rel_ops.compile.pass.cpp b/libcxx/test/std/containers/iterator.rel_ops.compile.pass.cpp index aaaa887f72074..9db2449f2f166 100644 --- a/libcxx/test/std/containers/iterator.rel_ops.compile.pass.cpp +++ b/libcxx/test/std/containers/iterator.rel_ops.compile.pass.cpp @@ -8,6 +8,8 @@ // XFAIL: availability-filesystem-missing +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + // Make sure the various containers' iterators are not broken by the use of `std::rel_ops`. #include // for std::rel_ops diff --git a/libcxx/test/std/utilities/utility/operators/rel_ops.pass.cpp b/libcxx/test/std/utilities/utility/operators/rel_ops.pass.cpp index 52ed642274114..db0c7a61bddd6 100644 --- a/libcxx/test/std/utilities/utility/operators/rel_ops.pass.cpp +++ b/libcxx/test/std/utilities/utility/operators/rel_ops.pass.cpp @@ -8,6 +8,8 @@ // test rel_ops +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + #include #include From 7fc9fb9f3f671ee4b1ccfefaf03ed18cc0c3e3c3 Mon Sep 17 00:00:00 2001 From: lntue <35648136+lntue@users.noreply.github.com> Date: Wed, 17 Jul 2024 12:23:14 -0400 Subject: [PATCH 298/777] [libc][math] Implement double precision cbrt correctly rounded to all rounding modes. (#99262) Division-less Newton iterations algorithm for cube roots. 1. **Range reduction** For `x = (-1)^s * 2^e * (1.m)`, we get 2 reduced arguments `x_r` and `a` as: ``` x_r = 1.m a = (-1)^s * 2^(e % 3) * (1.m) ``` Then `cbrt(x) = x^(1/3)` can be computed as: ``` x^(1/3) = 2^(e / 3) * a^(1/3). ``` In order to avoid division, we compute `a^(-2/3)` using Newton method and then multiply the results by a: ``` a^(1/3) = a * a^(-2/3). ``` 2. **First approximation to a^(-2/3)** First, we use a degree-7 minimax polynomial generated by Sollya to approximate `x_r^(-2/3)` for `1 <= x_r < 2`. ``` p = P(x_r) ~ x_r^(-2/3), ``` with relative errors bounded by: ``` | p / x_r^(-2/3) - 1 | < 1.16 * 2^-21. ``` Then we multiply with `2^(e % 3)` from a small lookup table to get: ``` x_0 = 2^(-2*(e % 3)/3) * p ~ 2^(-2*(e % 3)/3) * x_r^(-2/3) = a^(-2/3) ``` with relative errors: ``` | x_0 / a^(-2/3) - 1 | < 1.16 * 2^-21. ``` This step is done in double precision. 3. **First Newton iteration** We follow the method described in: Sibidanov, A. and Zimmermann, P., "Correctly rounded cubic root evaluation in double precision", https://core-math.gitlabpages.inria.fr/cbrt64.pdf to derive multiplicative Newton iterations as below: Let `x_n` be the nth approximation to `a^(-2/3)`. Define the n^th error as: ``` h_n = x_n^3 * a^2 - 1 ``` Then: ``` a^(-2/3) = x_n / (1 + h_n)^(1/3) = x_n * (1 - (1/3) * h_n + (2/9) * h_n^2 - (14/81) * h_n^3 + ...) ``` using the Taylor series expansion of `(1 + h_n)^(-1/3)`. Apply to `x_0` above: ``` h_0 = x_0^3 * a^2 - 1 = a^2 * (x_0 - a^(-2/3)) * (x_0^2 + x_0 * a^(-2/3) + a^(-4/3)), ``` it's bounded by: ``` |h_0| < 4 * 3 * 1.16 * 2^-21 * 4 < 2^-17. ``` So in the first iteration step, we use: ``` x_1 = x_0 * (1 - (1/3) * h_n + (2/9) * h_n^2 - (14/81) * h_n^3) ``` Its relative error is bounded by: ``` | x_1 / a^(-2/3) - 1 | < 35/242 * |h_0|^4 < 2^-70. ``` Then we perform Ziv's rounding test and check if the answer is exact. This step is done in double-double precision. 4. **Second Newton iteration** If the Ziv's rounding test from the previous step fails, we define the error term: ``` h_1 = x_1^3 * a^2 - 1, ``` And perform another iteration: ``` x_2 = x_1 * (1 - h_1 / 3) ``` with the relative errors exceed the precision of double-double. We then check the Ziv's accuracy test with relative errors < 2^-102 to compensate for rounding errors. 5. **Final iteration** If the Ziv's accuracy test from the previous step fails, we perform another iteration in 128-bit precision and check for exact outputs. --- libc/config/darwin/arm/entrypoints.txt | 1 + libc/config/gpu/entrypoints.txt | 1 + libc/config/linux/aarch64/entrypoints.txt | 1 + libc/config/linux/arm/entrypoints.txt | 1 + libc/config/linux/riscv/entrypoints.txt | 1 + libc/config/linux/x86_64/entrypoints.txt | 1 + libc/config/windows/entrypoints.txt | 1 + libc/docs/math/index.rst | 2 +- libc/spec/stdc.td | 1 + libc/src/math/CMakeLists.txt | 1 + libc/src/math/cbrt.h | 18 ++ libc/src/math/generic/CMakeLists.txt | 20 ++ libc/src/math/generic/cbrt.cpp | 339 ++++++++++++++++++++++ libc/test/src/math/CMakeLists.txt | 12 + libc/test/src/math/cbrt_test.cpp | 104 +++++++ libc/test/src/math/smoke/CMakeLists.txt | 10 + libc/test/src/math/smoke/cbrt_test.cpp | 35 +++ 17 files changed, 548 insertions(+), 1 deletion(-) create mode 100644 libc/src/math/cbrt.h create mode 100644 libc/src/math/generic/cbrt.cpp create mode 100644 libc/test/src/math/cbrt_test.cpp create mode 100644 libc/test/src/math/smoke/cbrt_test.cpp diff --git a/libc/config/darwin/arm/entrypoints.txt b/libc/config/darwin/arm/entrypoints.txt index 383118dc781e5..32a08f20b328f 100644 --- a/libc/config/darwin/arm/entrypoints.txt +++ b/libc/config/darwin/arm/entrypoints.txt @@ -123,6 +123,7 @@ set(TARGET_LIBM_ENTRYPOINTS libc.src.math.atan2f libc.src.math.atanf libc.src.math.atanhf + libc.src.math.cbrt libc.src.math.cbrtf libc.src.math.copysign libc.src.math.copysignf diff --git a/libc/config/gpu/entrypoints.txt b/libc/config/gpu/entrypoints.txt index b0c4652c6b8ee..3c6a92d279e50 100644 --- a/libc/config/gpu/entrypoints.txt +++ b/libc/config/gpu/entrypoints.txt @@ -245,6 +245,7 @@ set(TARGET_LIBM_ENTRYPOINTS libc.src.math.atanf libc.src.math.atanh libc.src.math.atanhf + libc.src.math.cbrt libc.src.math.cbrtf libc.src.math.ceil libc.src.math.ceilf diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt index dee6ac673643e..9b718c3f81151 100644 --- a/libc/config/linux/aarch64/entrypoints.txt +++ b/libc/config/linux/aarch64/entrypoints.txt @@ -345,6 +345,7 @@ set(TARGET_LIBM_ENTRYPOINTS libc.src.math.atan2f libc.src.math.atanf libc.src.math.atanhf + libc.src.math.cbrt libc.src.math.cbrtf libc.src.math.ceil libc.src.math.ceilf diff --git a/libc/config/linux/arm/entrypoints.txt b/libc/config/linux/arm/entrypoints.txt index b0ee0e989b5ed..a72f8668808a5 100644 --- a/libc/config/linux/arm/entrypoints.txt +++ b/libc/config/linux/arm/entrypoints.txt @@ -216,6 +216,7 @@ set(TARGET_LIBM_ENTRYPOINTS libc.src.math.atan2f libc.src.math.atanf libc.src.math.atanhf + libc.src.math.cbrt libc.src.math.cbrtf libc.src.math.ceil libc.src.math.ceilf diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt index 516a4b6ce3433..266c94d54a9df 100644 --- a/libc/config/linux/riscv/entrypoints.txt +++ b/libc/config/linux/riscv/entrypoints.txt @@ -347,6 +347,7 @@ set(TARGET_LIBM_ENTRYPOINTS libc.src.math.atan2f libc.src.math.atanf libc.src.math.atanhf + libc.src.math.cbrt libc.src.math.cbrtf libc.src.math.ceil libc.src.math.ceilf diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt index b6c55e7aa3033..4d19a28f4a2b3 100644 --- a/libc/config/linux/x86_64/entrypoints.txt +++ b/libc/config/linux/x86_64/entrypoints.txt @@ -370,6 +370,7 @@ set(TARGET_LIBM_ENTRYPOINTS libc.src.math.canonicalize libc.src.math.canonicalizef libc.src.math.canonicalizel + libc.src.math.cbrt libc.src.math.cbrtf libc.src.math.ceil libc.src.math.ceilf diff --git a/libc/config/windows/entrypoints.txt b/libc/config/windows/entrypoints.txt index 499c6bfe3a229..afc9ca87ff094 100644 --- a/libc/config/windows/entrypoints.txt +++ b/libc/config/windows/entrypoints.txt @@ -121,6 +121,7 @@ set(TARGET_LIBM_ENTRYPOINTS libc.src.math.atan2f libc.src.math.atanf libc.src.math.atanhf + libc.src.math.cbrt libc.src.math.cbrtf libc.src.math.copysign libc.src.math.copysignf diff --git a/libc/docs/math/index.rst b/libc/docs/math/index.rst index 70412e4ed203d..205d14946535e 100644 --- a/libc/docs/math/index.rst +++ b/libc/docs/math/index.rst @@ -266,7 +266,7 @@ Higher Math Functions +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ | atanpi | | | | | | 7.12.4.10 | F.10.1.10 | +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ -| cbrt | |check| | | | | | 7.12.7.1 | F.10.4.1 | +| cbrt | |check| | |check| | | | | 7.12.7.1 | F.10.4.1 | +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ | compoundn | | | | | | 7.12.7.2 | F.10.4.2 | +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td index aa56152aee141..a4c6b40b98388 100644 --- a/libc/spec/stdc.td +++ b/libc/spec/stdc.td @@ -382,6 +382,7 @@ def StdC : StandardSpec<"stdc"> { ], [], // Enumerations [ + FunctionSpec<"cbrt", RetValSpec, [ArgSpec]>, FunctionSpec<"cbrtf", RetValSpec, [ArgSpec]>, FunctionSpec<"copysign", RetValSpec, [ArgSpec, ArgSpec]>, diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt index 6462afbc54a4f..dc2339896f2bb 100644 --- a/libc/src/math/CMakeLists.txt +++ b/libc/src/math/CMakeLists.txt @@ -65,6 +65,7 @@ add_math_entrypoint_object(canonicalizel) add_math_entrypoint_object(canonicalizef16) add_math_entrypoint_object(canonicalizef128) +add_math_entrypoint_object(cbrt) add_math_entrypoint_object(cbrtf) add_math_entrypoint_object(ceil) diff --git a/libc/src/math/cbrt.h b/libc/src/math/cbrt.h new file mode 100644 index 0000000000000..a7d5fe80e57b3 --- /dev/null +++ b/libc/src/math/cbrt.h @@ -0,0 +1,18 @@ +//===-- Implementation header for cbrt --------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_MATH_CBRT_H +#define LLVM_LIBC_SRC_MATH_CBRT_H + +namespace LIBC_NAMESPACE { + +double cbrt(double x); + +} // namespace LIBC_NAMESPACE + +#endif // LLVM_LIBC_SRC_MATH_CBRT_H diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt index c2f58fb1a4f71..415ca3fbce796 100644 --- a/libc/src/math/generic/CMakeLists.txt +++ b/libc/src/math/generic/CMakeLists.txt @@ -4180,3 +4180,23 @@ add_entrypoint_object( libc.src.__support.FPUtil.multiply_add libc.src.__support.macros.optimization ) + +add_entrypoint_object( + cbrt + SRCS + cbrt.cpp + HDRS + ../cbrt.h + COMPILE_OPTIONS + -O3 + DEPENDS + libc.hdr.fenv_macros + libc.src.__support.FPUtil.double_double + libc.src.__support.FPUtil.dyadic_float + libc.src.__support.FPUtil.fenv_impl + libc.src.__support.FPUtil.fp_bits + libc.src.__support.FPUtil.multiply_add + libc.src.__support.FPUtil.polyeval + libc.src.__support.macros.optimization + libc.src.__support.integer_literals +) diff --git a/libc/src/math/generic/cbrt.cpp b/libc/src/math/generic/cbrt.cpp new file mode 100644 index 0000000000000..e226054332dfa --- /dev/null +++ b/libc/src/math/generic/cbrt.cpp @@ -0,0 +1,339 @@ +//===-- Implementation of cbrt function -----------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/math/cbrt.h" +#include "hdr/fenv_macros.h" +#include "src/__support/FPUtil/FEnvImpl.h" +#include "src/__support/FPUtil/FPBits.h" +#include "src/__support/FPUtil/PolyEval.h" +#include "src/__support/FPUtil/double_double.h" +#include "src/__support/FPUtil/dyadic_float.h" +#include "src/__support/FPUtil/multiply_add.h" +#include "src/__support/common.h" +#include "src/__support/integer_literals.h" +#include "src/__support/macros/config.h" +#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY + +#if ((LIBC_MATH & LIBC_MATH_SKIP_ACCURATE_PASS) != 0) +#define LIBC_MATH_CBRT_SKIP_ACCURATE_PASS +#endif + +namespace LIBC_NAMESPACE_DECL { + +using DoubleDouble = fputil::DoubleDouble; +using Float128 = fputil::DyadicFloat<128>; + +namespace { + +// Initial approximation of x^(-2/3) for 1 <= x < 2. +// Polynomial generated by Sollya with: +// > P = fpminimax(x^(-2/3), 7, [|D...|], [1, 2]); +// > dirtyinfnorm(P/x^(-2/3) - 1, [1, 2]); +// 0x1.28...p-21 +constexpr double intial_approximation(double x) { + constexpr double COEFFS[8] = { + 0x1.bc52aedead5c6p1, -0x1.b52bfebf110b3p2, 0x1.1d8d71d53d126p3, + -0x1.de2db9e81cf87p2, 0x1.0154ca06153bdp2, -0x1.5973c66ee6da7p0, + 0x1.07bf6ac832552p-2, -0x1.5e53d9ce41cb8p-6, + }; + + double x_sq = x * x; + + double c0 = fputil::multiply_add(x, COEFFS[1], COEFFS[0]); + double c1 = fputil::multiply_add(x, COEFFS[3], COEFFS[2]); + double c2 = fputil::multiply_add(x, COEFFS[5], COEFFS[4]); + double c3 = fputil::multiply_add(x, COEFFS[7], COEFFS[6]); + + double x_4 = x_sq * x_sq; + double d0 = fputil::multiply_add(x_sq, c1, c0); + double d1 = fputil::multiply_add(x_sq, c3, c2); + + return fputil::multiply_add(x_4, d1, d0); +} + +// Get the error term for Newton iteration: +// h(x) = x^3 * a^2 - 1, +#ifdef LIBC_TARGET_CPU_HAS_FMA +constexpr double get_error(const DoubleDouble &x_3, const DoubleDouble &a_sq) { + return fputil::multiply_add(x_3.hi, a_sq.hi, -1.0) + + fputil::multiply_add(x_3.lo, a_sq.hi, x_3.hi * a_sq.lo); +} +#else +constexpr double get_error(const DoubleDouble &x_3, const DoubleDouble &a_sq) { + DoubleDouble x_3_a_sq = fputil::quick_mult(a_sq, x_3); + return (x_3_a_sq.hi - 1.0) + x_3_a_sq.lo; +} +#endif + +} // anonymous namespace + +// Correctly rounded cbrt algorithm: +// +// === Step 1 - Range reduction === +// For x = (-1)^s * 2^e * (1.m), we get 2 reduced arguments x_r and a as: +// x_r = 1.m +// a = (-1)^s * 2^(e % 3) * (1.m) +// Then cbrt(x) = x^(1/3) can be computed as: +// x^(1/3) = 2^(e / 3) * a^(1/3). +// +// In order to avoid division, we compute a^(-2/3) using Newton method and then +// multiply the results by a: +// a^(1/3) = a * a^(-2/3). +// +// === Step 2 - First approximation to a^(-2/3) === +// First, we use a degree-7 minimax polynomial generated by Sollya to +// approximate x_r^(-2/3) for 1 <= x_r < 2. +// p = P(x_r) ~ x_r^(-2/3), +// with relative errors bounded by: +// | p / x_r^(-2/3) - 1 | < 1.16 * 2^-21. +// +// Then we multiply with 2^(e % 3) from a small lookup table to get: +// x_0 = 2^(-2*(e % 3)/3) * p +// ~ 2^(-2*(e % 3)/3) * x_r^(-2/3) +// = a^(-2/3) +// With relative errors: +// | x_0 / a^(-2/3) - 1 | < 1.16 * 2^-21. +// This step is done in double precision. +// +// === Step 3 - First Newton iteration === +// We follow the method described in: +// Sibidanov, A. and Zimmermann, P., "Correctly rounded cubic root evaluation +// in double precision", https://core-math.gitlabpages.inria.fr/cbrt64.pdf +// to derive multiplicative Newton iterations as below: +// Let x_n be the nth approximation to a^(-2/3). Define the n^th error as: +// h_n = x_n^3 * a^2 - 1 +// Then: +// a^(-2/3) = x_n / (1 + h_n)^(1/3) +// = x_n * (1 - (1/3) * h_n + (2/9) * h_n^2 - (14/81) * h_n^3 + ...) +// using the Taylor series expansion of (1 + h_n)^(-1/3). +// +// Apply to x_0 above: +// h_0 = x_0^3 * a^2 - 1 +// = a^2 * (x_0 - a^(-2/3)) * (x_0^2 + x_0 * a^(-2/3) + a^(-4/3)), +// it's bounded by: +// |h_0| < 4 * 3 * 1.16 * 2^-21 * 4 < 2^-17. +// So in the first iteration step, we use: +// x_1 = x_0 * (1 - (1/3) * h_n + (2/9) * h_n^2 - (14/81) * h_n^3) +// Its relative error is bounded by: +// | x_1 / a^(-2/3) - 1 | < 35/242 * |h_0|^4 < 2^-70. +// Then we perform Ziv's rounding test and check if the answer is exact. +// This step is done in double-double precision. +// +// === Step 4 - Second Newton iteration === +// If the Ziv's rounding test from the previous step fails, we define the error +// term: +// h_1 = x_1^3 * a^2 - 1, +// And perform another iteration: +// x_2 = x_1 * (1 - h_1 / 3) +// with the relative errors exceed the precision of double-double. +// We then check the Ziv's accuracy test with relative errors < 2^-102 to +// compensate for rounding errors. +// +// === Step 5 - Final iteration === +// If the Ziv's accuracy test from the previous step fails, we perform another +// iteration in 128-bit precision and check for exact outputs. +// +// TODO: It is possible to replace this costly computation step with special +// exceptional handling, similar to what was done in the CORE-MATH project: +// https://gitlab.inria.fr/core-math/core-math/-/blob/master/src/binary64/cbrt/cbrt.c + +LLVM_LIBC_FUNCTION(double, cbrt, (double x)) { + using FPBits = fputil::FPBits; + + uint64_t x_abs = FPBits(x).abs().uintval(); + + unsigned exp_bias_correction = 682; // 1023 * 2/3 + + if (LIBC_UNLIKELY(x_abs < FPBits::min_normal().uintval() || + x_abs >= FPBits::inf().uintval())) { + if (x_abs == 0 || x_abs >= FPBits::inf().uintval()) + // x is 0, Inf, or NaN. + return x; + + // x is non-zero denormal number. + // Normalize x. + x *= 0x1.0p60; + exp_bias_correction -= 20; + } + + FPBits x_bits(x); + + // When using biased exponent of x in double precision, + // x_e = real_exponent_of_x + 1023 + // Then: + // x_e / 3 = real_exponent_of_x / 3 + 1023/3 + // = real_exponent_of_x / 3 + 341 + // So to make it the correct biased exponent of x^(1/3), we add + // 1023 - 341 = 682 + // to the quotient x_e / 3. + unsigned x_e = static_cast(x_bits.get_biased_exponent()); + unsigned out_e = (x_e / 3 + exp_bias_correction); + unsigned shift_e = x_e % 3; + + // Set x_r = 1.mantissa + double x_r = + FPBits(x_bits.get_mantissa() | + (static_cast(FPBits::EXP_BIAS) << FPBits::FRACTION_LEN)) + .get_val(); + + // Set a = (-1)^x_sign * 2^(x_e % 3) * (1.mantissa) + uint64_t a_bits = x_bits.uintval() & 0x800F'FFFF'FFFF'FFFF; + a_bits |= + (static_cast(shift_e + static_cast(FPBits::EXP_BIAS)) + << FPBits::FRACTION_LEN); + double a = FPBits(a_bits).get_val(); + + // Initial approximation of x_r^(-2/3). + double p = intial_approximation(x_r); + + // Look up for 2^(-2*n/3) used for first approximation step. + constexpr double EXP2_M2_OVER_3[3] = {1.0, 0x1.428a2f98d728bp-1, + 0x1.965fea53d6e3dp-2}; + + // x0 is an initial approximation of a^(-2/3) for 1 <= |a| < 8. + // Relative error: < 1.16 * 2^(-21). + double x0 = static_cast(EXP2_M2_OVER_3[shift_e] * p); + + // First iteration in double precision. + DoubleDouble a_sq = fputil::exact_mult(a, a); + + // h0 = x0^3 * a^2 - 1 + DoubleDouble x0_sq = fputil::exact_mult(x0, x0); + DoubleDouble x0_3 = fputil::quick_mult(x0, x0_sq); + + double h0 = get_error(x0_3, a_sq); + +#ifdef LIBC_MATH_CBRT_SKIP_ACCURATE_PASS + constexpr double REL_ERROR = 0; +#else + constexpr double REL_ERROR = 0x1.0p-51; +#endif // LIBC_MATH_CBRT_SKIP_ACCURATE_PASS + + // Taylor polynomial of (1 + h)^(-1/3): + // (1 + h)^(-1/3) = 1 - h/3 + 2 h^2 / 9 - 14 h^3 / 81 + ... + constexpr double ERR_COEFFS[3] = { + -0x1.5555555555555p-2 - REL_ERROR, // -1/3 - relative_error + 0x1.c71c71c71c71cp-3, // 2/9 + -0x1.61f9add3c0ca4p-3, // -14/81 + }; + // e0 = -14 * h^2 / 81 + 2 * h / 9 - 1/3 - relative_error. + double e0 = fputil::polyeval(h0, ERR_COEFFS[0], ERR_COEFFS[1], ERR_COEFFS[2]); + double x0_h0 = x0 * h0; + + // x1 = x0 (1 - h0/3 + 2 h0^2 / 9 - 14 h0^3 / 81) + // x1 approximate a^(-2/3) with relative errors bounded by: + // | x1 / a^(-2/3) - 1 | < (34/243) h0^4 < h0 * REL_ERROR + DoubleDouble x1_dd{x0_h0 * e0, x0}; + + // r1 = x1 * a ~ a^(-2/3) * a = a^(1/3). + DoubleDouble r1 = fputil::quick_mult(a, x1_dd); + + // Lambda function to update the exponent of the result. + auto update_exponent = [=](double r) -> double { + uint64_t r_m = FPBits(r).uintval() & 0x800F'FFFF'FFFF'FFFF; + // Adjust exponent and sign. + uint64_t r_bits = + r_m | (static_cast(out_e) << FPBits::FRACTION_LEN); + return FPBits(r_bits).get_val(); + }; + +#ifdef LIBC_MATH_CBRT_SKIP_ACCURATE_PASS + // TODO: We probably don't need to use double-double if accurate tests and + // passes are skipped. + return update_exponent(r1.hi + r1.lo); +#else + // Accurate checks and passes. + double r1_lower = r1.hi + r1.lo; + double r1_upper = + r1.hi + fputil::multiply_add(x0_h0, 2.0 * REL_ERROR * a, r1.lo); + + // Ziv's accuracy test. + if (LIBC_LIKELY(r1_upper == r1_lower)) { + // Test for exact outputs. + // Check if lower (52 - 17 = 35) bits are 0's. + if (LIBC_UNLIKELY((FPBits(r1_lower).uintval() & 0x0000'0007'FFFF'FFFF) == + 0)) { + double r1_err = (r1_lower - r1.hi) - r1.lo; + if (FPBits(r1_err).abs().get_val() < 0x1.0p69) + fputil::clear_except_if_required(FE_INEXACT); + } + + return update_exponent(r1_lower); + } + + // Accuracy test failed, perform another Newton iteration. + double x1 = x1_dd.hi + (e0 + REL_ERROR) * x0_h0; + + // Second iteration in double-double precision. + // h1 = x1^3 * a^2 - 1. + DoubleDouble x1_sq = fputil::exact_mult(x1, x1); + DoubleDouble x1_3 = fputil::quick_mult(x1, x1_sq); + double h1 = get_error(x1_3, a_sq); + + // e1 = -x1*h1/3. + double e1 = h1 * (x1 * -0x1.5555555555555p-2); + // x2 = x1*(1 - h1/3) = x1 + e1 ~ a^(-2/3) with relative errors < 2^-101. + DoubleDouble x2 = fputil::exact_add(x1, e1); + // r2 = a * x2 ~ a * a^(-2/3) = a^(1/3) with relative errors < 2^-100. + DoubleDouble r2 = fputil::quick_mult(a, x2); + + double r2_upper = r2.hi + fputil::multiply_add(a, 0x1.0p-102, r2.lo); + double r2_lower = r2.hi + fputil::multiply_add(a, -0x1.0p-102, r2.lo); + + // Ziv's accuracy test. + if (LIBC_LIKELY(r2_upper == r2_lower)) + return update_exponent(r2_upper); + + // TODO: Investigate removing float128 and just list exceptional cases. + // Apply another Newton iteration with ~126-bit accuracy. + Float128 x2_f128 = fputil::quick_add(Float128(x2.hi), Float128(x2.lo)); + // x2^3 + Float128 x2_3 = + fputil::quick_mul(fputil::quick_mul(x2_f128, x2_f128), x2_f128); + // a^2 + Float128 a_sq_f128 = fputil::quick_mul(Float128(a), Float128(a)); + // x2^3 * a^2 + Float128 x2_3_a_sq = fputil::quick_mul(x2_3, a_sq_f128); + // h2 = x2^3 * a^2 - 1 + Float128 h2_f128 = fputil::quick_add(x2_3_a_sq, Float128(-1.0)); + double h2 = static_cast(h2_f128); + // t2 = 1 - h2 / 3 + Float128 t2 = + fputil::quick_add(Float128(1.0), Float128(h2 * (-0x1.5555555555555p-2))); + // x3 = x2 * (1 - h2 / 3) ~ a^(-2/3) + Float128 x3 = fputil::quick_mul(x2_f128, t2); + // r3 = a * x3 ~ a * a^(-2/3) = a^(1/3) + Float128 r3 = fputil::quick_mul(Float128(a), x3); + + // Check for exact cases: + Float128::MantissaType rounding_bits = + r3.mantissa & 0x0000'0000'0000'03FF'FFFF'FFFF'FFFF'FFFF_u128; + + double result = static_cast(r3); + if ((rounding_bits < 0x0000'0000'0000'0000'0000'0000'0000'000F_u128) || + (rounding_bits >= 0x0000'0000'0000'03FF'FFFF'FFFF'FFFF'FFF0_u128)) { + // Output is exact. + r3.mantissa &= 0xFFFF'FFFF'FFFF'FFFF'FFFF'FFFF'FFFF'FFF0_u128; + + if (rounding_bits >= 0x0000'0000'0000'03FF'FFFF'FFFF'FFFF'FFF0_u128) { + Float128 tmp{r3.sign, r3.exponent - 123, + 0x8000'0000'0000'0000'0000'0000'0000'0000_u128}; + Float128 r4 = fputil::quick_add(r3, tmp); + result = static_cast(r4); + } else { + result = static_cast(r3); + } + + fputil::clear_except_if_required(FE_INEXACT); + } + + return update_exponent(result); +#endif // LIBC_MATH_CBRT_SKIP_ACCURATE_PASS +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/test/src/math/CMakeLists.txt b/libc/test/src/math/CMakeLists.txt index 0dc7ae6aae2df..64b4d2c58fb6a 100644 --- a/libc/test/src/math/CMakeLists.txt +++ b/libc/test/src/math/CMakeLists.txt @@ -2225,6 +2225,18 @@ add_fp_unittest( libc.src.__support.FPUtil.fp_bits ) +add_fp_unittest( + cbrt_test + NEED_MPFR + SUITE + libc-math-unittests + SRCS + cbrt_test.cpp + DEPENDS + libc.src.math.cbrt + libc.src.__support.FPUtil.fp_bits +) + add_subdirectory(generic) add_subdirectory(smoke) diff --git a/libc/test/src/math/cbrt_test.cpp b/libc/test/src/math/cbrt_test.cpp new file mode 100644 index 0000000000000..123351496118b --- /dev/null +++ b/libc/test/src/math/cbrt_test.cpp @@ -0,0 +1,104 @@ +//===-- Unittests for cbrt ------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "hdr/math_macros.h" +#include "src/__support/FPUtil/FPBits.h" +#include "src/math/cbrt.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" +#include "utils/MPFRWrapper/MPFRUtils.h" + +using LlvmLibcCbrtTest = LIBC_NAMESPACE::testing::FPTest; + +namespace mpfr = LIBC_NAMESPACE::testing::mpfr; + +using LIBC_NAMESPACE::testing::tlog; + +TEST_F(LlvmLibcCbrtTest, InDoubleRange) { + constexpr uint64_t COUNT = 123'451; + uint64_t START = LIBC_NAMESPACE::fputil::FPBits(1.0).uintval(); + uint64_t STOP = LIBC_NAMESPACE::fputil::FPBits(8.0).uintval(); + uint64_t STEP = (STOP - START) / COUNT; + + auto test = [&](mpfr::RoundingMode rounding_mode) { + mpfr::ForceRoundingMode force_rounding(rounding_mode); + if (!force_rounding.success) + return; + + uint64_t fails = 0; + uint64_t tested = 0; + uint64_t total = 0; + double worst_input, worst_output = 0.0; + double ulp = 0.5; + + for (uint64_t i = 0, v = START; i <= COUNT; ++i, v += STEP) { + double x = FPBits(v).get_val(); + if (isnan(x) || isinf(x)) + continue; + + double result = LIBC_NAMESPACE::cbrt(x); + ++total; + if (isnan(result) || isinf(result)) + continue; + + ++tested; + + if (!TEST_MPFR_MATCH_ROUNDING_SILENTLY(mpfr::Operation::Cbrt, x, result, + 0.5, rounding_mode)) { + ++fails; + while (!TEST_MPFR_MATCH_ROUNDING_SILENTLY(mpfr::Operation::Cbrt, x, + result, ulp, rounding_mode)) { + worst_input = x; + worst_output = result; + + if (ulp > 1000.0) + break; + + ulp *= 2.0; + } + } + } + if (fails) { + tlog << " Cbrt failed: " << fails << "/" << tested << "/" << total + << " tests.\n"; + tlog << " Max ULPs is at most: " << static_cast(ulp) << ".\n"; + EXPECT_MPFR_MATCH(mpfr::Operation::Cbrt, worst_input, worst_output, 0.5, + rounding_mode); + } + }; + + tlog << " Test Rounding To Nearest...\n"; + test(mpfr::RoundingMode::Nearest); + + tlog << " Test Rounding Downward...\n"; + test(mpfr::RoundingMode::Downward); + + tlog << " Test Rounding Upward...\n"; + test(mpfr::RoundingMode::Upward); + + tlog << " Test Rounding Toward Zero...\n"; + test(mpfr::RoundingMode::TowardZero); +} + +TEST_F(LlvmLibcCbrtTest, SpecialValues) { + constexpr double INPUTS[] = { + 0x1.4f61672324c8p-1028, 0x1.00152f57068b7p-1, 0x1.006509cda9886p-1, + 0x1.018369b92e523p-1, 0x1.10af932ef2bf9p-1, 0x1.1a41117939fdbp-1, + 0x1.2ae8076520d9ap-1, 0x1.a202bfc89ddffp-1, 0x1.a6bb8c803147bp-1, + 0x1.000197b499b1bp+0, 0x1.00065ed266c6cp+0, 0x1.d4306c202c4c2p+0, + 0x1.8fd409efe4851p+1, 0x1.95fd0eb31cc4p+1, 0x1.7cef1d276e335p+2, + 0x1.94910c4fc98p+2, 0x1.a0cc1327bb4c4p+2, 0x1.e7d6ebed549c4p+2, + }; + for (double v : INPUTS) { + double x = FPBits(v).get_val(); + ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Cbrt, x, + LIBC_NAMESPACE::cbrt(x), 0.5); + ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Cbrt, -x, + LIBC_NAMESPACE::cbrt(-x), 0.5); + } +} diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt index 7f1bc0c204c68..76d5919ad9156 100644 --- a/libc/test/src/math/smoke/CMakeLists.txt +++ b/libc/test/src/math/smoke/CMakeLists.txt @@ -3971,3 +3971,13 @@ add_fp_unittest( DEPENDS libc.src.math.cbrtf ) + +add_fp_unittest( + cbrt_test + SUITE + libc-math-smoke-tests + SRCS + cbrt_test.cpp + DEPENDS + libc.src.math.cbrt +) diff --git a/libc/test/src/math/smoke/cbrt_test.cpp b/libc/test/src/math/smoke/cbrt_test.cpp new file mode 100644 index 0000000000000..724e0e979decc --- /dev/null +++ b/libc/test/src/math/smoke/cbrt_test.cpp @@ -0,0 +1,35 @@ +//===-- Unittests for cbrt ------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/math/cbrt.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" + +using LlvmLibcCbrtTest = LIBC_NAMESPACE::testing::FPTest; + +using LIBC_NAMESPACE::testing::tlog; + +TEST_F(LlvmLibcCbrtTest, SpecialNumbers) { + EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::cbrt(aNaN)); + EXPECT_FP_EQ_ALL_ROUNDING(inf, LIBC_NAMESPACE::cbrt(inf)); + EXPECT_FP_EQ_ALL_ROUNDING(neg_inf, LIBC_NAMESPACE::cbrt(neg_inf)); + EXPECT_FP_EQ_ALL_ROUNDING(zero, LIBC_NAMESPACE::cbrt(zero)); + EXPECT_FP_EQ_ALL_ROUNDING(neg_zero, LIBC_NAMESPACE::cbrt(neg_zero)); + EXPECT_FP_EQ_ALL_ROUNDING(1.0, LIBC_NAMESPACE::cbrt(1.0)); + EXPECT_FP_EQ_ALL_ROUNDING(-1.0, LIBC_NAMESPACE::cbrt(-1.0)); + EXPECT_FP_EQ_ALL_ROUNDING(2.0, LIBC_NAMESPACE::cbrt(8.0)); + EXPECT_FP_EQ_ALL_ROUNDING(-2.0, LIBC_NAMESPACE::cbrt(-8.0)); + EXPECT_FP_EQ_ALL_ROUNDING(3.0, LIBC_NAMESPACE::cbrt(27.0)); + EXPECT_FP_EQ_ALL_ROUNDING(-3.0, LIBC_NAMESPACE::cbrt(-27.0)); + EXPECT_FP_EQ_ALL_ROUNDING(5.0, LIBC_NAMESPACE::cbrt(125.0)); + EXPECT_FP_EQ_ALL_ROUNDING(-5.0, LIBC_NAMESPACE::cbrt(-125.0)); + EXPECT_FP_EQ_ALL_ROUNDING(0x1.0p42, LIBC_NAMESPACE::cbrt(0x1.0p126)); + EXPECT_FP_EQ_ALL_ROUNDING(-0x1.0p42, LIBC_NAMESPACE::cbrt(-0x1.0p126)); + EXPECT_FP_EQ_ALL_ROUNDING(0x1.0p341, LIBC_NAMESPACE::cbrt(0x1.0p1023)); + EXPECT_FP_EQ_ALL_ROUNDING(-0x1.0p341, LIBC_NAMESPACE::cbrt(-0x1.0p1023)); +} From ac1d5facf60c6e83418f8ab9d3fdfb1a8004d4aa Mon Sep 17 00:00:00 2001 From: lntue <35648136+lntue@users.noreply.github.com> Date: Wed, 17 Jul 2024 12:33:05 -0400 Subject: [PATCH 299/777] [libc][math] Remove constexpr quantifier from cbrt's utility functions. (#99349) Fix full build failures: https://lab.llvm.org/buildbot/#/builders/131/builds/2342 --- libc/src/math/generic/cbrt.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/libc/src/math/generic/cbrt.cpp b/libc/src/math/generic/cbrt.cpp index e226054332dfa..036664c2aafaf 100644 --- a/libc/src/math/generic/cbrt.cpp +++ b/libc/src/math/generic/cbrt.cpp @@ -35,7 +35,7 @@ namespace { // > P = fpminimax(x^(-2/3), 7, [|D...|], [1, 2]); // > dirtyinfnorm(P/x^(-2/3) - 1, [1, 2]); // 0x1.28...p-21 -constexpr double intial_approximation(double x) { +double intial_approximation(double x) { constexpr double COEFFS[8] = { 0x1.bc52aedead5c6p1, -0x1.b52bfebf110b3p2, 0x1.1d8d71d53d126p3, -0x1.de2db9e81cf87p2, 0x1.0154ca06153bdp2, -0x1.5973c66ee6da7p0, @@ -59,12 +59,12 @@ constexpr double intial_approximation(double x) { // Get the error term for Newton iteration: // h(x) = x^3 * a^2 - 1, #ifdef LIBC_TARGET_CPU_HAS_FMA -constexpr double get_error(const DoubleDouble &x_3, const DoubleDouble &a_sq) { +double get_error(const DoubleDouble &x_3, const DoubleDouble &a_sq) { return fputil::multiply_add(x_3.hi, a_sq.hi, -1.0) + fputil::multiply_add(x_3.lo, a_sq.hi, x_3.hi * a_sq.lo); } #else -constexpr double get_error(const DoubleDouble &x_3, const DoubleDouble &a_sq) { +double get_error(const DoubleDouble &x_3, const DoubleDouble &a_sq) { DoubleDouble x_3_a_sq = fputil::quick_mult(a_sq, x_3); return (x_3_a_sq.hi - 1.0) + x_3_a_sq.lo; } From e5ccc7136dab209d769cc97efd7f1596c12d5bec Mon Sep 17 00:00:00 2001 From: "Mikhail R. Gadelha" Date: Wed, 17 Jul 2024 18:41:12 +0200 Subject: [PATCH 300/777] [libc] Add missing -latomic for rv32 (#99337) On rv32, libatomic is needed to build libc when mpfr and gmp are enabled. --- libc/cmake/modules/LLVMLibCCheckMPFR.cmake | 2 +- libc/test/src/CMakeLists.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/libc/cmake/modules/LLVMLibCCheckMPFR.cmake b/libc/cmake/modules/LLVMLibCCheckMPFR.cmake index 45334a54431ef..a27c2dc0c030b 100644 --- a/libc/cmake/modules/LLVMLibCCheckMPFR.cmake +++ b/libc/cmake/modules/LLVMLibCCheckMPFR.cmake @@ -13,6 +13,6 @@ else() SOURCES ${LIBC_SOURCE_DIR}/utils/MPFRWrapper/check_mpfr.cpp LINK_LIBRARIES - -lmpfr -lgmp + -lmpfr -lgmp -latomic ) endif() diff --git a/libc/test/src/CMakeLists.txt b/libc/test/src/CMakeLists.txt index a5e7a2a4dee72..b9a50a47af75d 100644 --- a/libc/test/src/CMakeLists.txt +++ b/libc/test/src/CMakeLists.txt @@ -24,7 +24,7 @@ function(add_fp_unittest name) message(FATAL_ERROR "Hermetic math test cannot require MPFR.") endif() set(test_type UNIT_TEST_ONLY) - list(APPEND MATH_UNITTEST_LINK_LIBRARIES libcMPFRWrapper -lmpfr -lgmp) + list(APPEND MATH_UNITTEST_LINK_LIBRARIES libcMPFRWrapper -lmpfr -lgmp -latomic) endif() list(APPEND MATH_UNITTEST_LINK_LIBRARIES LibcFPTestHelpers) From a10570ba91050a394ca7766a6d1386dc17f8acc6 Mon Sep 17 00:00:00 2001 From: Eli Friedman Date: Wed, 17 Jul 2024 09:42:53 -0700 Subject: [PATCH 301/777] [MachO] Detect overflow in section offset. (#98685) The section offset field is only 32 bits; if the computed section offset is larger, make sure we don't emit a corrupt object file. --- llvm/lib/MC/MachObjectWriter.cpp | 18 +++++++++++++++++- llvm/test/MC/MachO/section-offset-overflow.s | 9 +++++++++ 2 files changed, 26 insertions(+), 1 deletion(-) create mode 100644 llvm/test/MC/MachO/section-offset-overflow.s diff --git a/llvm/lib/MC/MachObjectWriter.cpp b/llvm/lib/MC/MachObjectWriter.cpp index 53eed0092a5b4..e58e095252d05 100644 --- a/llvm/lib/MC/MachObjectWriter.cpp +++ b/llvm/lib/MC/MachObjectWriter.cpp @@ -277,9 +277,12 @@ void MachObjectWriter::writeSection(const MCAssembler &Asm, W.write(VMAddr); // address W.write(SectionSize); // size } + assert(isUInt<32>(FileOffset) && "Cannot encode offset of section"); W.write(FileOffset); W.write(Log2(Section.getAlign())); + assert((!NumRelocations || isUInt<32>(RelocationsStart)) && + "Cannot encode offset of relocations"); W.write(NumRelocations ? RelocationsStart : 0); W.write(NumRelocations); W.write(Flags); @@ -775,6 +778,7 @@ void MachObjectWriter::populateAddrSigSection(MCAssembler &Asm) { uint64_t MachObjectWriter::writeObject(MCAssembler &Asm) { uint64_t StartOffset = W.OS.tell(); + auto NumBytesWritten = [&] { return W.OS.tell() - StartOffset; }; populateAddrSigSection(Asm); @@ -904,6 +908,18 @@ uint64_t MachObjectWriter::writeObject(MCAssembler &Asm) { unsigned Flags = Sec.getTypeAndAttributes(); if (Sec.hasInstructions()) Flags |= MachO::S_ATTR_SOME_INSTRUCTIONS; + if (!cast(Sec).isVirtualSection() && + !isUInt<32>(SectionStart)) { + Asm.getContext().reportError( + SMLoc(), "cannot encode offset of section; object file too large"); + return NumBytesWritten(); + } + if (NumRelocs && !isUInt<32>(RelocTableEnd)) { + Asm.getContext().reportError( + SMLoc(), + "cannot encode offset of relocations; object file too large"); + return NumBytesWritten(); + } writeSection(Asm, Sec, getSectionAddress(&Sec), SectionStart, Flags, RelocTableEnd, NumRelocs); RelocTableEnd += NumRelocs * sizeof(MachO::any_relocation_info); @@ -1088,7 +1104,7 @@ uint64_t MachObjectWriter::writeObject(MCAssembler &Asm) { StringTable.write(W.OS); } - return W.OS.tell() - StartOffset; + return NumBytesWritten(); } std::unique_ptr diff --git a/llvm/test/MC/MachO/section-offset-overflow.s b/llvm/test/MC/MachO/section-offset-overflow.s new file mode 100644 index 0000000000000..f652cdb9f7e5c --- /dev/null +++ b/llvm/test/MC/MachO/section-offset-overflow.s @@ -0,0 +1,9 @@ +// RUN: not llvm-mc -triple x86_64-apple-macosx -filetype=obj -o /dev/null %s 2>&1 | FileCheck %s + +// CHECK: error: cannot encode offset of section + + .data + .long 1 + .zero 0x100000000 + .const + .long 1 From 2d42f840a2f08ce9635bafe56b2817d8b5099d06 Mon Sep 17 00:00:00 2001 From: Eli Friedman Date: Wed, 17 Jul 2024 09:44:56 -0700 Subject: [PATCH 302/777] [MC] Fix emission in asm of alignment 2^32. (#98688) The alignment amount was getting corrupted due to accidental truncation. --- llvm/lib/MC/MCAsmStreamer.cpp | 12 ++++++------ llvm/test/CodeGen/X86/global-with-max-align.ll | 14 ++++++++++++++ 2 files changed, 20 insertions(+), 6 deletions(-) create mode 100644 llvm/test/CodeGen/X86/global-with-max-align.ll diff --git a/llvm/lib/MC/MCAsmStreamer.cpp b/llvm/lib/MC/MCAsmStreamer.cpp index 45c32f13e759b..24209e456b5e2 100644 --- a/llvm/lib/MC/MCAsmStreamer.cpp +++ b/llvm/lib/MC/MCAsmStreamer.cpp @@ -254,7 +254,7 @@ class MCAsmStreamer final : public MCStreamer { void emitFill(const MCExpr &NumValues, int64_t Size, int64_t Expr, SMLoc Loc = SMLoc()) override; - void emitAlignmentDirective(unsigned ByteAlignment, + void emitAlignmentDirective(uint64_t ByteAlignment, std::optional Value, unsigned ValueSize, unsigned MaxBytesToEmit); @@ -1478,23 +1478,23 @@ void MCAsmStreamer::emitFill(const MCExpr &NumValues, int64_t Size, EmitEOL(); } -void MCAsmStreamer::emitAlignmentDirective(unsigned ByteAlignment, +void MCAsmStreamer::emitAlignmentDirective(uint64_t ByteAlignment, std::optional Value, unsigned ValueSize, unsigned MaxBytesToEmit) { if (MAI->useDotAlignForAlignment()) { - if (!isPowerOf2_32(ByteAlignment)) + if (!isPowerOf2_64(ByteAlignment)) report_fatal_error("Only power-of-two alignments are supported " "with .align."); OS << "\t.align\t"; - OS << Log2_32(ByteAlignment); + OS << Log2_64(ByteAlignment); EmitEOL(); return; } // Some assemblers don't support non-power of two alignments, so we always // emit alignments as a power of two if possible. - if (isPowerOf2_32(ByteAlignment)) { + if (isPowerOf2_64(ByteAlignment)) { switch (ValueSize) { default: llvm_unreachable("Invalid size for machine code value!"); @@ -1511,7 +1511,7 @@ void MCAsmStreamer::emitAlignmentDirective(unsigned ByteAlignment, llvm_unreachable("Unsupported alignment size!"); } - OS << Log2_32(ByteAlignment); + OS << Log2_64(ByteAlignment); if (Value.has_value() || MaxBytesToEmit) { if (Value.has_value()) { diff --git a/llvm/test/CodeGen/X86/global-with-max-align.ll b/llvm/test/CodeGen/X86/global-with-max-align.ll new file mode 100644 index 0000000000000..5cd360b55540d --- /dev/null +++ b/llvm/test/CodeGen/X86/global-with-max-align.ll @@ -0,0 +1,14 @@ +; RUN: llc -mtriple=x86_64 < %s | FileCheck %s + +; Make sure alignment of 2^32 isn't truncated to zero. + +; CHECK: .globl g1 +; CHECK-NEXT: .p2align 32, 0x0 +; CHECK: .globl g2 +; CHECK-NEXT: .p2align 32, 0x0 +; CHECK: .globl g3 +; CHECK-NEXT: .p2align 32, 0x0 + +@g1 = global i32 0, align 4294967296 +@g2 = global i32 33, align 4294967296 +@g3 = constant i32 44, align 4294967296 From c077a4f305aa7faf92a1438b239078c1da1563a9 Mon Sep 17 00:00:00 2001 From: MaheshRavishankar <1663364+MaheshRavishankar@users.noreply.github.com> Date: Wed, 17 Jul 2024 09:51:00 -0700 Subject: [PATCH 303/777] [mlir][Tensor] Add pattern to fold concats of empty. (#98994) A concatenation of empty tensors can be replaced by a single empty tensor of the concatenated shape. Add this pattern to `populateFoldTensorEmptyPatterns`. --- .../Tensor/Transforms/EmptyOpPatterns.cpp | 37 +++++++++++++++++- mlir/test/Dialect/Tensor/fold-empty-op.mlir | 38 +++++++++++++++++++ 2 files changed, 73 insertions(+), 2 deletions(-) diff --git a/mlir/lib/Dialect/Tensor/Transforms/EmptyOpPatterns.cpp b/mlir/lib/Dialect/Tensor/Transforms/EmptyOpPatterns.cpp index 43ad0acaf7420..60b0c3e759b6c 100644 --- a/mlir/lib/Dialect/Tensor/Transforms/EmptyOpPatterns.cpp +++ b/mlir/lib/Dialect/Tensor/Transforms/EmptyOpPatterns.cpp @@ -136,6 +136,38 @@ struct FoldEmptyTensorWithUnPackOp : public OpRewritePattern { } }; +// Fold concat operation where all the operands are empty. +struct FoldConcatsOfEmpty : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(tensor::ConcatOp concatOp, + PatternRewriter &rewriter) const override { + auto concatOperands = concatOp.getInputs(); + if (concatOperands.empty()) { + return failure(); + } + auto firstEmptyOp = concatOperands.front().getDefiningOp(); + if (!firstEmptyOp) { + return failure(); + } + auto isDefinedByEmptyOp = [](Value v) -> bool { + return v.getDefiningOp(); + }; + if (!llvm::all_of(concatOperands.drop_front(), isDefinedByEmptyOp)) { + return rewriter.notifyMatchFailure( + concatOp, "not all operands are defined by an empty op"); + } + SmallVector> resultShape; + if (failed(concatOp.reifyResultShapes(rewriter, resultShape))) { + return rewriter.notifyMatchFailure(concatOp, + "failed to get result shape"); + } + rewriter.replaceOpWithNewOp( + concatOp, resultShape[0], concatOp.getResultType().getElementType()); + return success(); + } +}; + } // namespace void mlir::tensor::populateFoldTensorEmptyPatterns(RewritePatternSet &patterns, @@ -144,6 +176,7 @@ void mlir::tensor::populateFoldTensorEmptyPatterns(RewritePatternSet &patterns, FoldEmptyTensorWithReshapeOp, FoldEmptyTensorWithReshapeOp>( patterns.getContext(), /*benefit=*/1, foldSingleUseOnly); - patterns.add( - patterns.getContext(), /*benefit=*/1); + patterns.add(patterns.getContext(), + /*benefit=*/1); } diff --git a/mlir/test/Dialect/Tensor/fold-empty-op.mlir b/mlir/test/Dialect/Tensor/fold-empty-op.mlir index e94f6ec7ec56e..5beb8c250aa10 100644 --- a/mlir/test/Dialect/Tensor/fold-empty-op.mlir +++ b/mlir/test/Dialect/Tensor/fold-empty-op.mlir @@ -164,3 +164,41 @@ func.func @double_use_of_tensor_empty(%arg0: index, %arg1: index) // CHECK: tensor.empty{{.*}} : tensor // CHECK: tensor.extract_slice // CHECK: tensor.extract_slice + +// ----- + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%root : !transform.any_op {transform.readonly}) { + %func_op = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.op<"func.func"> + transform.apply_patterns to %func_op { + transform.apply_patterns.tensor.fold_tensor_empty + } : !transform.op<"func.func"> + transform.yield + } +} + +func.func @concats_of_empty( + %arg0 : index, %arg1 : index, %arg2 : index, %arg3 : index) + -> tensor<5x?x?xf32> +{ + %0 = tensor.empty(%arg0, %arg1) : tensor<5x?x?xf32> + %1 = tensor.empty(%arg2, %arg3) : tensor<5x?x?xf32> + %2 = tensor.concat dim(1) %0, %1 : (tensor<5x?x?xf32>, tensor<5x?x?xf32>) -> tensor<5x?x?xf32> + return %2 : tensor<5x?x?xf32> +} +// CHECK: #[[MAP:.+]] = affine_map<()[s0, s1] -> (s0 + s1)> +// CHECK: func @concats_of_empty( +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: index, +// CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: index, +// CHECK-SAME: %[[ARG2:[a-zA-Z0-9]+]]: index, +// CHECK-SAME: %[[ARG3:[a-zA-Z0-9]+]]: index) +// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index +// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index +// CHECK-DAG: %[[EMPTY0:.+]] = tensor.empty(%[[ARG0]], %[[ARG1]]) +// CHECK-DAG: %[[EMPTY1:.+]] = tensor.empty(%[[ARG2]], %[[ARG3]]) +// CHECK: %[[D2:.+]] = tensor.dim %[[EMPTY0]], %[[C2]] +// CHECK-DAG: %[[D0_1:.+]] = tensor.dim %[[EMPTY0]], %[[C1]] +// CHECK-DAG: %[[D1_1:.+]] = tensor.dim %[[EMPTY1]], %[[C1]] +// CHECK-DAG: %[[SUM:.+]] = affine.apply #[[MAP]]()[%[[D0_1]], %[[D1_1]]] +// CHECK: %[[NEW_EMPTY:.+]] = tensor.empty(%[[SUM]], %[[D2]]) +// CHECK: return %[[NEW_EMPTY]] From c736ca85c38ce9c30a2286382d8023604f34f9e8 Mon Sep 17 00:00:00 2001 From: matthew-f <551862+matthew-f@users.noreply.github.com> Date: Wed, 17 Jul 2024 19:05:30 +0200 Subject: [PATCH 304/777] [clang-tidy] Ensure functions are anchored in the global namespace (#99084) The regular expressions match functions that aren't anchored in the global namespace. For example `::connect` matches `QObject::connect` This change is to remove these false positives --- .../bugprone/UnusedReturnValueCheck.cpp | 178 +++++++++--------- 1 file changed, 89 insertions(+), 89 deletions(-) diff --git a/clang-tools-extra/clang-tidy/bugprone/UnusedReturnValueCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/UnusedReturnValueCheck.cpp index 73373147e96fc..955a9b94dfaf6 100644 --- a/clang-tools-extra/clang-tidy/bugprone/UnusedReturnValueCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/UnusedReturnValueCheck.cpp @@ -48,97 +48,97 @@ UnusedReturnValueCheck::UnusedReturnValueCheck(llvm::StringRef Name, ClangTidyContext *Context) : ClangTidyCheck(Name, Context), CheckedFunctions(utils::options::parseStringList( - Options.get("CheckedFunctions", "::std::async$;" - "::std::launder$;" - "::std::remove$;" - "::std::remove_if$;" - "::std::unique$;" - "::std::unique_ptr::release$;" - "::std::basic_string::empty$;" - "::std::vector::empty$;" - "::std::back_inserter$;" - "::std::distance$;" - "::std::find$;" - "::std::find_if$;" - "::std::inserter$;" - "::std::lower_bound$;" - "::std::make_pair$;" - "::std::map::count$;" - "::std::map::find$;" - "::std::map::lower_bound$;" - "::std::multimap::equal_range$;" - "::std::multimap::upper_bound$;" - "::std::set::count$;" - "::std::set::find$;" - "::std::setfill$;" - "::std::setprecision$;" - "::std::setw$;" - "::std::upper_bound$;" - "::std::vector::at$;" + Options.get("CheckedFunctions", "^::std::async$;" + "^::std::launder$;" + "^::std::remove$;" + "^::std::remove_if$;" + "^::std::unique$;" + "^::std::unique_ptr::release$;" + "^::std::basic_string::empty$;" + "^::std::vector::empty$;" + "^::std::back_inserter$;" + "^::std::distance$;" + "^::std::find$;" + "^::std::find_if$;" + "^::std::inserter$;" + "^::std::lower_bound$;" + "^::std::make_pair$;" + "^::std::map::count$;" + "^::std::map::find$;" + "^::std::map::lower_bound$;" + "^::std::multimap::equal_range$;" + "^::std::multimap::upper_bound$;" + "^::std::set::count$;" + "^::std::set::find$;" + "^::std::setfill$;" + "^::std::setprecision$;" + "^::std::setw$;" + "^::std::upper_bound$;" + "^::std::vector::at$;" // C standard library - "::bsearch$;" - "::ferror$;" - "::feof$;" - "::isalnum$;" - "::isalpha$;" - "::isblank$;" - "::iscntrl$;" - "::isdigit$;" - "::isgraph$;" - "::islower$;" - "::isprint$;" - "::ispunct$;" - "::isspace$;" - "::isupper$;" - "::iswalnum$;" - "::iswprint$;" - "::iswspace$;" - "::isxdigit$;" - "::memchr$;" - "::memcmp$;" - "::strcmp$;" - "::strcoll$;" - "::strncmp$;" - "::strpbrk$;" - "::strrchr$;" - "::strspn$;" - "::strstr$;" - "::wcscmp$;" + "^::bsearch$;" + "^::ferror$;" + "^::feof$;" + "^::isalnum$;" + "^::isalpha$;" + "^::isblank$;" + "^::iscntrl$;" + "^::isdigit$;" + "^::isgraph$;" + "^::islower$;" + "^::isprint$;" + "^::ispunct$;" + "^::isspace$;" + "^::isupper$;" + "^::iswalnum$;" + "^::iswprint$;" + "^::iswspace$;" + "^::isxdigit$;" + "^::memchr$;" + "^::memcmp$;" + "^::strcmp$;" + "^::strcoll$;" + "^::strncmp$;" + "^::strpbrk$;" + "^::strrchr$;" + "^::strspn$;" + "^::strstr$;" + "^::wcscmp$;" // POSIX - "::access$;" - "::bind$;" - "::connect$;" - "::difftime$;" - "::dlsym$;" - "::fnmatch$;" - "::getaddrinfo$;" - "::getopt$;" - "::htonl$;" - "::htons$;" - "::iconv_open$;" - "::inet_addr$;" - "::isascii$;" - "::isatty$;" - "::mmap$;" - "::newlocale$;" - "::openat$;" - "::pathconf$;" - "::pthread_equal$;" - "::pthread_getspecific$;" - "::pthread_mutex_trylock$;" - "::readdir$;" - "::readlink$;" - "::recvmsg$;" - "::regexec$;" - "::scandir$;" - "::semget$;" - "::setjmp$;" - "::shm_open$;" - "::shmget$;" - "::sigismember$;" - "::strcasecmp$;" - "::strsignal$;" - "::ttyname"))), + "^::access$;" + "^::bind$;" + "^::connect$;" + "^::difftime$;" + "^::dlsym$;" + "^::fnmatch$;" + "^::getaddrinfo$;" + "^::getopt$;" + "^::htonl$;" + "^::htons$;" + "^::iconv_open$;" + "^::inet_addr$;" + "^::isascii$;" + "^::isatty$;" + "^::mmap$;" + "^::newlocale$;" + "^::openat$;" + "^::pathconf$;" + "^::pthread_equal$;" + "^::pthread_getspecific$;" + "^::pthread_mutex_trylock$;" + "^::readdir$;" + "^::readlink$;" + "^::recvmsg$;" + "^::regexec$;" + "^::scandir$;" + "^::semget$;" + "^::setjmp$;" + "^::shm_open$;" + "^::shmget$;" + "^::sigismember$;" + "^::strcasecmp$;" + "^::strsignal$;" + "^::ttyname"))), CheckedReturnTypes(utils::options::parseStringList( Options.get("CheckedReturnTypes", "::std::error_code$;" "::std::error_condition$;" From 86ef699060394c82dcda7e86ff70d8cabeabcc2a Mon Sep 17 00:00:00 2001 From: Jason Molenda Date: Wed, 17 Jul 2024 10:05:55 -0700 Subject: [PATCH 305/777] [lldb] progressive progress reporting for darwin kernel/firmware (#98845) When doing firmware/kernel debugging, it is frequent that binaries and debug info need to be retrieved / downloaded, and the lack of progress reports made for a poor experience, with lldb seemingly hung while downloading things over the network. This PR adds progress reports to the critical sites for these use cases. --- lldb/source/Core/DynamicLoader.cpp | 24 ++++++- .../DynamicLoaderDarwinKernel.cpp | 64 +++++++++++-------- .../Darwin-Kernel/DynamicLoaderDarwinKernel.h | 4 +- lldb/source/Target/Process.cpp | 9 +++ 4 files changed, 69 insertions(+), 32 deletions(-) diff --git a/lldb/source/Core/DynamicLoader.cpp b/lldb/source/Core/DynamicLoader.cpp index 7871be6fc451d..7758a87403b5a 100644 --- a/lldb/source/Core/DynamicLoader.cpp +++ b/lldb/source/Core/DynamicLoader.cpp @@ -13,6 +13,7 @@ #include "lldb/Core/ModuleList.h" #include "lldb/Core/ModuleSpec.h" #include "lldb/Core/PluginManager.h" +#include "lldb/Core/Progress.h" #include "lldb/Core/Section.h" #include "lldb/Symbol/ObjectFile.h" #include "lldb/Target/MemoryRegionInfo.h" @@ -195,20 +196,37 @@ ModuleSP DynamicLoader::LoadBinaryWithUUIDAndAddress( Target &target = process->GetTarget(); Status error; + StreamString prog_str; + if (!name.empty()) { + prog_str << name.str() << " "; + } + if (uuid.IsValid()) + prog_str << uuid.GetAsString(); + if (value_is_offset == 0 && value != LLDB_INVALID_ADDRESS) { + prog_str << "at 0x"; + prog_str.PutHex64(value); + } + if (!uuid.IsValid() && !value_is_offset) { memory_module_sp = ReadUnnamedMemoryModule(process, value, name); - if (memory_module_sp) + if (memory_module_sp) { uuid = memory_module_sp->GetUUID(); + if (uuid.IsValid()) { + prog_str << " "; + prog_str << uuid.GetAsString(); + } + } } ModuleSpec module_spec; module_spec.GetUUID() = uuid; FileSpec name_filespec(name); - if (FileSystem::Instance().Exists(name_filespec)) - module_spec.GetFileSpec() = name_filespec; if (uuid.IsValid()) { + Progress progress("Locating binary", prog_str.GetString().str()); + // Has lldb already seen a module with this UUID? + // Or have external lookup enabled in DebugSymbols on macOS. if (!module_sp) error = ModuleList::GetSharedModule(module_spec, module_sp, nullptr, nullptr, nullptr); diff --git a/lldb/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.cpp b/lldb/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.cpp index 8d83937aab668..20e5652c65bf8 100644 --- a/lldb/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.cpp +++ b/lldb/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.cpp @@ -13,6 +13,7 @@ #include "lldb/Core/Module.h" #include "lldb/Core/ModuleSpec.h" #include "lldb/Core/PluginManager.h" +#include "lldb/Core/Progress.h" #include "lldb/Core/Section.h" #include "lldb/Interpreter/OptionValueProperties.h" #include "lldb/Symbol/ObjectFile.h" @@ -714,7 +715,7 @@ void DynamicLoaderDarwinKernel::KextImageInfo::SetIsKernel(bool is_kernel) { } bool DynamicLoaderDarwinKernel::KextImageInfo::LoadImageUsingMemoryModule( - Process *process) { + Process *process, Progress *progress) { Log *log = GetLog(LLDBLog::DynamicLoader); if (IsLoaded()) return true; @@ -757,11 +758,37 @@ bool DynamicLoaderDarwinKernel::KextImageInfo::LoadImageUsingMemoryModule( const ModuleList &target_images = target.GetImages(); m_module_sp = target_images.FindModule(m_uuid); + StreamString prog_str; + // 'mach_kernel' is a fake name we make up to find kernels + // that were located by the local filesystem scan. + if (GetName() != "mach_kernel") + prog_str << GetName() << " "; + if (GetUUID().IsValid()) + prog_str << GetUUID().GetAsString() << " "; + if (GetLoadAddress() != LLDB_INVALID_ADDRESS) { + prog_str << "at 0x"; + prog_str.PutHex64(GetLoadAddress()); + } + + std::unique_ptr progress_up; + if (progress) + progress->Increment(1, prog_str.GetString().str()); + else { + if (IsKernel()) + progress_up = std::make_unique("Loading kernel", + prog_str.GetString().str()); + else + progress_up = std::make_unique("Loading kext", + prog_str.GetString().str()); + } + // Search for the kext on the local filesystem via the UUID if (!m_module_sp && m_uuid.IsValid()) { ModuleSpec module_spec; module_spec.GetUUID() = m_uuid; - module_spec.GetArchitecture() = target.GetArchitecture(); + if (!m_uuid.IsValid()) + module_spec.GetArchitecture() = target.GetArchitecture(); + module_spec.GetFileSpec() = FileSpec(m_name); // If the current platform is PlatformDarwinKernel, create a ModuleSpec // with the filename set to be the bundle ID for this kext, e.g. @@ -770,17 +797,9 @@ bool DynamicLoaderDarwinKernel::KextImageInfo::LoadImageUsingMemoryModule( // system. PlatformSP platform_sp(target.GetPlatform()); if (platform_sp) { - static ConstString g_platform_name( - PlatformDarwinKernel::GetPluginNameStatic()); - if (platform_sp->GetPluginName() == g_platform_name.GetStringRef()) { - ModuleSpec kext_bundle_module_spec(module_spec); - FileSpec kext_filespec(m_name.c_str()); - FileSpecList search_paths = target.GetExecutableSearchPaths(); - kext_bundle_module_spec.GetFileSpec() = kext_filespec; - platform_sp->GetSharedModule(kext_bundle_module_spec, process, - m_module_sp, &search_paths, nullptr, - nullptr); - } + FileSpecList search_paths = target.GetExecutableSearchPaths(); + platform_sp->GetSharedModule(module_spec, process, m_module_sp, + &search_paths, nullptr, nullptr); } // Ask the Target to find this file on the local system, if possible. @@ -1058,12 +1077,9 @@ void DynamicLoaderDarwinKernel::LoadKernelModuleIfNeeded() { } } } - - if (m_kernel.GetLoadAddress() != LLDB_INVALID_ADDRESS) { - if (!m_kernel.LoadImageUsingMemoryModule(m_process)) { + if (m_kernel.GetLoadAddress() != LLDB_INVALID_ADDRESS) + if (!m_kernel.LoadImageUsingMemoryModule(m_process)) m_kernel.LoadImageAtFileAddress(m_process); - } - } // The operating system plugin gets loaded and initialized in // LoadImageUsingMemoryModule when we discover the kernel dSYM. For a core @@ -1347,19 +1363,18 @@ bool DynamicLoaderDarwinKernel::ParseKextSummaries( std::vector> kexts_failed_to_load; if (number_of_new_kexts_being_added > 0) { ModuleList loaded_module_list; + Progress progress("Loading kext", "", number_of_new_kexts_being_added); const uint32_t num_of_new_kexts = kext_summaries.size(); for (uint32_t new_kext = 0; new_kext < num_of_new_kexts; new_kext++) { if (to_be_added[new_kext]) { KextImageInfo &image_info = kext_summaries[new_kext]; - bool kext_successfully_added = true; if (load_kexts) { - if (!image_info.LoadImageUsingMemoryModule(m_process)) { + if (!image_info.LoadImageUsingMemoryModule(m_process, &progress)) { kexts_failed_to_load.push_back(std::pair( kext_summaries[new_kext].GetName(), kext_summaries[new_kext].GetUUID())); image_info.LoadImageAtFileAddress(m_process); - kext_successfully_added = false; } } @@ -1369,13 +1384,6 @@ bool DynamicLoaderDarwinKernel::ParseKextSummaries( m_process->GetStopID() == image_info.GetProcessStopId()) loaded_module_list.AppendIfNeeded(image_info.GetModule()); - if (load_kexts) { - if (kext_successfully_added) - s.Printf("."); - else - s.Printf("-"); - } - if (log) kext_summaries[new_kext].PutToLog(log); } diff --git a/lldb/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.h b/lldb/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.h index 000c382b2c011..69dd8ca579158 100644 --- a/lldb/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.h +++ b/lldb/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.h @@ -16,6 +16,7 @@ #include "lldb/Host/SafeMachO.h" +#include "lldb/Core/Progress.h" #include "lldb/Target/DynamicLoader.h" #include "lldb/Target/Process.h" #include "lldb/Utility/FileSpec.h" @@ -137,7 +138,8 @@ class DynamicLoaderDarwinKernel : public lldb_private::DynamicLoader { bool LoadImageAtFileAddress(lldb_private::Process *process); - bool LoadImageUsingMemoryModule(lldb_private::Process *process); + bool LoadImageUsingMemoryModule(lldb_private::Process *process, + lldb_private::Progress *progress = nullptr); bool IsLoaded() { return m_load_process_stop_id != UINT32_MAX; } diff --git a/lldb/source/Target/Process.cpp b/lldb/source/Target/Process.cpp index d990f8e714e22..d5a639d9beacd 100644 --- a/lldb/source/Target/Process.cpp +++ b/lldb/source/Target/Process.cpp @@ -21,6 +21,7 @@ #include "lldb/Core/Module.h" #include "lldb/Core/ModuleSpec.h" #include "lldb/Core/PluginManager.h" +#include "lldb/Core/Progress.h" #include "lldb/Expression/DiagnosticManager.h" #include "lldb/Expression/DynamicCheckerFunctions.h" #include "lldb/Expression/UserExpression.h" @@ -2550,6 +2551,14 @@ ModuleSP Process::ReadModuleFromMemory(const FileSpec &file_spec, ModuleSP module_sp(new Module(file_spec, ArchSpec())); if (module_sp) { Status error; + std::unique_ptr progress_up; + // Reading an ObjectFile from a local corefile is very fast, + // only print a progress update if we're reading from a + // live session which might go over gdb remote serial protocol. + if (IsLiveDebugSession()) + progress_up = std::make_unique( + "Reading binary from memory", file_spec.GetFilename().GetString()); + ObjectFile *objfile = module_sp->GetMemoryObjectFile( shared_from_this(), header_addr, error, size_to_read); if (objfile) From c7b08ac01fa98db7c9ec7c3bbe9784c2d20f91e9 Mon Sep 17 00:00:00 2001 From: Daniel Bertalan Date: Wed, 17 Jul 2024 19:07:47 +0200 Subject: [PATCH 306/777] [lld-macho][test] Require "shell" feature for usage of `ln -s` (#99355) The use of `ln -s` is not guaranteed to be supported on Windows. --- lld/test/MachO/implicit-and-allowable-clients.test | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lld/test/MachO/implicit-and-allowable-clients.test b/lld/test/MachO/implicit-and-allowable-clients.test index 576db33af2ea0..f627d242a0075 100644 --- a/lld/test/MachO/implicit-and-allowable-clients.test +++ b/lld/test/MachO/implicit-and-allowable-clients.test @@ -1,4 +1,4 @@ -# REQUIRES: aarch64 +# REQUIRES: aarch64, shell # RUN: rm -rf %t; split-file %s %t # RUN: ln -s Versions/A/FrameworkPublic.tbd %t/System/Library/Frameworks/FrameworkPublic.framework/ # RUN: ln -s Versions/A/FrameworkPrivate.tbd %t/System/Library/Frameworks/FrameworkPrivate.framework/ From 6867e49fc80c8468f9a5a8376ce7d3b89fd4fb51 Mon Sep 17 00:00:00 2001 From: Angel Zhang Date: Wed, 17 Jul 2024 13:09:15 -0400 Subject: [PATCH 307/777] [mlir][spirv] Implement vector type legalization for function signatures (#98337) ### Description This PR implements a minimal version of function signature conversion to unroll vectors into 1D and with a size supported by SPIR-V (2, 3 or 4 depending on the original dimension). This PR also includes new unit tests that only check for function signature conversion. ### Future Plans - Check for capabilities that support vectors of size 8 or 16. - Set up `OneToNTypeConversion` and `DialectConversion` to replace the current implementation that uses `GreedyPatternRewriteDriver`. - Introduce other vector unrolling patterns to cancel out the `vector.insert_strided_slice` and `vector.extract_strided_slice` ops and fully legalize the vector types in the function body. - Handle `func::CallOp` and declarations. - Restructure the code in `SPIRVConversion.cpp`. - Create test passes for testing sets of patterns in isolation. - Optimize the way original shape is splitted into target shapes, e.g. `vector<5xi32>` can be splitted into `vector<4xi32>` and `vector<1xi32>`. --------- Co-authored-by: Jakub Kuderski --- mlir/include/mlir/Conversion/Passes.td | 10 +- .../SPIRV/Transforms/SPIRVConversion.h | 6 + .../ConvertToSPIRV/ConvertToSPIRVPass.cpp | 20 +- .../Dialect/SPIRV/Transforms/CMakeLists.txt | 6 + .../SPIRV/Transforms/SPIRVConversion.cpp | 290 ++++++++++++++++++ .../test/Conversion/ConvertToSPIRV/arith.mlir | 2 +- .../Conversion/ConvertToSPIRV/combined.mlir | 2 +- .../func-signature-vector-unroll.mlir | 147 +++++++++ .../test/Conversion/ConvertToSPIRV/index.mlir | 2 +- mlir/test/Conversion/ConvertToSPIRV/scf.mlir | 2 +- .../Conversion/ConvertToSPIRV/simple.mlir | 2 +- mlir/test/Conversion/ConvertToSPIRV/ub.mlir | 2 +- .../Conversion/ConvertToSPIRV/vector.mlir | 2 +- mlir/test/lib/Conversion/CMakeLists.txt | 1 + .../Conversion/ConvertToSPIRV/CMakeLists.txt | 16 + .../TestSPIRVFuncSignatureConversion.cpp | 57 ++++ mlir/tools/mlir-opt/CMakeLists.txt | 1 + mlir/tools/mlir-opt/mlir-opt.cpp | 2 + .../llvm-project-overlay/mlir/BUILD.bazel | 6 + .../mlir/test/BUILD.bazel | 15 + 20 files changed, 578 insertions(+), 13 deletions(-) create mode 100644 mlir/test/Conversion/ConvertToSPIRV/func-signature-vector-unroll.mlir create mode 100644 mlir/test/lib/Conversion/ConvertToSPIRV/CMakeLists.txt create mode 100644 mlir/test/lib/Conversion/ConvertToSPIRV/TestSPIRVFuncSignatureConversion.cpp diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td index 54b94bbfb93d1..748646e605827 100644 --- a/mlir/include/mlir/Conversion/Passes.td +++ b/mlir/include/mlir/Conversion/Passes.td @@ -40,7 +40,15 @@ def ConvertToSPIRVPass : Pass<"convert-to-spirv"> { let description = [{ This is a generic pass to convert to SPIR-V. }]; - let dependentDialects = ["spirv::SPIRVDialect"]; + let dependentDialects = [ + "spirv::SPIRVDialect", + "vector::VectorDialect", + ]; + let options = [ + Option<"runSignatureConversion", "run-signature-conversion", "bool", + /*default=*/"true", + "Run function signature conversion to convert vector types"> + ]; } //===----------------------------------------------------------------------===// diff --git a/mlir/include/mlir/Dialect/SPIRV/Transforms/SPIRVConversion.h b/mlir/include/mlir/Dialect/SPIRV/Transforms/SPIRVConversion.h index 09eecafc0c8a5..9ad3d5fc85dd3 100644 --- a/mlir/include/mlir/Dialect/SPIRV/Transforms/SPIRVConversion.h +++ b/mlir/include/mlir/Dialect/SPIRV/Transforms/SPIRVConversion.h @@ -17,7 +17,9 @@ #include "mlir/Dialect/SPIRV/IR/SPIRVOps.h" #include "mlir/Dialect/SPIRV/IR/SPIRVTypes.h" #include "mlir/Dialect/SPIRV/IR/TargetAndABI.h" +#include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h" #include "mlir/Transforms/DialectConversion.h" +#include "mlir/Transforms/OneToNTypeConversion.h" #include "llvm/ADT/SmallSet.h" namespace mlir { @@ -134,6 +136,10 @@ class SPIRVConversionTarget : public ConversionTarget { void populateBuiltinFuncToSPIRVPatterns(SPIRVTypeConverter &typeConverter, RewritePatternSet &patterns); +void populateFuncOpVectorRewritePatterns(RewritePatternSet &patterns); + +void populateReturnOpVectorRewritePatterns(RewritePatternSet &patterns); + namespace spirv { class AccessChainOp; diff --git a/mlir/lib/Conversion/ConvertToSPIRV/ConvertToSPIRVPass.cpp b/mlir/lib/Conversion/ConvertToSPIRV/ConvertToSPIRVPass.cpp index b5be4654bcb25..003a5feea9e9b 100644 --- a/mlir/lib/Conversion/ConvertToSPIRV/ConvertToSPIRVPass.cpp +++ b/mlir/lib/Conversion/ConvertToSPIRV/ConvertToSPIRVPass.cpp @@ -39,18 +39,31 @@ namespace { /// A pass to perform the SPIR-V conversion. struct ConvertToSPIRVPass final : impl::ConvertToSPIRVPassBase { + using ConvertToSPIRVPassBase::ConvertToSPIRVPassBase; void runOnOperation() override { MLIRContext *context = &getContext(); Operation *op = getOperation(); + if (runSignatureConversion) { + // Unroll vectors in function signatures to native vector size. + RewritePatternSet patterns(context); + populateFuncOpVectorRewritePatterns(patterns); + populateReturnOpVectorRewritePatterns(patterns); + GreedyRewriteConfig config; + config.strictMode = GreedyRewriteStrictness::ExistingOps; + if (failed(applyPatternsAndFoldGreedily(op, std::move(patterns), config))) + return signalPassFailure(); + } + spirv::TargetEnvAttr targetAttr = spirv::lookupTargetEnvOrDefault(op); + std::unique_ptr target = + SPIRVConversionTarget::get(targetAttr); SPIRVTypeConverter typeConverter(targetAttr); - RewritePatternSet patterns(context); ScfToSPIRVContext scfToSPIRVContext; - // Populate patterns. + // Populate patterns for each dialect. arith::populateCeilFloorDivExpandOpsPatterns(patterns); arith::populateArithToSPIRVPatterns(typeConverter, patterns); populateBuiltinFuncToSPIRVPatterns(typeConverter, patterns); @@ -60,9 +73,6 @@ struct ConvertToSPIRVPass final populateSCFToSPIRVPatterns(typeConverter, scfToSPIRVContext, patterns); ub::populateUBToSPIRVConversionPatterns(typeConverter, patterns); - std::unique_ptr target = - SPIRVConversionTarget::get(targetAttr); - if (failed(applyPartialConversion(op, *target, std::move(patterns)))) return signalPassFailure(); } diff --git a/mlir/lib/Dialect/SPIRV/Transforms/CMakeLists.txt b/mlir/lib/Dialect/SPIRV/Transforms/CMakeLists.txt index 821f82ebc0796..4de9b4729e720 100644 --- a/mlir/lib/Dialect/SPIRV/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/SPIRV/Transforms/CMakeLists.txt @@ -16,9 +16,15 @@ add_mlir_dialect_library(MLIRSPIRVConversion ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/SPIRV LINK_LIBS PUBLIC + MLIRArithDialect + MLIRDialectUtils MLIRFuncDialect + MLIRIR MLIRSPIRVDialect + MLIRSupport MLIRTransformUtils + MLIRVectorDialect + MLIRVectorTransforms ) add_mlir_dialect_library(MLIRSPIRVTransforms diff --git a/mlir/lib/Dialect/SPIRV/Transforms/SPIRVConversion.cpp b/mlir/lib/Dialect/SPIRV/Transforms/SPIRVConversion.cpp index 4072608dc8f87..e3a09ef1ff684 100644 --- a/mlir/lib/Dialect/SPIRV/Transforms/SPIRVConversion.cpp +++ b/mlir/lib/Dialect/SPIRV/Transforms/SPIRVConversion.cpp @@ -11,14 +11,24 @@ //===----------------------------------------------------------------------===// #include "mlir/Dialect/SPIRV/Transforms/SPIRVConversion.h" +#include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/SPIRV/IR/SPIRVDialect.h" #include "mlir/Dialect/SPIRV/IR/SPIRVEnums.h" #include "mlir/Dialect/SPIRV/IR/SPIRVOps.h" #include "mlir/Dialect/SPIRV/IR/SPIRVTypes.h" #include "mlir/Dialect/SPIRV/IR/TargetAndABI.h" +#include "mlir/Dialect/Utils/IndexingUtils.h" +#include "mlir/Dialect/Vector/IR/VectorOps.h" +#include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h" #include "mlir/IR/BuiltinTypes.h" +#include "mlir/IR/Operation.h" +#include "mlir/IR/PatternMatch.h" +#include "mlir/Support/LLVM.h" #include "mlir/Transforms/DialectConversion.h" +#include "mlir/Transforms/OneToNTypeConversion.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Support/Debug.h" #include "llvm/Support/MathExtras.h" @@ -34,6 +44,43 @@ using namespace mlir; // Utility functions //===----------------------------------------------------------------------===// +static int getComputeVectorSize(int64_t size) { + for (int i : {4, 3, 2}) { + if (size % i == 0) + return i; + } + return 1; +} + +static std::optional> getTargetShape(VectorType vecType) { + LLVM_DEBUG(llvm::dbgs() << "Get target shape\n"); + if (vecType.isScalable()) { + LLVM_DEBUG(llvm::dbgs() + << "--scalable vectors are not supported -> BAIL\n"); + return std::nullopt; + } + SmallVector unrollShape = llvm::to_vector<4>(vecType.getShape()); + std::optional> targetShape = + SmallVector(1, getComputeVectorSize(vecType.getShape().back())); + if (!targetShape) { + LLVM_DEBUG(llvm::dbgs() << "--no unrolling target shape defined\n"); + return std::nullopt; + } + auto maybeShapeRatio = computeShapeRatio(unrollShape, *targetShape); + if (!maybeShapeRatio) { + LLVM_DEBUG(llvm::dbgs() + << "--could not compute integral shape ratio -> BAIL\n"); + return std::nullopt; + } + if (llvm::all_of(*maybeShapeRatio, [](int64_t v) { return v == 1; })) { + LLVM_DEBUG(llvm::dbgs() << "--no unrolling needed -> SKIP\n"); + return std::nullopt; + } + LLVM_DEBUG(llvm::dbgs() + << "--found an integral shape ratio to unroll to -> SUCCESS\n"); + return targetShape; +} + /// Checks that `candidates` extension requirements are possible to be satisfied /// with the given `targetEnv`. /// @@ -813,6 +860,249 @@ void mlir::populateBuiltinFuncToSPIRVPatterns(SPIRVTypeConverter &typeConverter, patterns.add(typeConverter, patterns.getContext()); } +//===----------------------------------------------------------------------===// +// func::FuncOp Conversion Patterns +//===----------------------------------------------------------------------===// + +namespace { +/// A pattern for rewriting function signature to convert vector arguments of +/// functions to be of valid types +struct FuncOpVectorUnroll final : OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(func::FuncOp funcOp, + PatternRewriter &rewriter) const override { + FunctionType fnType = funcOp.getFunctionType(); + + // TODO: Handle declarations. + if (funcOp.isDeclaration()) { + LLVM_DEBUG(llvm::dbgs() + << fnType << " illegal: declarations are unsupported\n"); + return failure(); + } + + // Create a new func op with the original type and copy the function body. + auto newFuncOp = rewriter.create(funcOp.getLoc(), + funcOp.getName(), fnType); + rewriter.inlineRegionBefore(funcOp.getBody(), newFuncOp.getBody(), + newFuncOp.end()); + + Location loc = newFuncOp.getBody().getLoc(); + + Block &entryBlock = newFuncOp.getBlocks().front(); + OpBuilder::InsertionGuard guard(rewriter); + rewriter.setInsertionPointToStart(&entryBlock); + + OneToNTypeMapping oneToNTypeMapping(fnType.getInputs()); + + // For arguments that are of illegal types and require unrolling. + // `unrolledInputNums` stores the indices of arguments that result from + // unrolling in the new function signature. `newInputNo` is a counter. + SmallVector unrolledInputNums; + size_t newInputNo = 0; + + // For arguments that are of legal types and do not require unrolling. + // `tmpOps` stores a mapping from temporary operations that serve as + // placeholders for new arguments that will be added later. These operations + // will be erased once the entry block's argument list is updated. + llvm::SmallDenseMap tmpOps; + + // This counts the number of new operations created. + size_t newOpCount = 0; + + // Enumerate through the arguments. + for (auto [origInputNo, origType] : enumerate(fnType.getInputs())) { + // Check whether the argument is of vector type. + auto origVecType = dyn_cast(origType); + if (!origVecType) { + // We need a placeholder for the old argument that will be erased later. + Value result = rewriter.create( + loc, origType, rewriter.getZeroAttr(origType)); + rewriter.replaceAllUsesWith(newFuncOp.getArgument(origInputNo), result); + tmpOps.insert({result.getDefiningOp(), newInputNo}); + oneToNTypeMapping.addInputs(origInputNo, origType); + ++newInputNo; + ++newOpCount; + continue; + } + // Check whether the vector needs unrolling. + auto targetShape = getTargetShape(origVecType); + if (!targetShape) { + // We need a placeholder for the old argument that will be erased later. + Value result = rewriter.create( + loc, origType, rewriter.getZeroAttr(origType)); + rewriter.replaceAllUsesWith(newFuncOp.getArgument(origInputNo), result); + tmpOps.insert({result.getDefiningOp(), newInputNo}); + oneToNTypeMapping.addInputs(origInputNo, origType); + ++newInputNo; + ++newOpCount; + continue; + } + VectorType unrolledType = + VectorType::get(*targetShape, origVecType.getElementType()); + auto originalShape = + llvm::to_vector_of(origVecType.getShape()); + + // Prepare the result vector. + Value result = rewriter.create( + loc, origVecType, rewriter.getZeroAttr(origVecType)); + ++newOpCount; + // Prepare the placeholder for the new arguments that will be added later. + Value dummy = rewriter.create( + loc, unrolledType, rewriter.getZeroAttr(unrolledType)); + ++newOpCount; + + // Create the `vector.insert_strided_slice` ops. + SmallVector strides(targetShape->size(), 1); + SmallVector newTypes; + for (SmallVector offsets : + StaticTileOffsetRange(originalShape, *targetShape)) { + result = rewriter.create( + loc, dummy, result, offsets, strides); + newTypes.push_back(unrolledType); + unrolledInputNums.push_back(newInputNo); + ++newInputNo; + ++newOpCount; + } + rewriter.replaceAllUsesWith(newFuncOp.getArgument(origInputNo), result); + oneToNTypeMapping.addInputs(origInputNo, newTypes); + } + + // Change the function signature. + auto convertedTypes = oneToNTypeMapping.getConvertedTypes(); + auto newFnType = fnType.clone(convertedTypes, fnType.getResults()); + rewriter.modifyOpInPlace(newFuncOp, + [&] { newFuncOp.setFunctionType(newFnType); }); + + // Update the arguments in the entry block. + entryBlock.eraseArguments(0, fnType.getNumInputs()); + SmallVector locs(convertedTypes.size(), newFuncOp.getLoc()); + entryBlock.addArguments(convertedTypes, locs); + + // Replace the placeholder values with the new arguments. We assume there is + // only one block for now. + size_t unrolledInputIdx = 0; + for (auto [count, op] : enumerate(entryBlock.getOperations())) { + // We first look for operands that are placeholders for initially legal + // arguments. + Operation &curOp = op; + for (auto [operandIdx, operandVal] : llvm::enumerate(op.getOperands())) { + Operation *operandOp = operandVal.getDefiningOp(); + if (auto it = tmpOps.find(operandOp); it != tmpOps.end()) { + size_t idx = operandIdx; + rewriter.modifyOpInPlace(&curOp, [&curOp, &newFuncOp, it, idx] { + curOp.setOperand(idx, newFuncOp.getArgument(it->second)); + }); + } + } + // Since all newly created operations are in the beginning, reaching the + // end of them means that any later `vector.insert_strided_slice` should + // not be touched. + if (count >= newOpCount) + continue; + if (auto vecOp = dyn_cast(op)) { + size_t unrolledInputNo = unrolledInputNums[unrolledInputIdx]; + rewriter.modifyOpInPlace(&curOp, [&] { + curOp.setOperand(0, newFuncOp.getArgument(unrolledInputNo)); + }); + ++unrolledInputIdx; + } + } + + // Erase the original funcOp. The `tmpOps` do not need to be erased since + // they have no uses and will be handled by dead-code elimination. + rewriter.eraseOp(funcOp); + return success(); + } +}; +} // namespace + +void mlir::populateFuncOpVectorRewritePatterns(RewritePatternSet &patterns) { + patterns.add(patterns.getContext()); +} + +//===----------------------------------------------------------------------===// +// func::ReturnOp Conversion Patterns +//===----------------------------------------------------------------------===// + +namespace { +/// A pattern for rewriting function signature and the return op to convert +/// vectors to be of valid types. +struct ReturnOpVectorUnroll final : OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(func::ReturnOp returnOp, + PatternRewriter &rewriter) const override { + // Check whether the parent funcOp is valid. + auto funcOp = dyn_cast(returnOp->getParentOp()); + if (!funcOp) + return failure(); + + FunctionType fnType = funcOp.getFunctionType(); + OneToNTypeMapping oneToNTypeMapping(fnType.getResults()); + Location loc = returnOp.getLoc(); + + // For the new return op. + SmallVector newOperands; + + // Enumerate through the results. + for (auto [origResultNo, origType] : enumerate(fnType.getResults())) { + // Check whether the argument is of vector type. + auto origVecType = dyn_cast(origType); + if (!origVecType) { + oneToNTypeMapping.addInputs(origResultNo, origType); + newOperands.push_back(returnOp.getOperand(origResultNo)); + continue; + } + // Check whether the vector needs unrolling. + auto targetShape = getTargetShape(origVecType); + if (!targetShape) { + // The original argument can be used. + oneToNTypeMapping.addInputs(origResultNo, origType); + newOperands.push_back(returnOp.getOperand(origResultNo)); + continue; + } + VectorType unrolledType = + VectorType::get(*targetShape, origVecType.getElementType()); + + // Create `vector.extract_strided_slice` ops to form legal vectors from + // the original operand of illegal type. + auto originalShape = + llvm::to_vector_of(origVecType.getShape()); + SmallVector strides(targetShape->size(), 1); + SmallVector newTypes; + Value returnValue = returnOp.getOperand(origResultNo); + for (SmallVector offsets : + StaticTileOffsetRange(originalShape, *targetShape)) { + Value result = rewriter.create( + loc, returnValue, offsets, *targetShape, strides); + newOperands.push_back(result); + newTypes.push_back(unrolledType); + } + oneToNTypeMapping.addInputs(origResultNo, newTypes); + } + + // Change the function signature. + auto newFnType = + FunctionType::get(rewriter.getContext(), TypeRange(fnType.getInputs()), + TypeRange(oneToNTypeMapping.getConvertedTypes())); + rewriter.modifyOpInPlace(funcOp, + [&] { funcOp.setFunctionType(newFnType); }); + + // Replace the return op using the new operands. This will automatically + // update the entry block as well. + rewriter.replaceOp(returnOp, + rewriter.create(loc, newOperands)); + + return success(); + } +}; +} // namespace + +void mlir::populateReturnOpVectorRewritePatterns(RewritePatternSet &patterns) { + patterns.add(patterns.getContext()); +} + //===----------------------------------------------------------------------===// // Builtin Variables //===----------------------------------------------------------------------===// diff --git a/mlir/test/Conversion/ConvertToSPIRV/arith.mlir b/mlir/test/Conversion/ConvertToSPIRV/arith.mlir index a2adc0ad9c7a5..1a844a7cd018b 100644 --- a/mlir/test/Conversion/ConvertToSPIRV/arith.mlir +++ b/mlir/test/Conversion/ConvertToSPIRV/arith.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt -convert-to-spirv -split-input-file %s | FileCheck %s +// RUN: mlir-opt -convert-to-spirv="run-signature-conversion=false" -split-input-file %s | FileCheck %s //===----------------------------------------------------------------------===// // arithmetic ops diff --git a/mlir/test/Conversion/ConvertToSPIRV/combined.mlir b/mlir/test/Conversion/ConvertToSPIRV/combined.mlir index 9e908465cb142..02b938be775a3 100644 --- a/mlir/test/Conversion/ConvertToSPIRV/combined.mlir +++ b/mlir/test/Conversion/ConvertToSPIRV/combined.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt -convert-to-spirv %s | FileCheck %s +// RUN: mlir-opt -convert-to-spirv="run-signature-conversion=false" %s | FileCheck %s // CHECK-LABEL: @combined // CHECK: %[[C0_F32:.*]] = spirv.Constant 0.000000e+00 : f32 diff --git a/mlir/test/Conversion/ConvertToSPIRV/func-signature-vector-unroll.mlir b/mlir/test/Conversion/ConvertToSPIRV/func-signature-vector-unroll.mlir new file mode 100644 index 0000000000000..347d282f9ee0c --- /dev/null +++ b/mlir/test/Conversion/ConvertToSPIRV/func-signature-vector-unroll.mlir @@ -0,0 +1,147 @@ +// RUN: mlir-opt -test-spirv-func-signature-conversion -split-input-file %s | FileCheck %s + +// CHECK-LABEL: @simple_scalar +// CHECK-SAME: (%[[ARG0:.+]]: i32) +func.func @simple_scalar(%arg0 : i32) -> i32 { + // CHECK: return %[[ARG0]] : i32 + return %arg0 : i32 +} + +// ----- + +// CHECK-LABEL: @simple_vector_4 +// CHECK-SAME: (%[[ARG0:.+]]: vector<4xi32>) +func.func @simple_vector_4(%arg0 : vector<4xi32>) -> vector<4xi32> { + // CHECK: return %[[ARG0]] : vector<4xi32> + return %arg0 : vector<4xi32> +} + +// ----- + +// CHECK-LABEL: @simple_vector_5 +// CHECK-SAME: (%[[ARG0:.+]]: vector<1xi32>, %[[ARG1:.+]]: vector<1xi32>, %[[ARG2:.+]]: vector<1xi32>, %[[ARG3:.+]]: vector<1xi32>, %[[ARG4:.+]]: vector<1xi32>) +func.func @simple_vector_5(%arg0 : vector<5xi32>) -> vector<5xi32> { + // CHECK: %[[CST:.*]] = arith.constant dense<0> : vector<5xi32> + // CHECK: %[[INSERT0:.*]] = vector.insert_strided_slice %[[ARG0]], %[[CST]] {offsets = [0], strides = [1]} : vector<1xi32> into vector<5xi32> + // CHECK: %[[INSERT1:.*]] = vector.insert_strided_slice %[[ARG1]], %[[INSERT0]] {offsets = [1], strides = [1]} : vector<1xi32> into vector<5xi32> + // CHECK: %[[INSERT2:.*]] = vector.insert_strided_slice %[[ARG2]], %[[INSERT1]] {offsets = [2], strides = [1]} : vector<1xi32> into vector<5xi32> + // CHECK: %[[INSERT3:.*]] = vector.insert_strided_slice %[[ARG3]], %[[INSERT2]] {offsets = [3], strides = [1]} : vector<1xi32> into vector<5xi32> + // CHECK: %[[INSERT4:.*]] = vector.insert_strided_slice %[[ARG4]], %[[INSERT3]] {offsets = [4], strides = [1]} : vector<1xi32> into vector<5xi32> + // CHECK: %[[EXTRACT0:.*]] = vector.extract_strided_slice %[[INSERT4]] {offsets = [0], sizes = [1], strides = [1]} : vector<5xi32> to vector<1xi32> + // CHECK: %[[EXTRACT1:.*]] = vector.extract_strided_slice %[[INSERT4]] {offsets = [1], sizes = [1], strides = [1]} : vector<5xi32> to vector<1xi32> + // CHECK: %[[EXTRACT2:.*]] = vector.extract_strided_slice %[[INSERT4]] {offsets = [2], sizes = [1], strides = [1]} : vector<5xi32> to vector<1xi32> + // CHECK: %[[EXTRACT3:.*]] = vector.extract_strided_slice %[[INSERT4]] {offsets = [3], sizes = [1], strides = [1]} : vector<5xi32> to vector<1xi32> + // CHECK: %[[EXTRACT4:.*]] = vector.extract_strided_slice %[[INSERT4]] {offsets = [4], sizes = [1], strides = [1]} : vector<5xi32> to vector<1xi32> + // CHECK: return %[[EXTRACT0]], %[[EXTRACT1]], %[[EXTRACT2]], %[[EXTRACT3]], %[[EXTRACT4]] : vector<1xi32>, vector<1xi32>, vector<1xi32>, vector<1xi32>, vector<1xi32> + return %arg0 : vector<5xi32> +} + +// ----- + +// CHECK-LABEL: @simple_vector_6 +// CHECK-SAME: (%[[ARG0:.+]]: vector<3xi32>, %[[ARG1:.+]]: vector<3xi32>) +func.func @simple_vector_6(%arg0 : vector<6xi32>) -> vector<6xi32> { + // CHECK: %[[CST:.*]] = arith.constant dense<0> : vector<6xi32> + // CHECK: %[[INSERT0:.*]] = vector.insert_strided_slice %[[ARG0]], %[[CST]] {offsets = [0], strides = [1]} : vector<3xi32> into vector<6xi32> + // CHECK: %[[INSERT1:.*]] = vector.insert_strided_slice %[[ARG1]], %[[INSERT0]] {offsets = [3], strides = [1]} : vector<3xi32> into vector<6xi32> + // CHECK: %[[EXTRACT0:.*]] = vector.extract_strided_slice %[[INSERT1]] {offsets = [0], sizes = [3], strides = [1]} : vector<6xi32> to vector<3xi32> + // CHECK: %[[EXTRACT1:.*]] = vector.extract_strided_slice %[[INSERT1]] {offsets = [3], sizes = [3], strides = [1]} : vector<6xi32> to vector<3xi32> + // CHECK: return %[[EXTRACT0]], %[[EXTRACT1]] : vector<3xi32>, vector<3xi32> + return %arg0 : vector<6xi32> +} + +// ----- + +// CHECK-LABEL: @simple_vector_8 +// CHECK-SAME: (%[[ARG0:.+]]: vector<4xi32>, %[[ARG1:.+]]: vector<4xi32>) +func.func @simple_vector_8(%arg0 : vector<8xi32>) -> vector<8xi32> { + // CHECK: %[[CST:.*]] = arith.constant dense<0> : vector<8xi32> + // CHECK: %[[INSERT0:.*]] = vector.insert_strided_slice %[[ARG0]], %[[CST]] {offsets = [0], strides = [1]} : vector<4xi32> into vector<8xi32> + // CHECK: %[[INSERT1:.*]] = vector.insert_strided_slice %[[ARG1]], %[[INSERT0]] {offsets = [4], strides = [1]} : vector<4xi32> into vector<8xi32> + // CHECK: %[[EXTRACT0:.*]] = vector.extract_strided_slice %[[INSERT1]] {offsets = [0], sizes = [4], strides = [1]} : vector<8xi32> to vector<4xi32> + // CHECK: %[[EXTRACT1:.*]] = vector.extract_strided_slice %[[INSERT1]] {offsets = [4], sizes = [4], strides = [1]} : vector<8xi32> to vector<4xi32> + // CHECK: return %[[EXTRACT0]], %[[EXTRACT1]] : vector<4xi32>, vector<4xi32> + return %arg0 : vector<8xi32> +} + +// ----- + +// CHECK-LABEL: @vector_6and8 +// CHECK-SAME: (%[[ARG0:.+]]: vector<3xi32>, %[[ARG1:.+]]: vector<3xi32>, %[[ARG2:.+]]: vector<4xi32>, %[[ARG3:.+]]: vector<4xi32>) +func.func @vector_6and8(%arg0 : vector<6xi32>, %arg1 : vector<8xi32>) -> (vector<6xi32>, vector<8xi32>) { + // CHECK: %[[CST:.*]] = arith.constant dense<0> : vector<8xi32> + // CHECK: %[[CST0:.*]] = arith.constant dense<0> : vector<6xi32> + // CHECK: %[[INSERT0:.*]] = vector.insert_strided_slice %[[ARG0]], %[[CST0]] {offsets = [0], strides = [1]} : vector<3xi32> into vector<6xi32> + // CHECK: %[[INSERT1:.*]] = vector.insert_strided_slice %[[ARG1]], %[[INSERT0]] {offsets = [3], strides = [1]} : vector<3xi32> into vector<6xi32> + // CHECK: %[[INSERT2:.*]] = vector.insert_strided_slice %[[ARG2]], %[[CST]] {offsets = [0], strides = [1]} : vector<4xi32> into vector<8xi32> + // CHECK: %[[INSERT3:.*]] = vector.insert_strided_slice %[[ARG3]], %[[INSERT2]] {offsets = [4], strides = [1]} : vector<4xi32> into vector<8xi32> + // CHECK: %[[EXTRACT0:.*]] = vector.extract_strided_slice %[[INSERT1]] {offsets = [0], sizes = [3], strides = [1]} : vector<6xi32> to vector<3xi32> + // CHECK: %[[EXTRACT1:.*]] = vector.extract_strided_slice %[[INSERT1]] {offsets = [3], sizes = [3], strides = [1]} : vector<6xi32> to vector<3xi32> + // CHECK: %[[EXTRACT2:.*]] = vector.extract_strided_slice %[[INSERT3]] {offsets = [0], sizes = [4], strides = [1]} : vector<8xi32> to vector<4xi32> + // CHECK: %[[EXTRACT3:.*]] = vector.extract_strided_slice %[[INSERT3]] {offsets = [4], sizes = [4], strides = [1]} : vector<8xi32> to vector<4xi32> + // CHECK: return %[[EXTRACT0]], %[[EXTRACT1]], %[[EXTRACT2]], %[[EXTRACT3]] : vector<3xi32>, vector<3xi32>, vector<4xi32>, vector<4xi32> + return %arg0, %arg1 : vector<6xi32>, vector<8xi32> +} + +// ----- + +// CHECK-LABEL: @vector_3and8 +// CHECK-SAME: (%[[ARG0:.+]]: vector<3xi32>, %[[ARG1:.+]]: vector<4xi32>, %[[ARG2:.+]]: vector<4xi32>) +func.func @vector_3and8(%arg0 : vector<3xi32>, %arg1 : vector<8xi32>) -> (vector<3xi32>, vector<8xi32>) { + // CHECK: %[[CST:.*]] = arith.constant dense<0> : vector<8xi32> + // CHECK: %[[INSERT0:.*]] = vector.insert_strided_slice %[[ARG1]], %[[CST]] {offsets = [0], strides = [1]} : vector<4xi32> into vector<8xi32> + // CHECK: %[[INSERT1:.*]] = vector.insert_strided_slice %[[ARG2]], %[[INSERT0]] {offsets = [4], strides = [1]} : vector<4xi32> into vector<8xi32> + // CHECK: %[[EXTRACT0:.*]] = vector.extract_strided_slice %[[INSERT1]] {offsets = [0], sizes = [4], strides = [1]} : vector<8xi32> to vector<4xi32> + // CHECK: %[[EXTRACT1:.*]] = vector.extract_strided_slice %[[INSERT1]] {offsets = [4], sizes = [4], strides = [1]} : vector<8xi32> to vector<4xi32> + // CHECK: return %[[ARG0]], %[[EXTRACT0]], %[[EXTRACT1]] : vector<3xi32>, vector<4xi32>, vector<4xi32> + return %arg0, %arg1 : vector<3xi32>, vector<8xi32> +} + +// ----- + +// CHECK-LABEL: @scalar_vector +// CHECK-SAME: (%[[ARG0:.+]]: vector<4xi32>, %[[ARG1:.+]]: vector<4xi32>, %[[ARG2:.+]]: vector<3xi32>, %[[ARG3:.+]]: i32) +func.func @scalar_vector(%arg0 : vector<8xi32>, %arg1 : vector<3xi32>, %arg2 : i32) -> (vector<8xi32>, vector<3xi32>, i32) { + // CHECK: %[[CST:.*]] = arith.constant dense<0> : vector<8xi32> + // CHECK: %[[INSERT0:.*]] = vector.insert_strided_slice %[[ARG0]], %[[CST]] {offsets = [0], strides = [1]} : vector<4xi32> into vector<8xi32> + // CHECK: %[[INSERT1:.*]] = vector.insert_strided_slice %[[ARG1]], %[[INSERT0]] {offsets = [4], strides = [1]} : vector<4xi32> into vector<8xi32> + // CHECK: %[[EXTRACT0:.*]] = vector.extract_strided_slice %[[INSERT1]] {offsets = [0], sizes = [4], strides = [1]} : vector<8xi32> to vector<4xi32> + // CHECK: %[[EXTRACT1:.*]] = vector.extract_strided_slice %[[INSERT1]] {offsets = [4], sizes = [4], strides = [1]} : vector<8xi32> to vector<4xi32> + // CHECK: return %[[EXTRACT0]], %[[EXTRACT1]], %[[ARG2]], %[[ARG3]] : vector<4xi32>, vector<4xi32>, vector<3xi32>, i32 + return %arg0, %arg1, %arg2 : vector<8xi32>, vector<3xi32>, i32 +} + +// ----- + +// CHECK-LABEL: @reduction +// CHECK-SAME: (%[[ARG0:.+]]: vector<4xi32>, %[[ARG1:.+]]: vector<4xi32>, %[[ARG2:.+]]: vector<4xi32>, %[[ARG3:.+]]: vector<4xi32>, %[[ARG4:.+]]: i32) +func.func @reduction(%arg0 : vector<8xi32>, %arg1 : vector<8xi32>, %arg2 : i32) -> (i32) { + // CHECK: %[[CST:.*]] = arith.constant dense<0> : vector<8xi32> + // CHECK: %[[INSERT0:.*]] = vector.insert_strided_slice %[[ARG0]], %[[CST]] {offsets = [0], strides = [1]} : vector<4xi32> into vector<8xi32> + // CHECK: %[[INSERT1:.*]] = vector.insert_strided_slice %[[ARG1]], %[[INSERT0]] {offsets = [4], strides = [1]} : vector<4xi32> into vector<8xi32> + // CHECK: %[[INSERT2:.*]] = vector.insert_strided_slice %[[ARG2]], %[[CST]] {offsets = [0], strides = [1]} : vector<4xi32> into vector<8xi32> + // CHECK: %[[INSERT3:.*]] = vector.insert_strided_slice %[[ARG3]], %[[INSERT2]] {offsets = [4], strides = [1]} : vector<4xi32> into vector<8xi32> + // CHECK: %[[ADDI:.*]] = arith.addi %[[INSERT1]], %[[INSERT3]] : vector<8xi32> + // CHECK: %[[REDUCTION:.*]] = vector.reduction , %[[ADDI]] : vector<8xi32> into i32 + // CHECK: %[[RET:.*]] = arith.addi %[[REDUCTION]], %[[ARG4]] : i32 + // CHECK: return %[[RET]] : i32 + %0 = arith.addi %arg0, %arg1 : vector<8xi32> + %1 = vector.reduction , %0 : vector<8xi32> into i32 + %2 = arith.addi %1, %arg2 : i32 + return %2 : i32 +} + +// ----- + +// CHECK-LABEL: func.func private @unsupported_decl(vector<8xi32>) +func.func private @unsupported_decl(vector<8xi32>) + +// ----- + +// CHECK-LABEL: @unsupported_scalable +// CHECK-SAME: (%[[ARG0:.+]]: vector<[8]xi32>) +func.func @unsupported_scalable(%arg0 : vector<[8]xi32>) -> (vector<[8]xi32>) { + // CHECK: return %[[ARG0]] : vector<[8]xi32> + return %arg0 : vector<[8]xi32> +} + diff --git a/mlir/test/Conversion/ConvertToSPIRV/index.mlir b/mlir/test/Conversion/ConvertToSPIRV/index.mlir index db747625bc7b3..e1cb18aac5d01 100644 --- a/mlir/test/Conversion/ConvertToSPIRV/index.mlir +++ b/mlir/test/Conversion/ConvertToSPIRV/index.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s -convert-to-spirv | FileCheck %s +// RUN: mlir-opt %s -convert-to-spirv="run-signature-conversion=false" | FileCheck %s // CHECK-LABEL: @basic func.func @basic(%a: index, %b: index) { diff --git a/mlir/test/Conversion/ConvertToSPIRV/scf.mlir b/mlir/test/Conversion/ConvertToSPIRV/scf.mlir index f619ca5771824..58ec6ac61f6ac 100644 --- a/mlir/test/Conversion/ConvertToSPIRV/scf.mlir +++ b/mlir/test/Conversion/ConvertToSPIRV/scf.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt -convert-to-spirv %s | FileCheck %s +// RUN: mlir-opt -convert-to-spirv="run-signature-conversion=false" %s | FileCheck %s // CHECK-LABEL: @if_yield // CHECK: %[[VAR:.*]] = spirv.Variable : !spirv.ptr diff --git a/mlir/test/Conversion/ConvertToSPIRV/simple.mlir b/mlir/test/Conversion/ConvertToSPIRV/simple.mlir index 20b2a42bc3975..c5e0e6603d94a 100644 --- a/mlir/test/Conversion/ConvertToSPIRV/simple.mlir +++ b/mlir/test/Conversion/ConvertToSPIRV/simple.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt -convert-to-spirv %s | FileCheck %s +// RUN: mlir-opt -convert-to-spirv="run-signature-conversion=false" %s | FileCheck %s // CHECK-LABEL: @return_scalar // CHECK-SAME: %[[ARG0:.*]]: i32 diff --git a/mlir/test/Conversion/ConvertToSPIRV/ub.mlir b/mlir/test/Conversion/ConvertToSPIRV/ub.mlir index 66528b68f58cf..a83bfb6f405a0 100644 --- a/mlir/test/Conversion/ConvertToSPIRV/ub.mlir +++ b/mlir/test/Conversion/ConvertToSPIRV/ub.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt -convert-to-spirv %s | FileCheck %s +// RUN: mlir-opt -convert-to-spirv="run-signature-conversion=false" %s | FileCheck %s // CHECK-LABEL: @ub // CHECK: %[[UNDEF:.*]] = spirv.Undef : i32 diff --git a/mlir/test/Conversion/ConvertToSPIRV/vector.mlir b/mlir/test/Conversion/ConvertToSPIRV/vector.mlir index 336f0fe10c27e..c63dd030f4747 100644 --- a/mlir/test/Conversion/ConvertToSPIRV/vector.mlir +++ b/mlir/test/Conversion/ConvertToSPIRV/vector.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt -split-input-file -convert-to-spirv %s | FileCheck %s +// RUN: mlir-opt -split-input-file -convert-to-spirv="run-signature-conversion=false" %s | FileCheck %s // CHECK-LABEL: @extract // CHECK-SAME: %[[ARG:.+]]: vector<2xf32> diff --git a/mlir/test/lib/Conversion/CMakeLists.txt b/mlir/test/lib/Conversion/CMakeLists.txt index 754c9866d18e4..19975f671b081 100644 --- a/mlir/test/lib/Conversion/CMakeLists.txt +++ b/mlir/test/lib/Conversion/CMakeLists.txt @@ -1,3 +1,4 @@ +add_subdirectory(ConvertToSPIRV) add_subdirectory(FuncToLLVM) add_subdirectory(MathToVCIX) add_subdirectory(OneToNTypeConversion) diff --git a/mlir/test/lib/Conversion/ConvertToSPIRV/CMakeLists.txt b/mlir/test/lib/Conversion/ConvertToSPIRV/CMakeLists.txt new file mode 100644 index 0000000000000..69b5787f7e851 --- /dev/null +++ b/mlir/test/lib/Conversion/ConvertToSPIRV/CMakeLists.txt @@ -0,0 +1,16 @@ +# Exclude tests from libMLIR.so +add_mlir_library(MLIRTestConvertToSPIRV + TestSPIRVFuncSignatureConversion.cpp + + EXCLUDE_FROM_LIBMLIR + + LINK_LIBS PUBLIC + MLIRArithDialect + MLIRFuncDialect + MLIRPass + MLIRSPIRVConversion + MLIRSPIRVDialect + MLIRTransformUtils + MLIRTransforms + MLIRVectorDialect + ) diff --git a/mlir/test/lib/Conversion/ConvertToSPIRV/TestSPIRVFuncSignatureConversion.cpp b/mlir/test/lib/Conversion/ConvertToSPIRV/TestSPIRVFuncSignatureConversion.cpp new file mode 100644 index 0000000000000..ec67f85f6f27b --- /dev/null +++ b/mlir/test/lib/Conversion/ConvertToSPIRV/TestSPIRVFuncSignatureConversion.cpp @@ -0,0 +1,57 @@ +//===- TestSPIRVFuncSignatureConversion.cpp - Test signature conversion -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-------------------------------------------------------------------===// + +#include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/SPIRV/IR/SPIRVDialect.h" +#include "mlir/Dialect/SPIRV/Transforms/SPIRVConversion.h" +#include "mlir/Dialect/Vector/IR/VectorOps.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Pass/PassManager.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" + +namespace mlir { +namespace { + +struct TestSPIRVFuncSignatureConversion final + : PassWrapper> { + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestSPIRVFuncSignatureConversion) + + StringRef getArgument() const final { + return "test-spirv-func-signature-conversion"; + } + + StringRef getDescription() const final { + return "Test patterns that convert vector inputs and results in function " + "signatures"; + } + + void getDependentDialects(DialectRegistry ®istry) const override { + registry.insert(); + } + + void runOnOperation() override { + RewritePatternSet patterns(&getContext()); + populateFuncOpVectorRewritePatterns(patterns); + populateReturnOpVectorRewritePatterns(patterns); + GreedyRewriteConfig config; + config.strictMode = GreedyRewriteStrictness::ExistingOps; + (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns), + config); + } +}; + +} // namespace + +namespace test { +void registerTestSPIRVFuncSignatureConversion() { + PassRegistration(); +} +} // namespace test +} // namespace mlir diff --git a/mlir/tools/mlir-opt/CMakeLists.txt b/mlir/tools/mlir-opt/CMakeLists.txt index e8091bca3326c..8b79de58fa102 100644 --- a/mlir/tools/mlir-opt/CMakeLists.txt +++ b/mlir/tools/mlir-opt/CMakeLists.txt @@ -36,6 +36,7 @@ if(MLIR_INCLUDE_TESTS) MLIRSPIRVTestPasses MLIRTensorTestPasses MLIRTestAnalysis + MLIRTestConvertToSPIRV MLIRTestDialect MLIRTestDynDialect MLIRTestIR diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp index 8cafb0afac9ae..149f9d59961b8 100644 --- a/mlir/tools/mlir-opt/mlir-opt.cpp +++ b/mlir/tools/mlir-opt/mlir-opt.cpp @@ -141,6 +141,7 @@ void registerTestSCFWhileOpBuilderPass(); void registerTestSCFWrapInZeroTripCheckPasses(); void registerTestShapeMappingPass(); void registerTestSliceAnalysisPass(); +void registerTestSPIRVFuncSignatureConversion(); void registerTestTensorCopyInsertionPass(); void registerTestTensorTransforms(); void registerTestTopologicalSortAnalysisPass(); @@ -273,6 +274,7 @@ void registerTestPasses() { mlir::test::registerTestSCFWrapInZeroTripCheckPasses(); mlir::test::registerTestShapeMappingPass(); mlir::test::registerTestSliceAnalysisPass(); + mlir::test::registerTestSPIRVFuncSignatureConversion(); mlir::test::registerTestTensorCopyInsertionPass(); mlir::test::registerTestTensorTransforms(); mlir::test::registerTestTopologicalSortAnalysisPass(); diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index 8d2b2be67ad79..0f9f688403530 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -7207,10 +7207,15 @@ cc_library( hdrs = ["include/mlir/Dialect/SPIRV/Transforms/SPIRVConversion.h"], includes = ["include"], deps = [ + ":ArithDialect", + ":DialectUtils", ":FuncDialect", ":IR", ":SPIRVDialect", + ":Support", ":TransformUtils", + ":VectorDialect", + ":VectorTransforms", "//llvm:Support", ], ) @@ -9588,6 +9593,7 @@ cc_binary( "//mlir/test:TestArmSME", "//mlir/test:TestBufferization", "//mlir/test:TestControlFlow", + "//mlir/test:TestConvertToSPIRV", "//mlir/test:TestDLTI", "//mlir/test:TestDialect", "//mlir/test:TestFunc", diff --git a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel index 1d59370057d1c..a1d2b20a106e6 100644 --- a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel @@ -656,6 +656,21 @@ cc_library( ], ) +cc_library( + name = "TestConvertToSPIRV", + srcs = glob(["lib/Conversion/ConvertToSPIRV/*.cpp"]), + deps = [ + "//mlir:ArithDialect", + "//mlir:FuncDialect", + "//mlir:Pass", + "//mlir:SPIRVConversion", + "//mlir:SPIRVDialect", + "//mlir:TransformUtils", + "//mlir:Transforms", + "//mlir:VectorDialect", + ], +) + cc_library( name = "TestAffine", srcs = glob([ From d748dab6010dfd4ddf63cd59c0a89487824aa038 Mon Sep 17 00:00:00 2001 From: Keith Smiley Date: Wed, 17 Jul 2024 10:11:16 -0700 Subject: [PATCH 308/777] [bazel] Port #98653 (#99356) --- .../llvm-project-overlay/mlir/BUILD.bazel | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index 0f9f688403530..5badfccc29f22 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -5992,6 +5992,7 @@ cc_library( ":LLVMCommonConversion", ":LLVMDialect", ":MathDialect", + ":MathToROCDL", ":MemRefDialect", ":MemRefToLLVM", ":Pass", @@ -7284,6 +7285,32 @@ cc_library( ], ) +cc_library( + name = "MathToROCDL", + srcs = glob([ + "lib/Conversion/MathToROCDL/*.cpp", + ]), + hdrs = glob([ + "include/mlir/Conversion/MathToROCDL/*.h", + ]), + includes = ["include"], + deps = [ + ":ConversionPassIncGen", + ":DialectUtils", + ":FuncDialect", + ":GPUCommonTransforms", + ":GPUToGPURuntimeTransforms", + ":IR", + ":LLVMCommonConversion", + ":LLVMDialect", + ":MathDialect", + ":Pass", + ":ROCDLDialect", + ":TransformUtils", + ":VectorDialect", + ], +) + cc_library( name = "FuncToEmitC", srcs = glob([ From 963e25ae60f43ea77b686bd506171ee7482f044a Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Wed, 17 Jul 2024 19:19:59 +0200 Subject: [PATCH 309/777] [libc++][NFC] Remove a few unused includes (#98808) --- libcxx/include/__type_traits/add_pointer.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/libcxx/include/__type_traits/add_pointer.h b/libcxx/include/__type_traits/add_pointer.h index 358e3cbd23843..5aac7d5cfa90d 100644 --- a/libcxx/include/__type_traits/add_pointer.h +++ b/libcxx/include/__type_traits/add_pointer.h @@ -11,9 +11,7 @@ #include <__config> #include <__type_traits/is_referenceable.h> -#include <__type_traits/is_same.h> #include <__type_traits/is_void.h> -#include <__type_traits/remove_cv.h> #include <__type_traits/remove_reference.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) From 81955da03bd4731b668fee401b3d6aca8b7d4da6 Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Wed, 17 Jul 2024 12:20:48 -0500 Subject: [PATCH 310/777] [libc++] Remove special handling of the native C++ library in benchmarks (#98529) There were some ad-hoc settings that allowed running the benchmarks against the native C++ Standard Library. While this ability is very useful, it was done before the test suite was quite independent of libc++ itself. Instead, it is better to streamline running the benchmarks on the native standard library by using a custom Lit configuration like we do with the test suite. A follow-up patch will rework the integration of benchmarks with the Lit configuration used for the test suite so that we can reuse the same mechanism for both, making it easy to benchmark the native standard library. It will also make benchmarks way more user-friendly to run since we will be able to run them like we run individual tests, which is a pain point right now. --- libcxx/CMakeLists.txt | 14 ----- libcxx/benchmarks/CMakeLists.txt | 100 ++++++------------------------- libcxx/benchmarks/lit.cfg.py | 2 +- libcxx/docs/BuildingLibcxx.rst | 16 ----- libcxx/docs/ReleaseNotes/19.rst | 4 ++ libcxx/docs/TestingLibcxx.rst | 15 ++--- 6 files changed, 28 insertions(+), 123 deletions(-) diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt index 190a97db9462f..155f81a74a974 100644 --- a/libcxx/CMakeLists.txt +++ b/libcxx/CMakeLists.txt @@ -159,20 +159,6 @@ set(LIBCXX_BENCHMARK_TEST_ARGS_DEFAULT --benchmark_min_time=0.01) set(LIBCXX_BENCHMARK_TEST_ARGS "${LIBCXX_BENCHMARK_TEST_ARGS_DEFAULT}" CACHE STRING "Arguments to pass when running the benchmarks using check-cxx-benchmarks") -set(LIBCXX_BENCHMARK_NATIVE_STDLIB "" CACHE STRING - "Build the benchmarks against the specified native STL. - The value must be one of libc++/libstdc++") -set(LIBCXX_BENCHMARK_NATIVE_GCC_TOOLCHAIN "" CACHE STRING - "Use alternate GCC toolchain when building the native benchmarks") - -if (LIBCXX_BENCHMARK_NATIVE_STDLIB) - if (NOT (LIBCXX_BENCHMARK_NATIVE_STDLIB STREQUAL "libc++" - OR LIBCXX_BENCHMARK_NATIVE_STDLIB STREQUAL "libstdc++")) - message(FATAL_ERROR "Invalid value for LIBCXX_BENCHMARK_NATIVE_STDLIB: " - "'${LIBCXX_BENCHMARK_NATIVE_STDLIB}'") - endif() -endif() - option(LIBCXX_INCLUDE_DOCS "Build the libc++ documentation." ${LLVM_INCLUDE_DOCS}) set(LIBCXX_LIBDIR_SUFFIX "${LLVM_LIBDIR_SUFFIX}" CACHE STRING "Define suffix of library directory name (32/64)") diff --git a/libcxx/benchmarks/CMakeLists.txt b/libcxx/benchmarks/CMakeLists.txt index 2101f9c71788c..110672600213a 100644 --- a/libcxx/benchmarks/CMakeLists.txt +++ b/libcxx/benchmarks/CMakeLists.txt @@ -2,12 +2,12 @@ include(ExternalProject) include(CheckCXXCompilerFlag) #============================================================================== -# Build Google Benchmark for libc++ +# Build Google Benchmark #============================================================================== set(CMAKE_FOLDER "${CMAKE_FOLDER}/Benchmarks") -set(BENCHMARK_LIBCXX_COMPILE_FLAGS +set(BENCHMARK_COMPILE_FLAGS -Wno-unused-command-line-argument -nostdinc++ -isystem "${LIBCXX_GENERATED_INCLUDE_DIR}" @@ -16,64 +16,37 @@ set(BENCHMARK_LIBCXX_COMPILE_FLAGS ${SANITIZER_FLAGS} ) if(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR AND NOT APPLE) - list(APPEND BENCHMARK_LIBCXX_COMPILE_FLAGS + list(APPEND BENCHMARK_COMPILE_FLAGS -isystem "${LIBCXX_GENERATED_INCLUDE_TARGET_DIR}") endif() if (DEFINED LIBCXX_CXX_ABI_LIBRARY_PATH) - list(APPEND BENCHMARK_LIBCXX_COMPILE_FLAGS + list(APPEND BENCHMARK_COMPILE_FLAGS -L${LIBCXX_CXX_ABI_LIBRARY_PATH} -Wl,-rpath,${LIBCXX_CXX_ABI_LIBRARY_PATH}) endif() -split_list(BENCHMARK_LIBCXX_COMPILE_FLAGS) +split_list(BENCHMARK_COMPILE_FLAGS) -ExternalProject_Add(google-benchmark-libcxx +ExternalProject_Add(google-benchmark EXCLUDE_FROM_ALL ON DEPENDS cxx cxx-headers - PREFIX benchmark-libcxx + PREFIX google-benchmark SOURCE_DIR ${LLVM_THIRD_PARTY_DIR}/benchmark - INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}/benchmark-libcxx + INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}/google-benchmark CMAKE_CACHE_ARGS -DCMAKE_C_COMPILER:STRING=${CMAKE_C_COMPILER} -DCMAKE_CXX_COMPILER:STRING=${CMAKE_CXX_COMPILER} -DCMAKE_BUILD_TYPE:STRING=RELEASE -DCMAKE_INSTALL_PREFIX:PATH= - -DCMAKE_CXX_FLAGS:STRING=${BENCHMARK_LIBCXX_COMPILE_FLAGS} + -DCMAKE_CXX_FLAGS:STRING=${BENCHMARK_COMPILE_FLAGS} -DBENCHMARK_USE_LIBCXX:BOOL=ON -DBENCHMARK_ENABLE_TESTING:BOOL=OFF) -#============================================================================== -# Build Google Benchmark for the native stdlib -#============================================================================== -set(BENCHMARK_NATIVE_TARGET_FLAGS) -if (LIBCXX_BENCHMARK_NATIVE_GCC_TOOLCHAIN) - set(BENCHMARK_NATIVE_TARGET_FLAGS - --gcc-toolchain=${LIBCXX_BENCHMARK_NATIVE_GCC_TOOLCHAIN}) -endif() -split_list(BENCHMARK_NATIVE_TARGET_FLAGS) - -if (LIBCXX_BENCHMARK_NATIVE_STDLIB) - ExternalProject_Add(google-benchmark-native - EXCLUDE_FROM_ALL ON - PREFIX benchmark-native - SOURCE_DIR ${LLVM_THIRD_PARTY_DIR}/benchmark - INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}/benchmark-native - CMAKE_CACHE_ARGS - -DCMAKE_C_COMPILER:STRING=${CMAKE_C_COMPILER} - -DCMAKE_CXX_COMPILER:STRING=${CMAKE_CXX_COMPILER} - -DCMAKE_CXX_FLAGS:STRING=${BENCHMARK_NATIVE_TARGET_FLAGS} - -DCMAKE_BUILD_TYPE:STRING=RELEASE - -DCMAKE_INSTALL_PREFIX:PATH= - -DBENCHMARK_ENABLE_TESTING:BOOL=OFF) -endif() - - #============================================================================== # Benchmark tests configuration #============================================================================== add_custom_target(cxx-benchmarks) set(BENCHMARK_OUTPUT_DIR ${CMAKE_CURRENT_BINARY_DIR}) -set(BENCHMARK_LIBCXX_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/benchmark-libcxx) -set(BENCHMARK_NATIVE_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/benchmark-native) +set(BENCHMARK_INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}/google-benchmark) add_library( cxx-benchmarks-flags INTERFACE) @@ -97,32 +70,14 @@ else() target_compile_features( cxx-benchmarks-flags INTERFACE cxx_std_23) endif() -target_compile_options( cxx-benchmarks-flags INTERFACE -fsized-deallocation -nostdinc++) +target_compile_options(cxx-benchmarks-flags INTERFACE -fsized-deallocation -nostdinc++ + ${SANITIZER_FLAGS} -Wno-user-defined-literals -Wno-suggest-override) target_include_directories(cxx-benchmarks-flags INTERFACE "${LIBCXX_GENERATED_INCLUDE_DIR}" - INTERFACE "${BENCHMARK_LIBCXX_INSTALL}/include" + INTERFACE "${BENCHMARK_INSTALL_DIR}/include" INTERFACE "${LIBCXX_SOURCE_DIR}/test/support") - -add_library( cxx-benchmarks-flags-native INTERFACE) -target_link_libraries( cxx-benchmarks-flags-native INTERFACE cxx-benchmarks-flags) -target_compile_options(cxx-benchmarks-flags-native INTERFACE ${BENCHMARK_NATIVE_TARGET_FLAGS}) -target_link_options( cxx-benchmarks-flags-native INTERFACE ${BENCHMARK_NATIVE_TARGET_FLAGS} "-L${BENCHMARK_NATIVE_INSTALL}/lib") -if (LIBCXX_BENCHMARK_NATIVE_STDLIB STREQUAL "libstdc++") - find_library(LIBSTDCXX_FILESYSTEM_TEST stdc++fs - PATHS ${LIBCXX_BENCHMARK_NATIVE_GCC_TOOLCHAIN} - PATH_SUFFIXES lib lib64 - DOC "The libstdc++ filesystem library used by the benchmarks" - ) - if (LIBSTDCXX_FILESYSTEM_TEST) - target_link_libraries(cxx-benchmarks-flags-native INTERFACE -lstdc++fs) - endif() -else() - target_link_libraries(cxx-benchmarks-flags-native INTERFACE -lc++fs -lc++experimental) -endif() - -add_library( cxx-benchmarks-flags-libcxx INTERFACE) -target_link_libraries( cxx-benchmarks-flags-libcxx INTERFACE cxx-benchmarks-flags) -target_compile_options(cxx-benchmarks-flags-libcxx INTERFACE ${SANITIZER_FLAGS} -Wno-user-defined-literals -Wno-suggest-override) -target_link_options( cxx-benchmarks-flags-libcxx INTERFACE -lm -nostdlib++ "-L${BENCHMARK_LIBCXX_INSTALL}/lib" "-L${BENCHMARK_LIBCXX_INSTALL}/lib64" ${SANITIZER_FLAGS}) +target_link_options(cxx-benchmarks-flags INTERFACE -lm -nostdlib++ + "-L${BENCHMARK_INSTALL_DIR}/lib" "-L${BENCHMARK_INSTALL_DIR}/lib64" + ${SANITIZER_FLAGS}) set(libcxx_benchmark_targets) @@ -130,8 +85,8 @@ function(add_benchmark_test name source_file) set(libcxx_target ${name}_libcxx) list(APPEND libcxx_benchmark_targets ${libcxx_target}) add_executable(${libcxx_target} EXCLUDE_FROM_ALL ${source_file}) - target_link_libraries(${libcxx_target} PRIVATE cxx-benchmarks-flags-libcxx) - add_dependencies(${libcxx_target} cxx google-benchmark-libcxx) + target_link_libraries(${libcxx_target} PRIVATE cxx-benchmarks-flags) + add_dependencies(${libcxx_target} cxx google-benchmark) add_dependencies(cxx-benchmarks ${libcxx_target}) if (LIBCXX_ENABLE_SHARED) target_link_libraries(${libcxx_target} PRIVATE cxx_shared) @@ -144,27 +99,10 @@ function(add_benchmark_test name source_file) endif() set_target_properties(${libcxx_target} PROPERTIES - OUTPUT_NAME "${name}.libcxx.out" + OUTPUT_NAME "${name}.bench.out" RUNTIME_OUTPUT_DIRECTORY "${BENCHMARK_OUTPUT_DIR}" CXX_EXTENSIONS NO) cxx_link_system_libraries(${libcxx_target}) - if (LIBCXX_BENCHMARK_NATIVE_STDLIB) - set(native_target ${name}_native) - add_executable(${native_target} EXCLUDE_FROM_ALL ${source_file}) - target_link_libraries(${native_target} PRIVATE cxx-benchmarks-flags-native) - add_dependencies(${native_target} google-benchmark-native - google-benchmark-libcxx) - target_link_libraries(${native_target} PRIVATE -lbenchmark) - if (LIBCXX_HAS_PTHREAD_LIB) - target_link_libraries(${native_target} PRIVATE -pthread) - endif() - add_dependencies(cxx-benchmarks ${native_target}) - set_target_properties(${native_target} - PROPERTIES - OUTPUT_NAME "${name}.native.out" - RUNTIME_OUTPUT_DIRECTORY "${BENCHMARK_OUTPUT_DIR}" - CXX_EXTENSIONS NO) - endif() endfunction() diff --git a/libcxx/benchmarks/lit.cfg.py b/libcxx/benchmarks/lit.cfg.py index 7d222ddf9284e..0d08966c26cc1 100644 --- a/libcxx/benchmarks/lit.cfg.py +++ b/libcxx/benchmarks/lit.cfg.py @@ -19,5 +19,5 @@ config.test_source_root = config.test_exec_root config.test_format = GoogleBenchmark( - test_sub_dirs=".", test_suffix=".libcxx.out", benchmark_args=config.benchmark_args + test_sub_dirs=".", test_suffix=".bench.out", benchmark_args=config.benchmark_args ) diff --git a/libcxx/docs/BuildingLibcxx.rst b/libcxx/docs/BuildingLibcxx.rst index e425b9dadfe7d..66bb19bb5b2cd 100644 --- a/libcxx/docs/BuildingLibcxx.rst +++ b/libcxx/docs/BuildingLibcxx.rst @@ -399,22 +399,6 @@ libc++ Feature Options since the primary use of ``check-cxx-benchmarks`` is to get test and sanitizer coverage, not to get accurate measurements. -.. option:: LIBCXX_BENCHMARK_NATIVE_STDLIB:STRING - - **Default**:: ``""`` - - **Values**:: ``libc++``, ``libstdc++`` - - Build the libc++ benchmark tests and Google Benchmark library against the - specified standard library on the platform. On Linux this can be used to - compare libc++ to libstdc++ by building the benchmark tests against both - standard libraries. - -.. option:: LIBCXX_BENCHMARK_NATIVE_GCC_TOOLCHAIN:STRING - - Use the specified GCC toolchain and standard library when building the native - stdlib benchmark tests. - .. option:: LIBCXX_ASSERTION_HANDLER_FILE:PATH **Default**:: ``"${CMAKE_CURRENT_SOURCE_DIR}/vendor/llvm/default_assertion_handler.in"`` diff --git a/libcxx/docs/ReleaseNotes/19.rst b/libcxx/docs/ReleaseNotes/19.rst index 05aeaba7f8716..80b9e18cec901 100644 --- a/libcxx/docs/ReleaseNotes/19.rst +++ b/libcxx/docs/ReleaseNotes/19.rst @@ -178,3 +178,7 @@ Build System Changes to automatically detect the presence of ``clang-tidy`` and the required ``Clang`` libraries. - The CMake options ``LIBCXX_INSTALL_MODULES`` now defaults to ``ON``. + +- The CMake options ``LIBCXX_BENCHMARK_NATIVE_STDLIB`` and ``LIBCXX_BENCHMARK_NATIVE_GCC_TOOLCHAIN`` have + been removed. To benchmark the native standard library, configure the test suite against the native + standard library directly instead. diff --git a/libcxx/docs/TestingLibcxx.rst b/libcxx/docs/TestingLibcxx.rst index d9f4fe467fe36..6d3417cabfd61 100644 --- a/libcxx/docs/TestingLibcxx.rst +++ b/libcxx/docs/TestingLibcxx.rst @@ -351,7 +351,7 @@ Test Filenames`_ when determining the names for new test files. - Same as ``FOO.pass.cpp``, but for Objective-C++. * - ``FOO.compile.pass.cpp`` - - Checks whether the C++ code in the file compiles successfully. In general, prefer ``compile`` tests over ``verify`` tests, + - Checks whether the C++ code in the file compiles successfully. In general, prefer ``compile`` tests over ``verify`` tests, subject to the specific recommendations, below, for when to write ``verify`` tests. * - ``FOO.compile.pass.mm`` - Same as ``FOO.compile.pass.cpp``, but for Objective-C++. @@ -447,19 +447,12 @@ An example build would look like: .. code-block:: bash - $ cd build - $ ninja cxx-benchmarks + $ ninja -C build cxx-benchmarks This will build all of the benchmarks under ``/benchmarks`` to be built against the just-built libc++. The compiled tests are output into ``build/projects/libcxx/benchmarks``. -The benchmarks can also be built against the platforms native standard library -using the ``-DLIBCXX_BUILD_BENCHMARKS_NATIVE_STDLIB=ON`` CMake option. This -is useful for comparing the performance of libc++ to other standard libraries. -The compiled benchmarks are named ``.libcxx.out`` if they test libc++ and -``.native.out`` otherwise. - Also See: * :ref:`Building Libc++ ` @@ -476,8 +469,8 @@ For example: .. code-block:: bash $ cd build/projects/libcxx/benchmarks - $ ./algorithms.libcxx.out # Runs all the benchmarks - $ ./algorithms.libcxx.out --benchmark_filter=BM_Sort.* # Only runs the sort benchmarks + $ ./algorithms.bench.out # Runs all the benchmarks + $ ./algorithms.bench.out --benchmark_filter=BM_Sort.* # Only runs the sort benchmarks For more information about running benchmarks see `Google Benchmark`_. From 18cdfa72e046a40deeee4372ee98602fd1a65a94 Mon Sep 17 00:00:00 2001 From: Lei Wang Date: Wed, 17 Jul 2024 10:33:00 -0700 Subject: [PATCH 311/777] [SampleFDO] Stale profile call-graph matching (#95135) Profile staleness could be due to function renaming. Given that sample profile loader relies on exact string matching, a trivial change in the function signature( such as `int foo()` --> `long foo()` ) can make the mangled name different, the function profile(including all nested children profile) becomes unavailable. This patch introduces stale profile call-graph level matching, targeting at identifying the trivial function renaming and reusing the old function profile. Some noteworthy details: 1. Extend the LCS based CFG level matching to identify new function. - Extend to match function and profile have different name instead of the exact function name matching. This leverages LCS, i.e during the finding of callsite anchor matching, when two function name are different, try matching the functions instead of return. - In LCS, the equal function check is replaced by `functionMatchesProfile`. - Only try matching functions that are new functions(neither appears on each side). This reduces the matching scope as we don't need to match the originally matched function. 2. Determine the matching by call-site anchor similarity check. - A new function `functionMatchesProfile(IRFunc, ProfFunc)` is used to check the renaming for the possible pair, use the LCS(diff) matching to compute the equal set and we define: `Similarity = |equalSet * 2| / (|A| + |B|)`. The profile name is marked as renamed if the similarity is above a threshold(`-func-profile-similarity-threshold`) 3. Process the matching in top-down function order - when a caller's is done matching, the new function names are saved for later use, using top-down order will maximize the reused results. - `ProfileNameToFuncMap` is used to save or cache the matching result. 4. Update the original profile at the end using `ProfileNameToFuncMap`. 5. Added a new switch --salvage-unused-profile to control this, default is false. Verified on one Meta's internal big service, confirmed 90%+ of the found renaming pair is good. (There could be incorrect renaming pair if the num of the anchor is small, but checked that those functions are simple cold function) --- llvm/include/llvm/ProfileData/SampleProf.h | 23 +- .../Transforms/IPO/SampleProfileMatcher.h | 115 +++++- .../Utils/SampleProfileLoaderBaseImpl.h | 17 + llvm/lib/ProfileData/SampleProf.cpp | 36 +- llvm/lib/Transforms/IPO/SampleProfile.cpp | 63 +-- .../Transforms/IPO/SampleProfileMatcher.cpp | 368 +++++++++++++++--- ...-pm-thinlto-postlink-samplepgo-defaults.ll | 2 +- ...w-pm-thinlto-prelink-samplepgo-defaults.ll | 2 +- ...robe-stale-profile-renaming-recursive.prof | 11 + .../pseudo-probe-stale-profile-renaming.prof | 57 +++ .../non-probe-stale-profile-matching.ll | 12 +- ...pseudo-probe-stale-profile-matching-LCS.ll | 22 +- ...-probe-stale-profile-renaming-recursive.ll | 150 +++++++ .../pseudo-probe-stale-profile-renaming.ll | 313 +++++++++++++++ 14 files changed, 1064 insertions(+), 127 deletions(-) create mode 100644 llvm/test/Transforms/SampleProfile/Inputs/pseudo-probe-stale-profile-renaming-recursive.prof create mode 100644 llvm/test/Transforms/SampleProfile/Inputs/pseudo-probe-stale-profile-renaming.prof create mode 100644 llvm/test/Transforms/SampleProfile/pseudo-probe-stale-profile-renaming-recursive.ll create mode 100644 llvm/test/Transforms/SampleProfile/pseudo-probe-stale-profile-renaming.ll diff --git a/llvm/include/llvm/ProfileData/SampleProf.h b/llvm/include/llvm/ProfileData/SampleProf.h index 5c2a78c14efd0..e7b154dff0697 100644 --- a/llvm/include/llvm/ProfileData/SampleProf.h +++ b/llvm/include/llvm/ProfileData/SampleProf.h @@ -919,12 +919,14 @@ class FunctionSamples { /// Returns a pointer to FunctionSamples at the given callsite location /// \p Loc with callee \p CalleeName. If no callsite can be found, relax /// the restriction to return the FunctionSamples at callsite location - /// \p Loc with the maximum total sample count. If \p Remapper is not - /// nullptr, use \p Remapper to find FunctionSamples with equivalent name - /// as \p CalleeName. - const FunctionSamples * - findFunctionSamplesAt(const LineLocation &Loc, StringRef CalleeName, - SampleProfileReaderItaniumRemapper *Remapper) const; + /// \p Loc with the maximum total sample count. If \p Remapper or \p + /// FuncNameToProfNameMap is not nullptr, use them to find FunctionSamples + /// with equivalent name as \p CalleeName. + const FunctionSamples *findFunctionSamplesAt( + const LineLocation &Loc, StringRef CalleeName, + SampleProfileReaderItaniumRemapper *Remapper, + const HashKeyMap + *FuncNameToProfNameMap = nullptr) const; bool empty() const { return TotalSamples == 0; } @@ -1172,11 +1174,14 @@ class FunctionSamples { /// tree nodes in the profile. /// /// \returns the FunctionSamples pointer to the inlined instance. - /// If \p Remapper is not nullptr, it will be used to find matching - /// FunctionSamples with not exactly the same but equivalent name. + /// If \p Remapper or \p FuncNameToProfNameMap is not nullptr, it will be used + /// to find matching FunctionSamples with not exactly the same but equivalent + /// name. const FunctionSamples *findFunctionSamples( const DILocation *DIL, - SampleProfileReaderItaniumRemapper *Remapper = nullptr) const; + SampleProfileReaderItaniumRemapper *Remapper = nullptr, + const HashKeyMap + *FuncNameToProfNameMap = nullptr) const; static bool ProfileIsProbeBased; diff --git a/llvm/include/llvm/Transforms/IPO/SampleProfileMatcher.h b/llvm/include/llvm/Transforms/IPO/SampleProfileMatcher.h index b6feca5d47035..a67f158433391 100644 --- a/llvm/include/llvm/Transforms/IPO/SampleProfileMatcher.h +++ b/llvm/include/llvm/Transforms/IPO/SampleProfileMatcher.h @@ -26,6 +26,7 @@ using AnchorMap = std::map; class SampleProfileMatcher { Module &M; SampleProfileReader &Reader; + LazyCallGraph &CG; const PseudoProbeManager *ProbeManager; const ThinOrFullLTOPhase LTOPhase; SampleProfileMap FlattenedProfiles; @@ -58,6 +59,40 @@ class SampleProfileMatcher { StringMap> FuncCallsiteMatchStates; + struct FuncToProfileNameMapHash { + uint64_t + operator()(const std::pair &P) const { + return hash_combine(P.first, P.second); + } + }; + // A map from a pair of function and profile name to a boolean value + // indicating whether they are matched. This is used as a cache for the + // matching result. + std::unordered_map, bool, + FuncToProfileNameMapHash> + FuncProfileMatchCache; + // The new functions found by the call graph matching. The map's key is the + // the new(renamed) function pointer and the value is old(unused) profile + // name. + std::unordered_map FuncToProfileNameMap; + + // A map pointer to the FuncNameToProfNameMap in SampleProfileLoader, + // which maps the function name to the matched profile name. This is used + // for sample loader to look up profile using the new name. + HashKeyMap *FuncNameToProfNameMap; + + // A map pointer to the SymbolMap in SampleProfileLoader, which stores all + // the original matched symbols before the matching. this is to determine if + // the profile is unused(to be matched) or not. + HashKeyMap *SymbolMap; + + // The new functions from IR. + HashKeyMap + FunctionsWithoutProfile; + + // Pointer to the Profile Symbol List in the reader. + std::shared_ptr PSL; + // Profile mismatch statstics: uint64_t TotalProfiledFunc = 0; // Num of checksum-mismatched function. @@ -72,34 +107,61 @@ class SampleProfileMatcher { uint64_t MismatchedCallsiteSamples = 0; uint64_t RecoveredCallsiteSamples = 0; + // Profile call-graph matching statstics: + uint64_t NumCallGraphRecoveredProfiledFunc = 0; + uint64_t NumCallGraphRecoveredFuncSamples = 0; + // A dummy name for unknown indirect callee, used to differentiate from a // non-call instruction that also has an empty callee name. static constexpr const char *UnknownIndirectCallee = "unknown.indirect.callee"; public: - SampleProfileMatcher(Module &M, SampleProfileReader &Reader, - const PseudoProbeManager *ProbeManager, - ThinOrFullLTOPhase LTOPhase) - : M(M), Reader(Reader), ProbeManager(ProbeManager), LTOPhase(LTOPhase){}; + SampleProfileMatcher( + Module &M, SampleProfileReader &Reader, LazyCallGraph &CG, + const PseudoProbeManager *ProbeManager, ThinOrFullLTOPhase LTOPhase, + HashKeyMap &SymMap, + std::shared_ptr PSL, + HashKeyMap + &FuncNameToProfNameMap) + : M(M), Reader(Reader), CG(CG), ProbeManager(ProbeManager), + LTOPhase(LTOPhase), FuncNameToProfNameMap(&FuncNameToProfNameMap), + SymbolMap(&SymMap), PSL(PSL) {}; void runOnModule(); void clearMatchingData() { // Do not clear FuncMappings, it stores IRLoc to ProfLoc remappings which // will be used for sample loader. - FuncCallsiteMatchStates.clear(); + // Do not clear FlattenedProfiles as it contains function names referenced + // by FuncNameToProfNameMap. Clearing this memory could lead to a + // use-after-free error. + freeContainer(FuncCallsiteMatchStates); + freeContainer(FunctionsWithoutProfile); + freeContainer(FuncToProfileNameMap); } private: - FunctionSamples *getFlattenedSamplesFor(const Function &F) { - StringRef CanonFName = FunctionSamples::getCanonicalFnName(F); - auto It = FlattenedProfiles.find(FunctionId(CanonFName)); + FunctionSamples *getFlattenedSamplesFor(const FunctionId &Fname) { + auto It = FlattenedProfiles.find(Fname); if (It != FlattenedProfiles.end()) return &It->second; return nullptr; } + FunctionSamples *getFlattenedSamplesFor(const Function &F) { + StringRef CanonFName = FunctionSamples::getCanonicalFnName(F); + return getFlattenedSamplesFor(FunctionId(CanonFName)); + } + template inline void freeContainer(T &C) { + T Empty; + std::swap(C, Empty); + } + void getFilteredAnchorList(const AnchorMap &IRAnchors, + const AnchorMap &ProfileAnchors, + AnchorList &FilteredIRAnchorsList, + AnchorList &FilteredProfileAnchorList); void runOnFunction(Function &F); - void findIRAnchors(const Function &F, AnchorMap &IRAnchors); - void findProfileAnchors(const FunctionSamples &FS, AnchorMap &ProfileAnchors); + void findIRAnchors(const Function &F, AnchorMap &IRAnchors) const; + void findProfileAnchors(const FunctionSamples &FS, + AnchorMap &ProfileAnchors) const; // Record the callsite match states for profile staleness report, the result // is saved in FuncCallsiteMatchStates. void recordCallsiteMatchStates(const Function &F, const AnchorMap &IRAnchors, @@ -124,6 +186,9 @@ class SampleProfileMatcher { State == MatchState::RemovedMatch; }; + void countCallGraphRecoveredSamples( + const FunctionSamples &FS, + std::unordered_set &MatchedUnusedProfile); // Count the samples of checksum mismatched function for the top-level // function and all inlinees. void countMismatchedFuncSamples(const FunctionSamples &FS, bool IsTopLevel); @@ -151,15 +216,37 @@ class SampleProfileMatcher { // parts from the resulting SES are used to remap the IR locations to the // profile locations. As the number of function callsite is usually not big, // we currently just implements the basic greedy version(page 6 of the paper). - LocToLocMap - longestCommonSequence(const AnchorList &IRCallsiteAnchors, - const AnchorList &ProfileCallsiteAnchors) const; + LocToLocMap longestCommonSequence(const AnchorList &IRCallsiteAnchors, + const AnchorList &ProfileCallsiteAnchors, + bool MatchUnusedFunction); void matchNonCallsiteLocs(const LocToLocMap &AnchorMatchings, const AnchorMap &IRAnchors, LocToLocMap &IRToProfileLocationMap); void runStaleProfileMatching(const Function &F, const AnchorMap &IRAnchors, const AnchorMap &ProfileAnchors, - LocToLocMap &IRToProfileLocationMap); + LocToLocMap &IRToProfileLocationMap, + bool RunCFGMatching, bool RunCGMatching); + // If the function doesn't have profile, return the pointer to the function. + bool functionHasProfile(const FunctionId &IRFuncName, + Function *&FuncWithoutProfile); + bool isProfileUnused(const FunctionId &ProfileFuncName); + bool functionMatchesProfileHelper(const Function &IRFunc, + const FunctionId &ProfFunc); + // Determine if the function matches profile. If FindMatchedProfileOnly is + // set, only search the existing matched function. Otherwise, try matching the + // two functions. + bool functionMatchesProfile(const FunctionId &IRFuncName, + const FunctionId &ProfileFuncName, + bool FindMatchedProfileOnly); + // Determine if the function matches profile by computing a similarity ratio + // between two sequences of callsite anchors extracted from function and + // profile. If it's above the threshold, the function matches the profile. + bool functionMatchesProfile(Function &IRFunc, const FunctionId &ProfFunc, + bool FindMatchedProfileOnly); + // Find functions that don't show in the profile or profile symbol list, + // which are supposed to be new functions. We use them as the targets for + // call graph matching. + void findFunctionsWithoutProfile(); void reportOrPersistProfileStats(); }; } // end namespace llvm diff --git a/llvm/include/llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h b/llvm/include/llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h index 7c725a3c1216c..32bf7b8c96be3 100644 --- a/llvm/include/llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h +++ b/llvm/include/llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h @@ -22,6 +22,7 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/LazyCallGraph.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/PostDominators.h" @@ -155,6 +156,22 @@ static inline bool skipProfileForFunction(const Function &F) { return F.isDeclaration() || !F.hasFnAttribute("use-sample-profile"); } +static inline void +buildTopDownFuncOrder(LazyCallGraph &CG, + std::vector &FunctionOrderList) { + CG.buildRefSCCs(); + for (LazyCallGraph::RefSCC &RC : CG.postorder_ref_sccs()) { + for (LazyCallGraph::SCC &C : RC) { + for (LazyCallGraph::Node &N : C) { + Function &F = N.getFunction(); + if (!skipProfileForFunction(F)) + FunctionOrderList.push_back(&F); + } + } + } + std::reverse(FunctionOrderList.begin(), FunctionOrderList.end()); +} + template class SampleProfileLoaderBaseImpl { public: SampleProfileLoaderBaseImpl(std::string Name, std::string RemapName, diff --git a/llvm/lib/ProfileData/SampleProf.cpp b/llvm/lib/ProfileData/SampleProf.cpp index 294f64636d989..addb473faebdf 100644 --- a/llvm/lib/ProfileData/SampleProf.cpp +++ b/llvm/lib/ProfileData/SampleProf.cpp @@ -236,7 +236,9 @@ LineLocation FunctionSamples::getCallSiteIdentifier(const DILocation *DIL, } const FunctionSamples *FunctionSamples::findFunctionSamples( - const DILocation *DIL, SampleProfileReaderItaniumRemapper *Remapper) const { + const DILocation *DIL, SampleProfileReaderItaniumRemapper *Remapper, + const HashKeyMap + *FuncNameToProfNameMap) const { assert(DIL); SmallVector, 10> S; @@ -256,7 +258,8 @@ const FunctionSamples *FunctionSamples::findFunctionSamples( return this; const FunctionSamples *FS = this; for (int i = S.size() - 1; i >= 0 && FS != nullptr; i--) { - FS = FS->findFunctionSamplesAt(S[i].first, S[i].second, Remapper); + FS = FS->findFunctionSamplesAt(S[i].first, S[i].second, Remapper, + FuncNameToProfNameMap); } return FS; } @@ -277,19 +280,32 @@ void FunctionSamples::findAllNames(DenseSet &NameSet) const { const FunctionSamples *FunctionSamples::findFunctionSamplesAt( const LineLocation &Loc, StringRef CalleeName, - SampleProfileReaderItaniumRemapper *Remapper) const { + SampleProfileReaderItaniumRemapper *Remapper, + const HashKeyMap + *FuncNameToProfNameMap) const { CalleeName = getCanonicalFnName(CalleeName); - auto iter = CallsiteSamples.find(mapIRLocToProfileLoc(Loc)); - if (iter == CallsiteSamples.end()) + auto I = CallsiteSamples.find(mapIRLocToProfileLoc(Loc)); + if (I == CallsiteSamples.end()) return nullptr; - auto FS = iter->second.find(getRepInFormat(CalleeName)); - if (FS != iter->second.end()) + auto FS = I->second.find(getRepInFormat(CalleeName)); + if (FS != I->second.end()) return &FS->second; + + if (FuncNameToProfNameMap && !FuncNameToProfNameMap->empty()) { + auto R = FuncNameToProfNameMap->find(FunctionId(CalleeName)); + if (R != FuncNameToProfNameMap->end()) { + CalleeName = R->second.stringRef(); + auto FS = I->second.find(getRepInFormat(CalleeName)); + if (FS != I->second.end()) + return &FS->second; + } + } + if (Remapper) { if (auto NameInProfile = Remapper->lookUpNameInProfile(CalleeName)) { - auto FS = iter->second.find(getRepInFormat(*NameInProfile)); - if (FS != iter->second.end()) + auto FS = I->second.find(getRepInFormat(*NameInProfile)); + if (FS != I->second.end()) return &FS->second; } } @@ -300,7 +316,7 @@ const FunctionSamples *FunctionSamples::findFunctionSamplesAt( return nullptr; uint64_t MaxTotalSamples = 0; const FunctionSamples *R = nullptr; - for (const auto &NameFS : iter->second) + for (const auto &NameFS : I->second) if (NameFS.second.getTotalSamples() >= MaxTotalSamples) { MaxTotalSamples = NameFS.second.getTotalSamples(); R = &NameFS.second; diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp index 13c0e0d0abff0..5cc2911a1a80e 100644 --- a/llvm/lib/Transforms/IPO/SampleProfile.cpp +++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp @@ -134,6 +134,10 @@ cl::opt SalvageStaleProfile( "salvage-stale-profile", cl::Hidden, cl::init(false), cl::desc("Salvage stale profile by fuzzy matching and use the remapped " "location for sample profile query.")); +cl::opt + SalvageUnusedProfile("salvage-unused-profile", cl::Hidden, cl::init(false), + cl::desc("Salvage unused profile by matching with new " + "functions on call graph.")); cl::opt ReportProfileStaleness( "report-profile-staleness", cl::Hidden, cl::init(false), @@ -462,12 +466,13 @@ class SampleProfileLoader final : public SampleProfileLoaderBaseImpl { IntrusiveRefCntPtr FS, std::function GetAssumptionCache, std::function GetTargetTransformInfo, - std::function GetTLI) + std::function GetTLI, + LazyCallGraph &CG) : SampleProfileLoaderBaseImpl(std::string(Name), std::string(RemapName), std::move(FS)), GetAC(std::move(GetAssumptionCache)), GetTTI(std::move(GetTargetTransformInfo)), GetTLI(std::move(GetTLI)), - LTOPhase(LTOPhase), + CG(CG), LTOPhase(LTOPhase), AnnotatedPassName(AnnotateSampleProfileInlinePhase ? llvm::AnnotateInlinePassName(InlineContext{ LTOPhase, InlinePass::SampleProfileInliner}) @@ -475,7 +480,7 @@ class SampleProfileLoader final : public SampleProfileLoaderBaseImpl { bool doInitialization(Module &M, FunctionAnalysisManager *FAM = nullptr); bool runOnModule(Module &M, ModuleAnalysisManager *AM, - ProfileSummaryInfo *_PSI, LazyCallGraph &CG); + ProfileSummaryInfo *_PSI); protected: bool runOnFunction(Function &F, ModuleAnalysisManager *AM); @@ -527,9 +532,14 @@ class SampleProfileLoader final : public SampleProfileLoaderBaseImpl { /// is one-to-one mapping. HashKeyMap SymbolMap; + /// Map from function name to profile name generated by call-graph based + /// profile fuzzy matching(--salvage-unused-profile). + HashKeyMap FuncNameToProfNameMap; + std::function GetAC; std::function GetTTI; std::function GetTLI; + LazyCallGraph &CG; /// Profile tracker for different context. std::unique_ptr ContextTracker; @@ -544,7 +554,7 @@ class SampleProfileLoader final : public SampleProfileLoaderBaseImpl { /// Profle Symbol list tells whether a function name appears in the binary /// used to generate the current profile. - std::unique_ptr PSL; + std::shared_ptr PSL; /// Total number of samples collected in this profile. /// @@ -696,7 +706,8 @@ SampleProfileLoader::findCalleeFunctionSamples(const CallBase &Inst) const { return nullptr; return FS->findFunctionSamplesAt(FunctionSamples::getCallSiteIdentifier(DIL), - CalleeName, Reader->getRemapper()); + CalleeName, Reader->getRemapper(), + &FuncNameToProfNameMap); } /// Returns a vector of FunctionSamples that are the indirect call targets @@ -774,8 +785,8 @@ SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const { if (FunctionSamples::ProfileIsCS) it.first->second = ContextTracker->getContextSamplesFor(DIL); else - it.first->second = - Samples->findFunctionSamples(DIL, Reader->getRemapper()); + it.first->second = Samples->findFunctionSamples( + DIL, Reader->getRemapper(), &FuncNameToProfNameMap); } return it.first->second; } @@ -1923,20 +1934,9 @@ SampleProfileLoader::buildFunctionOrder(Module &M, LazyCallGraph &CG) { } ++CGI; } - } else { - CG.buildRefSCCs(); - for (LazyCallGraph::RefSCC &RC : CG.postorder_ref_sccs()) { - for (LazyCallGraph::SCC &C : RC) { - for (LazyCallGraph::Node &N : C) { - Function &F = N.getFunction(); - if (!skipProfileForFunction(F)) - FunctionOrderList.push_back(&F); - } - } - } - } - - std::reverse(FunctionOrderList.begin(), FunctionOrderList.end()); + std::reverse(FunctionOrderList.begin(), FunctionOrderList.end()); + } else + buildTopDownFuncOrder(CG, FunctionOrderList); LLVM_DEBUG({ dbgs() << "Function processing order:\n"; @@ -2066,7 +2066,8 @@ bool SampleProfileLoader::doInitialization(Module &M, if (ReportProfileStaleness || PersistProfileStaleness || SalvageStaleProfile) { MatchingManager = std::make_unique( - M, *Reader, ProbeManager.get(), LTOPhase); + M, *Reader, CG, ProbeManager.get(), LTOPhase, SymbolMap, PSL, + FuncNameToProfNameMap); } return true; @@ -2136,8 +2137,7 @@ void SampleProfileLoader::removePseudoProbeInsts(Module &M) { } bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM, - ProfileSummaryInfo *_PSI, - LazyCallGraph &CG) { + ProfileSummaryInfo *_PSI) { GUIDToFuncNameMapper Mapper(M, *Reader, GUIDToFuncNameMap); PSI = _PSI; @@ -2182,14 +2182,18 @@ bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM, } } } - assert(SymbolMap.count(FunctionId()) == 0 && - "No empty StringRef should be added in SymbolMap"); + // Stale profile matching. if (ReportProfileStaleness || PersistProfileStaleness || SalvageStaleProfile) { MatchingManager->runOnModule(); MatchingManager->clearMatchingData(); } + assert(SymbolMap.count(FunctionId()) == 0 && + "No empty StringRef should be added in SymbolMap"); + assert((SalvageUnusedProfile || FuncNameToProfNameMap.empty()) && + "FuncNameToProfNameMap is not empty when --salvage-unused-profile is " + "not enabled"); bool retval = false; for (auto *F : buildFunctionOrder(M, CG)) { @@ -2319,19 +2323,18 @@ PreservedAnalyses SampleProfileLoaderPass::run(Module &M, if (!FS) FS = vfs::getRealFileSystem(); + LazyCallGraph &CG = AM.getResult(M); SampleProfileLoader SampleLoader( ProfileFileName.empty() ? SampleProfileFile : ProfileFileName, ProfileRemappingFileName.empty() ? SampleProfileRemappingFile : ProfileRemappingFileName, - LTOPhase, FS, GetAssumptionCache, GetTTI, GetTLI); - + LTOPhase, FS, GetAssumptionCache, GetTTI, GetTLI, CG); if (!SampleLoader.doInitialization(M, &FAM)) return PreservedAnalyses::all(); ProfileSummaryInfo *PSI = &AM.getResult(M); - LazyCallGraph &CG = AM.getResult(M); - if (!SampleLoader.runOnModule(M, &AM, PSI, CG)) + if (!SampleLoader.runOnModule(M, &AM, PSI)) return PreservedAnalyses::all(); return PreservedAnalyses::none(); diff --git a/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp b/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp index 11368e3375bdd..312672e56b017 100644 --- a/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp +++ b/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp @@ -21,7 +21,23 @@ using namespace sampleprof; #define DEBUG_TYPE "sample-profile-matcher" +static cl::opt FuncProfileSimilarityThreshold( + "func-profile-similarity-threshold", cl::Hidden, cl::init(80), + cl::desc("Consider a profile matches a function if the similarity of their " + "callee sequences is above the specified percentile.")); + +static cl::opt MinFuncCountForCGMatching( + "min-func-count-for-cg-matching", cl::Hidden, cl::init(5), + cl::desc("The minimum number of basic blocks required for a function to " + "run stale profile call graph matching.")); + +static cl::opt MinCallCountForCGMatching( + "min-call-count-for-cg-matching", cl::Hidden, cl::init(3), + cl::desc("The minimum number of call anchors required for a function to " + "run stale profile call graph matching.")); + extern cl::opt SalvageStaleProfile; +extern cl::opt SalvageUnusedProfile; extern cl::opt PersistProfileStaleness; extern cl::opt ReportProfileStaleness; @@ -31,7 +47,7 @@ static cl::opt SalvageStaleProfileMaxCallsites( "profile matching will be skipped.")); void SampleProfileMatcher::findIRAnchors(const Function &F, - AnchorMap &IRAnchors) { + AnchorMap &IRAnchors) const { // For inlined code, recover the original callsite and callee by finding the // top-level inline frame. e.g. For frame stack "main:1 @ foo:2 @ bar:3", the // top-level frame is "main:1", the callsite is "1" and the callee is "foo". @@ -101,7 +117,7 @@ void SampleProfileMatcher::findIRAnchors(const Function &F, } void SampleProfileMatcher::findProfileAnchors(const FunctionSamples &FS, - AnchorMap &ProfileAnchors) { + AnchorMap &ProfileAnchors) const { auto isInvalidLineOffset = [](uint32_t LineOffset) { return LineOffset & 0x8000; }; @@ -133,8 +149,44 @@ void SampleProfileMatcher::findProfileAnchors(const FunctionSamples &FS, } } -LocToLocMap SampleProfileMatcher::longestCommonSequence( - const AnchorList &AnchorList1, const AnchorList &AnchorList2) const { +bool SampleProfileMatcher::functionHasProfile(const FunctionId &IRFuncName, + Function *&FuncWithoutProfile) { + FuncWithoutProfile = nullptr; + auto R = FunctionsWithoutProfile.find(IRFuncName); + if (R != FunctionsWithoutProfile.end()) + FuncWithoutProfile = R->second; + return !FuncWithoutProfile; +} + +bool SampleProfileMatcher::isProfileUnused(const FunctionId &ProfileFuncName) { + return SymbolMap->find(ProfileFuncName) == SymbolMap->end(); +} + +bool SampleProfileMatcher::functionMatchesProfile( + const FunctionId &IRFuncName, const FunctionId &ProfileFuncName, + bool FindMatchedProfileOnly) { + if (IRFuncName == ProfileFuncName) + return true; + if (!SalvageUnusedProfile) + return false; + + // If IR function doesn't have profile and the profile is unused, try + // matching them. + Function *IRFunc = nullptr; + if (functionHasProfile(IRFuncName, IRFunc) || + !isProfileUnused(ProfileFuncName)) + return false; + + assert(FunctionId(IRFunc->getName()) != ProfileFuncName && + "IR function should be different from profile function to match"); + return functionMatchesProfile(*IRFunc, ProfileFuncName, + FindMatchedProfileOnly); +} + +LocToLocMap +SampleProfileMatcher::longestCommonSequence(const AnchorList &AnchorList1, + const AnchorList &AnchorList2, + bool MatchUnusedFunction) { int32_t Size1 = AnchorList1.size(), Size2 = AnchorList2.size(), MaxDepth = Size1 + Size2; auto Index = [&](int32_t I) { return I + MaxDepth; }; @@ -195,7 +247,9 @@ LocToLocMap SampleProfileMatcher::longestCommonSequence( X = V[Index(K - 1)] + 1; Y = X - K; while (X < Size1 && Y < Size2 && - AnchorList1[X].second == AnchorList2[Y].second) + functionMatchesProfile( + AnchorList1[X].second, AnchorList2[Y].second, + !MatchUnusedFunction /* Find matched function only */)) X++, Y++; V[Index(K)] = X; @@ -266,6 +320,21 @@ void SampleProfileMatcher::matchNonCallsiteLocs( } } +// Filter the non-call locations from IRAnchors and ProfileAnchors and write +// them into a list for random access later. +void SampleProfileMatcher::getFilteredAnchorList( + const AnchorMap &IRAnchors, const AnchorMap &ProfileAnchors, + AnchorList &FilteredIRAnchorsList, AnchorList &FilteredProfileAnchorList) { + for (const auto &I : IRAnchors) { + if (I.second.stringRef().empty()) + continue; + FilteredIRAnchorsList.emplace_back(I); + } + + for (const auto &I : ProfileAnchors) + FilteredProfileAnchorList.emplace_back(I); +} + // Call target name anchor based profile fuzzy matching. // Input: // For IR locations, the anchor is the callee name of direct callsite; For @@ -285,23 +354,19 @@ void SampleProfileMatcher::matchNonCallsiteLocs( // The output mapping: [2->3, 3->4, 5->7, 6->8, 7->9]. void SampleProfileMatcher::runStaleProfileMatching( const Function &F, const AnchorMap &IRAnchors, - const AnchorMap &ProfileAnchors, LocToLocMap &IRToProfileLocationMap) { + const AnchorMap &ProfileAnchors, LocToLocMap &IRToProfileLocationMap, + bool RunCFGMatching, bool RunCGMatching) { + if (!RunCFGMatching && !RunCGMatching) + return; LLVM_DEBUG(dbgs() << "Run stale profile matching for " << F.getName() << "\n"); assert(IRToProfileLocationMap.empty() && "Run stale profile matching only once per function"); AnchorList FilteredProfileAnchorList; - for (const auto &I : ProfileAnchors) - FilteredProfileAnchorList.emplace_back(I); - AnchorList FilteredIRAnchorsList; - // Filter the non-callsite from IRAnchors. - for (const auto &I : IRAnchors) { - if (I.second.stringRef().empty()) - continue; - FilteredIRAnchorsList.emplace_back(I); - } + getFilteredAnchorList(IRAnchors, ProfileAnchors, FilteredIRAnchorsList, + FilteredProfileAnchorList); if (FilteredIRAnchorsList.empty() || FilteredProfileAnchorList.empty()) return; @@ -317,14 +382,25 @@ void SampleProfileMatcher::runStaleProfileMatching( } // Match the callsite anchors by finding the longest common subsequence - // between IR and profile. Note that we need to use IR anchor as base(A side) - // to align with the order of IRToProfileLocationMap. + // between IR and profile. + // Define a match between two anchors as follows: + // 1) The function names of anchors are the same. + // 2) The similarity between the anchor functions is above a threshold if + // RunCGMatching is set. + // For 2), we only consider the anchor functions from IR and profile don't + // appear on either side to reduce the matching scope. Note that we need to + // use IR anchor as base(A side) to align with the order of + // IRToProfileLocationMap. LocToLocMap MatchedAnchors = - longestCommonSequence(FilteredIRAnchorsList, FilteredProfileAnchorList); + longestCommonSequence(FilteredIRAnchorsList, FilteredProfileAnchorList, + RunCGMatching /* Match unused functions */); - // Match the non-callsite locations and write the result to + // CFG level matching: + // Apply the callsite matchings to infer matching for the basic + // block(non-callsite) locations and write the result to // IRToProfileLocationMap. - matchNonCallsiteLocs(MatchedAnchors, IRAnchors, IRToProfileLocationMap); + if (RunCFGMatching) + matchNonCallsiteLocs(MatchedAnchors, IRAnchors, IRToProfileLocationMap); } void SampleProfileMatcher::runOnFunction(Function &F) { @@ -335,6 +411,16 @@ void SampleProfileMatcher::runOnFunction(Function &F) { // the maximum number of callsites, we merge the function profiles from all // contexts, aka, the flattened profile to find profile anchors. const auto *FSFlattened = getFlattenedSamplesFor(F); + if (SalvageUnusedProfile && !FSFlattened) { + // Apply the matching in place to find the new function's matched profile. + // TODO: For extended profile format, if a function profile is unused and + // it's top-level, even if the profile is matched, it's not found in the + // profile. This is because sample reader only read the used profile at the + // beginning, we need to support loading the profile on-demand in future. + auto R = FuncToProfileNameMap.find(&F); + if (R != FuncToProfileNameMap.end()) + FSFlattened = getFlattenedSamplesFor(R->second); + } if (!FSFlattened) return; @@ -352,28 +438,31 @@ void SampleProfileMatcher::runOnFunction(Function &F) { if (ReportProfileStaleness || PersistProfileStaleness) recordCallsiteMatchStates(F, IRAnchors, ProfileAnchors, nullptr); - // For probe-based profiles, run matching only when the current profile is not - // valid. - if (SalvageStaleProfile && (!FunctionSamples::ProfileIsProbeBased || - !ProbeManager->profileIsValid(F, *FSFlattened))) { - // For imported functions, the checksum metadata(pseudo_probe_desc) are - // dropped, so we leverage function attribute(profile-checksum-mismatch) to - // transfer the info: add the attribute during pre-link phase and check it - // during post-link phase(see "profileIsValid"). - if (FunctionSamples::ProfileIsProbeBased && - LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) - F.addFnAttr("profile-checksum-mismatch"); - - // The matching result will be saved to IRToProfileLocationMap, create a - // new map for each function. - auto &IRToProfileLocationMap = getIRToProfileLocationMap(F); - runStaleProfileMatching(F, IRAnchors, ProfileAnchors, - IRToProfileLocationMap); - // Find and update callsite match states after matching. - if (ReportProfileStaleness || PersistProfileStaleness) - recordCallsiteMatchStates(F, IRAnchors, ProfileAnchors, - &IRToProfileLocationMap); - } + if (!SalvageStaleProfile) + return; + // For probe-based profiles, run matching only when profile checksum is + // mismatched. + bool ChecksumMismatch = FunctionSamples::ProfileIsProbeBased && + !ProbeManager->profileIsValid(F, *FSFlattened); + bool RunCFGMatching = + !FunctionSamples::ProfileIsProbeBased || ChecksumMismatch; + bool RunCGMatching = SalvageUnusedProfile; + // For imported functions, the checksum metadata(pseudo_probe_desc) are + // dropped, so we leverage function attribute(profile-checksum-mismatch) to + // transfer the info: add the attribute during pre-link phase and check it + // during post-link phase(see "profileIsValid"). + if (ChecksumMismatch && LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) + F.addFnAttr("profile-checksum-mismatch"); + + // The matching result will be saved to IRToProfileLocationMap, create a + // new map for each function. + auto &IRToProfileLocationMap = getIRToProfileLocationMap(F); + runStaleProfileMatching(F, IRAnchors, ProfileAnchors, IRToProfileLocationMap, + RunCFGMatching, RunCGMatching); + // Find and update callsite match states after matching. + if (RunCFGMatching && (ReportProfileStaleness || PersistProfileStaleness)) + recordCallsiteMatchStates(F, IRAnchors, ProfileAnchors, + &IRToProfileLocationMap); } void SampleProfileMatcher::recordCallsiteMatchStates( @@ -532,10 +621,35 @@ void SampleProfileMatcher::countMismatchCallsites(const FunctionSamples &FS) { } } +void SampleProfileMatcher::countCallGraphRecoveredSamples( + const FunctionSamples &FS, + std::unordered_set &CallGraphRecoveredProfiles) { + if (CallGraphRecoveredProfiles.count(FS.getFunction())) { + NumCallGraphRecoveredFuncSamples += FS.getTotalSamples(); + return; + } + + for (const auto &CM : FS.getCallsiteSamples()) { + for (const auto &CS : CM.second) { + countCallGraphRecoveredSamples(CS.second, CallGraphRecoveredProfiles); + } + } +} + void SampleProfileMatcher::computeAndReportProfileStaleness() { if (!ReportProfileStaleness && !PersistProfileStaleness) return; + std::unordered_set CallGraphRecoveredProfiles; + if (SalvageUnusedProfile) { + for (const auto &I : FuncToProfileNameMap) { + CallGraphRecoveredProfiles.insert(I.second); + if (GlobalValue::isAvailableExternallyLinkage(I.first->getLinkage())) + continue; + NumCallGraphRecoveredProfiledFunc++; + } + } + // Count profile mismatches for profile staleness report. for (const auto &F : M) { if (skipProfileForFunction(F)) @@ -550,6 +664,9 @@ void SampleProfileMatcher::computeAndReportProfileStaleness() { TotalProfiledFunc++; TotalFunctionSamples += FS->getTotalSamples(); + if (SalvageUnusedProfile && !CallGraphRecoveredProfiles.empty()) + countCallGraphRecoveredSamples(*FS, CallGraphRecoveredProfiles); + // Checksum mismatch is only used in pseudo-probe mode. if (FunctionSamples::ProfileIsProbeBased) countMismatchedFuncSamples(*FS, true); @@ -566,6 +683,13 @@ void SampleProfileMatcher::computeAndReportProfileStaleness() { << MismatchedFunctionSamples << "/" << TotalFunctionSamples << ") of samples are discarded due to function hash mismatch.\n"; } + if (SalvageUnusedProfile) { + errs() << "(" << NumCallGraphRecoveredProfiledFunc << "/" + << TotalProfiledFunc << ") of functions' profile are matched and (" + << NumCallGraphRecoveredFuncSamples << "/" << TotalFunctionSamples + << ") of samples are reused by call graph matching.\n"; + } + errs() << "(" << (NumMismatchedCallsites + NumRecoveredCallsites) << "/" << TotalProfiledCallsites << ") of callsites' profile are invalid and (" @@ -592,6 +716,13 @@ void SampleProfileMatcher::computeAndReportProfileStaleness() { ProfStatsVec.emplace_back("TotalFunctionSamples", TotalFunctionSamples); } + if (SalvageUnusedProfile) { + ProfStatsVec.emplace_back("NumCallGraphRecoveredProfiledFunc", + NumCallGraphRecoveredProfiledFunc); + ProfStatsVec.emplace_back("NumCallGraphRecoveredFuncSamples", + NumCallGraphRecoveredFuncSamples); + } + ProfStatsVec.emplace_back("NumMismatchedCallsites", NumMismatchedCallsites); ProfStatsVec.emplace_back("NumRecoveredCallsites", NumRecoveredCallsites); ProfStatsVec.emplace_back("TotalProfiledCallsites", TotalProfiledCallsites); @@ -606,14 +737,161 @@ void SampleProfileMatcher::computeAndReportProfileStaleness() { } } +void SampleProfileMatcher::findFunctionsWithoutProfile() { + // TODO: Support MD5 profile. + if (FunctionSamples::UseMD5) + return; + StringSet<> NamesInProfile; + if (auto NameTable = Reader.getNameTable()) { + for (auto Name : *NameTable) + NamesInProfile.insert(Name.stringRef()); + } + + for (auto &F : M) { + // Skip declarations, as even if the function can be matched, we have + // nothing to do with it. + if (F.isDeclaration()) + continue; + + StringRef CanonFName = FunctionSamples::getCanonicalFnName(F.getName()); + const auto *FS = getFlattenedSamplesFor(F); + if (FS) + continue; + + // For extended binary, functions fully inlined may not be loaded in the + // top-level profile, so check the NameTable which has the all symbol names + // in profile. + if (NamesInProfile.count(CanonFName)) + continue; + + // For extended binary, non-profiled function symbols are in the profile + // symbol list table. + if (PSL && PSL->contains(CanonFName)) + continue; + + LLVM_DEBUG(dbgs() << "Function " << CanonFName + << " is not in profile or profile symbol list.\n"); + FunctionsWithoutProfile[FunctionId(CanonFName)] = &F; + } +} + +bool SampleProfileMatcher::functionMatchesProfileHelper( + const Function &IRFunc, const FunctionId &ProfFunc) { + // The value is in the range [0, 1]. The bigger the value is, the more similar + // two sequences are. + float Similarity = 0.0; + + const auto *FSFlattened = getFlattenedSamplesFor(ProfFunc); + if (!FSFlattened) + return false; + // The check for similarity or checksum may not be reliable if the function is + // tiny, we use the number of basic block as a proxy for the function + // complexity and skip the matching if it's too small. + if (IRFunc.size() < MinFuncCountForCGMatching || + FSFlattened->getBodySamples().size() < MinFuncCountForCGMatching) + return false; + + // For probe-based function, we first trust the checksum info. If the checksum + // doesn't match, we continue checking for similarity. + if (FunctionSamples::ProfileIsProbeBased) { + const auto *FuncDesc = ProbeManager->getDesc(IRFunc); + if (FuncDesc && + !ProbeManager->profileIsHashMismatched(*FuncDesc, *FSFlattened)) { + LLVM_DEBUG(dbgs() << "The checksums for " << IRFunc.getName() + << "(IR) and " << ProfFunc << "(Profile) match.\n"); + + return true; + } + } + + AnchorMap IRAnchors; + findIRAnchors(IRFunc, IRAnchors); + AnchorMap ProfileAnchors; + findProfileAnchors(*FSFlattened, ProfileAnchors); + + AnchorList FilteredIRAnchorsList; + AnchorList FilteredProfileAnchorList; + getFilteredAnchorList(IRAnchors, ProfileAnchors, FilteredIRAnchorsList, + FilteredProfileAnchorList); + + // Similarly skip the matching if the num of anchors is not enough. + if (FilteredIRAnchorsList.size() < MinCallCountForCGMatching || + FilteredProfileAnchorList.size() < MinCallCountForCGMatching) + return false; + + // Use the diff algorithm to find the LCS between IR and profile. + + // Don't recursively match the callee function to avoid infinite matching, + // callee functions will be handled later since it's processed in top-down + // order . + LocToLocMap MatchedAnchors = + longestCommonSequence(FilteredIRAnchorsList, FilteredProfileAnchorList, + false /* Match unused functions */); + + Similarity = + static_cast(MatchedAnchors.size()) * 2 / + (FilteredIRAnchorsList.size() + FilteredProfileAnchorList.size()); + + LLVM_DEBUG(dbgs() << "The similarity between " << IRFunc.getName() + << "(IR) and " << ProfFunc << "(profile) is " + << format("%.2f", Similarity) << "\n"); + assert((Similarity >= 0 && Similarity <= 1.0) && + "Similarity value should be in [0, 1]"); + return Similarity * 100 > FuncProfileSimilarityThreshold; +} + +// If FindMatchedProfileOnly is set to true, only use the processed function +// results. This is used for skipping the repeated recursive matching. +bool SampleProfileMatcher::functionMatchesProfile(Function &IRFunc, + const FunctionId &ProfFunc, + bool FindMatchedProfileOnly) { + auto R = FuncProfileMatchCache.find({&IRFunc, ProfFunc}); + if (R != FuncProfileMatchCache.end()) + return R->second; + + if (FindMatchedProfileOnly) + return false; + + bool Matched = functionMatchesProfileHelper(IRFunc, ProfFunc); + FuncProfileMatchCache[{&IRFunc, ProfFunc}] = Matched; + if (Matched) { + FuncToProfileNameMap[&IRFunc] = ProfFunc; + LLVM_DEBUG(dbgs() << "Function:" << IRFunc.getName() + << " matches profile:" << ProfFunc << "\n"); + } + + return Matched; +} + void SampleProfileMatcher::runOnModule() { ProfileConverter::flattenProfile(Reader.getProfiles(), FlattenedProfiles, FunctionSamples::ProfileIsCS); - for (auto &F : M) { - if (skipProfileForFunction(F)) + if (SalvageUnusedProfile) + findFunctionsWithoutProfile(); + + // Process the matching in top-down order so that the caller matching result + // can be used to the callee matching. + std::vector TopDownFunctionList; + TopDownFunctionList.reserve(M.size()); + buildTopDownFuncOrder(CG, TopDownFunctionList); + for (auto *F : TopDownFunctionList) { + if (skipProfileForFunction(*F)) continue; - runOnFunction(F); + runOnFunction(*F); } + + // Update the data in SampleLoader. + if (SalvageUnusedProfile) + for (auto &I : FuncToProfileNameMap) { + assert(I.first && "New function is null"); + FunctionId FuncName(I.first->getName()); + FuncNameToProfNameMap->emplace(FuncName, I.second); + // We need to remove the old entry to avoid duplicating the function + // processing. + SymbolMap->erase(FuncName); + SymbolMap->emplace(I.second, I.first); + } + if (SalvageStaleProfile) distributeIRToProfileLocationMap(); diff --git a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll index ac80a31d8fd4b..e5aebc4850e6d 100644 --- a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll +++ b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll @@ -31,9 +31,9 @@ ; CHECK-EP-PIPELINE-START: Running pass: NoOpModulePass ; CHECK-O: Running pass: SampleProfileLoaderPass ; CHECK-O-NEXT: Running analysis: InnerAnalysisManagerProxy -; CHECK-O-NEXT: Running analysis: ProfileSummaryAnalysis ; CHECK-O-NEXT: Running analysis: LazyCallGraphAnalysis ; CHECK-O-NEXT: Running analysis: TargetLibraryAnalysis +; CHECK-O-NEXT: Running analysis: ProfileSummaryAnalysis ; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}ProfileSummaryAnalysis ; CHECK-O-NEXT: Running pass: PGOIndirectCallPromotion ; CHECK-O-NEXT: Running analysis: OptimizationRemarkEmitterAnalysis diff --git a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll index 210a4ef1f7664..0bb26330d000a 100644 --- a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll +++ b/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll @@ -44,8 +44,8 @@ ; CHECK-O-NEXT: Running analysis: TargetLibraryAnalysis ; CHECK-O3-NEXT: Running pass: CallSiteSplittingPass ; CHECK-O-NEXT: Running pass: SampleProfileLoaderPass -; CHECK-O-NEXT: Running analysis: ProfileSummaryAnalysis ; CHECK-O-NEXT: Running analysis: LazyCallGraphAnalysis +; CHECK-O-NEXT: Running analysis: ProfileSummaryAnalysis ; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}ProfileSummaryAnalysis ; CHECK-O-NEXT: Running pass: OpenMPOptPass ; CHECK-O-NEXT: Running pass: IPSCCPPass diff --git a/llvm/test/Transforms/SampleProfile/Inputs/pseudo-probe-stale-profile-renaming-recursive.prof b/llvm/test/Transforms/SampleProfile/Inputs/pseudo-probe-stale-profile-renaming-recursive.prof new file mode 100644 index 0000000000000..edb1404c1d517 --- /dev/null +++ b/llvm/test/Transforms/SampleProfile/Inputs/pseudo-probe-stale-profile-renaming-recursive.prof @@ -0,0 +1,11 @@ +main:42:0 + 1: 0 + 6: 2 + 7: 0 + 5: foo:40 + 1: 20 + 2: bar:20 + 1: 20 + !CFGChecksum: 4294967295 + !CFGChecksum: 281479271677951 + !CFGChecksum: 281582264815352 diff --git a/llvm/test/Transforms/SampleProfile/Inputs/pseudo-probe-stale-profile-renaming.prof b/llvm/test/Transforms/SampleProfile/Inputs/pseudo-probe-stale-profile-renaming.prof new file mode 100644 index 0000000000000..78ff0f322dd0f --- /dev/null +++ b/llvm/test/Transforms/SampleProfile/Inputs/pseudo-probe-stale-profile-renaming.prof @@ -0,0 +1,57 @@ +main:47:0 + 1: 0 + 2: 2 + 3: 0 + 4: 3 + 7: 2 test_noninline:2 + 8: 2 + 9: 0 + 5: foo:24 + 1: 4 + 2: 3 bar:3 + 4: 3 bar:3 + 5: 1 mismatch:1 + 3: baz:15 + 1: 3 + 2: block_only:12 + 1: 3 + 3: 3 + 5: 3 + 10: 3 + !CFGChecksum: 206551239323 + !CFGChecksum: 281479271677951 + !CFGChecksum: 123456 + 6: baz:14 + 1: 3 + 2: block_only:11 + 1: 3 + 3: 3 + 5: 3 + 10: 2 + !CFGChecksum: 206551239323 + !CFGChecksum: 281479271677951 + 10: cold_func:0 + 1: 0 + 2: 0 block_only:0 + !CFGChecksum: 281479271677951 + !CFGChecksum: 1126003093360596 +test_noninline:22:2 + 1: 2 + 2: foo:20 + 1: 3 + 2: 2 bar:3 + 4: 3 bar:3 + 3: baz:13 + 1: 2 + 2: block_only:11 + 1: 2 + 3: 3 + 5: 3 + 10: 3 + !CFGChecksum: 206551239323 + !CFGChecksum: 281479271677951 + !CFGChecksum: 123456 + !CFGChecksum: 281479271677951 +bar:12:12 + 1: 12 + !CFGChecksum: 4294967295 diff --git a/llvm/test/Transforms/SampleProfile/non-probe-stale-profile-matching.ll b/llvm/test/Transforms/SampleProfile/non-probe-stale-profile-matching.ll index 5394a00ced86a..3ca94a4563675 100644 --- a/llvm/test/Transforms/SampleProfile/non-probe-stale-profile-matching.ll +++ b/llvm/test/Transforms/SampleProfile/non-probe-stale-profile-matching.ll @@ -48,18 +48,18 @@ ; } ; } -; CHECK: Run stale profile matching for bar - -; CHECK: Run stale profile matching for foo -; CHECK: Callsite with callee:bar is matched from 1.15 to 1.15 -; CHECK: Callsite with callee:bar is matched from 2 to 2 - ; CHECK: Run stale profile matching for main ; CHECK: Callsite with callee:foo is matched from 4 to 2 ; CHECK: Callsite with callee:bar is matched from 5 to 3 ; CHECK: Callsite with callee:foo is matched from 8 to 4 ; CHECK: Callsite with callee:bar is matched from 9 to 5 +; CHECK: Run stale profile matching for foo +; CHECK: Callsite with callee:bar is matched from 1.15 to 1.15 +; CHECK: Callsite with callee:bar is matched from 2 to 2 + +; CHECK: Run stale profile matching for bar + target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-stale-profile-matching-LCS.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-stale-profile-matching-LCS.ll index 4b8cd853301ed..cdd365b6fb673 100644 --- a/llvm/test/Transforms/SampleProfile/pseudo-probe-stale-profile-matching-LCS.ll +++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-stale-profile-matching-LCS.ll @@ -3,17 +3,6 @@ ; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/pseudo-probe-stale-profile-matching-LCS.prof --salvage-stale-profile -S --debug-only=sample-profile,sample-profile-matcher,sample-profile-impl 2>&1 | FileCheck %s ; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/pseudo-probe-stale-profile-matching-LCS.prof --salvage-stale-profile -S --debug-only=sample-profile,sample-profile-matcher,sample-profile-impl --salvage-stale-profile-max-callsites=6 2>&1 | FileCheck %s -check-prefix=CHECK-MAX-CALLSITES -; CHECK: Run stale profile matching for test_direct_call -; CHECK: Location is matched from 1 to 1 -; CHECK: Location is matched from 2 to 2 -; CHECK: Location is matched from 3 to 3 -; CHECK: Callsite with callee:C is matched from 4 to 2 -; CHECK: Location is rematched backwards from 3 to 1 -; CHECK: Callsite with callee:A is matched from 5 to 4 -; CHECK: Callsite with callee:B is matched from 6 to 5 -; CHECK: Location is matched from 7 to 6 -; CHECK: Callsite with callee:A is matched from 8 to 6 - ; CHECK: Run stale profile matching for test_indirect_call ; CHECK: Location is matched from 1 to 1 ; CHECK: Location is matched from 2 to 2 @@ -28,6 +17,17 @@ ; CHECK: Callsite with callee:unknown.indirect.callee is matched from 9 to 6 ; CHECK: Callsite with callee:C is matched from 10 to 7 +; CHECK: Run stale profile matching for test_direct_call +; CHECK: Location is matched from 1 to 1 +; CHECK: Location is matched from 2 to 2 +; CHECK: Location is matched from 3 to 3 +; CHECK: Callsite with callee:C is matched from 4 to 2 +; CHECK: Location is rematched backwards from 3 to 1 +; CHECK: Callsite with callee:A is matched from 5 to 4 +; CHECK: Callsite with callee:B is matched from 6 to 5 +; CHECK: Location is matched from 7 to 6 +; CHECK: Callsite with callee:A is matched from 8 to 6 + ; CHECK-MAX-CALLSITES: Skip stale profile matching for test_direct_call ; CHECK-MAX-CALLSITES-NOT: Skip stale profile matching for test_indirect_call diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-stale-profile-renaming-recursive.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-stale-profile-renaming-recursive.ll new file mode 100644 index 0000000000000..d9db804b56364 --- /dev/null +++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-stale-profile-renaming-recursive.ll @@ -0,0 +1,150 @@ +; REQUIRES: x86_64-linux +; REQUIRES: asserts +; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/pseudo-probe-stale-profile-renaming-recursive.prof --salvage-stale-profile --salvage-unused-profile -report-profile-staleness -persist-profile-staleness -S --debug-only=sample-profile,sample-profile-matcher,sample-profile-impl -pass-remarks=inline --min-call-count-for-cg-matching=0 --min-func-count-for-cg-matching=0 2>&1 | FileCheck %s + +; CHECK: Run stale profile matching for main +; CHECK: Function:foo_new matches profile:foo +; CHECK: Run stale profile matching for foo_new +; CHECK: Function:bar_new matches profile:bar +; CHECK: Run stale profile matching for bar_new + +; CHECK: Function processing order: +; CHECK: main +; CHECK: foo_new +; CHECK: bar_new + +; CHECK: 'foo_new' inlined into 'main' to match profiling context with (cost=0, threshold=3000) at callsite main:2:7; +; CHECK: 'bar_new' inlined into 'main' to match profiling context with (cost=-15, threshold=3000) at callsite foo_new:1:3 @ main:2:7; + + + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@x = dso_local global i32 0, align 4, !dbg !0 + +; Function Attrs: nounwind uwtable +define dso_local void @bar_new() #0 !dbg !18 { +entry: + call void @llvm.pseudoprobe(i64 8236371237083957767, i64 1, i32 0, i64 -1), !dbg !21 + %0 = load volatile i32, ptr @x, align 4, !dbg !21, !tbaa !22 + %inc = add nsw i32 %0, 1, !dbg !21 + store volatile i32 %inc, ptr @x, align 4, !dbg !21, !tbaa !22 + ret void, !dbg !26 +} + +; Function Attrs: nounwind uwtable +define dso_local void @foo_new() #0 !dbg !27 { +entry: + call void @llvm.pseudoprobe(i64 -837213161392124280, i64 1, i32 0, i64 -1), !dbg !28 + call void @bar_new(), !dbg !29 + ret void, !dbg !31 +} + +; Function Attrs: nounwind uwtable +define dso_local i32 @main() #0 !dbg !32 { +entry: + call void @llvm.pseudoprobe(i64 -2624081020897602054, i64 1, i32 0, i64 -1), !dbg !38 + #dbg_value(i32 0, !36, !DIExpression(), !39) + br label %for.cond, !dbg !40 + +for.cond: ; preds = %for.body, %entry + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ], !dbg !41 + #dbg_value(i32 %i.0, !36, !DIExpression(), !39) + call void @llvm.pseudoprobe(i64 -2624081020897602054, i64 2, i32 0, i64 -1), !dbg !42 + %cmp = icmp slt i32 %i.0, 1000000, !dbg !44 + br i1 %cmp, label %for.body, label %for.cond.cleanup, !dbg !45 + +for.cond.cleanup: ; preds = %for.cond + call void @llvm.pseudoprobe(i64 -2624081020897602054, i64 3, i32 0, i64 -1), !dbg !46 + call void @llvm.pseudoprobe(i64 -2624081020897602054, i64 7, i32 0, i64 -1), !dbg !47 + ret i32 0, !dbg !47 + +for.body: ; preds = %for.cond + call void @llvm.pseudoprobe(i64 -2624081020897602054, i64 4, i32 0, i64 -1), !dbg !48 + call void @foo_new(), !dbg !50 + call void @llvm.pseudoprobe(i64 -2624081020897602054, i64 6, i32 0, i64 -1), !dbg !52 + %inc = add nsw i32 %i.0, 1, !dbg !52 + #dbg_value(i32 %inc, !36, !DIExpression(), !39) + br label %for.cond, !dbg !53, !llvm.loop !54 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) +declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare void @llvm.dbg.declare(metadata, metadata, metadata) #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) +declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) +declare void @llvm.pseudoprobe(i64, i64, i32, i64) #3 + +attributes #0 = { nounwind uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "use-sample-profile" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) } +attributes #2 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #3 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) } + +!llvm.dbg.cu = !{!2} +!llvm.module.flags = !{!7, !8, !9, !10, !11, !12, !13} +!llvm.ident = !{!14} +!llvm.pseudo_probe_desc = !{!15, !16, !17} + +!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression()) +!1 = distinct !DIGlobalVariable(name: "x", scope: !2, file: !3, line: 1, type: !5, isLocal: false, isDefinition: true) +!2 = distinct !DICompileUnit(language: DW_LANG_C11, file: !3, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, globals: !4, splitDebugInlining: false, nameTableKind: None) +!3 = !DIFile(filename: "test.c", directory: "/home/", checksumkind: CSK_MD5, checksum: "48867dcc5b42e2991317c585b7545860") +!4 = !{!0} +!5 = !DIDerivedType(tag: DW_TAG_volatile_type, baseType: !6) +!6 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!7 = !{i32 7, !"Dwarf Version", i32 5} +!8 = !{i32 2, !"Debug Info Version", i32 3} +!9 = !{i32 1, !"wchar_size", i32 4} +!10 = !{i32 8, !"PIC Level", i32 2} +!11 = !{i32 7, !"PIE Level", i32 2} +!12 = !{i32 7, !"uwtable", i32 2} +!13 = !{i32 7, !"debug-info-assignment-tracking", i1 true} +!14 = !{!"clang version 19.0.0"} +!15 = !{i64 8236371237083957767, i64 4294967295, !"bar_new"} +!16 = !{i64 -837213161392124280, i64 281479271677951, !"foo_new"} +!17 = !{i64 -2624081020897602054, i64 281582264815352, !"main"} +!18 = distinct !DISubprogram(name: "bar_new", scope: !3, file: !3, line: 3, type: !19, scopeLine: 3, flags: DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2) +!19 = !DISubroutineType(types: !20) +!20 = !{null} +!21 = !DILocation(line: 4, column: 4, scope: !18) +!22 = !{!23, !23, i64 0} +!23 = !{!"int", !24, i64 0} +!24 = !{!"omnipotent char", !25, i64 0} +!25 = !{!"Simple C/C++ TBAA"} +!26 = !DILocation(line: 5, column: 1, scope: !18) +!27 = distinct !DISubprogram(name: "foo_new", scope: !3, file: !3, line: 7, type: !19, scopeLine: 7, flags: DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2) +!28 = !DILocation(line: 8, column: 3, scope: !27) +!29 = !DILocation(line: 8, column: 3, scope: !30) +!30 = !DILexicalBlockFile(scope: !27, file: !3, discriminator: 455082007) +!31 = !DILocation(line: 9, column: 1, scope: !27) +!32 = distinct !DISubprogram(name: "main", scope: !3, file: !3, line: 11, type: !33, scopeLine: 11, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !35) +!33 = !DISubroutineType(types: !34) +!34 = !{!6} +!35 = !{!36} +!36 = !DILocalVariable(name: "i", scope: !37, file: !3, line: 12, type: !6) +!37 = distinct !DILexicalBlock(scope: !32, file: !3, line: 12, column: 3) +!38 = !DILocation(line: 12, column: 12, scope: !37) +!39 = !DILocation(line: 0, scope: !37) +!40 = !DILocation(line: 12, column: 8, scope: !37) +!41 = !DILocation(line: 12, scope: !37) +!42 = !DILocation(line: 12, column: 19, scope: !43) +!43 = distinct !DILexicalBlock(scope: !37, file: !3, line: 12, column: 3) +!44 = !DILocation(line: 12, column: 21, scope: !43) +!45 = !DILocation(line: 12, column: 3, scope: !37) +!46 = !DILocation(line: 0, scope: !32) +!47 = !DILocation(line: 15, column: 1, scope: !32) +!48 = !DILocation(line: 13, column: 7, scope: !49) +!49 = distinct !DILexicalBlock(scope: !43, file: !3, line: 12, column: 41) +!50 = !DILocation(line: 13, column: 7, scope: !51) +!51 = !DILexicalBlockFile(scope: !49, file: !3, discriminator: 455082031) +!52 = !DILocation(line: 12, column: 37, scope: !43) +!53 = !DILocation(line: 12, column: 3, scope: !43) +!54 = distinct !{!54, !45, !55, !56} +!55 = !DILocation(line: 14, column: 3, scope: !37) +!56 = !{!"llvm.loop.mustprogress"} diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-stale-profile-renaming.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-stale-profile-renaming.ll new file mode 100644 index 0000000000000..a549812f46ef6 --- /dev/null +++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-stale-profile-renaming.ll @@ -0,0 +1,313 @@ +; REQUIRES: x86_64-linux +; REQUIRES: asserts +; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/pseudo-probe-stale-profile-renaming.prof --salvage-stale-profile --salvage-unused-profile -report-profile-staleness -persist-profile-staleness -S --debug-only=sample-profile,sample-profile-matcher,sample-profile-impl -pass-remarks=inline --min-call-count-for-cg-matching=0 --min-func-count-for-cg-matching=0 2>&1 | FileCheck %s +; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/pseudo-probe-stale-profile-renaming.prof --salvage-stale-profile --salvage-unused-profile -S --debug-only=sample-profile,sample-profile-matcher,sample-profile-impl --min-call-count-for-cg-matching=10 --min-func-count-for-cg-matching=10 2>&1 | FileCheck %s --check-prefix=TINY-FUNC + +; Verify find new IR functions. +; CHECK: Function new_block_only is not in profile or profile symbol list. +; CHECK: Function new_foo is not in profile or profile symbol list. + +; CHECK: Run stale profile matching for main +; CHECK: The similarity between new_foo(IR) and foo(profile) is 0.86 +; CHECK: Function:new_foo matches profile:foo +; CHECK: Run stale profile matching for cold_func +; CHECK: The checksums for new_block_only(IR) and block_only(Profile) match. +; CHECK: Function:new_block_only matches profile:block_only +; CHECK: Run stale profile matching for test_noninline +; CHECK: Run stale profile matching for baz +; CHECK: Run stale profile matching for bar + +; CHECK: (2/3) of functions' profile are matched and (55/81) of samples are reused by call graph matching. + +; Verify the matched function is updated correctly by checking the inlining. +; CHECK: 'new_foo' inlined into 'main' to match profiling context with (cost=110, threshold=3000) at callsite main:2:7.5; +; CHECK: 'new_block_only' inlined into 'main' to match profiling context with (cost=75, threshold=3000) at callsite baz:1:3.2 @ main:3:7.6 +; CHECK: 'new_block_only' inlined into 'main' to match profiling context with (cost=75, threshold=3000) at callsite baz:1:3.2 @ new_foo:2:3.3 @ main:2:7.5; +; CHECK: 'new_foo' inlined into 'test_noninline' to match profiling context with (cost=110, threshold=3000) at callsite test_noninline:1:3.2; + +; CHECK: !"NumCallGraphRecoveredProfiledFunc", i64 2, !"NumCallGraphRecoveredFuncSamples", i64 55 + +; TINY-FUNC-NOT: Function:new_foo matches profile:foo +; TINY-FUNC-NOT: Function:new_block_only matches profile:block_only + + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@x = dso_local global i32 0, align 4, !dbg !0 + +; Function Attrs: noinline nounwind uwtable +define dso_local i32 @bar(i32 noundef %x) #0 !dbg !22 { +entry: + #dbg_value(i32 %x, !26, !DIExpression(), !27) + call void @llvm.pseudoprobe(i64 -2012135647395072713, i64 1, i32 0, i64 -1), !dbg !28 + %add = add nsw i32 %x, 1, !dbg !29 + ret i32 %add, !dbg !30 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare void @llvm.dbg.declare(metadata, metadata, metadata) #1 + +; Function Attrs: nounwind uwtable +define dso_local void @new_block_only() #2 !dbg !31 { +entry: + call void @llvm.pseudoprobe(i64 2964250471062803127, i64 1, i32 0, i64 -1), !dbg !34 + %0 = load volatile i32, ptr @x, align 4, !dbg !34, !tbaa !36 + %cmp = icmp eq i32 %0, 9999, !dbg !40 + br i1 %cmp, label %if.then, label %if.else, !dbg !41 + +if.then: ; preds = %entry + call void @llvm.pseudoprobe(i64 2964250471062803127, i64 2, i32 0, i64 -1), !dbg !42 + %1 = load volatile i32, ptr @x, align 4, !dbg !42, !tbaa !36 + %add = add nsw i32 %1, 1000, !dbg !42 + store volatile i32 %add, ptr @x, align 4, !dbg !42, !tbaa !36 + br label %if.end10, !dbg !43 + +if.else: ; preds = %entry + call void @llvm.pseudoprobe(i64 2964250471062803127, i64 3, i32 0, i64 -1), !dbg !44 + %2 = load volatile i32, ptr @x, align 4, !dbg !44, !tbaa !36 + %cmp1 = icmp eq i32 %2, 999, !dbg !46 + br i1 %cmp1, label %if.then2, label %if.else4, !dbg !47 + +if.then2: ; preds = %if.else + call void @llvm.pseudoprobe(i64 2964250471062803127, i64 4, i32 0, i64 -1), !dbg !48 + %3 = load volatile i32, ptr @x, align 4, !dbg !48, !tbaa !36 + %add3 = add nsw i32 %3, 100, !dbg !48 + store volatile i32 %add3, ptr @x, align 4, !dbg !48, !tbaa !36 + br label %if.end10, !dbg !49 + +if.else4: ; preds = %if.else + call void @llvm.pseudoprobe(i64 2964250471062803127, i64 5, i32 0, i64 -1), !dbg !50 + %4 = load volatile i32, ptr @x, align 4, !dbg !50, !tbaa !36 + %cmp5 = icmp eq i32 %4, 99, !dbg !52 + br i1 %cmp5, label %if.then6, label %if.else8, !dbg !53 + +if.then6: ; preds = %if.else4 + call void @llvm.pseudoprobe(i64 2964250471062803127, i64 6, i32 0, i64 -1), !dbg !54 + %5 = load volatile i32, ptr @x, align 4, !dbg !54, !tbaa !36 + %add7 = add nsw i32 %5, 10, !dbg !54 + store volatile i32 %add7, ptr @x, align 4, !dbg !54, !tbaa !36 + br label %if.end10, !dbg !55 + +if.else8: ; preds = %if.else4 + call void @llvm.pseudoprobe(i64 2964250471062803127, i64 7, i32 0, i64 -1), !dbg !56 + %6 = load volatile i32, ptr @x, align 4, !dbg !56, !tbaa !36 + %inc = add nsw i32 %6, 1, !dbg !56 + store volatile i32 %inc, ptr @x, align 4, !dbg !56, !tbaa !36 + br label %if.end10 + +if.end10: ; preds = %if.then2, %if.else8, %if.then6, %if.then + call void @llvm.pseudoprobe(i64 2964250471062803127, i64 10, i32 0, i64 -1), !dbg !57 + ret void, !dbg !57 +} + +; Function Attrs: nounwind uwtable +define dso_local void @baz() #2 !dbg !58 { +entry: + call void @llvm.pseudoprobe(i64 7546896869197086323, i64 1, i32 0, i64 -1), !dbg !59 + call void @new_block_only(), !dbg !60 + ret void, !dbg !62 +} + +; Function Attrs: nounwind uwtable +define dso_local void @new_foo() #2 !dbg !63 { +entry: + call void @llvm.pseudoprobe(i64 5381804724291869009, i64 1, i32 0, i64 -1), !dbg !64 + %0 = load volatile i32, ptr @x, align 4, !dbg !64, !tbaa !36 + %call = call i32 @bar(i32 noundef %0), !dbg !65 + %1 = load volatile i32, ptr @x, align 4, !dbg !67, !tbaa !36 + %add = add nsw i32 %1, %call, !dbg !67 + store volatile i32 %add, ptr @x, align 4, !dbg !67, !tbaa !36 + call void @baz(), !dbg !68 + %2 = load volatile i32, ptr @x, align 4, !dbg !70, !tbaa !36 + %call1 = call i32 @bar(i32 noundef %2), !dbg !71 + %3 = load volatile i32, ptr @x, align 4, !dbg !73, !tbaa !36 + %add2 = add nsw i32 %3, %call1, !dbg !73 + store volatile i32 %add2, ptr @x, align 4, !dbg !73, !tbaa !36 + ret void, !dbg !74 +} + +; Function Attrs: noinline nounwind uwtable +define dso_local void @test_noninline() #0 !dbg !75 { +entry: + call void @llvm.pseudoprobe(i64 -5610330892148506720, i64 1, i32 0, i64 -1), !dbg !76 + call void @new_foo(), !dbg !77 + ret void, !dbg !79 +} + +; Function Attrs: nounwind uwtable +define dso_local void @cold_func() #2 !dbg !80 { +entry: + call void @llvm.pseudoprobe(i64 2711072140522378707, i64 1, i32 0, i64 -1), !dbg !81 + call void @new_block_only(), !dbg !82 + ret void, !dbg !84 +} + +; Function Attrs: nounwind uwtable +define dso_local i32 @main() #2 !dbg !85 { +entry: + call void @llvm.pseudoprobe(i64 -2624081020897602054, i64 1, i32 0, i64 -1), !dbg !91 + #dbg_value(i32 0, !89, !DIExpression(), !92) + br label %for.cond, !dbg !93 + +for.cond: ; preds = %for.body, %entry + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ], !dbg !94 + #dbg_value(i32 %i.0, !89, !DIExpression(), !92) + call void @llvm.pseudoprobe(i64 -2624081020897602054, i64 2, i32 0, i64 -1), !dbg !95 + %cmp = icmp slt i32 %i.0, 1000000, !dbg !97 + br i1 %cmp, label %for.body, label %for.cond.cleanup, !dbg !98 + +for.cond.cleanup: ; preds = %for.cond + call void @llvm.pseudoprobe(i64 -2624081020897602054, i64 3, i32 0, i64 -1), !dbg !99 + call void @llvm.pseudoprobe(i64 -2624081020897602054, i64 9, i32 0, i64 -1), !dbg !100 + call void @cold_func(), !dbg !101 + ret i32 0, !dbg !103 + +for.body: ; preds = %for.cond + call void @llvm.pseudoprobe(i64 -2624081020897602054, i64 4, i32 0, i64 -1), !dbg !104 + call void @new_foo(), !dbg !106 + call void @baz(), !dbg !108 + call void @test_noninline(), !dbg !110 + call void @llvm.pseudoprobe(i64 -2624081020897602054, i64 8, i32 0, i64 -1), !dbg !112 + %inc = add nsw i32 %i.0, 1, !dbg !112 + #dbg_value(i32 %inc, !89, !DIExpression(), !92) + br label %for.cond, !dbg !113, !llvm.loop !114 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) +declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #3 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) +declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #3 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) +declare void @llvm.pseudoprobe(i64, i64, i32, i64) #4 + +attributes #0 = { noinline nounwind uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "use-sample-profile" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { nounwind uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "use-sample-profile" } +attributes #3 = { mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) } +attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) } + +!llvm.dbg.cu = !{!2} +!llvm.module.flags = !{!7, !8, !9, !10, !11, !12, !13} +!llvm.ident = !{!14} +!llvm.pseudo_probe_desc = !{!15, !16, !17, !18, !19, !20, !21} + +!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression()) +!1 = distinct !DIGlobalVariable(name: "x", scope: !2, file: !3, line: 1, type: !5, isLocal: false, isDefinition: true) +!2 = distinct !DICompileUnit(language: DW_LANG_C11, file: !3, producer: "clang version 19.0.0git (https://github.com/llvm/llvm-project.git 2e1509152224d8ffbeac84c489920dcbaeefc2b2)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, globals: !4, splitDebugInlining: false, nameTableKind: None) +!3 = !DIFile(filename: "test_rename.c", directory: "/home/wlei/local/toytest/rename", checksumkind: CSK_MD5, checksum: "b07f600b3cdefd40bd44932bc13c33f5") +!4 = !{!0} +!5 = !DIDerivedType(tag: DW_TAG_volatile_type, baseType: !6) +!6 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!7 = !{i32 7, !"Dwarf Version", i32 5} +!8 = !{i32 2, !"Debug Info Version", i32 3} +!9 = !{i32 1, !"wchar_size", i32 4} +!10 = !{i32 8, !"PIC Level", i32 2} +!11 = !{i32 7, !"PIE Level", i32 2} +!12 = !{i32 7, !"uwtable", i32 2} +!13 = !{i32 7, !"debug-info-assignment-tracking", i1 true} +!14 = !{!"clang version 19.0.0git (https://github.com/llvm/llvm-project.git 2e1509152224d8ffbeac84c489920dcbaeefc2b2)"} +!15 = !{i64 -2012135647395072713, i64 4294967295, !"bar"} +!16 = !{i64 2964250471062803127, i64 206551239323, !"new_block_only"} +!17 = !{i64 7546896869197086323, i64 281479271677951, !"baz"} +!18 = !{i64 5381804724291869009, i64 844429225099263, !"new_foo"} +!19 = !{i64 -5610330892148506720, i64 281479271677951, !"test_noninline"} +!20 = !{i64 2711072140522378707, i64 281479271677951, !"cold_func"} +!21 = !{i64 -2624081020897602054, i64 1126003093360596, !"main"} +!22 = distinct !DISubprogram(name: "bar", scope: !3, file: !3, line: 3, type: !23, scopeLine: 3, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !25) +!23 = !DISubroutineType(types: !24) +!24 = !{!6, !6} +!25 = !{!26} +!26 = !DILocalVariable(name: "x", arg: 1, scope: !22, file: !3, line: 3, type: !6) +!27 = !DILocation(line: 0, scope: !22) +!28 = !DILocation(line: 4, column: 10, scope: !22) +!29 = !DILocation(line: 4, column: 12, scope: !22) +!30 = !DILocation(line: 4, column: 3, scope: !22) +!31 = distinct !DISubprogram(name: "new_block_only", scope: !3, file: !3, line: 7, type: !32, scopeLine: 7, flags: DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2) +!32 = !DISubroutineType(types: !33) +!33 = !{null} +!34 = !DILocation(line: 8, column: 6, scope: !35) +!35 = distinct !DILexicalBlock(scope: !31, file: !3, line: 8, column: 6) +!36 = !{!37, !37, i64 0} +!37 = !{!"int", !38, i64 0} +!38 = !{!"omnipotent char", !39, i64 0} +!39 = !{!"Simple C/C++ TBAA"} +!40 = !DILocation(line: 8, column: 8, scope: !35) +!41 = !DILocation(line: 8, column: 6, scope: !31) +!42 = !DILocation(line: 9, column: 7, scope: !35) +!43 = !DILocation(line: 9, column: 5, scope: !35) +!44 = !DILocation(line: 10, column: 12, scope: !45) +!45 = distinct !DILexicalBlock(scope: !35, file: !3, line: 10, column: 12) +!46 = !DILocation(line: 10, column: 14, scope: !45) +!47 = !DILocation(line: 10, column: 12, scope: !35) +!48 = !DILocation(line: 11, column: 7, scope: !45) +!49 = !DILocation(line: 11, column: 5, scope: !45) +!50 = !DILocation(line: 12, column: 12, scope: !51) +!51 = distinct !DILexicalBlock(scope: !45, file: !3, line: 12, column: 12) +!52 = !DILocation(line: 12, column: 14, scope: !51) +!53 = !DILocation(line: 12, column: 12, scope: !45) +!54 = !DILocation(line: 13, column: 7, scope: !51) +!55 = !DILocation(line: 13, column: 5, scope: !51) +!56 = !DILocation(line: 15, column: 6, scope: !51) +!57 = !DILocation(line: 16, column: 1, scope: !31) +!58 = distinct !DISubprogram(name: "baz", scope: !3, file: !3, line: 18, type: !32, scopeLine: 18, flags: DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2) +!59 = !DILocation(line: 19, column: 3, scope: !58) +!60 = !DILocation(line: 19, column: 3, scope: !61) +!61 = !DILexicalBlockFile(scope: !58, file: !3, discriminator: 186646551) +!62 = !DILocation(line: 20, column: 1, scope: !58) +!63 = distinct !DISubprogram(name: "new_foo", scope: !3, file: !3, line: 22, type: !32, scopeLine: 22, flags: DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2) +!64 = !DILocation(line: 23, column: 12, scope: !63) +!65 = !DILocation(line: 23, column: 8, scope: !66) +!66 = !DILexicalBlockFile(scope: !63, file: !3, discriminator: 186646551) +!67 = !DILocation(line: 23, column: 5, scope: !63) +!68 = !DILocation(line: 24, column: 3, scope: !69) +!69 = !DILexicalBlockFile(scope: !63, file: !3, discriminator: 186646559) +!70 = !DILocation(line: 25, column: 12, scope: !63) +!71 = !DILocation(line: 25, column: 8, scope: !72) +!72 = !DILexicalBlockFile(scope: !63, file: !3, discriminator: 186646567) +!73 = !DILocation(line: 25, column: 5, scope: !63) +!74 = !DILocation(line: 26, column: 1, scope: !63) +!75 = distinct !DISubprogram(name: "test_noninline", scope: !3, file: !3, line: 28, type: !32, scopeLine: 28, flags: DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2) +!76 = !DILocation(line: 29, column: 3, scope: !75) +!77 = !DILocation(line: 29, column: 3, scope: !78) +!78 = !DILexicalBlockFile(scope: !75, file: !3, discriminator: 186646551) +!79 = !DILocation(line: 30, column: 1, scope: !75) +!80 = distinct !DISubprogram(name: "cold_func", scope: !3, file: !3, line: 32, type: !32, scopeLine: 32, flags: DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2) +!81 = !DILocation(line: 32, column: 20, scope: !80) +!82 = !DILocation(line: 32, column: 20, scope: !83) +!83 = !DILexicalBlockFile(scope: !80, file: !3, discriminator: 186646551) +!84 = !DILocation(line: 32, column: 37, scope: !80) +!85 = distinct !DISubprogram(name: "main", scope: !3, file: !3, line: 34, type: !86, scopeLine: 34, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !88) +!86 = !DISubroutineType(types: !87) +!87 = !{!6} +!88 = !{!89} +!89 = !DILocalVariable(name: "i", scope: !90, file: !3, line: 35, type: !6) +!90 = distinct !DILexicalBlock(scope: !85, file: !3, line: 35, column: 3) +!91 = !DILocation(line: 35, column: 12, scope: !90) +!92 = !DILocation(line: 0, scope: !90) +!93 = !DILocation(line: 35, column: 8, scope: !90) +!94 = !DILocation(line: 35, scope: !90) +!95 = !DILocation(line: 35, column: 19, scope: !96) +!96 = distinct !DILexicalBlock(scope: !90, file: !3, line: 35, column: 3) +!97 = !DILocation(line: 35, column: 21, scope: !96) +!98 = !DILocation(line: 35, column: 3, scope: !90) +!99 = !DILocation(line: 0, scope: !85) +!100 = !DILocation(line: 40, column: 3, scope: !85) +!101 = !DILocation(line: 40, column: 3, scope: !102) +!102 = !DILexicalBlockFile(scope: !85, file: !3, discriminator: 186646615) +!103 = !DILocation(line: 41, column: 1, scope: !85) +!104 = !DILocation(line: 36, column: 7, scope: !105) +!105 = distinct !DILexicalBlock(scope: !96, file: !3, line: 35, column: 41) +!106 = !DILocation(line: 36, column: 7, scope: !107) +!107 = !DILexicalBlockFile(scope: !105, file: !3, discriminator: 186646575) +!108 = !DILocation(line: 37, column: 7, scope: !109) +!109 = !DILexicalBlockFile(scope: !105, file: !3, discriminator: 186646583) +!110 = !DILocation(line: 38, column: 7, scope: !111) +!111 = !DILexicalBlockFile(scope: !105, file: !3, discriminator: 186646591) +!112 = !DILocation(line: 35, column: 37, scope: !96) +!113 = !DILocation(line: 35, column: 3, scope: !96) +!114 = distinct !{!114, !98, !115, !116} +!115 = !DILocation(line: 39, column: 3, scope: !90) +!116 = !{!"llvm.loop.mustprogress"} From 0bb68b55715487447ffceaa1ab59f7a0bc8c7979 Mon Sep 17 00:00:00 2001 From: Doug Wyatt Date: Wed, 17 Jul 2024 10:36:36 -0700 Subject: [PATCH 312/777] Performance optimizations for function effects (nonblocking attribute etc.) (#96844) - Put new FunctionProtoType trailing objects last. - Inline FunctionEffectsRef::get() - Manually inline FunctionEffectsRef::Profile(). --------- Co-authored-by: Doug Wyatt --- clang/include/clang/AST/ASTContext.h | 5 ++ clang/include/clang/AST/Type.h | 32 ++++++++---- clang/lib/AST/ASTContext.cpp | 12 +++-- clang/lib/AST/Type.cpp | 36 +++++--------- clang/lib/Sema/Sema.cpp | 4 +- clang/lib/Sema/SemaDecl.cpp | 74 ++++++++++++++-------------- clang/lib/Sema/SemaDeclCXX.cpp | 62 ++++++++++++----------- clang/lib/Sema/SemaOverload.cpp | 2 +- 8 files changed, 119 insertions(+), 108 deletions(-) diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h index 13aa203de32ba..608bd90fcc3ff 100644 --- a/clang/include/clang/AST/ASTContext.h +++ b/clang/include/clang/AST/ASTContext.h @@ -643,6 +643,9 @@ class ASTContext : public RefCountedBase { /// address spaces (e.g. OpenCL/CUDA) bool AddrSpaceMapMangling; + /// For performance, track whether any function effects are in use. + mutable bool AnyFunctionEffects = false; + const TargetInfo *Target = nullptr; const TargetInfo *AuxTarget = nullptr; clang::PrintingPolicy PrintingPolicy; @@ -2909,6 +2912,8 @@ class ASTContext : public RefCountedBase { return AddrSpaceMapMangling || isTargetAddressSpace(AS); } + bool hasAnyFunctionEffects() const { return AnyFunctionEffects; } + // Merges two exception specifications, such that the resulting // exception spec is the union of both. For example, if either // of them can throw something, the result can throw it as well. diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h index 4c9ba37fe1e3a..25defea58c2dc 100644 --- a/clang/include/clang/AST/Type.h +++ b/clang/include/clang/AST/Type.h @@ -132,7 +132,6 @@ class TemplateArgument; class TemplateArgumentListInfo; class TemplateArgumentLoc; class TemplateTypeParmDecl; -template class TreeTransform; class TypedefNameDecl; class UnresolvedUsingTypenameDecl; class UsingShadowDecl; @@ -4901,7 +4900,6 @@ class FunctionEffectsRef { return !(LHS == RHS); } - void Profile(llvm::FoldingSetNodeID &ID) const; void dump(llvm::raw_ostream &OS) const; }; @@ -4971,8 +4969,8 @@ class FunctionProtoType final FunctionProtoType, QualType, SourceLocation, FunctionType::FunctionTypeExtraBitfields, FunctionType::FunctionTypeArmAttributes, FunctionType::ExceptionType, - Expr *, FunctionDecl *, FunctionType::ExtParameterInfo, - FunctionEffect, EffectConditionExpr, Qualifiers> { + Expr *, FunctionDecl *, FunctionType::ExtParameterInfo, Qualifiers, + FunctionEffect, EffectConditionExpr> { friend class ASTContext; // ASTContext creates these. friend TrailingObjects; @@ -5003,21 +5001,21 @@ class FunctionProtoType final // an ExtParameterInfo for each of the parameters. Present if and // only if hasExtParameterInfos() is true. // + // * Optionally a Qualifiers object to represent extra qualifiers that can't + // be represented by FunctionTypeBitfields.FastTypeQuals. Present if and + // only if hasExtQualifiers() is true. + // // * Optionally, an array of getNumFunctionEffects() FunctionEffect. // Present only when getNumFunctionEffects() > 0 // // * Optionally, an array of getNumFunctionEffects() EffectConditionExpr. // Present only when getNumFunctionEffectConditions() > 0. // - // * Optionally a Qualifiers object to represent extra qualifiers that can't - // be represented by FunctionTypeBitfields.FastTypeQuals. Present if and - // only if hasExtQualifiers() is true. - // // The optional FunctionTypeExtraBitfields has to be before the data // related to the exception specification since it contains the number // of exception types. // - // We put the ExtParameterInfos last. If all were equal, it would make + // We put the ExtParameterInfos later. If all were equal, it would make // more sense to put these before the exception specification, because // it's much easier to skip past them compared to the elaborate switch // required to skip the exception specification. However, all is not @@ -5134,6 +5132,10 @@ class FunctionProtoType final return hasExtParameterInfos() ? getNumParams() : 0; } + unsigned numTrailingObjects(OverloadToken) const { + return hasExtQualifiers() ? 1 : 0; + } + unsigned numTrailingObjects(OverloadToken) const { return getNumFunctionEffects(); } @@ -8616,6 +8618,18 @@ QualType DecayedType::getPointeeType() const { void FixedPointValueToString(SmallVectorImpl &Str, llvm::APSInt Val, unsigned Scale); +inline FunctionEffectsRef FunctionEffectsRef::get(QualType QT) { + while (true) { + QualType Pointee = QT->getPointeeType(); + if (Pointee.isNull()) + break; + QT = Pointee; + } + if (const auto *FPT = QT->getAs()) + return FPT->getFunctionEffects(); + return {}; +} + } // namespace clang #endif // LLVM_CLANG_AST_TYPE_H diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp index f4aa1387974aa..a8e599f7ebe04 100644 --- a/clang/lib/AST/ASTContext.cpp +++ b/clang/lib/AST/ASTContext.cpp @@ -4896,14 +4896,14 @@ QualType ASTContext::getFunctionTypeInternal( size_t Size = FunctionProtoType::totalSizeToAlloc< QualType, SourceLocation, FunctionType::FunctionTypeExtraBitfields, FunctionType::FunctionTypeArmAttributes, FunctionType::ExceptionType, - Expr *, FunctionDecl *, FunctionProtoType::ExtParameterInfo, - FunctionEffect, EffectConditionExpr, Qualifiers>( + Expr *, FunctionDecl *, FunctionProtoType::ExtParameterInfo, Qualifiers, + FunctionEffect, EffectConditionExpr>( NumArgs, EPI.Variadic, EPI.requiresFunctionProtoTypeExtraBitfields(), EPI.requiresFunctionProtoTypeArmAttributes(), ESH.NumExceptionType, ESH.NumExprPtr, ESH.NumFunctionDeclPtr, - EPI.ExtParameterInfos ? NumArgs : 0, EPI.FunctionEffects.size(), - EPI.FunctionEffects.conditions().size(), - EPI.TypeQuals.hasNonFastQualifiers() ? 1 : 0); + EPI.ExtParameterInfos ? NumArgs : 0, + EPI.TypeQuals.hasNonFastQualifiers() ? 1 : 0, EPI.FunctionEffects.size(), + EPI.FunctionEffects.conditions().size()); auto *FTP = (FunctionProtoType *)Allocate(Size, alignof(FunctionProtoType)); FunctionProtoType::ExtProtoInfo newEPI = EPI; @@ -4911,6 +4911,8 @@ QualType ASTContext::getFunctionTypeInternal( Types.push_back(FTP); if (!Unique) FunctionProtoTypes.InsertNode(FTP, InsertPos); + if (!EPI.FunctionEffects.empty()) + AnyFunctionEffects = true; return QualType(FTP, 0); } diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp index 5bf1f3dbdbd4b..fdaab8e434593 100644 --- a/clang/lib/AST/Type.cpp +++ b/clang/lib/AST/Type.cpp @@ -3798,9 +3798,18 @@ void FunctionProtoType::Profile(llvm::FoldingSetNodeID &ID, QualType Result, } epi.ExtInfo.Profile(ID); - ID.AddInteger((epi.AArch64SMEAttributes << 1) | epi.HasTrailingReturn); - epi.FunctionEffects.Profile(ID); + unsigned EffectCount = epi.FunctionEffects.size(); + bool HasConds = !epi.FunctionEffects.Conditions.empty(); + + ID.AddInteger((EffectCount << 3) | (HasConds << 2) | + (epi.AArch64SMEAttributes << 1) | epi.HasTrailingReturn); + + for (unsigned Idx = 0; Idx != EffectCount; ++Idx) { + ID.AddInteger(epi.FunctionEffects.Effects[Idx].toOpaqueInt32()); + if (HasConds) + ID.AddPointer(epi.FunctionEffects.Conditions[Idx].getCondition()); + } } void FunctionProtoType::Profile(llvm::FoldingSetNodeID &ID, @@ -5181,17 +5190,6 @@ bool FunctionEffect::shouldDiagnoseFunctionCall( // ===== -void FunctionEffectsRef::Profile(llvm::FoldingSetNodeID &ID) const { - bool HasConds = !Conditions.empty(); - - ID.AddInteger(size() | (HasConds << 31u)); - for (unsigned Idx = 0, Count = Effects.size(); Idx != Count; ++Idx) { - ID.AddInteger(Effects[Idx].toOpaqueInt32()); - if (HasConds) - ID.AddPointer(Conditions[Idx].getCondition()); - } -} - bool FunctionEffectSet::insert(const FunctionEffectWithCondition &NewEC, Conflicts &Errs) { FunctionEffect::Kind NewOppositeKind = NewEC.Effect.oppositeKind(); @@ -5313,18 +5311,6 @@ LLVM_DUMP_METHOD void FunctionEffectSet::dump(llvm::raw_ostream &OS) const { FunctionEffectsRef(*this).dump(OS); } -FunctionEffectsRef FunctionEffectsRef::get(QualType QT) { - while (true) { - QualType Pointee = QT->getPointeeType(); - if (Pointee.isNull()) - break; - QT = Pointee; - } - if (const auto *FPT = QT->getAs()) - return FPT->getFunctionEffects(); - return {}; -} - FunctionEffectsRef FunctionEffectsRef::create(ArrayRef FX, ArrayRef Conds) { diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp index d6228718d53ae..46417964f0896 100644 --- a/clang/lib/Sema/Sema.cpp +++ b/clang/lib/Sema/Sema.cpp @@ -718,8 +718,8 @@ ExprResult Sema::ImpCastExprToType(Expr *E, QualType Ty, diagnoseNullableToNonnullConversion(Ty, E->getType(), E->getBeginLoc()); diagnoseZeroToNullptrConversion(Kind, E); - if (!isCast(CCK) && Kind != CK_NullToPointer && - Kind != CK_NullToMemberPointer) + if (Context.hasAnyFunctionEffects() && !isCast(CCK) && + Kind != CK_NullToPointer && Kind != CK_NullToMemberPointer) diagnoseFunctionEffectConversion(Ty, E->getType(), E->getBeginLoc()); QualType ExprTy = Context.getCanonicalType(E->getType()); diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index 1f2fde12c9d24..a3dd5ede9116a 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -3813,45 +3813,47 @@ bool Sema::MergeFunctionDecl(FunctionDecl *New, NamedDecl *&OldD, Scope *S, return true; } - const auto OldFX = Old->getFunctionEffects(); - const auto NewFX = New->getFunctionEffects(); QualType OldQTypeForComparison = OldQType; - if (OldFX != NewFX) { - const auto Diffs = FunctionEffectDifferences(OldFX, NewFX); - for (const auto &Diff : Diffs) { - if (Diff.shouldDiagnoseRedeclaration(*Old, OldFX, *New, NewFX)) { - Diag(New->getLocation(), - diag::warn_mismatched_func_effect_redeclaration) - << Diff.effectName(); - Diag(Old->getLocation(), diag::note_previous_declaration); + if (Context.hasAnyFunctionEffects()) { + const auto OldFX = Old->getFunctionEffects(); + const auto NewFX = New->getFunctionEffects(); + if (OldFX != NewFX) { + const auto Diffs = FunctionEffectDifferences(OldFX, NewFX); + for (const auto &Diff : Diffs) { + if (Diff.shouldDiagnoseRedeclaration(*Old, OldFX, *New, NewFX)) { + Diag(New->getLocation(), + diag::warn_mismatched_func_effect_redeclaration) + << Diff.effectName(); + Diag(Old->getLocation(), diag::note_previous_declaration); + } } - } - // Following a warning, we could skip merging effects from the previous - // declaration, but that would trigger an additional "conflicting types" - // error. - if (const auto *NewFPT = NewQType->getAs()) { - FunctionEffectSet::Conflicts MergeErrs; - FunctionEffectSet MergedFX = - FunctionEffectSet::getUnion(OldFX, NewFX, MergeErrs); - if (!MergeErrs.empty()) - diagnoseFunctionEffectMergeConflicts(MergeErrs, New->getLocation(), - Old->getLocation()); - - FunctionProtoType::ExtProtoInfo EPI = NewFPT->getExtProtoInfo(); - EPI.FunctionEffects = FunctionEffectsRef(MergedFX); - QualType ModQT = Context.getFunctionType(NewFPT->getReturnType(), - NewFPT->getParamTypes(), EPI); - - New->setType(ModQT); - NewQType = New->getType(); - - // Revise OldQTForComparison to include the merged effects, - // so as not to fail due to differences later. - if (const auto *OldFPT = OldQType->getAs()) { - EPI = OldFPT->getExtProtoInfo(); + // Following a warning, we could skip merging effects from the previous + // declaration, but that would trigger an additional "conflicting types" + // error. + if (const auto *NewFPT = NewQType->getAs()) { + FunctionEffectSet::Conflicts MergeErrs; + FunctionEffectSet MergedFX = + FunctionEffectSet::getUnion(OldFX, NewFX, MergeErrs); + if (!MergeErrs.empty()) + diagnoseFunctionEffectMergeConflicts(MergeErrs, New->getLocation(), + Old->getLocation()); + + FunctionProtoType::ExtProtoInfo EPI = NewFPT->getExtProtoInfo(); EPI.FunctionEffects = FunctionEffectsRef(MergedFX); - OldQTypeForComparison = Context.getFunctionType( - OldFPT->getReturnType(), OldFPT->getParamTypes(), EPI); + QualType ModQT = Context.getFunctionType(NewFPT->getReturnType(), + NewFPT->getParamTypes(), EPI); + + New->setType(ModQT); + NewQType = New->getType(); + + // Revise OldQTForComparison to include the merged effects, + // so as not to fail due to differences later. + if (const auto *OldFPT = OldQType->getAs()) { + EPI = OldFPT->getExtProtoInfo(); + EPI.FunctionEffects = FunctionEffectsRef(MergedFX); + OldQTypeForComparison = Context.getFunctionType( + OldFPT->getReturnType(), OldFPT->getParamTypes(), EPI); + } } } } diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp index 2bfb103e8953d..04b8d88cae217 100644 --- a/clang/lib/Sema/SemaDeclCXX.cpp +++ b/clang/lib/Sema/SemaDeclCXX.cpp @@ -18100,38 +18100,40 @@ bool Sema::CheckOverridingFunctionAttributes(CXXMethodDecl *New, } // Virtual overrides: check for matching effects. - const auto OldFX = Old->getFunctionEffects(); - const auto NewFXOrig = New->getFunctionEffects(); - - if (OldFX != NewFXOrig) { - FunctionEffectSet NewFX(NewFXOrig); - const auto Diffs = FunctionEffectDifferences(OldFX, NewFX); - FunctionEffectSet::Conflicts Errs; - for (const auto &Diff : Diffs) { - switch (Diff.shouldDiagnoseMethodOverride(*Old, OldFX, *New, NewFX)) { - case FunctionEffectDiff::OverrideResult::NoAction: - break; - case FunctionEffectDiff::OverrideResult::Warn: - Diag(New->getLocation(), diag::warn_mismatched_func_effect_override) - << Diff.effectName(); - Diag(Old->getLocation(), diag::note_overridden_virtual_function) - << Old->getReturnTypeSourceRange(); - break; - case FunctionEffectDiff::OverrideResult::Merge: { - NewFX.insert(Diff.Old, Errs); - const auto *NewFT = New->getType()->castAs(); - FunctionProtoType::ExtProtoInfo EPI = NewFT->getExtProtoInfo(); - EPI.FunctionEffects = FunctionEffectsRef(NewFX); - QualType ModQT = Context.getFunctionType(NewFT->getReturnType(), - NewFT->getParamTypes(), EPI); - New->setType(ModQT); - break; - } + if (Context.hasAnyFunctionEffects()) { + const auto OldFX = Old->getFunctionEffects(); + const auto NewFXOrig = New->getFunctionEffects(); + + if (OldFX != NewFXOrig) { + FunctionEffectSet NewFX(NewFXOrig); + const auto Diffs = FunctionEffectDifferences(OldFX, NewFX); + FunctionEffectSet::Conflicts Errs; + for (const auto &Diff : Diffs) { + switch (Diff.shouldDiagnoseMethodOverride(*Old, OldFX, *New, NewFX)) { + case FunctionEffectDiff::OverrideResult::NoAction: + break; + case FunctionEffectDiff::OverrideResult::Warn: + Diag(New->getLocation(), diag::warn_mismatched_func_effect_override) + << Diff.effectName(); + Diag(Old->getLocation(), diag::note_overridden_virtual_function) + << Old->getReturnTypeSourceRange(); + break; + case FunctionEffectDiff::OverrideResult::Merge: { + NewFX.insert(Diff.Old, Errs); + const auto *NewFT = New->getType()->castAs(); + FunctionProtoType::ExtProtoInfo EPI = NewFT->getExtProtoInfo(); + EPI.FunctionEffects = FunctionEffectsRef(NewFX); + QualType ModQT = Context.getFunctionType(NewFT->getReturnType(), + NewFT->getParamTypes(), EPI); + New->setType(ModQT); + break; + } + } } + if (!Errs.empty()) + diagnoseFunctionEffectMergeConflicts(Errs, New->getLocation(), + Old->getLocation()); } - if (!Errs.empty()) - diagnoseFunctionEffectMergeConflicts(Errs, New->getLocation(), - Old->getLocation()); } CallingConv NewCC = NewFT->getCallConv(), OldCC = OldFT->getCallConv(); diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp index 074062ebbb594..472e7ae5d1d3f 100644 --- a/clang/lib/Sema/SemaOverload.cpp +++ b/clang/lib/Sema/SemaOverload.cpp @@ -1863,7 +1863,7 @@ bool Sema::IsFunctionConversion(QualType FromType, QualType ToType, // we need to not alter FromFn, or else even an innocuous cast // like dropping effects will fail. In C++ however we do want to // alter FromFn (because of the way PerformImplicitConversion works). - if (getLangOpts().CPlusPlus) { + if (Context.hasAnyFunctionEffects() && getLangOpts().CPlusPlus) { FromFPT = cast(FromFn); // in case FromFn changed above // Transparently add/drop effects; here we are concerned with From 0778f5c1f11da599b71d6c9f5990fd880ff7cb46 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Wed, 17 Jul 2024 10:45:59 -0700 Subject: [PATCH 313/777] [ELF] Support NOCROSSREFS and NOCROSSERFS_TO Implement the two commands described by https://sourceware.org/binutils/docs/ld/Miscellaneous-Commands.html After `outputSections` is available, check each output section described by at least one `NOCROSSREFS`/`NOCROSSERFS_TO` command. For each checked output section, scan relocations from its input sections. This step is slow, therefore utilize `parallelForEach(isd->sections, ...)`. To support non SHF_ALLOC sections, `InputSectionBase::relocations` (empty) cannot be used. In addition, we may explore eliminating this member to speed up relocation scanning. Some parse code is adapted from #95714. Close #41825 Pull Request: https://github.com/llvm/llvm-project/pull/98773 --- lld/ELF/LinkerScript.h | 13 +++ lld/ELF/Relocations.cpp | 58 +++++++++++++ lld/ELF/Relocations.h | 1 + lld/ELF/ScriptParser.cpp | 16 ++++ lld/ELF/Writer.cpp | 5 ++ lld/docs/ReleaseNotes.rst | 3 + lld/test/ELF/linkerscript/nocrossrefs.test | 99 ++++++++++++++++++++++ 7 files changed, 195 insertions(+) create mode 100644 lld/test/ELF/linkerscript/nocrossrefs.test diff --git a/lld/ELF/LinkerScript.h b/lld/ELF/LinkerScript.h index 43d0850eed718..b86521a429f04 100644 --- a/lld/ELF/LinkerScript.h +++ b/lld/ELF/LinkerScript.h @@ -256,6 +256,16 @@ struct InsertCommand { StringRef where; }; +// A NOCROSSREFS/NOCROSSREFS_TO command that prohibits references between +// certain output sections. +struct NoCrossRefCommand { + SmallVector outputSections; + + // When true, this describes a NOCROSSREFS_TO command that probits references + // to the first output section from any of the other sections. + bool toFirst = false; +}; + struct PhdrsCommand { StringRef name; unsigned type = llvm::ELF::PT_NULL; @@ -397,6 +407,9 @@ class LinkerScript final { // OutputSections specified by OVERWRITE_SECTIONS. SmallVector overwriteSections; + // NOCROSSREFS(_TO) commands. + SmallVector noCrossRefs; + // Sections that will be warned/errored by --orphan-handling. SmallVector orphanSections; diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp index 9ad180306bcd8..36857d72c647e 100644 --- a/lld/ELF/Relocations.cpp +++ b/lld/ELF/Relocations.cpp @@ -2367,7 +2367,65 @@ void elf::hexagonTLSSymbolUpdate(ArrayRef outputSections) { }); } +static bool matchesRefTo(const NoCrossRefCommand &cmd, StringRef osec) { + if (cmd.toFirst) + return cmd.outputSections[0] == osec; + return llvm::is_contained(cmd.outputSections, osec); +} + +template +static void scanCrossRefs(const NoCrossRefCommand &cmd, OutputSection *osec, + InputSection *sec, Rels rels) { + for (const auto &r : rels) { + Symbol &sym = sec->file->getSymbol(r.getSymbol(config->isMips64EL)); + // A legal cross-reference is when the destination output section is + // nullptr, osec for a self-reference, or a section that is described by the + // NOCROSSREFS/NOCROSSREFS_TO command. + auto *dstOsec = sym.getOutputSection(); + if (!dstOsec || dstOsec == osec || !matchesRefTo(cmd, dstOsec->name)) + continue; + + std::string toSymName; + if (!sym.isSection()) + toSymName = toString(sym); + else if (auto *d = dyn_cast(&sym)) + toSymName = d->section->name; + errorOrWarn(sec->getLocation(r.r_offset) + + ": prohibited cross reference from '" + osec->name + "' to '" + + toSymName + "' in '" + dstOsec->name + "'"); + } +} + +// For each output section described by at least one NOCROSSREFS(_TO) command, +// scan relocations from its input sections for prohibited cross references. +template void elf::checkNoCrossRefs() { + for (OutputSection *osec : outputSections) { + for (const NoCrossRefCommand &noxref : script->noCrossRefs) { + if (!llvm::is_contained(noxref.outputSections, osec->name) || + (noxref.toFirst && noxref.outputSections[0] == osec->name)) + continue; + for (SectionCommand *cmd : osec->commands) { + auto *isd = dyn_cast(cmd); + if (!isd) + continue; + parallelForEach(isd->sections, [&](InputSection *sec) { + const RelsOrRelas rels = sec->template relsOrRelas(); + if (rels.areRelocsRel()) + scanCrossRefs(noxref, osec, sec, rels.rels); + else + scanCrossRefs(noxref, osec, sec, rels.relas); + }); + } + } + } +} + template void elf::scanRelocations(); template void elf::scanRelocations(); template void elf::scanRelocations(); template void elf::scanRelocations(); + +template void elf::checkNoCrossRefs(); +template void elf::checkNoCrossRefs(); +template void elf::checkNoCrossRefs(); +template void elf::checkNoCrossRefs(); diff --git a/lld/ELF/Relocations.h b/lld/ELF/Relocations.h index e299d23dd7db3..1bee0dedf8587 100644 --- a/lld/ELF/Relocations.h +++ b/lld/ELF/Relocations.h @@ -141,6 +141,7 @@ struct JumpInstrMod { // Call reportUndefinedSymbols() after calling scanRelocations() to emit // the diagnostics. template void scanRelocations(); +template void checkNoCrossRefs(); void reportUndefinedSymbols(); void postScanRelocations(); void addGotEntry(Symbol &sym); diff --git a/lld/ELF/ScriptParser.cpp b/lld/ELF/ScriptParser.cpp index 92ef9330141fc..47a94c29ea496 100644 --- a/lld/ELF/ScriptParser.cpp +++ b/lld/ELF/ScriptParser.cpp @@ -87,6 +87,7 @@ class ScriptParser final : ScriptLexer { void readTarget(); void readVersion(); void readVersionScriptCommand(); + void readNoCrossRefs(bool to); SymbolAssignment *readSymbolAssignment(StringRef name); ByteCommand *readByteCommand(StringRef tok); @@ -280,6 +281,10 @@ void ScriptParser::readLinkerScript() { readTarget(); } else if (tok == "VERSION") { readVersion(); + } else if (tok == "NOCROSSREFS") { + readNoCrossRefs(/*to=*/false); + } else if (tok == "NOCROSSREFS_TO") { + readNoCrossRefs(/*to=*/true); } else if (SymbolAssignment *cmd = readAssignment(tok)) { script->sectionCommands.push_back(cmd); } else { @@ -299,6 +304,17 @@ void ScriptParser::readDefsym(StringRef name) { script->sectionCommands.push_back(cmd); } +void ScriptParser::readNoCrossRefs(bool to) { + expect("("); + NoCrossRefCommand cmd{{}, to}; + while (!errorCount() && !consume(")")) + cmd.outputSections.push_back(unquote(next())); + if (cmd.outputSections.size() < 2) + warn(getCurrentLocation() + ": ignored with fewer than 2 output sections"); + else + script->noCrossRefs.push_back(std::move(cmd)); +} + void ScriptParser::addFile(StringRef s) { if (isUnderSysroot && s.starts_with("/")) { SmallString<128> pathData; diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp index 8940a1c5d5113..5cffdb771a738 100644 --- a/lld/ELF/Writer.cpp +++ b/lld/ELF/Writer.cpp @@ -1943,6 +1943,11 @@ template void Writer::finalizeSections() { // have the headers, we can find out which sections they point to. setReservedSymbolSections(); + if (script->noCrossRefs.size()) { + llvm::TimeTraceScope timeScope("Check NOCROSSREFS"); + checkNoCrossRefs(); + } + { llvm::TimeTraceScope timeScope("Finalize synthetic sections"); diff --git a/lld/docs/ReleaseNotes.rst b/lld/docs/ReleaseNotes.rst index 05179bfdcb536..ba8543732bb8e 100644 --- a/lld/docs/ReleaseNotes.rst +++ b/lld/docs/ReleaseNotes.rst @@ -81,6 +81,9 @@ ELF Improvements (`#87530 `_) * ``OUTPUT_FORMAT(binary)`` is now supported. (`#98837 `_) +* ``NOCROSSREFS`` and ``NOCRFOSSREFS_TO`` commands now supported to prohibit + cross references between certain output sections. + (`#98773 `_) * Orphan placement is refined to prefer the last similar section when its rank <= orphan's rank. (`#94099 `_) Non-alloc orphan sections are now placed at the end. diff --git a/lld/test/ELF/linkerscript/nocrossrefs.test b/lld/test/ELF/linkerscript/nocrossrefs.test new file mode 100644 index 0000000000000..f13d50a03be87 --- /dev/null +++ b/lld/test/ELF/linkerscript/nocrossrefs.test @@ -0,0 +1,99 @@ +# REQUIRES: x86 +# RUN: rm -rf %t && split-file %s %t && cd %t + +# RUN: llvm-mc --triple=x86_64 -filetype=obj a.s -o a.o +# RUN: llvm-mc --triple=x86_64 -filetype=obj data.s -o data.o +# RUN: ld.lld a.o data.o -T 0.t 2>&1 | FileCheck %s --check-prefix=CHECK0 --implicit-check-not=warning: + +# CHECK0: warning: 0.t:3: ignored with fewer than 2 output sections +# CHECK0-NEXT: warning: 0.t:4: ignored with fewer than 2 output sections + +# RUN: not ld.lld a.o data.o -T 1.t 2>&1 | FileCheck %s --check-prefix=CHECK1 --implicit-check-not=error: +# CHECK1: error: a.o:(.text.start+0x11): prohibited cross reference from '.text' to 'data' in '.data' + +## .text and .text1 are in two NOCROSSREFS commands. Violations are reported twice. +# RUN: not ld.lld --threads=1 a.o data.o -T 2.t 2>&1 | FileCheck %s --check-prefix=CHECK2 --implicit-check-not=error: +# CHECK2: error: a.o:(.text.start+0x6): prohibited cross reference from '.text' to '.text1' in '.text1' +# CHECK2-NEXT: error: a.o:(.text.start+0x6): prohibited cross reference from '.text' to '.text1' in '.text1' +# CHECK2-NEXT: error: a.o:(.text.start+0xb): prohibited cross reference from '.text' to 'foo2' in '.text2' +# CHECK2-NEXT: error: a.o:(.text.start+0x11): prohibited cross reference from '.text' to 'data' in '.data' +# CHECK2-NEXT: error: a.o:(.text.start+0x17): prohibited cross reference from '.text' to 'str1' in '.rodata' +## .data occurs twice in the command, but the violation is only reported once. +# CHECK2-NEXT: error: a.o:(.text1+0x1): prohibited cross reference from '.text1' to '_edata' in '.data' +# CHECK2-NEXT: error: a.o:(.nonalloc+0x0): prohibited cross reference from '.nonalloc' to '.text' in '.text' +# CHECK2-NEXT: error: a.o:(.nonalloc+0x10): prohibited cross reference from '.nonalloc' to 'data' in '.data' + +# RUN: not ld.lld a.o data.o -T 3.t 2>&1 | FileCheck %s --check-prefix=CHECK3 --implicit-check-not=error: +# CHECK3: error: a.o:(.nonalloc+0x0): prohibited cross reference from '.nonalloc' to '.text' in '.text' + +#--- 0.t +## Some cases that do not cause errors. +abs = 42; +NOCROSSREFS() +NOCROSSREFS (.text) +NOCROSSREFS( .text .text3 ); ## ; is ignored +NOCROSSREFS_TO(.text .text2 .text3 .data ); +NOCROSSREFS_TO (.data .text2 .text3) + +#--- 1.t +abs = 42; +NOCROSSREFS(.text ".data") + +#--- 2.t +abs = 42; +NOCROSSREFS(.text ".text1" .text ".text1" ) +NOCROSSREFS(.text .text1 .text2 .data .rodata .data .nonalloc) + +#--- 3.t +abs = 42; +NOCROSSREFS_TO(.text .text .text1 .text2 .data .nonalloc) + +#--- err1.t +NOCROSSREFS ) + +# RUN: not ld.lld a.o data.o -T err1.t 2>&1 | FileCheck %s --check-prefix=ERR1 --implicit-check-not=error: +# ERR1: error: err1.t:1: ( expected, but got ) + +#--- err2.t +NOCROSSREFS(.text + +# RUN: not ld.lld a.o data.o -T err2.t 2>&1 | FileCheck %s --check-prefix=ERR2 --implicit-check-not=error: +# ERR2: error: err2.t:1: unexpected EOF + +#--- a.s +.global _start, foo1, foo2, foo3 +.section .text.start,"ax" +_start: + call _start + call .text1 + call foo2 + movl data(%rip), %eax + movl str1(%rip), %eax + +.section .text1,"ax" +foo1: + call _edata + +.section .text2,"ax" +foo2: + call foo3 + +.section .text3,"ax" +foo3: + call foo2 + +.section .rodata.str1.1,"aMS",@progbits,1 +str1: + .asciz "abc" + +.section .nonalloc,"" + .quad .text + .quad .text3 + .quad data + +#--- data.s +.section .data,"aw" +.globl data +data: + .byte 0 + .quad abs From 93d38d7f08864397f1e751c8cecde5ea302ecced Mon Sep 17 00:00:00 2001 From: Leandro Lupori Date: Wed, 17 Jul 2024 14:49:22 -0300 Subject: [PATCH 314/777] [lldb][test] Fix simulator test for std::unique_ptr (#99357) libcxx-simulators/unique_ptr/main.cpp uses __builtin_printf, that maps to printf on Windows. Include stdio.h to avoid linker errors on Windows. See https://lab.llvm.org/buildbot/#/builders/141/builds/853 --- .../data-formatter-stl/libcxx-simulators/unique_ptr/main.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/unique_ptr/main.cpp b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/unique_ptr/main.cpp index 08324e24f9cc4..a6bfaa3febebb 100644 --- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/unique_ptr/main.cpp +++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/unique_ptr/main.cpp @@ -1,5 +1,7 @@ #include +#include + namespace std { namespace __lldb { template struct default_delete { From 858147d0b88b50f6829834a059d95924ea8e5d4d Mon Sep 17 00:00:00 2001 From: Petr Hosek Date: Wed, 17 Jul 2024 10:54:08 -0700 Subject: [PATCH 315/777] [CMake][Fuchsia] Include new/delete in baremetal targets (#99279) These don't include libcxxabi yet so we need new/delete in libcxx. --- clang/cmake/caches/Fuchsia-stage2.cmake | 2 ++ 1 file changed, 2 insertions(+) diff --git a/clang/cmake/caches/Fuchsia-stage2.cmake b/clang/cmake/caches/Fuchsia-stage2.cmake index b4561e6c87ba5..01c47e5b0be44 100644 --- a/clang/cmake/caches/Fuchsia-stage2.cmake +++ b/clang/cmake/caches/Fuchsia-stage2.cmake @@ -338,6 +338,7 @@ foreach(target armv6m-unknown-eabi;armv7m-unknown-eabi;armv8m.main-unknown-eabi) set(RUNTIMES_${target}_LIBCXX_CXX_ABI none CACHE STRING "") set(RUNTIMES_${target}_LIBCXX_ENABLE_SHARED OFF CACHE BOOL "") set(RUNTIMES_${target}_LIBCXX_ENABLE_STATIC ON CACHE BOOL "") + set(RUNTIMES_${target}_LIBCXX_ENABLE_NEW_DELETE_DEFINITIONS ON CACHE BOOL "") set(RUNTIMES_${target}_LIBCXX_ENABLE_FILESYSTEM OFF CACHE BOOL "") set(RUNTIMES_${target}_LIBCXX_ENABLE_RANDOM_DEVICE OFF CACHE BOOL "") set(RUNTIMES_${target}_LIBCXX_ENABLE_LOCALIZATION OFF CACHE BOOL "") @@ -388,6 +389,7 @@ foreach(target riscv32-unknown-elf) set(RUNTIMES_${target}_LIBCXX_CXX_ABI none CACHE STRING "") set(RUNTIMES_${target}_LIBCXX_ENABLE_SHARED OFF CACHE BOOL "") set(RUNTIMES_${target}_LIBCXX_ENABLE_STATIC ON CACHE BOOL "") + set(RUNTIMES_${target}_LIBCXX_ENABLE_NEW_DELETE_DEFINITIONS ON CACHE BOOL "") set(RUNTIMES_${target}_LIBCXX_ENABLE_FILESYSTEM OFF CACHE BOOL "") set(RUNTIMES_${target}_LIBCXX_ENABLE_RANDOM_DEVICE OFF CACHE BOOL "") set(RUNTIMES_${target}_LIBCXX_ENABLE_LOCALIZATION OFF CACHE BOOL "") From e3b8d3649789a59e54a32998780fb64d0663284c Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Wed, 17 Jul 2024 11:01:20 -0700 Subject: [PATCH 316/777] [ARC,CSKY] Update getMemcpy after #98969 --- llvm/lib/Target/ARC/ARCISelLowering.cpp | 2 +- llvm/lib/Target/CSKY/CSKYISelLowering.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/ARC/ARCISelLowering.cpp b/llvm/lib/Target/ARC/ARCISelLowering.cpp index 5dd343d97b80c..5ab27681361db 100644 --- a/llvm/lib/Target/ARC/ARCISelLowering.cpp +++ b/llvm/lib/Target/ARC/ARCISelLowering.cpp @@ -608,7 +608,7 @@ SDValue ARCTargetLowering::LowerCallArguments( InVals.push_back(FIN); MemOps.push_back(DAG.getMemcpy( Chain, dl, FIN, ArgDI.SDV, DAG.getConstant(Size, dl, MVT::i32), - Alignment, false, false, false, MachinePointerInfo(), + Alignment, false, false, /*CI=*/nullptr, false, MachinePointerInfo(), MachinePointerInfo())); } else { InVals.push_back(ArgDI.SDV); diff --git a/llvm/lib/Target/CSKY/CSKYISelLowering.cpp b/llvm/lib/Target/CSKY/CSKYISelLowering.cpp index 869277a391a56..c3fc9f9ead5eb 100644 --- a/llvm/lib/Target/CSKY/CSKYISelLowering.cpp +++ b/llvm/lib/Target/CSKY/CSKYISelLowering.cpp @@ -556,7 +556,7 @@ SDValue CSKYTargetLowering::LowerCall(CallLoweringInfo &CLI, Chain = DAG.getMemcpy(Chain, DL, FIPtr, Arg, SizeNode, Alignment, /*IsVolatile=*/false, - /*AlwaysInline=*/false, IsTailCall, + /*AlwaysInline=*/false, /*CI=*/nullptr, IsTailCall, MachinePointerInfo(), MachinePointerInfo()); ByValArgs.push_back(FIPtr); } From 321a0c00425adeab84bce657cac85ae4634df910 Mon Sep 17 00:00:00 2001 From: Zahira Ammarguellat Date: Wed, 17 Jul 2024 14:05:51 -0400 Subject: [PATCH 317/777] The pragma STDC CX_LIMITED_RANGE ON should have precedence. (#98520) The `pragma STDC CX_LIMITED_RANGE` should have precedence over the command line `-fcomplex-arithmetic`. --- clang/include/clang/AST/Expr.h | 19 +++++++++ clang/include/clang/AST/Stmt.h | 5 +++ clang/lib/CodeGen/CGExprComplex.cpp | 44 +++++++++++++------- clang/test/CodeGen/pragma-cx-limited-range.c | 28 ++++++------- 4 files changed, 66 insertions(+), 30 deletions(-) diff --git a/clang/include/clang/AST/Expr.h b/clang/include/clang/AST/Expr.h index a8add9d1337c6..5b813bfc2faf9 100644 --- a/clang/include/clang/AST/Expr.h +++ b/clang/include/clang/AST/Expr.h @@ -2333,6 +2333,11 @@ class UnaryOperator final return getTrailingFPFeatures(); } + /// Get the store FPOptionsOverride or default if not stored. + FPOptionsOverride getStoredFPFeaturesOrDefault() const { + return hasStoredFPFeatures() ? getStoredFPFeatures() : FPOptionsOverride(); + } + protected: /// Set FPFeatures in trailing storage, used by Serialization & ASTImporter. void setStoredFPFeatures(FPOptionsOverride F) { getTrailingFPFeatures() = F; } @@ -3096,6 +3101,11 @@ class CallExpr : public Expr { *getTrailingFPFeatures() = F; } + /// Get the store FPOptionsOverride or default if not stored. + FPOptionsOverride getStoredFPFeaturesOrDefault() const { + return hasStoredFPFeatures() ? getStoredFPFeatures() : FPOptionsOverride(); + } + /// Get the FP features status of this operator. Only meaningful for /// operations on floating point types. FPOptions getFPFeaturesInEffect(const LangOptions &LO) const { @@ -3592,6 +3602,11 @@ class CastExpr : public Expr { return *getTrailingFPFeatures(); } + /// Get the store FPOptionsOverride or default if not stored. + FPOptionsOverride getStoredFPFeaturesOrDefault() const { + return hasStoredFPFeatures() ? getStoredFPFeatures() : FPOptionsOverride(); + } + /// Get the FP features status of this operation. Only meaningful for /// operations on floating point types. FPOptions getFPFeaturesInEffect(const LangOptions &LO) const { @@ -4038,6 +4053,10 @@ class BinaryOperator : public Expr { assert(BinaryOperatorBits.HasFPFeatures); *getTrailingFPFeatures() = F; } + /// Get the store FPOptionsOverride or default if not stored. + FPOptionsOverride getStoredFPFeaturesOrDefault() const { + return hasStoredFPFeatures() ? getStoredFPFeatures() : FPOptionsOverride(); + } /// Get the FP features status of this operator. Only meaningful for /// operations on floating point types. diff --git a/clang/include/clang/AST/Stmt.h b/clang/include/clang/AST/Stmt.h index 9cd7a364cd3f1..e91e89d728ca0 100644 --- a/clang/include/clang/AST/Stmt.h +++ b/clang/include/clang/AST/Stmt.h @@ -1658,6 +1658,11 @@ class CompoundStmt final return *getTrailingObjects(); } + /// Get the store FPOptionsOverride or default if not stored. + FPOptionsOverride getStoredFPFeaturesOrDefault() const { + return hasStoredFPFeatures() ? getStoredFPFeatures() : FPOptionsOverride(); + } + using body_iterator = Stmt **; using body_range = llvm::iterator_range; diff --git a/clang/lib/CodeGen/CGExprComplex.cpp b/clang/lib/CodeGen/CGExprComplex.cpp index 000d4ff5c0698..4d45f6d64c1cd 100644 --- a/clang/lib/CodeGen/CGExprComplex.cpp +++ b/clang/lib/CodeGen/CGExprComplex.cpp @@ -328,12 +328,20 @@ class ComplexExprEmitter } } - QualType getPromotionType(QualType Ty, bool IsDivOpCode = false) { + QualType getPromotionType(FPOptionsOverride Features, QualType Ty, + bool IsDivOpCode = false) { if (auto *CT = Ty->getAs()) { QualType ElementType = CT->getElementType(); - if (IsDivOpCode && ElementType->isFloatingType() && - CGF.getLangOpts().getComplexRange() == - LangOptions::ComplexRangeKind::CX_Promoted) + bool IsFloatingType = ElementType->isFloatingType(); + bool IsComplexRangePromoted = CGF.getLangOpts().getComplexRange() == + LangOptions::ComplexRangeKind::CX_Promoted; + bool HasNoComplexRangeOverride = !Features.hasComplexRangeOverride(); + bool HasMatchingComplexRange = Features.hasComplexRangeOverride() && + Features.getComplexRangeOverride() == + CGF.getLangOpts().getComplexRange(); + + if (IsDivOpCode && IsFloatingType && IsComplexRangePromoted && + (HasNoComplexRangeOverride || HasMatchingComplexRange)) return HigherPrecisionTypeForComplexArithmetic(ElementType, IsDivOpCode); if (ElementType.UseExcessPrecision(CGF.getContext())) @@ -347,7 +355,7 @@ class ComplexExprEmitter #define HANDLEBINOP(OP) \ ComplexPairTy VisitBin##OP(const BinaryOperator *E) { \ QualType promotionTy = getPromotionType( \ - E->getType(), \ + E->getStoredFPFeaturesOrDefault(), E->getType(), \ (E->getOpcode() == BinaryOperatorKind::BO_Div) ? true : false); \ ComplexPairTy result = EmitBin##OP(EmitBinOps(E, promotionTy)); \ if (!promotionTy.isNull()) \ @@ -641,9 +649,12 @@ ComplexPairTy ComplexExprEmitter::EmitCast(CastKind CK, Expr *Op, ComplexPairTy ComplexExprEmitter::VisitUnaryPlus(const UnaryOperator *E, QualType PromotionType) { - QualType promotionTy = PromotionType.isNull() - ? getPromotionType(E->getSubExpr()->getType()) - : PromotionType; + E->hasStoredFPFeatures(); + QualType promotionTy = + PromotionType.isNull() + ? getPromotionType(E->getStoredFPFeaturesOrDefault(), + E->getSubExpr()->getType()) + : PromotionType; ComplexPairTy result = VisitPlus(E, promotionTy); if (!promotionTy.isNull()) return CGF.EmitUnPromotedValue(result, E->getSubExpr()->getType()); @@ -661,9 +672,11 @@ ComplexPairTy ComplexExprEmitter::VisitPlus(const UnaryOperator *E, ComplexPairTy ComplexExprEmitter::VisitUnaryMinus(const UnaryOperator *E, QualType PromotionType) { - QualType promotionTy = PromotionType.isNull() - ? getPromotionType(E->getSubExpr()->getType()) - : PromotionType; + QualType promotionTy = + PromotionType.isNull() + ? getPromotionType(E->getStoredFPFeaturesOrDefault(), + E->getSubExpr()->getType()) + : PromotionType; ComplexPairTy result = VisitMinus(E, promotionTy); if (!promotionTy.isNull()) return CGF.EmitUnPromotedValue(result, E->getSubExpr()->getType()); @@ -1218,13 +1231,15 @@ EmitCompoundAssignLValue(const CompoundAssignOperator *E, // __block variables need to have the rhs evaluated first, plus this should // improve codegen a little. QualType PromotionTypeCR; - PromotionTypeCR = getPromotionType(E->getComputationResultType()); + PromotionTypeCR = getPromotionType(E->getStoredFPFeaturesOrDefault(), + E->getComputationResultType()); if (PromotionTypeCR.isNull()) PromotionTypeCR = E->getComputationResultType(); OpInfo.Ty = PromotionTypeCR; QualType ComplexElementTy = OpInfo.Ty->castAs()->getElementType(); - QualType PromotionTypeRHS = getPromotionType(E->getRHS()->getType()); + QualType PromotionTypeRHS = getPromotionType( + E->getStoredFPFeaturesOrDefault(), E->getRHS()->getType()); // The RHS should have been converted to the computation type. if (E->getRHS()->getType()->isRealFloatingType()) { @@ -1252,7 +1267,8 @@ EmitCompoundAssignLValue(const CompoundAssignOperator *E, // Load from the l-value and convert it. SourceLocation Loc = E->getExprLoc(); - QualType PromotionTypeLHS = getPromotionType(E->getComputationLHSType()); + QualType PromotionTypeLHS = getPromotionType( + E->getStoredFPFeaturesOrDefault(), E->getComputationLHSType()); if (LHSTy->isAnyComplexType()) { ComplexPairTy LHSVal = EmitLoadOfLValue(LHS, Loc); if (!PromotionTypeLHS.isNull()) diff --git a/clang/test/CodeGen/pragma-cx-limited-range.c b/clang/test/CodeGen/pragma-cx-limited-range.c index 68615348c1871..1c9bf40fd714f 100644 --- a/clang/test/CodeGen/pragma-cx-limited-range.c +++ b/clang/test/CodeGen/pragma-cx-limited-range.c @@ -106,21 +106,17 @@ _Complex float pragma_on_div(_Complex float a, _Complex float b) { // IMPRVD-NEXT: fdiv float // IMPRVD-NEXT: fdiv float - // PRMTD: fpext float {{.*}} to double - // PRMTD: fpext float {{.*}} to double - // PRMTD: fmul double - // PRMTD: fmul double - // PRMTD: fadd double - // PRMTD: fmul double - // PRMTD: fmul double - // PRMTD: fadd double - // PRMTD: fmul double - // PRMTD: fmul double - // PRMTD: fsub double - // PRMTD: fdiv double - // PRMTD: fdiv double - // PRMTD: fptrunc double - // PRMTD: fptrunc double + // PRMTD: fmul float + // PRMTD-NEXT: fmul float + // PRMTD-NEXT: fadd float + // PRMTD-NEXT: fmul float + // PRMTD-NEXT: fmul float + // PRMTD-NEXT: fadd float + // PRMTD-NEXT: fmul float + // PRMTD-NEXT: fmul float + // PRMTD-NEXT: fsub float + // PRMTD-NEXT: fdiv float + // PRMTD-NEXT: fdiv float return a / b; } @@ -135,7 +131,7 @@ _Complex float pragma_off_div(_Complex float a, _Complex float b) { // IMPRVD: call {{.*}} @__divsc3 - // PRMTD: call {{.*}} @__divdc3 + // PRMTD: call {{.*}} @__divsc3 return a / b; } From ddbf5ea6d48d3fbf5300309ca009f9e4e67fb58a Mon Sep 17 00:00:00 2001 From: vporpo Date: Wed, 17 Jul 2024 11:06:18 -0700 Subject: [PATCH 318/777] [SandboxIR][NFC] Add some comments (#99359) --- llvm/include/llvm/SandboxIR/SandboxIR.h | 35 ++++++++++++++++++------- llvm/lib/SandboxIR/SandboxIR.cpp | 13 +++++++++ 2 files changed, 38 insertions(+), 10 deletions(-) diff --git a/llvm/include/llvm/SandboxIR/SandboxIR.h b/llvm/include/llvm/SandboxIR/SandboxIR.h index fcb581211736e..473bd93aea7c1 100644 --- a/llvm/include/llvm/SandboxIR/SandboxIR.h +++ b/llvm/include/llvm/SandboxIR/SandboxIR.h @@ -76,7 +76,8 @@ class Instruction; class User; class Value; -/// Returns the operand edge when dereferenced. +/// Iterator for the `Use` edges of a User's operands. +/// \Returns the operand `Use` when dereferenced. class OperandUseIterator { sandboxir::Use Use; /// Don't let the user create a non-empty OperandUseIterator. @@ -103,7 +104,8 @@ class OperandUseIterator { } }; -/// Returns user edge when dereferenced. +/// Iterator for the `Use` edges of a Value's users. +/// \Returns a `Use` when dereferenced. class UserUseIterator { sandboxir::Use Use; /// Don't let the user create a non-empty UserUseIterator. @@ -162,7 +164,9 @@ class Value { unsigned UID; #endif /// The LLVM Value that corresponds to this SandboxIR Value. - /// NOTE: Some SBInstructions, like Packs, may include more than one value. + /// NOTE: Some sandboxir Instructions, like Packs, may include more than one + /// value and in these cases `Val` points to the last instruction in program + /// order. llvm::Value *Val = nullptr; friend class Context; // For getting `Val`. @@ -300,6 +304,7 @@ class Argument : public sandboxir::Value { #endif }; +/// A sandboxir::User has operands. class User : public Value { protected: User(ClassID ID, llvm::Value *V, Context &Ctx) : Value(ID, V, Ctx) {} @@ -309,6 +314,9 @@ class User : public Value { /// match the underlying LLVM instruction. All others should use a different /// implementation. Use getOperandUseDefault(unsigned OpIdx, bool Verify) const; + /// \Returns the Use for the \p OpIdx'th operand. This is virtual to allow + /// instructions to deviate from the LLVM IR operands, which is a requirement + /// for sandboxir Instructions that consist of more than one LLVM Instruction. virtual Use getOperandUseInternal(unsigned OpIdx, bool Verify) const = 0; friend class OperandUseIterator; // for getOperandUseInternal() @@ -414,7 +422,8 @@ class Constant : public sandboxir::User { #endif }; -/// The BasicBlock::iterator. +/// Iterator for `Instruction`s in a `BasicBlock. +/// \Returns an sandboxir::Instruction & when derereferenced. class BBIterator { public: using difference_type = std::ptrdiff_t; @@ -456,7 +465,8 @@ class BBIterator { pointer get() const { return getInstr(It); } }; -/// A sandboxir::User with operands and opcode. +/// A sandboxir::User with operands, opcode and linked with previous/next +/// instructions in an instruction list. class Instruction : public sandboxir::User { public: enum class Opcode { @@ -577,6 +587,7 @@ class OpaqueInst : public sandboxir::Instruction { #endif }; +/// Contains a list of sandboxir::Instruction's. class BasicBlock : public Value { /// Builds a graph that contains all values in \p BB in their original form /// i.e., no vectorization is taking place here. @@ -643,9 +654,10 @@ class Context { friend void Instruction::eraseFromParent(); // For detach(). /// Take ownership of VPtr and store it in `LLVMValueToValueMap`. Value *registerValue(std::unique_ptr &&VPtr); - + /// This is the actual function that creates sandboxir values for \p V, + /// and among others handles all instruction types. Value *getOrCreateValueInternal(llvm::Value *V, llvm::User *U = nullptr); - + /// Get or create a sandboxir::Argument for an existing LLVM IR \p LLVMArg. Argument *getOrCreateArgument(llvm::Argument *LLVMArg) { auto Pair = LLVMValueToValueMap.insert({LLVMArg, nullptr}); auto It = Pair.first; @@ -655,11 +667,12 @@ class Context { } return cast(It->second.get()); } - + /// Get or create a sandboxir::Value for an existing LLVM IR \p LLVMV. Value *getOrCreateValue(llvm::Value *LLVMV) { return getOrCreateValueInternal(LLVMV, 0); } - + /// Create a sandboxir::BasicBlock for an existing LLVM IR \p BB. This will + /// also create all contents of the block. BasicBlock *createBasicBlock(llvm::BasicBlock *BB); friend class BasicBlock; // For getOrCreateValue(). @@ -671,7 +684,9 @@ class Context { const sandboxir::Value *getValue(const llvm::Value *V) const { return getValue(const_cast(V)); } - + /// Create a sandboxir::Function for an existing LLVM IR \p F, including all + /// blocks and instructions. + /// This is the main API function for creating Sandbox IR. Function *createFunction(llvm::Function *F); /// \Returns the number of values registered with Context. diff --git a/llvm/lib/SandboxIR/SandboxIR.cpp b/llvm/lib/SandboxIR/SandboxIR.cpp index a3f350e9ca8b0..2984c6eaccd64 100644 --- a/llvm/lib/SandboxIR/SandboxIR.cpp +++ b/llvm/lib/SandboxIR/SandboxIR.cpp @@ -60,6 +60,8 @@ OperandUseIterator &OperandUseIterator::operator++() { } UserUseIterator &UserUseIterator::operator++() { + // Get the corresponding llvm::Use, get the next in the list, and update the + // sandboxir::Use. llvm::Use *&LLVMUse = Use.LLVMUse; assert(LLVMUse != nullptr && "Already at end!"); LLVMUse = LLVMUse->getNext(); @@ -107,6 +109,7 @@ void Value::replaceUsesWithIf( Value *OtherV, llvm::function_ref ShouldReplace) { assert(getType() == OtherV->getType() && "Can't replace with different type"); llvm::Value *OtherVal = OtherV->Val; + // We are delegating RUWIf to LLVM IR's RUWIf. Val->replaceUsesWithIf( OtherVal, [&ShouldReplace, this](llvm::Use &LLVMUse) -> bool { User *DstU = cast_or_null(Ctx.getValue(LLVMUse.getUser())); @@ -119,6 +122,7 @@ void Value::replaceUsesWithIf( void Value::replaceAllUsesWith(Value *Other) { assert(getType() == Other->getType() && "Replacing with Value of different type!"); + // We are delegating RAUW to LLVM IR's RAUW. Val->replaceAllUsesWith(Other->Val); } @@ -208,10 +212,12 @@ bool User::classof(const Value *From) { void User::setOperand(unsigned OperandIdx, Value *Operand) { assert(isa(Val) && "No operands!"); + // We are delegating to llvm::User::setOperand(). cast(Val)->setOperand(OperandIdx, Operand->Val); } bool User::replaceUsesOfWith(Value *FromV, Value *ToV) { + // We are delegating RUOW to LLVM IR's RUOW. return cast(Val)->replaceUsesOfWith(FromV->Val, ToV->Val); } @@ -282,6 +288,9 @@ BBIterator Instruction::getIterator() const { Instruction *Instruction::getNextNode() const { assert(getParent() != nullptr && "Detached!"); assert(getIterator() != getParent()->end() && "Already at end!"); + // `Val` is the bottom-most LLVM IR instruction. Get the next in the chain, + // and get the corresponding sandboxir Instruction that maps to it. This works + // even for SandboxIR Instructions that map to more than one LLVM Instruction. auto *LLVMI = cast(Val); assert(LLVMI->getParent() != nullptr && "LLVM IR instr is detached!"); auto *NextLLVMI = LLVMI->getNextNode(); @@ -342,6 +351,7 @@ void Instruction::insertBefore(Instruction *BeforeI) { assert(is_sorted(getLLVMInstrs(), [](auto *I1, auto *I2) { return I1->comesBefore(I2); }) && "Expected program order!"); + // Insert the LLVM IR Instructions in program order. for (llvm::Instruction *I : getLLVMInstrs()) I->insertBefore(BeforeTopI); } @@ -362,11 +372,14 @@ void Instruction::insertInto(BasicBlock *BB, const BBIterator &WhereIt) { LLVMBeforeI = nullptr; LLVMBeforeIt = LLVMBB->end(); } + // Insert the LLVM IR Instructions in program order. for (llvm::Instruction *I : getLLVMInstrs()) I->insertInto(LLVMBB, LLVMBeforeIt); } BasicBlock *Instruction::getParent() const { + // Get the LLVM IR Instruction that this maps to, get its parent, and get the + // corresponding sandboxir::BasicBlock by looking it up in sandboxir::Context. auto *BB = cast(Val)->getParent(); if (BB == nullptr) return nullptr; From 093f0a4770ec9bde9f7a21cfe9c5ec5b20a923a8 Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Wed, 17 Jul 2024 11:06:39 -0700 Subject: [PATCH 319/777] [instcombine] Improve coverage for reductions of i1 types In advance of an upcoming change to generalize some of this to scalable vector types. --- .../InstCombine/vector-logical-reductions.ll | 144 ++++++++++++++++++ 1 file changed, 144 insertions(+) diff --git a/llvm/test/Transforms/InstCombine/vector-logical-reductions.ll b/llvm/test/Transforms/InstCombine/vector-logical-reductions.ll index da4a0ca754680..74f4ed01085f8 100644 --- a/llvm/test/Transforms/InstCombine/vector-logical-reductions.ll +++ b/llvm/test/Transforms/InstCombine/vector-logical-reductions.ll @@ -11,6 +11,15 @@ define i1 @reduction_logical_or(<4 x i1> %x) { ret i1 %r } +define i1 @reduction_logical_or_nxv2i1( %x) { +; CHECK-LABEL: @reduction_logical_or_nxv2i1( +; CHECK-NEXT: [[R:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1( [[X:%.*]]) +; CHECK-NEXT: ret i1 [[R]] +; + %r = call i1 @llvm.vector.reduce.or.nxv2i1( %x) + ret i1 %r +} + define i1 @reduction_logical_and(<4 x i1> %x) { ; CHECK-LABEL: @reduction_logical_and( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i1> [[X:%.*]] to i4 @@ -21,6 +30,131 @@ define i1 @reduction_logical_and(<4 x i1> %x) { ret i1 %r } +define i1 @reduction_logical_and_nxv2i1( %x) { +; CHECK-LABEL: @reduction_logical_and_nxv2i1( +; CHECK-NEXT: [[R:%.*]] = call i1 @llvm.vector.reduce.and.nxv2i1( [[X:%.*]]) +; CHECK-NEXT: ret i1 [[R]] +; + %r = call i1 @llvm.vector.reduce.and.nxv2i1( %x) + ret i1 %r +} + +define i1 @reduction_logical_mul(<2 x i1> %x) { +; CHECK-LABEL: @reduction_logical_mul( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i1> [[X:%.*]] to i2 +; CHECK-NEXT: [[R:%.*]] = icmp eq i2 [[TMP1]], -1 +; CHECK-NEXT: ret i1 [[R]] +; + %r = call i1 @llvm.vector.reduce.mul.v4i1(<2 x i1> %x) + ret i1 %r +} + +define i1 @reduction_logical_mul_nxv2i1( %x) { +; CHECK-LABEL: @reduction_logical_mul_nxv2i1( +; CHECK-NEXT: [[R:%.*]] = call i1 @llvm.vector.reduce.mul.nxv2i1( [[X:%.*]]) +; CHECK-NEXT: ret i1 [[R]] +; + %r = call i1 @llvm.vector.reduce.mul.nxv2i1( %x) + ret i1 %r +} + +define i1 @reduction_logical_xor(<2 x i1> %x) { +; CHECK-LABEL: @reduction_logical_xor( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i1> [[X:%.*]] to i2 +; CHECK-NEXT: [[TMP2:%.*]] = call range(i2 0, -1) i2 @llvm.ctpop.i2(i2 [[TMP1]]) +; CHECK-NEXT: [[R:%.*]] = trunc i2 [[TMP2]] to i1 +; CHECK-NEXT: ret i1 [[R]] +; + %r = call i1 @llvm.vector.reduce.xor.v4i1(<2 x i1> %x) + ret i1 %r +} + +define i1 @reduction_logical_xor_nxv2i1( %x) { +; CHECK-LABEL: @reduction_logical_xor_nxv2i1( +; CHECK-NEXT: [[R:%.*]] = call i1 @llvm.vector.reduce.xor.nxv2i1( [[X:%.*]]) +; CHECK-NEXT: ret i1 [[R]] +; + %r = call i1 @llvm.vector.reduce.xor.nxv2i1( %x) + ret i1 %r +} + +define i1 @reduction_logical_smin(<2 x i1> %x) { +; CHECK-LABEL: @reduction_logical_smin( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i1> [[X:%.*]] to i2 +; CHECK-NEXT: [[R:%.*]] = icmp ne i2 [[TMP1]], 0 +; CHECK-NEXT: ret i1 [[R]] +; + %r = call i1 @llvm.vector.reduce.smin.v4i1(<2 x i1> %x) + ret i1 %r +} + +define i1 @reduction_logical_smin_nxv2i1( %x) { +; CHECK-LABEL: @reduction_logical_smin_nxv2i1( +; CHECK-NEXT: [[R:%.*]] = call i1 @llvm.vector.reduce.smin.nxv2i1( [[X:%.*]]) +; CHECK-NEXT: ret i1 [[R]] +; + %r = call i1 @llvm.vector.reduce.smin.nxv2i1( %x) + ret i1 %r +} + +define i1 @reduction_logical_smax(<2 x i1> %x) { +; CHECK-LABEL: @reduction_logical_smax( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i1> [[X:%.*]] to i2 +; CHECK-NEXT: [[R:%.*]] = icmp eq i2 [[TMP1]], -1 +; CHECK-NEXT: ret i1 [[R]] +; + %r = call i1 @llvm.vector.reduce.smax.v4i1(<2 x i1> %x) + ret i1 %r +} + +define i1 @reduction_logical_smax_nxv2i1( %x) { +; CHECK-LABEL: @reduction_logical_smax_nxv2i1( +; CHECK-NEXT: [[R:%.*]] = call i1 @llvm.vector.reduce.smax.nxv2i1( [[X:%.*]]) +; CHECK-NEXT: ret i1 [[R]] +; + %r = call i1 @llvm.vector.reduce.smax.nxv2i1( %x) + ret i1 %r +} + +define i1 @reduction_logical_umin(<2 x i1> %x) { +; CHECK-LABEL: @reduction_logical_umin( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i1> [[X:%.*]] to i2 +; CHECK-NEXT: [[R:%.*]] = icmp eq i2 [[TMP1]], -1 +; CHECK-NEXT: ret i1 [[R]] +; + %r = call i1 @llvm.vector.reduce.umin.v4i1(<2 x i1> %x) + ret i1 %r +} + +define i1 @reduction_logical_umin_nxv2i1( %x) { +; CHECK-LABEL: @reduction_logical_umin_nxv2i1( +; CHECK-NEXT: [[R:%.*]] = call i1 @llvm.vector.reduce.umin.nxv2i1( [[X:%.*]]) +; CHECK-NEXT: ret i1 [[R]] +; + %r = call i1 @llvm.vector.reduce.umin.nxv2i1( %x) + ret i1 %r +} + +define i1 @reduction_logical_umax(<2 x i1> %x) { +; CHECK-LABEL: @reduction_logical_umax( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i1> [[X:%.*]] to i2 +; CHECK-NEXT: [[R:%.*]] = icmp ne i2 [[TMP1]], 0 +; CHECK-NEXT: ret i1 [[R]] +; + %r = call i1 @llvm.vector.reduce.umax.v4i1(<2 x i1> %x) + ret i1 %r +} + +define i1 @reduction_logical_umax_nxv2i1( %x) { +; CHECK-LABEL: @reduction_logical_umax_nxv2i1( +; CHECK-NEXT: [[R:%.*]] = call i1 @llvm.vector.reduce.umax.nxv2i1( [[X:%.*]]) +; CHECK-NEXT: ret i1 [[R]] +; + %r = call i1 @llvm.vector.reduce.umax.nxv2i1( %x) + ret i1 %r +} + + define i1 @reduction_logical_or_reverse_nxv2i1( %p) { ; CHECK-LABEL: @reduction_logical_or_reverse_nxv2i1( ; CHECK-NEXT: [[RED:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1( [[P:%.*]]) @@ -93,5 +227,15 @@ declare i1 @llvm.vector.reduce.and.nxv2i1() declare i1 @llvm.vector.reduce.and.v2i1(<2 x i1>) declare i1 @llvm.vector.reduce.xor.nxv2i1() declare i1 @llvm.vector.reduce.xor.v2i1(<2 x i1>) +declare i1 @llvm.vector.reduce.mul.nxv2i1() +declare i1 @llvm.vector.reduce.mul.v2i1(<2 x i1>) +declare i1 @llvm.vector.reduce.smin.nxv2i1() +declare i1 @llvm.vector.reduce.smin.v2i1(<2 x i1>) +declare i1 @llvm.vector.reduce.smax.nxv2i1() +declare i1 @llvm.vector.reduce.smax.v2i1(<2 x i1>) +declare i1 @llvm.vector.reduce.umin.nxv2i1() +declare i1 @llvm.vector.reduce.umin.v2i1(<2 x i1>) +declare i1 @llvm.vector.reduce.umax.nxv2i1() +declare i1 @llvm.vector.reduce.umax.v2i1(<2 x i1>) declare @llvm.vector.reverse.nxv2i1() declare <2 x i1> @llvm.vector.reverse.v2i1(<2 x i1>) From ead486ca61ab06d46aa4b30c91d1f40e5e5e43e5 Mon Sep 17 00:00:00 2001 From: Saiyedul Islam Date: Wed, 17 Jul 2024 23:42:10 +0530 Subject: [PATCH 320/777] [ClangLinkerWrapper] Fix intermediate file naming for multi-arch compilation (#99325) When save-temps is enabled and the given offload-archs differ only in target features with the same arch, the intermediate postlink.bc and postopt.bc files were getting overwritten. This fix, suffixes the intermediate file names with the complete TargetID. E.g. `helloworld.amdgcn-amd-amdhsa.gfx90a:xnack+.postlink.bc` and `helloworld.amdgcn-amd-amdhsa.gfx90a:xnack+.postopt.bc` --- clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp index cb4cc5debae87..5edf4c982baa4 100644 --- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp +++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp @@ -14,6 +14,7 @@ // //===---------------------------------------------------------------------===// +#include "clang/Basic/TargetID.h" #include "clang/Basic/Version.h" #include "llvm/ADT/MapVector.h" #include "llvm/BinaryFormat/Magic.h" @@ -668,7 +669,8 @@ std::unique_ptr createLTO( ModuleHook Hook = [](size_t, const Module &) { return true; }) { const llvm::Triple Triple(Args.getLastArgValue(OPT_triple_EQ)); // We need to remove AMD's target-id from the processor if present. - StringRef Arch = Args.getLastArgValue(OPT_arch_EQ).split(":").first; + StringRef TargetID = Args.getLastArgValue(OPT_arch_EQ); + StringRef Arch = clang::getProcessorFromTargetID(Triple, TargetID); lto::Config Conf; lto::ThinBackend Backend; // TODO: Handle index-only thin-LTO @@ -712,7 +714,7 @@ std::unique_ptr createLTO( if (SaveTemps) { std::string TempName = (sys::path::filename(ExecutableName) + "." + - Triple.getTriple() + "." + Arch) + Triple.getTriple() + "." + TargetID) .str(); Conf.PostInternalizeModuleHook = [=](size_t Task, const Module &M) { std::string File = From 130ef7375493b560df08546666338233bacf95e5 Mon Sep 17 00:00:00 2001 From: Petr Hosek Date: Wed, 17 Jul 2024 11:46:17 -0700 Subject: [PATCH 321/777] [CMake][Fuchsia] Install libc++ for baremetal targets (#99372) We already build the library and want to install it also. --- clang/cmake/caches/Fuchsia-stage2.cmake | 2 -- 1 file changed, 2 deletions(-) diff --git a/clang/cmake/caches/Fuchsia-stage2.cmake b/clang/cmake/caches/Fuchsia-stage2.cmake index 01c47e5b0be44..73ebd36c28496 100644 --- a/clang/cmake/caches/Fuchsia-stage2.cmake +++ b/clang/cmake/caches/Fuchsia-stage2.cmake @@ -348,7 +348,6 @@ foreach(target armv6m-unknown-eabi;armv7m-unknown-eabi;armv8m.main-unknown-eabi) set(RUNTIMES_${target}_LIBCXX_ENABLE_RTTI OFF CACHE BOOL "") set(RUNTIMES_${target}_LIBCXX_ENABLE_THREADS OFF CACHE BOOL "") set(RUNTIMES_${target}_LIBCXX_ENABLE_MONOTONIC_CLOCK OFF CACHE BOOL "") - set(RUNTIMES_${target}_LIBCXX_INSTALL_LIBRARY OFF CACHE BOOL "") set(RUNTIMES_${target}_LIBCXX_USE_COMPILER_RT ON CACHE BOOL "") set(RUNTIMES_${target}_LLVM_INCLUDE_TESTS OFF CACHE BOOL "") set(RUNTIMES_${target}_LLVM_ENABLE_ASSERTIONS OFF CACHE BOOL "") @@ -399,7 +398,6 @@ foreach(target riscv32-unknown-elf) set(RUNTIMES_${target}_LIBCXX_ENABLE_RTTI OFF CACHE BOOL "") set(RUNTIMES_${target}_LIBCXX_ENABLE_THREADS OFF CACHE BOOL "") set(RUNTIMES_${target}_LIBCXX_ENABLE_MONOTONIC_CLOCK OFF CACHE BOOL "") - set(RUNTIMES_${target}_LIBCXX_INSTALL_LIBRARY OFF CACHE BOOL "") set(RUNTIMES_${target}_LIBCXX_USE_COMPILER_RT ON CACHE BOOL "") set(RUNTIMES_${target}_LLVM_INCLUDE_TESTS OFF CACHE BOOL "") set(RUNTIMES_${target}_LLVM_ENABLE_ASSERTIONS OFF CACHE BOOL "") From 194f98c2210bf40d0490613fddbf83e04c18ad9b Mon Sep 17 00:00:00 2001 From: Xing Xue Date: Wed, 17 Jul 2024 14:47:13 -0400 Subject: [PATCH 322/777] [libc++] basic_ios cannot store fill character WCHAR_MAX (#89305) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `libcxx std::basic_ios` uses `WEOF` to indicate the `fill` value is uninitialized. On some platforms (e.g AIX and zOS in 64-bit mode) `wchar_t` is 4 bytes `unsigned` and `wint_t` is also 4 bytes which means `WEOF` cannot be distinguished from `WCHAR_MAX` by `std::char_traits::eq_int_type()`, meaning this valid character value cannot be stored on affected platforms (as the implementation triggers reinitialization to `widen(’ ’)`). This patch introduces a new helper class `_FillHelper` uses a boolean variable to indicate whether the fill character has been initialized, which is used by default in libcxx ABI version 2. The patch does not affect ABI version 1 except for targets AIX in 32- and 64-bit and z/OS in 64-bit (so that the layout of the implementation is compatible with the current IBM system provided libc++) This is a continuation of Phabricator patch [D124555](https://reviews.llvm.org/D124555). This patch uses a modified version of the [approach](https://reviews.llvm.org/D124555#3566746) suggested by @ldionne . --------- Co-authored-by: Louis Dionne Co-authored-by: David Tenty --- libcxx/cmake/caches/AIX.cmake | 1 + libcxx/cmake/caches/s390x-ibm-zos-ascii.cmake | 1 + libcxx/cmake/caches/s390x-ibm-zos.cmake | 1 + libcxx/include/__configuration/abi.h | 7 +++ libcxx/include/ios | 50 ++++++++++++++++--- .../std.manip/setfill_wchar_max.pass.cpp | 38 ++++++++++++++ 6 files changed, 92 insertions(+), 6 deletions(-) create mode 100644 libcxx/test/std/input.output/iostream.format/std.manip/setfill_wchar_max.pass.cpp diff --git a/libcxx/cmake/caches/AIX.cmake b/libcxx/cmake/caches/AIX.cmake index c01aa5b14df06..4ec78f9bbd592 100644 --- a/libcxx/cmake/caches/AIX.cmake +++ b/libcxx/cmake/caches/AIX.cmake @@ -15,3 +15,4 @@ set(LIBCXXABI_ENABLE_STATIC OFF CACHE BOOL "") set(LIBCXX_CXX_ABI libcxxabi CACHE STRING "") set(LIBUNWIND_ENABLE_SHARED ON CACHE BOOL "") set(LIBUNWIND_ENABLE_STATIC OFF CACHE BOOL "") +set(LIBCXX_ABI_DEFINES "_LIBCPP_ABI_IOS_ALLOW_ARBITRARY_FILL_VALUE" CACHE STRING "") diff --git a/libcxx/cmake/caches/s390x-ibm-zos-ascii.cmake b/libcxx/cmake/caches/s390x-ibm-zos-ascii.cmake index 95b7cbe776e05..68b1cc8eff9b0 100644 --- a/libcxx/cmake/caches/s390x-ibm-zos-ascii.cmake +++ b/libcxx/cmake/caches/s390x-ibm-zos-ascii.cmake @@ -20,3 +20,4 @@ set(LIBCXX_CXX_ABI system-libcxxabi CACHE STRING "") set(LIBCXX_ADDITIONAL_COMPILE_FLAGS "-fzos-le-char-mode=ascii" CACHE STRING "") set(LIBCXX_ADDITIONAL_LIBRARIES "-L../s390x-ibm-zos/lib -Wl,../s390x-ibm-zos/lib/libunwind.x" CACHE STRING "") +set(LIBCXX_ABI_DEFINES "_LIBCPP_ABI_IOS_ALLOW_ARBITRARY_FILL_VALUE" CACHE STRING "") diff --git a/libcxx/cmake/caches/s390x-ibm-zos.cmake b/libcxx/cmake/caches/s390x-ibm-zos.cmake index 3eaed3e67fc16..e51d5ff31ebfe 100644 --- a/libcxx/cmake/caches/s390x-ibm-zos.cmake +++ b/libcxx/cmake/caches/s390x-ibm-zos.cmake @@ -15,3 +15,4 @@ set(LIBCXX_DLL_NAME CRTEQCXE CACHE STRING "") set(LIBCXXABI_DLL_NAME CRTEQCXA CACHE STRING "") set(LIBCXXABI_ADDITIONAL_LIBRARIES "-Wl,lib/libunwind.x" CACHE STRING "") +set(LIBCXX_ABI_DEFINES "_LIBCPP_ABI_IOS_ALLOW_ARBITRARY_FILL_VALUE" CACHE STRING "") diff --git a/libcxx/include/__configuration/abi.h b/libcxx/include/__configuration/abi.h index 513da6e3b81b6..cbde7887becf1 100644 --- a/libcxx/include/__configuration/abi.h +++ b/libcxx/include/__configuration/abi.h @@ -91,6 +91,13 @@ # define _LIBCPP_ABI_USE_WRAP_ITER_IN_STD_STRING_VIEW // Dont' add an inline namespace for `std::filesystem` # define _LIBCPP_ABI_NO_FILESYSTEM_INLINE_NAMESPACE +// std::basic_ios uses WEOF to indicate that the fill value is +// uninitialized. However, on platforms where the size of char_type is +// equal to or greater than the size of int_type and char_type is unsigned, +// std::char_traits::eq_int_type() cannot distinguish between WEOF +// and WCHAR_MAX. This ABI setting determines whether we should instead track whether the fill +// value has been initialized using a separate boolean, which changes the ABI. +# define _LIBCPP_ABI_IOS_ALLOW_ARBITRARY_FILL_VALUE #elif _LIBCPP_ABI_VERSION == 1 # if !(defined(_LIBCPP_OBJECT_FORMAT_COFF) || defined(_LIBCPP_OBJECT_FORMAT_XCOFF)) // Enable compiling copies of now inline methods into the dylib to support diff --git a/libcxx/include/ios b/libcxx/include/ios index 0a813c07721fe..d8a3643c7ad50 100644 --- a/libcxx/include/ios +++ b/libcxx/include/ios @@ -519,6 +519,38 @@ inline _LIBCPP_HIDE_FROM_ABI void ios_base::exceptions(iostate __iostate) { clear(__rdstate_); } +template +// Attribute 'packed' is used to keep the layout compatible with the previous +// definition of the '__fill_' and '_set_' pair in basic_ios on AIX & z/OS. +struct _LIBCPP_PACKED _FillHelper { + _LIBCPP_HIDE_FROM_ABI void __init() { __set_ = false; } + _LIBCPP_HIDE_FROM_ABI _FillHelper& operator=(typename _Traits::int_type __x) { + __set_ = true; + __fill_val_ = __x; + return *this; + } + _LIBCPP_HIDE_FROM_ABI bool __is_set() const { return __set_; } + _LIBCPP_HIDE_FROM_ABI typename _Traits::int_type __get() const { return __fill_val_; } + +private: + typename _Traits::int_type __fill_val_; + bool __set_; +}; + +template +struct _LIBCPP_PACKED _SentinelValueFill { + _LIBCPP_HIDE_FROM_ABI void __init() { __fill_val_ = _Traits::eof(); } + _LIBCPP_HIDE_FROM_ABI _SentinelValueFill& operator=(typename _Traits::int_type __x) { + __fill_val_ = __x; + return *this; + } + _LIBCPP_HIDE_FROM_ABI bool __is_set() const { return __fill_val_ != _Traits::eof(); } + _LIBCPP_HIDE_FROM_ABI typename _Traits::int_type __get() const { return __fill_val_; } + +private: + typename _Traits::int_type __fill_val_; +}; + template class _LIBCPP_TEMPLATE_VIS basic_ios : public ios_base { public: @@ -588,7 +620,13 @@ protected: private: basic_ostream* __tie_; - mutable int_type __fill_; + +#if defined(_LIBCPP_ABI_IOS_ALLOW_ARBITRARY_FILL_VALUE) + using _FillType = _FillHelper; +#else + using _FillType = _SentinelValueFill; +#endif + mutable _FillType __fill_; }; template @@ -603,7 +641,7 @@ template inline _LIBCPP_HIDE_FROM_ABI void basic_ios<_CharT, _Traits>::init(basic_streambuf* __sb) { ios_base::init(__sb); __tie_ = nullptr; - __fill_ = traits_type::eof(); + __fill_.__init(); } template @@ -653,16 +691,16 @@ inline _LIBCPP_HIDE_FROM_ABI _CharT basic_ios<_CharT, _Traits>::widen(char __c) template inline _LIBCPP_HIDE_FROM_ABI _CharT basic_ios<_CharT, _Traits>::fill() const { - if (traits_type::eq_int_type(traits_type::eof(), __fill_)) + if (!__fill_.__is_set()) __fill_ = widen(' '); - return __fill_; + return __fill_.__get(); } template inline _LIBCPP_HIDE_FROM_ABI _CharT basic_ios<_CharT, _Traits>::fill(char_type __ch) { - if (traits_type::eq_int_type(traits_type::eof(), __fill_)) + if (!__fill_.__is_set()) __fill_ = widen(' '); - char_type __r = __fill_; + char_type __r = __fill_.__get(); __fill_ = __ch; return __r; } diff --git a/libcxx/test/std/input.output/iostream.format/std.manip/setfill_wchar_max.pass.cpp b/libcxx/test/std/input.output/iostream.format/std.manip/setfill_wchar_max.pass.cpp new file mode 100644 index 0000000000000..f22850877dd62 --- /dev/null +++ b/libcxx/test/std/input.output/iostream.format/std.manip/setfill_wchar_max.pass.cpp @@ -0,0 +1,38 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// Test that WCHAR_MAX as a wchar_t value can be set as the fill character. + +// UNSUPPORTED: no-wide-characters + +// Expect the test case to fail on targets where WEOF is the same as +// WCHAR_MAX with the libcpp ABI version 1 implementation. The libcpp ABI +// version 2 implementation fixes the problem. + +// XFAIL: target={{.*}}-windows{{.*}} && libcpp-abi-version=1 +// XFAIL: target=armv{{7|8}}l-linux-gnueabihf && libcpp-abi-version=1 +// XFAIL: target=aarch64-linux-gnu && libcpp-abi-version=1 + +#include +#include +#include +#include + +template +struct testbuf : public std::basic_streambuf { + testbuf() {} +}; + +int main(int, char**) { + testbuf sb; + std::wostream os(&sb); + os << std::setfill((wchar_t)WCHAR_MAX); + assert(os.fill() == (wchar_t)WCHAR_MAX); + + return 0; +} From da5264efa3ae50d61a4fc584f8c4f60a51539a96 Mon Sep 17 00:00:00 2001 From: Angel Zhang Date: Wed, 17 Jul 2024 14:47:53 -0400 Subject: [PATCH 323/777] [bazel][docs] Update build documentation (#99339) This PR updates the Quick Start section to provide more detailed build instructions. --- utils/bazel/README.md | 34 +++++++++++++++++++++++++--------- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/utils/bazel/README.md b/utils/bazel/README.md index 16d736852d130..d3e7f15f17683 100644 --- a/utils/bazel/README.md +++ b/utils/bazel/README.md @@ -33,15 +33,31 @@ for adding this configuration. [bazelisk](https://github.com/bazelbuild/bazelisk) which automates downloading the proper bazel version 3. `cd utils/bazel` -4. `bazel build --config=generic_clang @llvm-project//...` - * If you're using clang, it's expected that lld is also available - * If you're using MSVC or gcc, instead of `--config=generic_clang`, pass - `--config=generic_gcc` or `--config=msvc` - * To specify a specific local compiler to use, add the following bazel - flag: `--repo_env=CC=/usr/bin/clang` - * `--config=generic_clang`/`--config=generic_gcc` by default set - `--repo_env=CC=clang`/`--repo_env=CC=gcc`, using `clang`/`gcc` on the - `PATH` +4. The `bazel build` command depends on the local compiler you want to use. + * For **clang**, go to step 5. + * For **gcc** or **MSVC**, go to step 6 +5. If you are using **clang**, it is expected that lld is also available. + The `--config=generic_clang` flag by default sets the compiler to be `clang` + binary on the `PATH`. + ``` + bazel build --config=generic_clang @llvm-project//... + ``` + To provide a specific path to your `clang`, use the `--repo_env` Bazel flag. + For example: + ``` + bazel build --config=generic_clang --repo_env=CC=/usr/bin/clang --repo_env=CXX=/usr/bin/clang++ @llvm-project//... + ``` +6. If you are using **gcc** or **MSVC**, instead of `--config=generic_clang` + , pass `--config=generic_gcc` or `--config=generic_msvc`, which sets the + compiler to be `gcc` binary on the `PATH`. + ``` + bazel build --config=generic_gcc @llvm-project//... + ``` + To provide a specific path to your `gcc`, use the `--repo_env` Bazel flag. + For example: + ``` + bazel build --config=generic_gcc --repo_env=CC=/usr/bin/gcc --repo_env=CXX=/usr/bin/g++ @llvm-project//... + ``` # Configuration From 21e6777957457451196084cd48ebc42bce9619f0 Mon Sep 17 00:00:00 2001 From: Jordan Rupprecht Date: Wed, 17 Jul 2024 13:50:35 -0500 Subject: [PATCH 324/777] [mlir][NFC] Add rewrite header to fix standalone header compile (#99370) This uses `MlirRewriterBase` from from `mlir-c/Rewrite.h` without including it. --- mlir/include/mlir/CAPI/Rewrite.h | 1 + 1 file changed, 1 insertion(+) diff --git a/mlir/include/mlir/CAPI/Rewrite.h b/mlir/include/mlir/CAPI/Rewrite.h index f0bb9337e49ea..1038c0a575cf2 100644 --- a/mlir/include/mlir/CAPI/Rewrite.h +++ b/mlir/include/mlir/CAPI/Rewrite.h @@ -15,6 +15,7 @@ #ifndef MLIR_CAPI_REWRITE_H #define MLIR_CAPI_REWRITE_H +#include "mlir-c/Rewrite.h" #include "mlir/CAPI/Wrap.h" #include "mlir/IR/PatternMatch.h" From ff0821583eab1651ff126bbf4f881e6163b67435 Mon Sep 17 00:00:00 2001 From: Thurston Dang Date: Wed, 17 Jul 2024 12:14:36 -0700 Subject: [PATCH 325/777] [msan] Precommit MSan Arm NEON vst tests (#98247) These tests show that MSan currently does not handle vst (or vld) correctly. --- .../MemorySanitizer/AArch64/neon_vst.ll | 1515 +++++++++++++++++ 1 file changed, 1515 insertions(+) create mode 100644 llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vst.ll diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vst.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vst.ll new file mode 100644 index 0000000000000..1c4ca47b60c13 --- /dev/null +++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vst.ll @@ -0,0 +1,1515 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 +; Test memory sanitizer instrumentation for Arm NEON VST instructions. +; +; RUN: opt < %s -passes=msan -S | FileCheck %s +; +; Forked from llvm/test/CodeGen/AArch64/arm64-st1.ll + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-android9001" + +; ----------------------------------------------------------------------------------------------------------------------------------------------- + +define void @st2_8b(<8 x i8> %A, <8 x i8> %B, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st2_8b +; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to i64 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to i64 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i64 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0:![0-9]+]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4:[0-9]+]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> [[A]], <8 x i8> [[B]], ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> %A, <8 x i8> %B, ptr %P) + ret void +} + +define void @st2_8b_undefA(<8 x i8> %A, <8 x i8> %B, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st2_8b_undefA +; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> undef, <8 x i8> [[B]], ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> undef, <8 x i8> %B, ptr %P) + ret void +} + +define void @st2_8b_undefB(<8 x i8> %A, <8 x i8> %B, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st2_8b_undefB +; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> [[A]], <8 x i8> undef, ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> %A, <8 x i8> undef, ptr %P) + ret void +} + +define void @st2_8b_undefAB(<8 x i8> %A, <8 x i8> %B, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st2_8b_undefAB +; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> undef, <8 x i8> undef, ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> undef, <8 x i8> undef, ptr %P) + ret void +} + +define void @st3_8b(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st3_8b +; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP1]] to i64 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP2]] to i64 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i64 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to i64 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i64 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> [[C]], ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %P) + ret void +} + +define void @st3_8b_undefA(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st3_8b_undefA +; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> undef, <8 x i8> [[B]], <8 x i8> [[C]], ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> undef, <8 x i8> %B, <8 x i8> %C, ptr %P) + ret void +} + +define void @st3_8b_undefB(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st3_8b_undefB +; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> [[A]], <8 x i8> undef, <8 x i8> [[C]], ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> %A, <8 x i8> undef, <8 x i8> %C, ptr %P) + ret void +} + +define void @st3_8b_undefC(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st3_8b_undefC +; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to i64 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to i64 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i64 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> undef, ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> %A, <8 x i8> %B, <8 x i8> undef, ptr %P) + ret void +} + +define void @st3_8b_undefAB(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st3_8b_undefAB +; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> undef, <8 x i8> undef, <8 x i8> [[C]], ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> undef, <8 x i8> undef, <8 x i8> %C, ptr %P) + ret void +} + +define void @st3_8b_undefAC(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st3_8b_undefAC +; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> undef, <8 x i8> [[B]], <8 x i8> undef, ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> undef, <8 x i8> %B, <8 x i8> undef, ptr %P) + ret void +} + +define void @st3_8b_undefBC(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st3_8b_undefBC +; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> [[A]], <8 x i8> undef, <8 x i8> undef, ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> %A, <8 x i8> undef, <8 x i8> undef, ptr %P) + ret void +} + +define void @st3_8b_undefABC(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st3_8b_undefABC +; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> undef, <8 x i8> undef, <8 x i8> undef, ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> undef, <8 x i8> undef, <8 x i8> undef, ptr %P) + ret void +} + +define void @st4_8b(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st4_8b +; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP1]] to i64 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP6]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP2]] to i64 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i64 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP3]] to i64 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP8]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to i64 +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i64 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] +; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i64 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]] +; CHECK-NEXT: br i1 [[_MSOR7]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF0]] +; CHECK: 10: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: 11: +; CHECK-NEXT: call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> [[C]], <8 x i8> [[D]], ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, ptr %P) + ret void +} + +define void @st4_8b_undefA(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st4_8b_undefA +; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> undef, <8 x i8> [[B]], <8 x i8> [[C]], <8 x i8> [[D]], ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> undef, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, ptr %P) + ret void +} + +define void @st4_8b_undefB(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st4_8b_undefB +; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[A]], <8 x i8> undef, <8 x i8> [[C]], <8 x i8> [[D]], ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> %A, <8 x i8> undef, <8 x i8> %C, <8 x i8> %D, ptr %P) + ret void +} + +define void @st4_8b_undefC(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st4_8b_undefC +; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP1]] to i64 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP2]] to i64 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i64 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> undef, <8 x i8> [[D]], ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> %A, <8 x i8> %B, <8 x i8> undef, <8 x i8> %D, ptr %P) + ret void +} + +define void @st4_8b_undefD(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st4_8b_undefD +; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP1]] to i64 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP2]] to i64 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i64 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to i64 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> [[C]], <8 x i8> undef, ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> undef, ptr %P) + ret void +} + +define void @st4_8b_undefAB(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st4_8b_undefAB +; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> undef, <8 x i8> undef, <8 x i8> [[C]], <8 x i8> [[D]], ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> undef, <8 x i8> undef, <8 x i8> %C, <8 x i8> %D, ptr %P) + ret void +} + +define void @st4_8b_undefAC(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st4_8b_undefAC +; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> undef, <8 x i8> [[B]], <8 x i8> undef, <8 x i8> [[D]], ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> undef, <8 x i8> %B, <8 x i8> undef, <8 x i8> %D, ptr %P) + ret void +} + +define void @st4_8b_undefBC(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st4_8b_undefBC +; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[A]], <8 x i8> undef, <8 x i8> undef, <8 x i8> [[D]], ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> %A, <8 x i8> undef, <8 x i8> undef, <8 x i8> %D, ptr %P) + ret void +} + +define void @st4_8b_undefBD(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st4_8b_undefBD +; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[A]], <8 x i8> undef, <8 x i8> [[C]], <8 x i8> undef, ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> %A, <8 x i8> undef, <8 x i8> %C, <8 x i8> undef, ptr %P) + ret void +} + +define void @st4_8b_undefABC(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st4_8b_undefABC +; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> undef, <8 x i8> undef, <8 x i8> undef, <8 x i8> [[D]], ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> undef, <8 x i8> undef, <8 x i8> undef, <8 x i8> %D, ptr %P) + ret void +} + +define void @st4_8b_undefABD(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st4_8b_undefABD +; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> undef, <8 x i8> undef, <8 x i8> [[C]], <8 x i8> undef, ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> undef, <8 x i8> undef, <8 x i8> %C, <8 x i8> undef, ptr %P) + ret void +} + +define void @st4_8b_undefACD(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st4_8b_undefACD +; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> undef, <8 x i8> [[B]], <8 x i8> undef, <8 x i8> undef, ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> undef, <8 x i8> %B, <8 x i8> undef, <8 x i8> undef, ptr %P) + ret void +} + +define void @st4_8b_undefBCD(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st4_8b_undefBCD +; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[A]], <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> %A, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, ptr %P) + ret void +} + +define void @st4_8b_undefABCD(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st4_8b_undefABCD +; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> undef, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> undef, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, ptr %P) + ret void +} + +; ----------------------------------------------------------------------------------------------------------------------------------------------- + +declare void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8>, <8 x i8>, ptr) nounwind sanitize_memory readonly +declare void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8>, <8 x i8>, <8 x i8>, ptr) nounwind sanitize_memory readonly +declare void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, ptr) nounwind sanitize_memory readonly + +define void @st2_16b(<16 x i8> %A, <16 x i8> %B, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st2_16b +; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> [[A]], <16 x i8> [[B]], ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> %A, <16 x i8> %B, ptr %P) + ret void +} + +define void @st3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st3_16b +; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i64 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: call void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, ptr %P) + ret void +} + +define void @st4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st4_16b +; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to i128 +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i128 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] +; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i64 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]] +; CHECK-NEXT: br i1 [[_MSOR7]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF0]] +; CHECK: 10: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: 11: +; CHECK-NEXT: call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <16 x i8> [[D]], ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, ptr %P) + ret void +} + +; ----------------------------------------------------------------------------------------------------------------------------------------------- + +declare void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8>, <16 x i8>, ptr) nounwind sanitize_memory readonly +declare void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8>, <16 x i8>, <16 x i8>, ptr) nounwind sanitize_memory readonly +declare void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, ptr) nounwind sanitize_memory readonly + +define void @st2_4h(<4 x i16> %A, <4 x i16> %B, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st2_4h +; CHECK-SAME: (<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to i64 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to i64 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i64 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: call void @llvm.aarch64.neon.st2.v4i16.p0(<4 x i16> [[A]], <4 x i16> [[B]], ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st2.v4i16.p0(<4 x i16> %A, <4 x i16> %B, ptr %P) + ret void +} + +define void @st3_4h(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st3_4h +; CHECK-SAME: (<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP1]] to i64 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP2]] to i64 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i64 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP3]] to i64 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i64 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: call void @llvm.aarch64.neon.st3.v4i16.p0(<4 x i16> [[A]], <4 x i16> [[B]], <4 x i16> [[C]], ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st3.v4i16.p0(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, ptr %P) + ret void +} + +define void @st4_4h(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st4_4h +; CHECK-SAME: (<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]], <4 x i16> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP1]] to i64 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP6]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP2]] to i64 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i64 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP3]] to i64 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP8]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP4]] to i64 +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i64 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] +; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i64 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]] +; CHECK-NEXT: br i1 [[_MSOR7]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF0]] +; CHECK: 10: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: 11: +; CHECK-NEXT: call void @llvm.aarch64.neon.st4.v4i16.p0(<4 x i16> [[A]], <4 x i16> [[B]], <4 x i16> [[C]], <4 x i16> [[D]], ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st4.v4i16.p0(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, ptr %P) + ret void +} + +; ----------------------------------------------------------------------------------------------------------------------------------------------- + +declare void @llvm.aarch64.neon.st2.v4i16.p0(<4 x i16>, <4 x i16>, ptr) nounwind sanitize_memory readonly +declare void @llvm.aarch64.neon.st3.v4i16.p0(<4 x i16>, <4 x i16>, <4 x i16>, ptr) nounwind sanitize_memory readonly +declare void @llvm.aarch64.neon.st4.v4i16.p0(<4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, ptr) nounwind sanitize_memory readonly + +define void @st2_8h(<8 x i16> %A, <8 x i16> %B, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st2_8h +; CHECK-SAME: (<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: call void @llvm.aarch64.neon.st2.v8i16.p0(<8 x i16> [[A]], <8 x i16> [[B]], ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st2.v8i16.p0(<8 x i16> %A, <8 x i16> %B, ptr %P) + ret void +} + +define void @st3_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st3_8h +; CHECK-SAME: (<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i64 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: call void @llvm.aarch64.neon.st3.v8i16.p0(<8 x i16> [[A]], <8 x i16> [[B]], <8 x i16> [[C]], ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st3.v8i16.p0(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, ptr %P) + ret void +} + +define void @st4_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st4_8h +; CHECK-SAME: (<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], <8 x i16> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP4]] to i128 +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i128 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] +; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i64 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]] +; CHECK-NEXT: br i1 [[_MSOR7]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF0]] +; CHECK: 10: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: 11: +; CHECK-NEXT: call void @llvm.aarch64.neon.st4.v8i16.p0(<8 x i16> [[A]], <8 x i16> [[B]], <8 x i16> [[C]], <8 x i16> [[D]], ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st4.v8i16.p0(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, ptr %P) + ret void +} + +; ----------------------------------------------------------------------------------------------------------------------------------------------- + +declare void @llvm.aarch64.neon.st2.v8i16.p0(<8 x i16>, <8 x i16>, ptr) nounwind sanitize_memory readonly +declare void @llvm.aarch64.neon.st3.v8i16.p0(<8 x i16>, <8 x i16>, <8 x i16>, ptr) nounwind sanitize_memory readonly +declare void @llvm.aarch64.neon.st4.v8i16.p0(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, ptr) nounwind sanitize_memory readonly + +define void @st2_2s(<2 x i32> %A, <2 x i32> %B, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st2_2s +; CHECK-SAME: (<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP1]] to i64 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP2]] to i64 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i64 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: call void @llvm.aarch64.neon.st2.v2i32.p0(<2 x i32> [[A]], <2 x i32> [[B]], ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st2.v2i32.p0(<2 x i32> %A, <2 x i32> %B, ptr %P) + ret void +} + +define void @st3_2s(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st3_2s +; CHECK-SAME: (<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP1]] to i64 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP2]] to i64 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i64 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP3]] to i64 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i64 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: call void @llvm.aarch64.neon.st3.v2i32.p0(<2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> [[C]], ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st3.v2i32.p0(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, ptr %P) + ret void +} + +define void @st4_2s(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st4_2s +; CHECK-SAME: (<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]], <2 x i32> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP1]] to i64 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP6]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP2]] to i64 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i64 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP3]] to i64 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP8]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP4]] to i64 +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i64 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] +; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i64 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]] +; CHECK-NEXT: br i1 [[_MSOR7]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF0]] +; CHECK: 10: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: 11: +; CHECK-NEXT: call void @llvm.aarch64.neon.st4.v2i32.p0(<2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> [[C]], <2 x i32> [[D]], ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st4.v2i32.p0(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, ptr %P) + ret void +} + +declare void @llvm.aarch64.neon.st2.v2i32.p0(<2 x i32>, <2 x i32>, ptr) nounwind sanitize_memory readonly +declare void @llvm.aarch64.neon.st3.v2i32.p0(<2 x i32>, <2 x i32>, <2 x i32>, ptr) nounwind sanitize_memory readonly +declare void @llvm.aarch64.neon.st4.v2i32.p0(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, ptr) nounwind sanitize_memory readonly + +define void @st2_4s(<4 x i32> %A, <4 x i32> %B, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st2_4s +; CHECK-SAME: (<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> [[A]], <4 x i32> [[B]], ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> %A, <4 x i32> %B, ptr %P) + ret void +} + +define void @st3_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st3_4s +; CHECK-SAME: (<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i64 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: call void @llvm.aarch64.neon.st3.v4i32.p0(<4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> [[C]], ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st3.v4i32.p0(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr %P) + ret void +} + +define void @st4_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st4_4s +; CHECK-SAME: (<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], <4 x i32> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP4]] to i128 +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i128 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] +; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i64 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]] +; CHECK-NEXT: br i1 [[_MSOR7]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF0]] +; CHECK: 10: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: 11: +; CHECK-NEXT: call void @llvm.aarch64.neon.st4.v4i32.p0(<4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> [[C]], <4 x i32> [[D]], ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st4.v4i32.p0(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, ptr %P) + ret void +} + +; ----------------------------------------------------------------------------------------------------------------------------------------------- + +declare void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32>, <4 x i32>, ptr) nounwind sanitize_memory readonly +declare void @llvm.aarch64.neon.st3.v4i32.p0(<4 x i32>, <4 x i32>, <4 x i32>, ptr) nounwind sanitize_memory readonly +declare void @llvm.aarch64.neon.st4.v4i32.p0(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, ptr) nounwind sanitize_memory readonly + +; If there's only one element, st2/3/4 don't make much sense, stick to st1. +define void @st2_1d(<1 x i64> %A, <1 x i64> %B, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st2_1d +; CHECK-SAME: (<1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP1]] to i64 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP2]] to i64 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i64 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: call void @llvm.aarch64.neon.st2.v1i64.p0(<1 x i64> [[A]], <1 x i64> [[B]], ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st2.v1i64.p0(<1 x i64> %A, <1 x i64> %B, ptr %P) + ret void +} + +define void @st3_1d(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st3_1d +; CHECK-SAME: (<1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]], <1 x i64> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP1]] to i64 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP2]] to i64 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i64 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP3]] to i64 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i64 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: call void @llvm.aarch64.neon.st3.v1i64.p0(<1 x i64> [[A]], <1 x i64> [[B]], <1 x i64> [[C]], ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st3.v1i64.p0(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, ptr %P) + ret void +} + +define void @st4_1d(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st4_1d +; CHECK-SAME: (<1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]], <1 x i64> [[C:%.*]], <1 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP1]] to i64 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP6]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP2]] to i64 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i64 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP3]] to i64 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP8]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP4]] to i64 +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i64 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] +; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i64 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]] +; CHECK-NEXT: br i1 [[_MSOR7]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF0]] +; CHECK: 10: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: 11: +; CHECK-NEXT: call void @llvm.aarch64.neon.st4.v1i64.p0(<1 x i64> [[A]], <1 x i64> [[B]], <1 x i64> [[C]], <1 x i64> [[D]], ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st4.v1i64.p0(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, ptr %P) + ret void +} + +; ----------------------------------------------------------------------------------------------------------------------------------------------- + +declare void @llvm.aarch64.neon.st2.v1i64.p0(<1 x i64>, <1 x i64>, ptr) nounwind sanitize_memory readonly +declare void @llvm.aarch64.neon.st3.v1i64.p0(<1 x i64>, <1 x i64>, <1 x i64>, ptr) nounwind sanitize_memory readonly +declare void @llvm.aarch64.neon.st4.v1i64.p0(<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, ptr) nounwind sanitize_memory readonly + +define void @st2_2d(<2 x i64> %A, <2 x i64> %B, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st2_2d +; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> [[A]], <2 x i64> [[B]], ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> %A, <2 x i64> %B, ptr %P) + ret void +} + +define void @st2_2d_undefA(<2 x i64> %A, <2 x i64> %B, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st2_2d_undefA +; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> undef, <2 x i64> [[B]], ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> undef, <2 x i64> %B, ptr %P) + ret void +} + +define void @st2_2d_undefB(<2 x i64> %A, <2 x i64> %B, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st2_2d_undefB +; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> [[A]], <2 x i64> undef, ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> %A, <2 x i64> undef, ptr %P) + ret void +} + +define void @st2_2d_undefAB(<2 x i64> %A, <2 x i64> %B, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st2_2d_undefAB +; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> undef, <2 x i64> undef, ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> undef, <2 x i64> undef, ptr %P) + ret void +} + +define void @st3_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st3_2d +; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i64 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> [[A]], <2 x i64> [[B]], <2 x i64> [[C]], ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %P) + ret void +} + +define void @st3_2d_undefA(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st3_2d_undefA +; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> undef, <2 x i64> [[B]], <2 x i64> [[C]], ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> undef, <2 x i64> %B, <2 x i64> %C, ptr %P) + ret void +} + +define void @st3_2d_undefB(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st3_2d_undefB +; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> [[A]], <2 x i64> undef, <2 x i64> [[C]], ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> %A, <2 x i64> undef, <2 x i64> %C, ptr %P) + ret void +} + +define void @st3_2d_undefC(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st3_2d_undefC +; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> [[A]], <2 x i64> [[B]], <2 x i64> undef, ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> %A, <2 x i64> %B, <2 x i64> undef, ptr %P) + ret void +} + +define void @st3_2d_undefAB(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st3_2d_undefAB +; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> undef, <2 x i64> undef, <2 x i64> [[C]], ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> undef, <2 x i64> undef, <2 x i64> %C, ptr %P) + ret void +} + +define void @st3_2d_undefAC(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st3_2d_undefAC +; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> undef, <2 x i64> [[B]], <2 x i64> undef, ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> undef, <2 x i64> %B, <2 x i64> undef, ptr %P) + ret void +} + +define void @st3_2d_undefBC(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st3_2d_undefBC +; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> [[A]], <2 x i64> undef, <2 x i64> undef, ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> %A, <2 x i64> undef, <2 x i64> undef, ptr %P) + ret void +} + +define void @st3_2d_undefABC(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st3_2d_undefABC +; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> undef, <2 x i64> undef, <2 x i64> undef, ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> undef, <2 x i64> undef, <2 x i64> undef, ptr %P) + ret void +} + +define void @st4_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st4_2d +; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <2 x i64> [[TMP4]] to i128 +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i128 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] +; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i64 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]] +; CHECK-NEXT: br i1 [[_MSOR7]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF0]] +; CHECK: 10: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: 11: +; CHECK-NEXT: call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[A]], <2 x i64> [[B]], <2 x i64> [[C]], <2 x i64> [[D]], ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %P) + ret void +} + +declare void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64>, <2 x i64>, ptr) nounwind sanitize_memory readonly +declare void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64>, <2 x i64>, <2 x i64>, ptr) nounwind sanitize_memory readonly +declare void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, ptr) nounwind sanitize_memory readonly + +define void @st4_2d_undefA(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st4_2d_undefA +; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> undef, <2 x i64> [[B]], <2 x i64> [[C]], <2 x i64> [[D]], ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> undef, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %P) + ret void +} + +define void @st4_2d_undefB(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st4_2d_undefB +; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[A]], <2 x i64> undef, <2 x i64> [[C]], <2 x i64> [[D]], ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> %A, <2 x i64> undef, <2 x i64> %C, <2 x i64> %D, ptr %P) + ret void +} + +define void @st4_2d_undefC(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st4_2d_undefC +; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[A]], <2 x i64> [[B]], <2 x i64> undef, <2 x i64> [[D]], ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> %A, <2 x i64> %B, <2 x i64> undef, <2 x i64> %D, ptr %P) + ret void +} + +define void @st4_2d_undefD(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st4_2d_undefD +; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[A]], <2 x i64> [[B]], <2 x i64> [[C]], <2 x i64> undef, ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> undef, ptr %P) + ret void +} + +define void @st4_2d_undefAB(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st4_2d_undefAB +; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> undef, <2 x i64> undef, <2 x i64> [[C]], <2 x i64> [[D]], ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> undef, <2 x i64> undef, <2 x i64> %C, <2 x i64> %D, ptr %P) + ret void +} + +define void @st4_2d_undefAC(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st4_2d_undefAC +; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> undef, <2 x i64> [[B]], <2 x i64> undef, <2 x i64> [[D]], ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> undef, <2 x i64> %B, <2 x i64> undef, <2 x i64> %D, ptr %P) + ret void +} + +define void @st4_2d_undefAD(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st4_2d_undefAD +; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> undef, <2 x i64> [[B]], <2 x i64> [[C]], <2 x i64> undef, ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> undef, <2 x i64> %B, <2 x i64> %C, <2 x i64> undef, ptr %P) + ret void +} + +define void @st4_2d_undefBC(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st4_2d_undefBC +; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[A]], <2 x i64> undef, <2 x i64> undef, <2 x i64> [[D]], ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> %A, <2 x i64> undef, <2 x i64> undef, <2 x i64> %D, ptr %P) + ret void +} + +define void @st4_2d_undefBD(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st4_2d_undefBD +; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[A]], <2 x i64> undef, <2 x i64> [[C]], <2 x i64> undef, ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> %A, <2 x i64> undef, <2 x i64> %C, <2 x i64> undef, ptr %P) + ret void +} + +define void @st4_2d_undefCD(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st4_2d_undefCD +; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[A]], <2 x i64> [[B]], <2 x i64> undef, <2 x i64> undef, ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> %A, <2 x i64> %B, <2 x i64> undef, <2 x i64> undef, ptr %P) + ret void +} + +define void @st4_2d_undefABC(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st4_2d_undefABC +; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> undef, <2 x i64> undef, <2 x i64> undef, <2 x i64> [[D]], ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> undef, <2 x i64> undef, <2 x i64> undef, <2 x i64> %D, ptr %P) + ret void +} + +define void @st4_2d_undefABD(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st4_2d_undefABD +; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> undef, <2 x i64> undef, <2 x i64> [[C]], <2 x i64> undef, ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> undef, <2 x i64> undef, <2 x i64> %C, <2 x i64> undef, ptr %P) + ret void +} + +define void @st4_2d_undefACD(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st4_2d_undefACD +; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> undef, <2 x i64> [[B]], <2 x i64> undef, <2 x i64> undef, ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> undef, <2 x i64> %B, <2 x i64> undef, <2 x i64> undef, ptr %P) + ret void +} + +define void @st4_2d_undefBCD(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st4_2d_undefBCD +; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[A]], <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> %A, <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, ptr %P) + ret void +} + +define void @st4_2d_undefABCD(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %P) nounwind sanitize_memory { +; CHECK-LABEL: define void @st4_2d_undefABCD +; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> undef, <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> undef, <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, ptr %P) + ret void +} From 51122fb4469b56b207bcae0c39182f961e4276fd Mon Sep 17 00:00:00 2001 From: Vladislav Khmelevsky Date: Wed, 17 Jul 2024 23:17:12 +0400 Subject: [PATCH 326/777] [BOLT][NFC] Fix build (#99361) On clang 14 the build is failing with: reference to local binding 'ParentName' declared in enclosing function 'llvm::bolt::RewriteInstance::registerFragments' --- bolt/lib/Rewrite/RewriteInstance.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp index 32562ccb6b345..4ae802dc97ccd 100644 --- a/bolt/lib/Rewrite/RewriteInstance.cpp +++ b/bolt/lib/Rewrite/RewriteInstance.cpp @@ -1432,7 +1432,9 @@ void RewriteInstance::registerFragments() { // of the last local symbol. ELFSymbolRef LocalSymEnd = ELF64LEFile->toSymbolRef(SymTab, SymTab->sh_info); - for (auto &[ParentName, BF] : AmbiguousFragments) { + for (auto &Fragment : AmbiguousFragments) { + const StringRef &ParentName = Fragment.first; + BinaryFunction *BF = Fragment.second; const uint64_t Address = BF->getAddress(); // Get fragment's own symbol From a51f343b433120e45f186e5507e8a522d4d7192f Mon Sep 17 00:00:00 2001 From: AtariDreams Date: Wed, 17 Jul 2024 15:19:02 -0400 Subject: [PATCH 327/777] [CodeGen] Emit more efficient magic numbers for exact udivs (#87161) Have simpler lowering for exact udivs in both SelectionDAG and GlobalISel. The algorithm is the same between unsigned exact divs and signed divs save for arithmetic vs logical shift for even divisors, according to Hacker's Delight, 2nd Edition, page 242. --- .../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 62 ++++++- .../CodeGen/SelectionDAG/TargetLowering.cpp | 71 +++++++- .../AArch64/GlobalISel/combine-udiv.ll | 42 +++++ .../AArch64/GlobalISel/combine-udiv.mir | 122 +++++++++++++ llvm/test/CodeGen/X86/udiv-exact.ll | 171 ++++++++++++++++++ 5 files changed, 462 insertions(+), 6 deletions(-) create mode 100644 llvm/test/CodeGen/X86/udiv-exact.ll diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index c27b882f17003..dfc3d73e322b8 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -5183,8 +5183,35 @@ MachineInstr *CombinerHelper::buildUDivUsingMul(MachineInstr &MI) { KB ? KB->getKnownBits(LHS).countMinLeadingZeros() : 0; auto &MIB = Builder; + bool UseSRL = false; bool UseNPQ = false; SmallVector PreShifts, PostShifts, MagicFactors, NPQFactors; + SmallVector Shifts, Factors; + auto *RHSDefInstr = cast(getDefIgnoringCopies(RHS, MRI)); + bool IsSplat = getIConstantSplatVal(*RHSDefInstr, MRI).has_value(); + + auto BuildExactUDIVPattern = [&](const Constant *C) { + // Don't recompute inverses for each splat element. + if (IsSplat && !Factors.empty()) { + Shifts.push_back(Shifts[0]); + Factors.push_back(Factors[0]); + return true; + } + + auto *CI = cast(C); + APInt Divisor = CI->getValue(); + unsigned Shift = Divisor.countr_zero(); + if (Shift) { + Divisor.lshrInPlace(Shift); + UseSRL = true; + } + + // Calculate the multiplicative inverse modulo BW. + APInt Factor = Divisor.multiplicativeInverse(); + Shifts.push_back(MIB.buildConstant(ScalarShiftAmtTy, Shift).getReg(0)); + Factors.push_back(MIB.buildConstant(ScalarTy, Factor).getReg(0)); + return true; + }; auto BuildUDIVPattern = [&](const Constant *C) { auto *CI = cast(C); @@ -5231,6 +5258,29 @@ MachineInstr *CombinerHelper::buildUDivUsingMul(MachineInstr &MI) { return true; }; + if (MI.getFlag(MachineInstr::MIFlag::IsExact)) { + // Collect all magic values from the build vector. + bool Matched = matchUnaryPredicate(MRI, RHS, BuildExactUDIVPattern); + (void)Matched; + assert(Matched && "Expected unary predicate match to succeed"); + + Register Shift, Factor; + if (Ty.isVector()) { + Shift = MIB.buildBuildVector(ShiftAmtTy, Shifts).getReg(0); + Factor = MIB.buildBuildVector(Ty, Factors).getReg(0); + } else { + Shift = Shifts[0]; + Factor = Factors[0]; + } + + Register Res = LHS; + + if (UseSRL) + Res = MIB.buildLShr(Ty, Res, Shift, MachineInstr::IsExact).getReg(0); + + return MIB.buildMul(Ty, Res, Factor); + } + // Collect the shifts/magic values from each element. bool Matched = matchUnaryPredicate(MRI, RHS, BuildUDIVPattern); (void)Matched; @@ -5283,9 +5333,6 @@ bool CombinerHelper::matchUDivByConst(MachineInstr &MI) { Register Dst = MI.getOperand(0).getReg(); Register RHS = MI.getOperand(2).getReg(); LLT DstTy = MRI.getType(Dst); - auto *RHSDef = MRI.getVRegDef(RHS); - if (!isConstantOrConstantVector(*RHSDef, MRI)) - return false; auto &MF = *MI.getMF(); AttributeList Attr = MF.getFunction().getAttributes(); @@ -5300,6 +5347,15 @@ bool CombinerHelper::matchUDivByConst(MachineInstr &MI) { if (MF.getFunction().hasMinSize()) return false; + if (MI.getFlag(MachineInstr::MIFlag::IsExact)) { + return matchUnaryPredicate( + MRI, RHS, [](const Constant *C) { return C && !C->isNullValue(); }); + } + + auto *RHSDef = MRI.getVRegDef(RHS); + if (!isConstantOrConstantVector(*RHSDef, MRI)) + return false; + // Don't do this if the types are not going to be legal. if (LI) { if (!isLegalOrBeforeLegalizer({TargetOpcode::G_MUL, {DstTy, DstTy}})) diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index adf14bd007356..c3a20b5044c5f 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -6092,6 +6092,7 @@ void TargetLowering::ComputeConstraintToUse(AsmOperandInfo &OpInfo, /// Given an exact SDIV by a constant, create a multiplication /// with the multiplicative inverse of the constant. +/// Ref: "Hacker's Delight" by Henry Warren, 2nd Edition, p. 242 static SDValue BuildExactSDIV(const TargetLowering &TLI, SDNode *N, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl &Created) { @@ -6141,10 +6142,7 @@ static SDValue BuildExactSDIV(const TargetLowering &TLI, SDNode *N, } SDValue Res = Op0; - - // Shift the value upfront if it is even, so the LSB is one. if (UseSRA) { - // TODO: For UDIV use SRL instead of SRA. SDNodeFlags Flags; Flags.setExact(true); Res = DAG.getNode(ISD::SRA, dl, VT, Res, Shift, Flags); @@ -6154,6 +6152,69 @@ static SDValue BuildExactSDIV(const TargetLowering &TLI, SDNode *N, return DAG.getNode(ISD::MUL, dl, VT, Res, Factor); } +/// Given an exact UDIV by a constant, create a multiplication +/// with the multiplicative inverse of the constant. +/// Ref: "Hacker's Delight" by Henry Warren, 2nd Edition, p. 242 +static SDValue BuildExactUDIV(const TargetLowering &TLI, SDNode *N, + const SDLoc &dl, SelectionDAG &DAG, + SmallVectorImpl &Created) { + EVT VT = N->getValueType(0); + EVT SVT = VT.getScalarType(); + EVT ShVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout()); + EVT ShSVT = ShVT.getScalarType(); + + bool UseSRL = false; + SmallVector Shifts, Factors; + + auto BuildUDIVPattern = [&](ConstantSDNode *C) { + if (C->isZero()) + return false; + APInt Divisor = C->getAPIntValue(); + unsigned Shift = Divisor.countr_zero(); + if (Shift) { + Divisor.lshrInPlace(Shift); + UseSRL = true; + } + // Calculate the multiplicative inverse modulo BW. + APInt Factor = Divisor.multiplicativeInverse(); + Shifts.push_back(DAG.getConstant(Shift, dl, ShSVT)); + Factors.push_back(DAG.getConstant(Factor, dl, SVT)); + return true; + }; + + SDValue Op1 = N->getOperand(1); + + // Collect all magic values from the build vector. + if (!ISD::matchUnaryPredicate(Op1, BuildUDIVPattern)) + return SDValue(); + + SDValue Shift, Factor; + if (Op1.getOpcode() == ISD::BUILD_VECTOR) { + Shift = DAG.getBuildVector(ShVT, dl, Shifts); + Factor = DAG.getBuildVector(VT, dl, Factors); + } else if (Op1.getOpcode() == ISD::SPLAT_VECTOR) { + assert(Shifts.size() == 1 && Factors.size() == 1 && + "Expected matchUnaryPredicate to return one element for scalable " + "vectors"); + Shift = DAG.getSplatVector(ShVT, dl, Shifts[0]); + Factor = DAG.getSplatVector(VT, dl, Factors[0]); + } else { + assert(isa(Op1) && "Expected a constant"); + Shift = Shifts[0]; + Factor = Factors[0]; + } + + SDValue Res = N->getOperand(0); + if (UseSRL) { + SDNodeFlags Flags; + Flags.setExact(true); + Res = DAG.getNode(ISD::SRL, dl, VT, Res, Shift, Flags); + Created.push_back(Res.getNode()); + } + + return DAG.getNode(ISD::MUL, dl, VT, Res, Factor); +} + SDValue TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl &Created) const { @@ -6413,6 +6474,10 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG, return SDValue(); } + // If the udiv has an 'exact' bit we can use a simpler lowering. + if (N->getFlags().hasExact()) + return BuildExactUDIV(*this, N, dl, DAG, Created); + SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll b/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll index c97a00ccdd455..d465e0237201b 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll @@ -269,3 +269,45 @@ define i32 @udiv_div_by_180(i32 %x) %udiv = udiv i32 %truncate, 180 ret i32 %udiv } + +define i32 @udiv_div_by_180_exact(i32 %x) +; SDAG-LABEL: udiv_div_by_180_exact: +; SDAG: // %bb.0: +; SDAG-NEXT: lsr w8, w0, #2 +; SDAG-NEXT: mov w9, #20389 // =0x4fa5 +; SDAG-NEXT: movk w9, #42234, lsl #16 +; SDAG-NEXT: mul w0, w8, w9 +; SDAG-NEXT: ret +; +; GISEL-LABEL: udiv_div_by_180_exact: +; GISEL: // %bb.0: +; GISEL-NEXT: lsr w8, w0, #2 +; GISEL-NEXT: mov w9, #20389 // =0x4fa5 +; GISEL-NEXT: movk w9, #42234, lsl #16 +; GISEL-NEXT: mul w0, w8, w9 +; GISEL-NEXT: ret +{ + %udiv = udiv exact i32 %x, 180 + ret i32 %udiv +} + +define <4 x i32> @udiv_div_by_104_exact(<4 x i32> %x) +; SDAG-LABEL: udiv_div_by_104_exact: +; SDAG: // %bb.0: +; SDAG-NEXT: adrp x8, .LCPI8_0 +; SDAG-NEXT: ushr v0.4s, v0.4s, #3 +; SDAG-NEXT: ldr q1, [x8, :lo12:.LCPI8_0] +; SDAG-NEXT: mul v0.4s, v0.4s, v1.4s +; SDAG-NEXT: ret +; +; GISEL-LABEL: udiv_div_by_104_exact: +; GISEL: // %bb.0: +; GISEL-NEXT: adrp x8, .LCPI8_0 +; GISEL-NEXT: ushr v0.4s, v0.4s, #3 +; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI8_0] +; GISEL-NEXT: mul v0.4s, v0.4s, v1.4s +; GISEL-NEXT: ret +{ + %udiv = udiv exact <4 x i32> %x, + ret <4 x i32> %udiv +} diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.mir index 02233b9f498bd..f8578a694e2d4 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.mir @@ -304,5 +304,127 @@ body: | %10:_(<8 x s16>) = G_UDIV %0, %1 $q0 = COPY %10(<8 x s16>) RET_ReallyLR implicit $q0 +... +--- +name: udiv_exact +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: udiv_exact + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -991146299 + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = exact G_LSHR [[COPY]], [[C]](s32) + ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[LSHR]], [[C1]] + ; CHECK-NEXT: $w0 = COPY [[MUL]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 + %0:_(s32) = COPY $w0 + %1:_(s32) = G_CONSTANT i32 104 + %2:_(s32) = exact G_UDIV %0, %1 + $w0 = COPY %2(s32) + RET_ReallyLR implicit $w0 + +... +--- +name: udiv_noexact +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: udiv_noexact + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1321528399 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 5 + ; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[COPY]], [[C]] + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UMULH]], [[C1]](s32) + ; CHECK-NEXT: $w0 = COPY [[LSHR]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 + %0:_(s32) = COPY $w0 + %1:_(s32) = G_CONSTANT i32 104 + %2:_(s32) = G_UDIV %0, %1 + $w0 = COPY %2(s32) + RET_ReallyLR implicit $w0 + +... +--- +name: udiv_exact_minsize +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: udiv_exact_minsize + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -991146299 + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = exact G_LSHR [[COPY]], [[C]](s32) + ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[LSHR]], [[C1]] + ; CHECK-NEXT: $w0 = COPY [[MUL]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 + %0:_(s32) = COPY $w0 + %1:_(s32) = G_CONSTANT i32 104 + %2:_(s32) = exact G_UDIV %0, %1 + $w0 = COPY %2(s32) + RET_ReallyLR implicit $w0 + +... +--- +name: div_v4s32 +body: | + bb.1: + liveins: $q0 + + ; CHECK-LABEL: name: div_v4s32 + ; CHECK: liveins: $q0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -991146299 + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 954437177 + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32) + ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C1]](s32), [[C2]](s32), [[C1]](s32), [[C2]](s32) + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<4 x s32>) = exact G_LSHR [[COPY]], [[BUILD_VECTOR]](<4 x s32>) + ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(<4 x s32>) = G_MUL [[LSHR]], [[BUILD_VECTOR1]] + ; CHECK-NEXT: $q0 = COPY [[MUL]](<4 x s32>) + ; CHECK-NEXT: RET_ReallyLR implicit $q0 + %0:_(<4 x s32>) = COPY $q0 + %c1:_(s32) = G_CONSTANT i32 104 + %c2:_(s32) = G_CONSTANT i32 72 + %1:_(<4 x s32>) = G_BUILD_VECTOR %c1(s32), %c2(s32), %c1(s32), %c2(s32) + %3:_(<4 x s32>) = exact G_UDIV %0, %1 + $q0 = COPY %3(<4 x s32>) + RET_ReallyLR implicit $q0 + +... +--- +name: div_v4s32_splat +body: | + bb.1: + liveins: $q0 + + ; CHECK-LABEL: name: div_v4s32_splat + ; CHECK: liveins: $q0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -991146299 + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32) + ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C1]](s32), [[C1]](s32), [[C1]](s32), [[C1]](s32) + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<4 x s32>) = exact G_LSHR [[COPY]], [[BUILD_VECTOR]](<4 x s32>) + ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(<4 x s32>) = G_MUL [[LSHR]], [[BUILD_VECTOR1]] + ; CHECK-NEXT: $q0 = COPY [[MUL]](<4 x s32>) + ; CHECK-NEXT: RET_ReallyLR implicit $q0 + %0:_(<4 x s32>) = COPY $q0 + %c1:_(s32) = G_CONSTANT i32 104 + %1:_(<4 x s32>) = G_BUILD_VECTOR %c1(s32), %c1(s32), %c1(s32), %c1(s32) + %3:_(<4 x s32>) = exact G_UDIV %0, %1 + $q0 = COPY %3(<4 x s32>) + RET_ReallyLR implicit $q0 ... diff --git a/llvm/test/CodeGen/X86/udiv-exact.ll b/llvm/test/CodeGen/X86/udiv-exact.ll new file mode 100644 index 0000000000000..271d11edff9a7 --- /dev/null +++ b/llvm/test/CodeGen/X86/udiv-exact.ll @@ -0,0 +1,171 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64 + +define i32 @test1(i32 %x) { +; X86-LABEL: test1: +; X86: # %bb.0: +; X86-NEXT: imull $-1030792151, {{[0-9]+}}(%esp), %eax # imm = 0xC28F5C29 +; X86-NEXT: retl +; +; X64-LABEL: test1: +; X64: # %bb.0: +; X64-NEXT: imull $-1030792151, %edi, %eax # imm = 0xC28F5C29 +; X64-NEXT: retq + %div = udiv exact i32 %x, 25 + ret i32 %div +} + +define i32 @test2(i32 %x) { +; X86-LABEL: test2: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: shrl $3, %eax +; X86-NEXT: imull $-1431655765, %eax, %eax # imm = 0xAAAAAAAB +; X86-NEXT: retl +; +; X64-LABEL: test2: +; X64: # %bb.0: +; X64-NEXT: shrl $3, %edi +; X64-NEXT: imull $-1431655765, %edi, %eax # imm = 0xAAAAAAAB +; X64-NEXT: retq + %div = udiv exact i32 %x, 24 + ret i32 %div +} + +define <4 x i32> @test3(<4 x i32> %x) { +; X86-LABEL: test3: +; X86: # %bb.0: +; X86-NEXT: psrld $3, %xmm0 +; X86-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] +; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; X86-NEXT: pmuludq %xmm1, %xmm0 +; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-NEXT: pmuludq %xmm1, %xmm2 +; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X86-NEXT: retl +; +; X64-LABEL: test3: +; X64: # %bb.0: +; X64-NEXT: vpsrld $3, %xmm0, %xmm0 +; X64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] +; X64-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; X64-NEXT: retq + %div = udiv exact <4 x i32> %x, + ret <4 x i32> %div +} + +define <4 x i32> @test4(<4 x i32> %x) { +; X86-LABEL: test4: +; X86: # %bb.0: +; X86-NEXT: movdqa {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145] +; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; X86-NEXT: pmuludq %xmm1, %xmm0 +; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-NEXT: pmuludq %xmm1, %xmm2 +; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X86-NEXT: retl +; +; X64-LABEL: test4: +; X64: # %bb.0: +; X64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145] +; X64-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; X64-NEXT: retq + %div = udiv exact <4 x i32> %x, + ret <4 x i32> %div +} + +define <4 x i32> @test5(<4 x i32> %x) { +; X86-LABEL: test5: +; X86: # %bb.0: +; X86-NEXT: movdqa %xmm0, %xmm1 +; X86-NEXT: psrld $3, %xmm1 +; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] +; X86-NEXT: movdqa {{.*#+}} xmm0 = [2863311531,2863311531,3264175145,3264175145] +; X86-NEXT: pmuludq %xmm1, %xmm0 +; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X86-NEXT: retl +; +; X64-LABEL: test5: +; X64: # %bb.0: +; X64-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-NEXT: retq + %div = udiv exact <4 x i32> %x, + ret <4 x i32> %div +} + +define <4 x i32> @test6(<4 x i32> %x) { +; X86-LABEL: test6: +; X86: # %bb.0: +; X86-NEXT: movdqa %xmm0, %xmm1 +; X86-NEXT: psrld $3, %xmm1 +; X86-NEXT: psrld $1, %xmm0 +; X86-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; X86-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,2863311531,3303820997,3303820997] +; X86-NEXT: pmuludq %xmm0, %xmm1 +; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,3,3] +; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X86-NEXT: movdqa %xmm1, %xmm0 +; X86-NEXT: retl +; +; X64-LABEL: test6: +; X64: # %bb.0: +; X64-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-NEXT: retq + %div = udiv exact <4 x i32> %x, + ret <4 x i32> %div +} + +define <4 x i32> @test7(<4 x i32> %x) { +; X86-LABEL: test7: +; X86: # %bb.0: +; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X86-NEXT: retl +; +; X64-LABEL: test7: +; X64: # %bb.0: +; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-NEXT: retq + %div = udiv exact <4 x i32> %x, + ret <4 x i32> %div +} + +define <4 x i32> @test8(<4 x i32> %x) { +; X86-LABEL: test8: +; X86: # %bb.0: +; X86-NEXT: movdqa %xmm0, %xmm1 +; X86-NEXT: psrld $3, %xmm1 +; X86-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; X86-NEXT: movdqa {{.*#+}} xmm0 = [1,1,2863311531,2863311531] +; X86-NEXT: pmuludq %xmm1, %xmm0 +; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X86-NEXT: retl +; +; X64-LABEL: test8: +; X64: # %bb.0: +; X64-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-NEXT: retq + %div = udiv exact <4 x i32> %x, + ret <4 x i32> %div +} From d85f1054fbb04c5299848bf81aa350442f9a56c7 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 17 Jul 2024 12:23:05 -0700 Subject: [PATCH 328/777] [RISCV] Teach fillUpExtensionSupportForSplat to handle nxvXi64 VMV_V_X_VL on RV32. (#99251) A nxvXi64 VMV_V_X_VL on RV32 sign extends its 32 bit input to 64 bits. If that input is positive, the sign extend can also be considered as a zero extend. --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 17 +- llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll | 322 +++++---------- llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll | 382 ++++++------------ .../CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll | 184 +++------ .../CodeGen/RISCV/rvv/fixed-vectors-cttz.ll | 216 ++++------ .../RISCV/rvv/fixed-vectors-shuffle-rotate.ll | 180 +++++---- .../CodeGen/RISCV/rvv/fixed-vectors-vrol.ll | 124 ++++-- .../CodeGen/RISCV/rvv/fixed-vectors-vror.ll | 180 ++++++--- .../CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll | 72 +--- .../CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll | 69 +--- .../CodeGen/RISCV/rvv/fixed-vectors-vwsll.ll | 19 +- .../CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll | 78 ++-- llvm/test/CodeGen/RISCV/rvv/vrol-sdnode.ll | 52 ++- llvm/test/CodeGen/RISCV/rvv/vror-sdnode.ll | 52 ++- llvm/test/CodeGen/RISCV/rvv/vwadd-sdnode.ll | 21 +- llvm/test/CodeGen/RISCV/rvv/vwsll-sdnode.ll | 22 +- 16 files changed, 824 insertions(+), 1166 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index fef1441eca9c6..21193ebe1eb94 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -14503,10 +14503,21 @@ struct NodeExtensionHelper { // widening opcode by splatting to smaller element size. unsigned EltBits = VT.getScalarSizeInBits(); unsigned ScalarBits = Op.getValueSizeInBits(); - // Make sure we're getting all element bits from the scalar register. - // FIXME: Support implicit sign extension of vmv.v.x? - if (ScalarBits < EltBits) + // If we're not getting all bits from the element, we need special handling. + if (ScalarBits < EltBits) { + // This should only occur on RV32. + assert(Opc == RISCVISD::VMV_V_X_VL && EltBits == 64 && ScalarBits == 32 && + !Subtarget.is64Bit() && "Unexpected splat"); + // vmv.v.x sign extends narrow inputs. + SupportsSExt = true; + + // If the input is positive, then sign extend is also zero extend. + if (DAG.SignBitIsZero(Op)) + SupportsZExt = true; + + EnforceOneUse = false; return; + } unsigned NarrowSize = EltBits / 2; // If the narrow type cannot be expressed with a legal VMV, diff --git a/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll index 6e538f3dfb38e..d51f5eacd7d91 100644 --- a/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll @@ -1229,36 +1229,20 @@ define @ctlz_nxv1i64( %va) { ; RV64I-NEXT: vsrl.vx v8, v8, a0 ; RV64I-NEXT: ret ; -; RV32F-LABEL: ctlz_nxv1i64: -; RV32F: # %bb.0: -; RV32F-NEXT: li a0, 190 -; RV32F-NEXT: vsetvli a1, zero, e64, m1, ta, ma -; RV32F-NEXT: vmv.v.x v9, a0 -; RV32F-NEXT: fsrmi a0, 1 -; RV32F-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; RV32F-NEXT: vfncvt.f.xu.w v10, v8 -; RV32F-NEXT: vsrl.vi v8, v10, 23 -; RV32F-NEXT: vwsubu.wv v9, v9, v8 -; RV32F-NEXT: li a1, 64 -; RV32F-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; RV32F-NEXT: vminu.vx v8, v9, a1 -; RV32F-NEXT: fsrm a0 -; RV32F-NEXT: ret -; -; RV64F-LABEL: ctlz_nxv1i64: -; RV64F: # %bb.0: -; RV64F-NEXT: li a0, 190 -; RV64F-NEXT: vsetvli a1, zero, e32, mf2, ta, ma -; RV64F-NEXT: vmv.v.x v9, a0 -; RV64F-NEXT: fsrmi a0, 1 -; RV64F-NEXT: vfncvt.f.xu.w v10, v8 -; RV64F-NEXT: vsrl.vi v8, v10, 23 -; RV64F-NEXT: vwsubu.vv v10, v9, v8 -; RV64F-NEXT: li a1, 64 -; RV64F-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; RV64F-NEXT: vminu.vx v8, v10, a1 -; RV64F-NEXT: fsrm a0 -; RV64F-NEXT: ret +; CHECK-F-LABEL: ctlz_nxv1i64: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: li a0, 190 +; CHECK-F-NEXT: vsetvli a1, zero, e32, mf2, ta, ma +; CHECK-F-NEXT: vmv.v.x v9, a0 +; CHECK-F-NEXT: fsrmi a0, 1 +; CHECK-F-NEXT: vfncvt.f.xu.w v10, v8 +; CHECK-F-NEXT: vsrl.vi v8, v10, 23 +; CHECK-F-NEXT: vwsubu.vv v10, v9, v8 +; CHECK-F-NEXT: li a1, 64 +; CHECK-F-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-F-NEXT: vminu.vx v8, v10, a1 +; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: ctlz_nxv1i64: ; CHECK-D: # %bb.0: @@ -1385,36 +1369,20 @@ define @ctlz_nxv2i64( %va) { ; RV64I-NEXT: vsrl.vx v8, v8, a0 ; RV64I-NEXT: ret ; -; RV32F-LABEL: ctlz_nxv2i64: -; RV32F: # %bb.0: -; RV32F-NEXT: li a0, 190 -; RV32F-NEXT: vsetvli a1, zero, e64, m2, ta, ma -; RV32F-NEXT: vmv.v.x v10, a0 -; RV32F-NEXT: fsrmi a0, 1 -; RV32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV32F-NEXT: vfncvt.f.xu.w v12, v8 -; RV32F-NEXT: vsrl.vi v8, v12, 23 -; RV32F-NEXT: vwsubu.wv v10, v10, v8 -; RV32F-NEXT: li a1, 64 -; RV32F-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; RV32F-NEXT: vminu.vx v8, v10, a1 -; RV32F-NEXT: fsrm a0 -; RV32F-NEXT: ret -; -; RV64F-LABEL: ctlz_nxv2i64: -; RV64F: # %bb.0: -; RV64F-NEXT: li a0, 190 -; RV64F-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; RV64F-NEXT: vmv.v.x v10, a0 -; RV64F-NEXT: fsrmi a0, 1 -; RV64F-NEXT: vfncvt.f.xu.w v11, v8 -; RV64F-NEXT: vsrl.vi v8, v11, 23 -; RV64F-NEXT: vwsubu.vv v12, v10, v8 -; RV64F-NEXT: li a1, 64 -; RV64F-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; RV64F-NEXT: vminu.vx v8, v12, a1 -; RV64F-NEXT: fsrm a0 -; RV64F-NEXT: ret +; CHECK-F-LABEL: ctlz_nxv2i64: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: li a0, 190 +; CHECK-F-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; CHECK-F-NEXT: vmv.v.x v10, a0 +; CHECK-F-NEXT: fsrmi a0, 1 +; CHECK-F-NEXT: vfncvt.f.xu.w v11, v8 +; CHECK-F-NEXT: vsrl.vi v8, v11, 23 +; CHECK-F-NEXT: vwsubu.vv v12, v10, v8 +; CHECK-F-NEXT: li a1, 64 +; CHECK-F-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-F-NEXT: vminu.vx v8, v12, a1 +; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: ctlz_nxv2i64: ; CHECK-D: # %bb.0: @@ -1541,36 +1509,20 @@ define @ctlz_nxv4i64( %va) { ; RV64I-NEXT: vsrl.vx v8, v8, a0 ; RV64I-NEXT: ret ; -; RV32F-LABEL: ctlz_nxv4i64: -; RV32F: # %bb.0: -; RV32F-NEXT: li a0, 190 -; RV32F-NEXT: vsetvli a1, zero, e64, m4, ta, ma -; RV32F-NEXT: vmv.v.x v12, a0 -; RV32F-NEXT: fsrmi a0, 1 -; RV32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; RV32F-NEXT: vfncvt.f.xu.w v16, v8 -; RV32F-NEXT: vsrl.vi v8, v16, 23 -; RV32F-NEXT: vwsubu.wv v12, v12, v8 -; RV32F-NEXT: li a1, 64 -; RV32F-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; RV32F-NEXT: vminu.vx v8, v12, a1 -; RV32F-NEXT: fsrm a0 -; RV32F-NEXT: ret -; -; RV64F-LABEL: ctlz_nxv4i64: -; RV64F: # %bb.0: -; RV64F-NEXT: li a0, 190 -; RV64F-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; RV64F-NEXT: vmv.v.x v12, a0 -; RV64F-NEXT: fsrmi a0, 1 -; RV64F-NEXT: vfncvt.f.xu.w v14, v8 -; RV64F-NEXT: vsrl.vi v8, v14, 23 -; RV64F-NEXT: vwsubu.vv v16, v12, v8 -; RV64F-NEXT: li a1, 64 -; RV64F-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; RV64F-NEXT: vminu.vx v8, v16, a1 -; RV64F-NEXT: fsrm a0 -; RV64F-NEXT: ret +; CHECK-F-LABEL: ctlz_nxv4i64: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: li a0, 190 +; CHECK-F-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; CHECK-F-NEXT: vmv.v.x v12, a0 +; CHECK-F-NEXT: fsrmi a0, 1 +; CHECK-F-NEXT: vfncvt.f.xu.w v14, v8 +; CHECK-F-NEXT: vsrl.vi v8, v14, 23 +; CHECK-F-NEXT: vwsubu.vv v16, v12, v8 +; CHECK-F-NEXT: li a1, 64 +; CHECK-F-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-F-NEXT: vminu.vx v8, v16, a1 +; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: ctlz_nxv4i64: ; CHECK-D: # %bb.0: @@ -1697,36 +1649,20 @@ define @ctlz_nxv8i64( %va) { ; RV64I-NEXT: vsrl.vx v8, v8, a0 ; RV64I-NEXT: ret ; -; RV32F-LABEL: ctlz_nxv8i64: -; RV32F: # %bb.0: -; RV32F-NEXT: li a0, 190 -; RV32F-NEXT: vsetvli a1, zero, e64, m8, ta, ma -; RV32F-NEXT: vmv.v.x v16, a0 -; RV32F-NEXT: fsrmi a0, 1 -; RV32F-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; RV32F-NEXT: vfncvt.f.xu.w v24, v8 -; RV32F-NEXT: vsrl.vi v8, v24, 23 -; RV32F-NEXT: vwsubu.wv v16, v16, v8 -; RV32F-NEXT: li a1, 64 -; RV32F-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; RV32F-NEXT: vminu.vx v8, v16, a1 -; RV32F-NEXT: fsrm a0 -; RV32F-NEXT: ret -; -; RV64F-LABEL: ctlz_nxv8i64: -; RV64F: # %bb.0: -; RV64F-NEXT: li a0, 190 -; RV64F-NEXT: vsetvli a1, zero, e32, m4, ta, ma -; RV64F-NEXT: vmv.v.x v16, a0 -; RV64F-NEXT: fsrmi a0, 1 -; RV64F-NEXT: vfncvt.f.xu.w v20, v8 -; RV64F-NEXT: vsrl.vi v8, v20, 23 -; RV64F-NEXT: vwsubu.vv v24, v16, v8 -; RV64F-NEXT: li a1, 64 -; RV64F-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; RV64F-NEXT: vminu.vx v8, v24, a1 -; RV64F-NEXT: fsrm a0 -; RV64F-NEXT: ret +; CHECK-F-LABEL: ctlz_nxv8i64: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: li a0, 190 +; CHECK-F-NEXT: vsetvli a1, zero, e32, m4, ta, ma +; CHECK-F-NEXT: vmv.v.x v16, a0 +; CHECK-F-NEXT: fsrmi a0, 1 +; CHECK-F-NEXT: vfncvt.f.xu.w v20, v8 +; CHECK-F-NEXT: vsrl.vi v8, v20, 23 +; CHECK-F-NEXT: vwsubu.vv v24, v16, v8 +; CHECK-F-NEXT: li a1, 64 +; CHECK-F-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; CHECK-F-NEXT: vminu.vx v8, v24, a1 +; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: ctlz_nxv8i64: ; CHECK-D: # %bb.0: @@ -2895,31 +2831,17 @@ define @ctlz_zero_undef_nxv1i64( %va) { ; RV64I-NEXT: vsrl.vx v8, v8, a0 ; RV64I-NEXT: ret ; -; RV32F-LABEL: ctlz_zero_undef_nxv1i64: -; RV32F: # %bb.0: -; RV32F-NEXT: li a0, 190 -; RV32F-NEXT: vsetvli a1, zero, e64, m1, ta, ma -; RV32F-NEXT: vmv.v.x v9, a0 -; RV32F-NEXT: fsrmi a0, 1 -; RV32F-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; RV32F-NEXT: vfncvt.f.xu.w v10, v8 -; RV32F-NEXT: vsrl.vi v8, v10, 23 -; RV32F-NEXT: vwsubu.wv v9, v9, v8 -; RV32F-NEXT: fsrm a0 -; RV32F-NEXT: vmv1r.v v8, v9 -; RV32F-NEXT: ret -; -; RV64F-LABEL: ctlz_zero_undef_nxv1i64: -; RV64F: # %bb.0: -; RV64F-NEXT: li a0, 190 -; RV64F-NEXT: vsetvli a1, zero, e32, mf2, ta, ma -; RV64F-NEXT: vmv.v.x v9, a0 -; RV64F-NEXT: fsrmi a0, 1 -; RV64F-NEXT: vfncvt.f.xu.w v10, v8 -; RV64F-NEXT: vsrl.vi v10, v10, 23 -; RV64F-NEXT: vwsubu.vv v8, v9, v10 -; RV64F-NEXT: fsrm a0 -; RV64F-NEXT: ret +; CHECK-F-LABEL: ctlz_zero_undef_nxv1i64: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: li a0, 190 +; CHECK-F-NEXT: vsetvli a1, zero, e32, mf2, ta, ma +; CHECK-F-NEXT: vmv.v.x v9, a0 +; CHECK-F-NEXT: fsrmi a0, 1 +; CHECK-F-NEXT: vfncvt.f.xu.w v10, v8 +; CHECK-F-NEXT: vsrl.vi v10, v10, 23 +; CHECK-F-NEXT: vwsubu.vv v8, v9, v10 +; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: ctlz_zero_undef_nxv1i64: ; CHECK-D: # %bb.0: @@ -3043,31 +2965,17 @@ define @ctlz_zero_undef_nxv2i64( %va) { ; RV64I-NEXT: vsrl.vx v8, v8, a0 ; RV64I-NEXT: ret ; -; RV32F-LABEL: ctlz_zero_undef_nxv2i64: -; RV32F: # %bb.0: -; RV32F-NEXT: li a0, 190 -; RV32F-NEXT: vsetvli a1, zero, e64, m2, ta, ma -; RV32F-NEXT: vmv.v.x v10, a0 -; RV32F-NEXT: fsrmi a0, 1 -; RV32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV32F-NEXT: vfncvt.f.xu.w v12, v8 -; RV32F-NEXT: vsrl.vi v8, v12, 23 -; RV32F-NEXT: vwsubu.wv v10, v10, v8 -; RV32F-NEXT: fsrm a0 -; RV32F-NEXT: vmv2r.v v8, v10 -; RV32F-NEXT: ret -; -; RV64F-LABEL: ctlz_zero_undef_nxv2i64: -; RV64F: # %bb.0: -; RV64F-NEXT: li a0, 190 -; RV64F-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; RV64F-NEXT: vmv.v.x v10, a0 -; RV64F-NEXT: fsrmi a0, 1 -; RV64F-NEXT: vfncvt.f.xu.w v11, v8 -; RV64F-NEXT: vsrl.vi v11, v11, 23 -; RV64F-NEXT: vwsubu.vv v8, v10, v11 -; RV64F-NEXT: fsrm a0 -; RV64F-NEXT: ret +; CHECK-F-LABEL: ctlz_zero_undef_nxv2i64: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: li a0, 190 +; CHECK-F-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; CHECK-F-NEXT: vmv.v.x v10, a0 +; CHECK-F-NEXT: fsrmi a0, 1 +; CHECK-F-NEXT: vfncvt.f.xu.w v11, v8 +; CHECK-F-NEXT: vsrl.vi v11, v11, 23 +; CHECK-F-NEXT: vwsubu.vv v8, v10, v11 +; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: ctlz_zero_undef_nxv2i64: ; CHECK-D: # %bb.0: @@ -3191,31 +3099,17 @@ define @ctlz_zero_undef_nxv4i64( %va) { ; RV64I-NEXT: vsrl.vx v8, v8, a0 ; RV64I-NEXT: ret ; -; RV32F-LABEL: ctlz_zero_undef_nxv4i64: -; RV32F: # %bb.0: -; RV32F-NEXT: li a0, 190 -; RV32F-NEXT: vsetvli a1, zero, e64, m4, ta, ma -; RV32F-NEXT: vmv.v.x v12, a0 -; RV32F-NEXT: fsrmi a0, 1 -; RV32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; RV32F-NEXT: vfncvt.f.xu.w v16, v8 -; RV32F-NEXT: vsrl.vi v8, v16, 23 -; RV32F-NEXT: vwsubu.wv v12, v12, v8 -; RV32F-NEXT: fsrm a0 -; RV32F-NEXT: vmv4r.v v8, v12 -; RV32F-NEXT: ret -; -; RV64F-LABEL: ctlz_zero_undef_nxv4i64: -; RV64F: # %bb.0: -; RV64F-NEXT: li a0, 190 -; RV64F-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; RV64F-NEXT: vmv.v.x v12, a0 -; RV64F-NEXT: fsrmi a0, 1 -; RV64F-NEXT: vfncvt.f.xu.w v14, v8 -; RV64F-NEXT: vsrl.vi v14, v14, 23 -; RV64F-NEXT: vwsubu.vv v8, v12, v14 -; RV64F-NEXT: fsrm a0 -; RV64F-NEXT: ret +; CHECK-F-LABEL: ctlz_zero_undef_nxv4i64: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: li a0, 190 +; CHECK-F-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; CHECK-F-NEXT: vmv.v.x v12, a0 +; CHECK-F-NEXT: fsrmi a0, 1 +; CHECK-F-NEXT: vfncvt.f.xu.w v14, v8 +; CHECK-F-NEXT: vsrl.vi v14, v14, 23 +; CHECK-F-NEXT: vwsubu.vv v8, v12, v14 +; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: ctlz_zero_undef_nxv4i64: ; CHECK-D: # %bb.0: @@ -3339,31 +3233,17 @@ define @ctlz_zero_undef_nxv8i64( %va) { ; RV64I-NEXT: vsrl.vx v8, v8, a0 ; RV64I-NEXT: ret ; -; RV32F-LABEL: ctlz_zero_undef_nxv8i64: -; RV32F: # %bb.0: -; RV32F-NEXT: li a0, 190 -; RV32F-NEXT: vsetvli a1, zero, e64, m8, ta, ma -; RV32F-NEXT: vmv.v.x v16, a0 -; RV32F-NEXT: fsrmi a0, 1 -; RV32F-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; RV32F-NEXT: vfncvt.f.xu.w v24, v8 -; RV32F-NEXT: vsrl.vi v8, v24, 23 -; RV32F-NEXT: vwsubu.wv v16, v16, v8 -; RV32F-NEXT: fsrm a0 -; RV32F-NEXT: vmv8r.v v8, v16 -; RV32F-NEXT: ret -; -; RV64F-LABEL: ctlz_zero_undef_nxv8i64: -; RV64F: # %bb.0: -; RV64F-NEXT: li a0, 190 -; RV64F-NEXT: vsetvli a1, zero, e32, m4, ta, ma -; RV64F-NEXT: vmv.v.x v16, a0 -; RV64F-NEXT: fsrmi a0, 1 -; RV64F-NEXT: vfncvt.f.xu.w v20, v8 -; RV64F-NEXT: vsrl.vi v20, v20, 23 -; RV64F-NEXT: vwsubu.vv v8, v16, v20 -; RV64F-NEXT: fsrm a0 -; RV64F-NEXT: ret +; CHECK-F-LABEL: ctlz_zero_undef_nxv8i64: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: li a0, 190 +; CHECK-F-NEXT: vsetvli a1, zero, e32, m4, ta, ma +; CHECK-F-NEXT: vmv.v.x v16, a0 +; CHECK-F-NEXT: fsrmi a0, 1 +; CHECK-F-NEXT: vfncvt.f.xu.w v20, v8 +; CHECK-F-NEXT: vsrl.vi v20, v20, 23 +; CHECK-F-NEXT: vwsubu.vv v8, v16, v20 +; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: ctlz_zero_undef_nxv8i64: ; CHECK-D: # %bb.0: @@ -3387,4 +3267,6 @@ define @ctlz_zero_undef_nxv8i64( %va) { } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; RV32: {{.*}} +; RV32F: {{.*}} ; RV64: {{.*}} +; RV64F: {{.*}} diff --git a/llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll index 50bbe4f7b4c2d..3bddcf798f66b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll @@ -1213,42 +1213,23 @@ define @cttz_nxv1i64( %va) { ; RV64I-NEXT: vsrl.vx v8, v8, a0 ; RV64I-NEXT: ret ; -; RV32F-LABEL: cttz_nxv1i64: -; RV32F: # %bb.0: -; RV32F-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; RV32F-NEXT: vrsub.vi v9, v8, 0 -; RV32F-NEXT: vand.vv v9, v8, v9 -; RV32F-NEXT: fsrmi a0, 1 -; RV32F-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; RV32F-NEXT: vfncvt.f.xu.w v10, v9 -; RV32F-NEXT: vsrl.vi v9, v10, 23 -; RV32F-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; RV32F-NEXT: vzext.vf2 v10, v9 -; RV32F-NEXT: li a1, 127 -; RV32F-NEXT: vsub.vx v9, v10, a1 -; RV32F-NEXT: vmseq.vi v0, v8, 0 -; RV32F-NEXT: li a1, 64 -; RV32F-NEXT: vmerge.vxm v8, v9, a1, v0 -; RV32F-NEXT: fsrm a0 -; RV32F-NEXT: ret -; -; RV64F-LABEL: cttz_nxv1i64: -; RV64F: # %bb.0: -; RV64F-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; RV64F-NEXT: vrsub.vi v9, v8, 0 -; RV64F-NEXT: vand.vv v9, v8, v9 -; RV64F-NEXT: fsrmi a0, 1 -; RV64F-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; RV64F-NEXT: vfncvt.f.xu.w v10, v9 -; RV64F-NEXT: vsrl.vi v9, v10, 23 -; RV64F-NEXT: li a1, 127 -; RV64F-NEXT: vwsubu.vx v10, v9, a1 -; RV64F-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; RV64F-NEXT: vmseq.vi v0, v8, 0 -; RV64F-NEXT: li a1, 64 -; RV64F-NEXT: vmerge.vxm v8, v10, a1, v0 -; RV64F-NEXT: fsrm a0 -; RV64F-NEXT: ret +; CHECK-F-LABEL: cttz_nxv1i64: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; CHECK-F-NEXT: vrsub.vi v9, v8, 0 +; CHECK-F-NEXT: vand.vv v9, v8, v9 +; CHECK-F-NEXT: fsrmi a0, 1 +; CHECK-F-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-F-NEXT: vfncvt.f.xu.w v10, v9 +; CHECK-F-NEXT: vsrl.vi v9, v10, 23 +; CHECK-F-NEXT: li a1, 127 +; CHECK-F-NEXT: vwsubu.vx v10, v9, a1 +; CHECK-F-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-F-NEXT: vmseq.vi v0, v8, 0 +; CHECK-F-NEXT: li a1, 64 +; CHECK-F-NEXT: vmerge.vxm v8, v10, a1, v0 +; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: cttz_nxv1i64: ; CHECK-D: # %bb.0: @@ -1358,42 +1339,23 @@ define @cttz_nxv2i64( %va) { ; RV64I-NEXT: vsrl.vx v8, v8, a0 ; RV64I-NEXT: ret ; -; RV32F-LABEL: cttz_nxv2i64: -; RV32F: # %bb.0: -; RV32F-NEXT: vsetvli a0, zero, e64, m2, ta, ma -; RV32F-NEXT: vrsub.vi v10, v8, 0 -; RV32F-NEXT: vand.vv v10, v8, v10 -; RV32F-NEXT: fsrmi a0, 1 -; RV32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV32F-NEXT: vfncvt.f.xu.w v12, v10 -; RV32F-NEXT: vsrl.vi v10, v12, 23 -; RV32F-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; RV32F-NEXT: vzext.vf2 v12, v10 -; RV32F-NEXT: li a1, 127 -; RV32F-NEXT: vsub.vx v10, v12, a1 -; RV32F-NEXT: vmseq.vi v0, v8, 0 -; RV32F-NEXT: li a1, 64 -; RV32F-NEXT: vmerge.vxm v8, v10, a1, v0 -; RV32F-NEXT: fsrm a0 -; RV32F-NEXT: ret -; -; RV64F-LABEL: cttz_nxv2i64: -; RV64F: # %bb.0: -; RV64F-NEXT: vsetvli a0, zero, e64, m2, ta, ma -; RV64F-NEXT: vrsub.vi v10, v8, 0 -; RV64F-NEXT: vand.vv v10, v8, v10 -; RV64F-NEXT: fsrmi a0, 1 -; RV64F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64F-NEXT: vfncvt.f.xu.w v12, v10 -; RV64F-NEXT: vsrl.vi v10, v12, 23 -; RV64F-NEXT: li a1, 127 -; RV64F-NEXT: vwsubu.vx v12, v10, a1 -; RV64F-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; RV64F-NEXT: vmseq.vi v0, v8, 0 -; RV64F-NEXT: li a1, 64 -; RV64F-NEXT: vmerge.vxm v8, v12, a1, v0 -; RV64F-NEXT: fsrm a0 -; RV64F-NEXT: ret +; CHECK-F-LABEL: cttz_nxv2i64: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; CHECK-F-NEXT: vrsub.vi v10, v8, 0 +; CHECK-F-NEXT: vand.vv v10, v8, v10 +; CHECK-F-NEXT: fsrmi a0, 1 +; CHECK-F-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-F-NEXT: vfncvt.f.xu.w v12, v10 +; CHECK-F-NEXT: vsrl.vi v10, v12, 23 +; CHECK-F-NEXT: li a1, 127 +; CHECK-F-NEXT: vwsubu.vx v12, v10, a1 +; CHECK-F-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-F-NEXT: vmseq.vi v0, v8, 0 +; CHECK-F-NEXT: li a1, 64 +; CHECK-F-NEXT: vmerge.vxm v8, v12, a1, v0 +; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: cttz_nxv2i64: ; CHECK-D: # %bb.0: @@ -1503,42 +1465,23 @@ define @cttz_nxv4i64( %va) { ; RV64I-NEXT: vsrl.vx v8, v8, a0 ; RV64I-NEXT: ret ; -; RV32F-LABEL: cttz_nxv4i64: -; RV32F: # %bb.0: -; RV32F-NEXT: vsetvli a0, zero, e64, m4, ta, ma -; RV32F-NEXT: vrsub.vi v12, v8, 0 -; RV32F-NEXT: vand.vv v12, v8, v12 -; RV32F-NEXT: fsrmi a0, 1 -; RV32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; RV32F-NEXT: vfncvt.f.xu.w v16, v12 -; RV32F-NEXT: vsrl.vi v12, v16, 23 -; RV32F-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; RV32F-NEXT: vzext.vf2 v16, v12 -; RV32F-NEXT: li a1, 127 -; RV32F-NEXT: vsub.vx v12, v16, a1 -; RV32F-NEXT: vmseq.vi v0, v8, 0 -; RV32F-NEXT: li a1, 64 -; RV32F-NEXT: vmerge.vxm v8, v12, a1, v0 -; RV32F-NEXT: fsrm a0 -; RV32F-NEXT: ret -; -; RV64F-LABEL: cttz_nxv4i64: -; RV64F: # %bb.0: -; RV64F-NEXT: vsetvli a0, zero, e64, m4, ta, ma -; RV64F-NEXT: vrsub.vi v12, v8, 0 -; RV64F-NEXT: vand.vv v12, v8, v12 -; RV64F-NEXT: fsrmi a0, 1 -; RV64F-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; RV64F-NEXT: vfncvt.f.xu.w v16, v12 -; RV64F-NEXT: vsrl.vi v12, v16, 23 -; RV64F-NEXT: li a1, 127 -; RV64F-NEXT: vwsubu.vx v16, v12, a1 -; RV64F-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; RV64F-NEXT: vmseq.vi v0, v8, 0 -; RV64F-NEXT: li a1, 64 -; RV64F-NEXT: vmerge.vxm v8, v16, a1, v0 -; RV64F-NEXT: fsrm a0 -; RV64F-NEXT: ret +; CHECK-F-LABEL: cttz_nxv4i64: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; CHECK-F-NEXT: vrsub.vi v12, v8, 0 +; CHECK-F-NEXT: vand.vv v12, v8, v12 +; CHECK-F-NEXT: fsrmi a0, 1 +; CHECK-F-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-F-NEXT: vfncvt.f.xu.w v16, v12 +; CHECK-F-NEXT: vsrl.vi v12, v16, 23 +; CHECK-F-NEXT: li a1, 127 +; CHECK-F-NEXT: vwsubu.vx v16, v12, a1 +; CHECK-F-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-F-NEXT: vmseq.vi v0, v8, 0 +; CHECK-F-NEXT: li a1, 64 +; CHECK-F-NEXT: vmerge.vxm v8, v16, a1, v0 +; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: cttz_nxv4i64: ; CHECK-D: # %bb.0: @@ -1648,42 +1591,23 @@ define @cttz_nxv8i64( %va) { ; RV64I-NEXT: vsrl.vx v8, v8, a0 ; RV64I-NEXT: ret ; -; RV32F-LABEL: cttz_nxv8i64: -; RV32F: # %bb.0: -; RV32F-NEXT: vsetvli a0, zero, e64, m8, ta, ma -; RV32F-NEXT: vrsub.vi v16, v8, 0 -; RV32F-NEXT: vand.vv v16, v8, v16 -; RV32F-NEXT: fsrmi a0, 1 -; RV32F-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; RV32F-NEXT: vfncvt.f.xu.w v24, v16 -; RV32F-NEXT: vsrl.vi v16, v24, 23 -; RV32F-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; RV32F-NEXT: vzext.vf2 v24, v16 -; RV32F-NEXT: li a1, 127 -; RV32F-NEXT: vsub.vx v16, v24, a1 -; RV32F-NEXT: vmseq.vi v0, v8, 0 -; RV32F-NEXT: li a1, 64 -; RV32F-NEXT: vmerge.vxm v8, v16, a1, v0 -; RV32F-NEXT: fsrm a0 -; RV32F-NEXT: ret -; -; RV64F-LABEL: cttz_nxv8i64: -; RV64F: # %bb.0: -; RV64F-NEXT: vsetvli a0, zero, e64, m8, ta, ma -; RV64F-NEXT: vrsub.vi v16, v8, 0 -; RV64F-NEXT: vand.vv v16, v8, v16 -; RV64F-NEXT: fsrmi a0, 1 -; RV64F-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; RV64F-NEXT: vfncvt.f.xu.w v24, v16 -; RV64F-NEXT: vsrl.vi v16, v24, 23 -; RV64F-NEXT: li a1, 127 -; RV64F-NEXT: vwsubu.vx v24, v16, a1 -; RV64F-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; RV64F-NEXT: vmseq.vi v0, v8, 0 -; RV64F-NEXT: li a1, 64 -; RV64F-NEXT: vmerge.vxm v8, v24, a1, v0 -; RV64F-NEXT: fsrm a0 -; RV64F-NEXT: ret +; CHECK-F-LABEL: cttz_nxv8i64: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; CHECK-F-NEXT: vrsub.vi v16, v8, 0 +; CHECK-F-NEXT: vand.vv v16, v8, v16 +; CHECK-F-NEXT: fsrmi a0, 1 +; CHECK-F-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-F-NEXT: vfncvt.f.xu.w v24, v16 +; CHECK-F-NEXT: vsrl.vi v16, v24, 23 +; CHECK-F-NEXT: li a1, 127 +; CHECK-F-NEXT: vwsubu.vx v24, v16, a1 +; CHECK-F-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; CHECK-F-NEXT: vmseq.vi v0, v8, 0 +; CHECK-F-NEXT: li a1, 64 +; CHECK-F-NEXT: vmerge.vxm v8, v24, a1, v0 +; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: cttz_nxv8i64: ; CHECK-D: # %bb.0: @@ -2819,35 +2743,19 @@ define @cttz_zero_undef_nxv1i64( %va) { ; RV64I-NEXT: vsrl.vx v8, v8, a0 ; RV64I-NEXT: ret ; -; RV32F-LABEL: cttz_zero_undef_nxv1i64: -; RV32F: # %bb.0: -; RV32F-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; RV32F-NEXT: vrsub.vi v9, v8, 0 -; RV32F-NEXT: vand.vv v8, v8, v9 -; RV32F-NEXT: fsrmi a0, 1 -; RV32F-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; RV32F-NEXT: vfncvt.f.xu.w v9, v8 -; RV32F-NEXT: vsrl.vi v8, v9, 23 -; RV32F-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; RV32F-NEXT: vzext.vf2 v9, v8 -; RV32F-NEXT: li a1, 127 -; RV32F-NEXT: vsub.vx v8, v9, a1 -; RV32F-NEXT: fsrm a0 -; RV32F-NEXT: ret -; -; RV64F-LABEL: cttz_zero_undef_nxv1i64: -; RV64F: # %bb.0: -; RV64F-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; RV64F-NEXT: vrsub.vi v9, v8, 0 -; RV64F-NEXT: vand.vv v8, v8, v9 -; RV64F-NEXT: fsrmi a0, 1 -; RV64F-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; RV64F-NEXT: vfncvt.f.xu.w v9, v8 -; RV64F-NEXT: vsrl.vi v9, v9, 23 -; RV64F-NEXT: li a1, 127 -; RV64F-NEXT: vwsubu.vx v8, v9, a1 -; RV64F-NEXT: fsrm a0 -; RV64F-NEXT: ret +; CHECK-F-LABEL: cttz_zero_undef_nxv1i64: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; CHECK-F-NEXT: vrsub.vi v9, v8, 0 +; CHECK-F-NEXT: vand.vv v8, v8, v9 +; CHECK-F-NEXT: fsrmi a0, 1 +; CHECK-F-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-F-NEXT: vfncvt.f.xu.w v9, v8 +; CHECK-F-NEXT: vsrl.vi v9, v9, 23 +; CHECK-F-NEXT: li a1, 127 +; CHECK-F-NEXT: vwsubu.vx v8, v9, a1 +; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: cttz_zero_undef_nxv1i64: ; CHECK-D: # %bb.0: @@ -2953,35 +2861,19 @@ define @cttz_zero_undef_nxv2i64( %va) { ; RV64I-NEXT: vsrl.vx v8, v8, a0 ; RV64I-NEXT: ret ; -; RV32F-LABEL: cttz_zero_undef_nxv2i64: -; RV32F: # %bb.0: -; RV32F-NEXT: vsetvli a0, zero, e64, m2, ta, ma -; RV32F-NEXT: vrsub.vi v10, v8, 0 -; RV32F-NEXT: vand.vv v8, v8, v10 -; RV32F-NEXT: fsrmi a0, 1 -; RV32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV32F-NEXT: vfncvt.f.xu.w v10, v8 -; RV32F-NEXT: vsrl.vi v8, v10, 23 -; RV32F-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; RV32F-NEXT: vzext.vf2 v10, v8 -; RV32F-NEXT: li a1, 127 -; RV32F-NEXT: vsub.vx v8, v10, a1 -; RV32F-NEXT: fsrm a0 -; RV32F-NEXT: ret -; -; RV64F-LABEL: cttz_zero_undef_nxv2i64: -; RV64F: # %bb.0: -; RV64F-NEXT: vsetvli a0, zero, e64, m2, ta, ma -; RV64F-NEXT: vrsub.vi v10, v8, 0 -; RV64F-NEXT: vand.vv v8, v8, v10 -; RV64F-NEXT: fsrmi a0, 1 -; RV64F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64F-NEXT: vfncvt.f.xu.w v10, v8 -; RV64F-NEXT: vsrl.vi v10, v10, 23 -; RV64F-NEXT: li a1, 127 -; RV64F-NEXT: vwsubu.vx v8, v10, a1 -; RV64F-NEXT: fsrm a0 -; RV64F-NEXT: ret +; CHECK-F-LABEL: cttz_zero_undef_nxv2i64: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; CHECK-F-NEXT: vrsub.vi v10, v8, 0 +; CHECK-F-NEXT: vand.vv v8, v8, v10 +; CHECK-F-NEXT: fsrmi a0, 1 +; CHECK-F-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-F-NEXT: vfncvt.f.xu.w v10, v8 +; CHECK-F-NEXT: vsrl.vi v10, v10, 23 +; CHECK-F-NEXT: li a1, 127 +; CHECK-F-NEXT: vwsubu.vx v8, v10, a1 +; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: cttz_zero_undef_nxv2i64: ; CHECK-D: # %bb.0: @@ -3087,35 +2979,19 @@ define @cttz_zero_undef_nxv4i64( %va) { ; RV64I-NEXT: vsrl.vx v8, v8, a0 ; RV64I-NEXT: ret ; -; RV32F-LABEL: cttz_zero_undef_nxv4i64: -; RV32F: # %bb.0: -; RV32F-NEXT: vsetvli a0, zero, e64, m4, ta, ma -; RV32F-NEXT: vrsub.vi v12, v8, 0 -; RV32F-NEXT: vand.vv v8, v8, v12 -; RV32F-NEXT: fsrmi a0, 1 -; RV32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; RV32F-NEXT: vfncvt.f.xu.w v12, v8 -; RV32F-NEXT: vsrl.vi v8, v12, 23 -; RV32F-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; RV32F-NEXT: vzext.vf2 v12, v8 -; RV32F-NEXT: li a1, 127 -; RV32F-NEXT: vsub.vx v8, v12, a1 -; RV32F-NEXT: fsrm a0 -; RV32F-NEXT: ret -; -; RV64F-LABEL: cttz_zero_undef_nxv4i64: -; RV64F: # %bb.0: -; RV64F-NEXT: vsetvli a0, zero, e64, m4, ta, ma -; RV64F-NEXT: vrsub.vi v12, v8, 0 -; RV64F-NEXT: vand.vv v8, v8, v12 -; RV64F-NEXT: fsrmi a0, 1 -; RV64F-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; RV64F-NEXT: vfncvt.f.xu.w v12, v8 -; RV64F-NEXT: vsrl.vi v12, v12, 23 -; RV64F-NEXT: li a1, 127 -; RV64F-NEXT: vwsubu.vx v8, v12, a1 -; RV64F-NEXT: fsrm a0 -; RV64F-NEXT: ret +; CHECK-F-LABEL: cttz_zero_undef_nxv4i64: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; CHECK-F-NEXT: vrsub.vi v12, v8, 0 +; CHECK-F-NEXT: vand.vv v8, v8, v12 +; CHECK-F-NEXT: fsrmi a0, 1 +; CHECK-F-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-F-NEXT: vfncvt.f.xu.w v12, v8 +; CHECK-F-NEXT: vsrl.vi v12, v12, 23 +; CHECK-F-NEXT: li a1, 127 +; CHECK-F-NEXT: vwsubu.vx v8, v12, a1 +; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: cttz_zero_undef_nxv4i64: ; CHECK-D: # %bb.0: @@ -3221,35 +3097,19 @@ define @cttz_zero_undef_nxv8i64( %va) { ; RV64I-NEXT: vsrl.vx v8, v8, a0 ; RV64I-NEXT: ret ; -; RV32F-LABEL: cttz_zero_undef_nxv8i64: -; RV32F: # %bb.0: -; RV32F-NEXT: vsetvli a0, zero, e64, m8, ta, ma -; RV32F-NEXT: vrsub.vi v16, v8, 0 -; RV32F-NEXT: vand.vv v8, v8, v16 -; RV32F-NEXT: fsrmi a0, 1 -; RV32F-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; RV32F-NEXT: vfncvt.f.xu.w v16, v8 -; RV32F-NEXT: vsrl.vi v8, v16, 23 -; RV32F-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; RV32F-NEXT: vzext.vf2 v16, v8 -; RV32F-NEXT: li a1, 127 -; RV32F-NEXT: vsub.vx v8, v16, a1 -; RV32F-NEXT: fsrm a0 -; RV32F-NEXT: ret -; -; RV64F-LABEL: cttz_zero_undef_nxv8i64: -; RV64F: # %bb.0: -; RV64F-NEXT: vsetvli a0, zero, e64, m8, ta, ma -; RV64F-NEXT: vrsub.vi v16, v8, 0 -; RV64F-NEXT: vand.vv v8, v8, v16 -; RV64F-NEXT: fsrmi a0, 1 -; RV64F-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; RV64F-NEXT: vfncvt.f.xu.w v16, v8 -; RV64F-NEXT: vsrl.vi v16, v16, 23 -; RV64F-NEXT: li a1, 127 -; RV64F-NEXT: vwsubu.vx v8, v16, a1 -; RV64F-NEXT: fsrm a0 -; RV64F-NEXT: ret +; CHECK-F-LABEL: cttz_zero_undef_nxv8i64: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; CHECK-F-NEXT: vrsub.vi v16, v8, 0 +; CHECK-F-NEXT: vand.vv v8, v8, v16 +; CHECK-F-NEXT: fsrmi a0, 1 +; CHECK-F-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-F-NEXT: vfncvt.f.xu.w v16, v8 +; CHECK-F-NEXT: vsrl.vi v16, v16, 23 +; CHECK-F-NEXT: li a1, 127 +; CHECK-F-NEXT: vwsubu.vx v8, v16, a1 +; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: cttz_zero_undef_nxv8i64: ; CHECK-D: # %bb.0: @@ -3276,5 +3136,7 @@ define @cttz_zero_undef_nxv8i64( %va) { ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; RV32: {{.*}} ; RV32D: {{.*}} +; RV32F: {{.*}} ; RV64: {{.*}} ; RV64D: {{.*}} +; RV64F: {{.*}} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll index 49e5a1c79c43b..228a9f0d6d522 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll @@ -347,40 +347,22 @@ define void @ctlz_v2i64(ptr %x, ptr %y) nounwind { ; RV64I-NEXT: vse64.v v8, (a0) ; RV64I-NEXT: ret ; -; RV32F-LABEL: ctlz_v2i64: -; RV32F: # %bb.0: -; RV32F-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32F-NEXT: vle64.v v8, (a0) -; RV32F-NEXT: li a1, 190 -; RV32F-NEXT: vmv.v.x v9, a1 -; RV32F-NEXT: fsrmi a1, 1 -; RV32F-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; RV32F-NEXT: vfncvt.f.xu.w v10, v8 -; RV32F-NEXT: fsrm a1 -; RV32F-NEXT: vsrl.vi v8, v10, 23 -; RV32F-NEXT: vwsubu.wv v9, v9, v8 -; RV32F-NEXT: li a1, 64 -; RV32F-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; RV32F-NEXT: vminu.vx v8, v9, a1 -; RV32F-NEXT: vse64.v v8, (a0) -; RV32F-NEXT: ret -; -; RV64F-LABEL: ctlz_v2i64: -; RV64F: # %bb.0: -; RV64F-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; RV64F-NEXT: vle64.v v8, (a0) -; RV64F-NEXT: li a1, 190 -; RV64F-NEXT: vmv.v.x v9, a1 -; RV64F-NEXT: fsrmi a1, 1 -; RV64F-NEXT: vfncvt.f.xu.w v10, v8 -; RV64F-NEXT: fsrm a1 -; RV64F-NEXT: vsrl.vi v8, v10, 23 -; RV64F-NEXT: vwsubu.vv v10, v9, v8 -; RV64F-NEXT: li a1, 64 -; RV64F-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; RV64F-NEXT: vminu.vx v8, v10, a1 -; RV64F-NEXT: vse64.v v8, (a0) -; RV64F-NEXT: ret +; RVF-LABEL: ctlz_v2i64: +; RVF: # %bb.0: +; RVF-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RVF-NEXT: vle64.v v8, (a0) +; RVF-NEXT: li a1, 190 +; RVF-NEXT: vmv.v.x v9, a1 +; RVF-NEXT: fsrmi a1, 1 +; RVF-NEXT: vfncvt.f.xu.w v10, v8 +; RVF-NEXT: fsrm a1 +; RVF-NEXT: vsrl.vi v8, v10, 23 +; RVF-NEXT: vwsubu.vv v10, v9, v8 +; RVF-NEXT: li a1, 64 +; RVF-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RVF-NEXT: vminu.vx v8, v10, a1 +; RVF-NEXT: vse64.v v8, (a0) +; RVF-NEXT: ret ; ; RVD-LABEL: ctlz_v2i64: ; RVD: # %bb.0: @@ -756,40 +738,22 @@ define void @ctlz_v4i64(ptr %x, ptr %y) nounwind { ; RV64I-NEXT: vse64.v v8, (a0) ; RV64I-NEXT: ret ; -; RV32F-LABEL: ctlz_v4i64: -; RV32F: # %bb.0: -; RV32F-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV32F-NEXT: vle64.v v8, (a0) -; RV32F-NEXT: li a1, 190 -; RV32F-NEXT: vmv.v.x v10, a1 -; RV32F-NEXT: fsrmi a1, 1 -; RV32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV32F-NEXT: vfncvt.f.xu.w v12, v8 -; RV32F-NEXT: fsrm a1 -; RV32F-NEXT: vsrl.vi v8, v12, 23 -; RV32F-NEXT: vwsubu.wv v10, v10, v8 -; RV32F-NEXT: li a1, 64 -; RV32F-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; RV32F-NEXT: vminu.vx v8, v10, a1 -; RV32F-NEXT: vse64.v v8, (a0) -; RV32F-NEXT: ret -; -; RV64F-LABEL: ctlz_v4i64: -; RV64F: # %bb.0: -; RV64F-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV64F-NEXT: vle64.v v8, (a0) -; RV64F-NEXT: li a1, 190 -; RV64F-NEXT: vmv.v.x v10, a1 -; RV64F-NEXT: fsrmi a1, 1 -; RV64F-NEXT: vfncvt.f.xu.w v11, v8 -; RV64F-NEXT: fsrm a1 -; RV64F-NEXT: vsrl.vi v8, v11, 23 -; RV64F-NEXT: vwsubu.vv v12, v10, v8 -; RV64F-NEXT: li a1, 64 -; RV64F-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; RV64F-NEXT: vminu.vx v8, v12, a1 -; RV64F-NEXT: vse64.v v8, (a0) -; RV64F-NEXT: ret +; RVF-LABEL: ctlz_v4i64: +; RVF: # %bb.0: +; RVF-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RVF-NEXT: vle64.v v8, (a0) +; RVF-NEXT: li a1, 190 +; RVF-NEXT: vmv.v.x v10, a1 +; RVF-NEXT: fsrmi a1, 1 +; RVF-NEXT: vfncvt.f.xu.w v11, v8 +; RVF-NEXT: fsrm a1 +; RVF-NEXT: vsrl.vi v8, v11, 23 +; RVF-NEXT: vwsubu.vv v12, v10, v8 +; RVF-NEXT: li a1, 64 +; RVF-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RVF-NEXT: vminu.vx v8, v12, a1 +; RVF-NEXT: vse64.v v8, (a0) +; RVF-NEXT: ret ; ; RVD-LABEL: ctlz_v4i64: ; RVD: # %bb.0: @@ -1146,34 +1110,19 @@ define void @ctlz_zero_undef_v2i64(ptr %x, ptr %y) nounwind { ; RV64I-NEXT: vse64.v v8, (a0) ; RV64I-NEXT: ret ; -; RV32F-LABEL: ctlz_zero_undef_v2i64: -; RV32F: # %bb.0: -; RV32F-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32F-NEXT: vle64.v v8, (a0) -; RV32F-NEXT: li a1, 190 -; RV32F-NEXT: vmv.v.x v9, a1 -; RV32F-NEXT: fsrmi a1, 1 -; RV32F-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; RV32F-NEXT: vfncvt.f.xu.w v10, v8 -; RV32F-NEXT: fsrm a1 -; RV32F-NEXT: vsrl.vi v8, v10, 23 -; RV32F-NEXT: vwsubu.wv v9, v9, v8 -; RV32F-NEXT: vse64.v v9, (a0) -; RV32F-NEXT: ret -; -; RV64F-LABEL: ctlz_zero_undef_v2i64: -; RV64F: # %bb.0: -; RV64F-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; RV64F-NEXT: vle64.v v8, (a0) -; RV64F-NEXT: li a1, 190 -; RV64F-NEXT: vmv.v.x v9, a1 -; RV64F-NEXT: fsrmi a1, 1 -; RV64F-NEXT: vfncvt.f.xu.w v10, v8 -; RV64F-NEXT: fsrm a1 -; RV64F-NEXT: vsrl.vi v8, v10, 23 -; RV64F-NEXT: vwsubu.vv v10, v9, v8 -; RV64F-NEXT: vse64.v v10, (a0) -; RV64F-NEXT: ret +; RVF-LABEL: ctlz_zero_undef_v2i64: +; RVF: # %bb.0: +; RVF-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RVF-NEXT: vle64.v v8, (a0) +; RVF-NEXT: li a1, 190 +; RVF-NEXT: vmv.v.x v9, a1 +; RVF-NEXT: fsrmi a1, 1 +; RVF-NEXT: vfncvt.f.xu.w v10, v8 +; RVF-NEXT: fsrm a1 +; RVF-NEXT: vsrl.vi v8, v10, 23 +; RVF-NEXT: vwsubu.vv v10, v9, v8 +; RVF-NEXT: vse64.v v10, (a0) +; RVF-NEXT: ret ; ; RVD-LABEL: ctlz_zero_undef_v2i64: ; RVD: # %bb.0: @@ -1531,34 +1480,19 @@ define void @ctlz_zero_undef_v4i64(ptr %x, ptr %y) nounwind { ; RV64I-NEXT: vse64.v v8, (a0) ; RV64I-NEXT: ret ; -; RV32F-LABEL: ctlz_zero_undef_v4i64: -; RV32F: # %bb.0: -; RV32F-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV32F-NEXT: vle64.v v8, (a0) -; RV32F-NEXT: li a1, 190 -; RV32F-NEXT: vmv.v.x v10, a1 -; RV32F-NEXT: fsrmi a1, 1 -; RV32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV32F-NEXT: vfncvt.f.xu.w v12, v8 -; RV32F-NEXT: fsrm a1 -; RV32F-NEXT: vsrl.vi v8, v12, 23 -; RV32F-NEXT: vwsubu.wv v10, v10, v8 -; RV32F-NEXT: vse64.v v10, (a0) -; RV32F-NEXT: ret -; -; RV64F-LABEL: ctlz_zero_undef_v4i64: -; RV64F: # %bb.0: -; RV64F-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV64F-NEXT: vle64.v v8, (a0) -; RV64F-NEXT: li a1, 190 -; RV64F-NEXT: vmv.v.x v10, a1 -; RV64F-NEXT: fsrmi a1, 1 -; RV64F-NEXT: vfncvt.f.xu.w v11, v8 -; RV64F-NEXT: fsrm a1 -; RV64F-NEXT: vsrl.vi v8, v11, 23 -; RV64F-NEXT: vwsubu.vv v12, v10, v8 -; RV64F-NEXT: vse64.v v12, (a0) -; RV64F-NEXT: ret +; RVF-LABEL: ctlz_zero_undef_v4i64: +; RVF: # %bb.0: +; RVF-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RVF-NEXT: vle64.v v8, (a0) +; RVF-NEXT: li a1, 190 +; RVF-NEXT: vmv.v.x v10, a1 +; RVF-NEXT: fsrmi a1, 1 +; RVF-NEXT: vfncvt.f.xu.w v11, v8 +; RVF-NEXT: fsrm a1 +; RVF-NEXT: vsrl.vi v8, v11, 23 +; RVF-NEXT: vwsubu.vv v12, v10, v8 +; RVF-NEXT: vse64.v v12, (a0) +; RVF-NEXT: ret ; ; RVD-LABEL: ctlz_zero_undef_v4i64: ; RVD: # %bb.0: @@ -1589,4 +1523,6 @@ define void @ctlz_zero_undef_v4i64(ptr %x, ptr %y) nounwind { } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; RV32D: {{.*}} +; RV32F: {{.*}} ; RV64D: {{.*}} +; RV64F: {{.*}} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll index ea3a78ae0becc..4b1691aada5be 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll @@ -330,46 +330,25 @@ define void @cttz_v2i64(ptr %x, ptr %y) nounwind { ; RV64I-NEXT: vse64.v v8, (a0) ; RV64I-NEXT: ret ; -; RV32F-LABEL: cttz_v2i64: -; RV32F: # %bb.0: -; RV32F-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32F-NEXT: vle64.v v8, (a0) -; RV32F-NEXT: vrsub.vi v9, v8, 0 -; RV32F-NEXT: vand.vv v9, v8, v9 -; RV32F-NEXT: fsrmi a1, 1 -; RV32F-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; RV32F-NEXT: vfncvt.f.xu.w v10, v9 -; RV32F-NEXT: fsrm a1 -; RV32F-NEXT: vsrl.vi v9, v10, 23 -; RV32F-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; RV32F-NEXT: vzext.vf2 v10, v9 -; RV32F-NEXT: li a1, 127 -; RV32F-NEXT: vsub.vx v9, v10, a1 -; RV32F-NEXT: vmseq.vi v0, v8, 0 -; RV32F-NEXT: li a1, 64 -; RV32F-NEXT: vmerge.vxm v8, v9, a1, v0 -; RV32F-NEXT: vse64.v v8, (a0) -; RV32F-NEXT: ret -; -; RV64F-LABEL: cttz_v2i64: -; RV64F: # %bb.0: -; RV64F-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV64F-NEXT: vle64.v v8, (a0) -; RV64F-NEXT: vrsub.vi v9, v8, 0 -; RV64F-NEXT: vand.vv v9, v8, v9 -; RV64F-NEXT: fsrmi a1, 1 -; RV64F-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; RV64F-NEXT: vfncvt.f.xu.w v10, v9 -; RV64F-NEXT: fsrm a1 -; RV64F-NEXT: vsrl.vi v9, v10, 23 -; RV64F-NEXT: li a1, 127 -; RV64F-NEXT: vwsubu.vx v10, v9, a1 -; RV64F-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; RV64F-NEXT: vmseq.vi v0, v8, 0 -; RV64F-NEXT: li a1, 64 -; RV64F-NEXT: vmerge.vxm v8, v10, a1, v0 -; RV64F-NEXT: vse64.v v8, (a0) -; RV64F-NEXT: ret +; RVF-LABEL: cttz_v2i64: +; RVF: # %bb.0: +; RVF-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RVF-NEXT: vle64.v v8, (a0) +; RVF-NEXT: vrsub.vi v9, v8, 0 +; RVF-NEXT: vand.vv v9, v8, v9 +; RVF-NEXT: fsrmi a1, 1 +; RVF-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; RVF-NEXT: vfncvt.f.xu.w v10, v9 +; RVF-NEXT: fsrm a1 +; RVF-NEXT: vsrl.vi v9, v10, 23 +; RVF-NEXT: li a1, 127 +; RVF-NEXT: vwsubu.vx v10, v9, a1 +; RVF-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RVF-NEXT: vmseq.vi v0, v8, 0 +; RVF-NEXT: li a1, 64 +; RVF-NEXT: vmerge.vxm v8, v10, a1, v0 +; RVF-NEXT: vse64.v v8, (a0) +; RVF-NEXT: ret ; ; RVD-LABEL: cttz_v2i64: ; RVD: # %bb.0: @@ -731,46 +710,25 @@ define void @cttz_v4i64(ptr %x, ptr %y) nounwind { ; RV64I-NEXT: vse64.v v8, (a0) ; RV64I-NEXT: ret ; -; RV32F-LABEL: cttz_v4i64: -; RV32F: # %bb.0: -; RV32F-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV32F-NEXT: vle64.v v8, (a0) -; RV32F-NEXT: vrsub.vi v10, v8, 0 -; RV32F-NEXT: vand.vv v10, v8, v10 -; RV32F-NEXT: fsrmi a1, 1 -; RV32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV32F-NEXT: vfncvt.f.xu.w v12, v10 -; RV32F-NEXT: fsrm a1 -; RV32F-NEXT: vsrl.vi v10, v12, 23 -; RV32F-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; RV32F-NEXT: vzext.vf2 v12, v10 -; RV32F-NEXT: li a1, 127 -; RV32F-NEXT: vsub.vx v10, v12, a1 -; RV32F-NEXT: vmseq.vi v0, v8, 0 -; RV32F-NEXT: li a1, 64 -; RV32F-NEXT: vmerge.vxm v8, v10, a1, v0 -; RV32F-NEXT: vse64.v v8, (a0) -; RV32F-NEXT: ret -; -; RV64F-LABEL: cttz_v4i64: -; RV64F: # %bb.0: -; RV64F-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV64F-NEXT: vle64.v v8, (a0) -; RV64F-NEXT: vrsub.vi v10, v8, 0 -; RV64F-NEXT: vand.vv v10, v8, v10 -; RV64F-NEXT: fsrmi a1, 1 -; RV64F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64F-NEXT: vfncvt.f.xu.w v12, v10 -; RV64F-NEXT: fsrm a1 -; RV64F-NEXT: vsrl.vi v10, v12, 23 -; RV64F-NEXT: li a1, 127 -; RV64F-NEXT: vwsubu.vx v12, v10, a1 -; RV64F-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; RV64F-NEXT: vmseq.vi v0, v8, 0 -; RV64F-NEXT: li a1, 64 -; RV64F-NEXT: vmerge.vxm v8, v12, a1, v0 -; RV64F-NEXT: vse64.v v8, (a0) -; RV64F-NEXT: ret +; RVF-LABEL: cttz_v4i64: +; RVF: # %bb.0: +; RVF-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RVF-NEXT: vle64.v v8, (a0) +; RVF-NEXT: vrsub.vi v10, v8, 0 +; RVF-NEXT: vand.vv v10, v8, v10 +; RVF-NEXT: fsrmi a1, 1 +; RVF-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RVF-NEXT: vfncvt.f.xu.w v12, v10 +; RVF-NEXT: fsrm a1 +; RVF-NEXT: vsrl.vi v10, v12, 23 +; RVF-NEXT: li a1, 127 +; RVF-NEXT: vwsubu.vx v12, v10, a1 +; RVF-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RVF-NEXT: vmseq.vi v0, v8, 0 +; RVF-NEXT: li a1, 64 +; RVF-NEXT: vmerge.vxm v8, v12, a1, v0 +; RVF-NEXT: vse64.v v8, (a0) +; RVF-NEXT: ret ; ; RVD-LABEL: cttz_v4i64: ; RVD: # %bb.0: @@ -1109,39 +1067,21 @@ define void @cttz_zero_undef_v2i64(ptr %x, ptr %y) nounwind { ; RV64I-NEXT: vse64.v v8, (a0) ; RV64I-NEXT: ret ; -; RV32F-LABEL: cttz_zero_undef_v2i64: -; RV32F: # %bb.0: -; RV32F-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32F-NEXT: vle64.v v8, (a0) -; RV32F-NEXT: vrsub.vi v9, v8, 0 -; RV32F-NEXT: vand.vv v8, v8, v9 -; RV32F-NEXT: fsrmi a1, 1 -; RV32F-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; RV32F-NEXT: vfncvt.f.xu.w v9, v8 -; RV32F-NEXT: fsrm a1 -; RV32F-NEXT: vsrl.vi v8, v9, 23 -; RV32F-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; RV32F-NEXT: vzext.vf2 v9, v8 -; RV32F-NEXT: li a1, 127 -; RV32F-NEXT: vsub.vx v8, v9, a1 -; RV32F-NEXT: vse64.v v8, (a0) -; RV32F-NEXT: ret -; -; RV64F-LABEL: cttz_zero_undef_v2i64: -; RV64F: # %bb.0: -; RV64F-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV64F-NEXT: vle64.v v8, (a0) -; RV64F-NEXT: vrsub.vi v9, v8, 0 -; RV64F-NEXT: vand.vv v8, v8, v9 -; RV64F-NEXT: fsrmi a1, 1 -; RV64F-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; RV64F-NEXT: vfncvt.f.xu.w v9, v8 -; RV64F-NEXT: fsrm a1 -; RV64F-NEXT: vsrl.vi v8, v9, 23 -; RV64F-NEXT: li a1, 127 -; RV64F-NEXT: vwsubu.vx v9, v8, a1 -; RV64F-NEXT: vse64.v v9, (a0) -; RV64F-NEXT: ret +; RVF-LABEL: cttz_zero_undef_v2i64: +; RVF: # %bb.0: +; RVF-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RVF-NEXT: vle64.v v8, (a0) +; RVF-NEXT: vrsub.vi v9, v8, 0 +; RVF-NEXT: vand.vv v8, v8, v9 +; RVF-NEXT: fsrmi a1, 1 +; RVF-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; RVF-NEXT: vfncvt.f.xu.w v9, v8 +; RVF-NEXT: fsrm a1 +; RVF-NEXT: vsrl.vi v8, v9, 23 +; RVF-NEXT: li a1, 127 +; RVF-NEXT: vwsubu.vx v9, v8, a1 +; RVF-NEXT: vse64.v v9, (a0) +; RVF-NEXT: ret ; ; RVD-LABEL: cttz_zero_undef_v2i64: ; RVD: # %bb.0: @@ -1480,39 +1420,21 @@ define void @cttz_zero_undef_v4i64(ptr %x, ptr %y) nounwind { ; RV64I-NEXT: vse64.v v8, (a0) ; RV64I-NEXT: ret ; -; RV32F-LABEL: cttz_zero_undef_v4i64: -; RV32F: # %bb.0: -; RV32F-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV32F-NEXT: vle64.v v8, (a0) -; RV32F-NEXT: vrsub.vi v10, v8, 0 -; RV32F-NEXT: vand.vv v8, v8, v10 -; RV32F-NEXT: fsrmi a1, 1 -; RV32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV32F-NEXT: vfncvt.f.xu.w v10, v8 -; RV32F-NEXT: fsrm a1 -; RV32F-NEXT: vsrl.vi v8, v10, 23 -; RV32F-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; RV32F-NEXT: vzext.vf2 v10, v8 -; RV32F-NEXT: li a1, 127 -; RV32F-NEXT: vsub.vx v8, v10, a1 -; RV32F-NEXT: vse64.v v8, (a0) -; RV32F-NEXT: ret -; -; RV64F-LABEL: cttz_zero_undef_v4i64: -; RV64F: # %bb.0: -; RV64F-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV64F-NEXT: vle64.v v8, (a0) -; RV64F-NEXT: vrsub.vi v10, v8, 0 -; RV64F-NEXT: vand.vv v8, v8, v10 -; RV64F-NEXT: fsrmi a1, 1 -; RV64F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64F-NEXT: vfncvt.f.xu.w v10, v8 -; RV64F-NEXT: fsrm a1 -; RV64F-NEXT: vsrl.vi v8, v10, 23 -; RV64F-NEXT: li a1, 127 -; RV64F-NEXT: vwsubu.vx v10, v8, a1 -; RV64F-NEXT: vse64.v v10, (a0) -; RV64F-NEXT: ret +; RVF-LABEL: cttz_zero_undef_v4i64: +; RVF: # %bb.0: +; RVF-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RVF-NEXT: vle64.v v8, (a0) +; RVF-NEXT: vrsub.vi v10, v8, 0 +; RVF-NEXT: vand.vv v8, v8, v10 +; RVF-NEXT: fsrmi a1, 1 +; RVF-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RVF-NEXT: vfncvt.f.xu.w v10, v8 +; RVF-NEXT: fsrm a1 +; RVF-NEXT: vsrl.vi v8, v10, 23 +; RVF-NEXT: li a1, 127 +; RVF-NEXT: vwsubu.vx v10, v8, a1 +; RVF-NEXT: vse64.v v10, (a0) +; RVF-NEXT: ret ; ; RVD-LABEL: cttz_zero_undef_v4i64: ; RVD: # %bb.0: @@ -1545,4 +1467,6 @@ define void @cttz_zero_undef_v4i64(ptr %x, ptr %y) nounwind { } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; RV32D: {{.*}} +; RV32F: {{.*}} ; RV64D: {{.*}} +; RV64F: {{.*}} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll index 4c84cf350c40e..17a63eff26ac1 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll @@ -479,16 +479,18 @@ define <8 x i16> @shuffle_v8i16_as_i32(<8 x i16> %v) { define <8 x i16> @shuffle_v8i16_as_i64_16(<8 x i16> %v) { ; RV32-LABEL: shuffle_v8i16_as_i64_16: ; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV32-NEXT: vmv.v.i v9, 0 ; RV32-NEXT: li a0, 48 -; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a0 -; RV32-NEXT: li a0, 63 -; RV32-NEXT: vand.vx v10, v9, a0 -; RV32-NEXT: vsll.vv v10, v8, v10 -; RV32-NEXT: vrsub.vi v9, v9, 0 -; RV32-NEXT: vand.vx v9, v9, a0 -; RV32-NEXT: vsrl.vv v8, v8, v9 -; RV32-NEXT: vor.vv v8, v10, v8 +; RV32-NEXT: vwsubu.vx v10, v9, a0 +; RV32-NEXT: li a1, 63 +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV32-NEXT: vand.vx v9, v10, a1 +; RV32-NEXT: vsrl.vv v9, v8, v9 +; RV32-NEXT: vmv.v.x v10, a0 +; RV32-NEXT: vand.vx v10, v10, a1 +; RV32-NEXT: vsll.vv v8, v8, v10 +; RV32-NEXT: vor.vv v8, v8, v9 ; RV32-NEXT: ret ; ; RV64-LABEL: shuffle_v8i16_as_i64_16: @@ -523,16 +525,18 @@ define <8 x i16> @shuffle_v8i16_as_i64_16(<8 x i16> %v) { define <8 x i16> @shuffle_v8i16_as_i64_32(<8 x i16> %v) { ; RV32-LABEL: shuffle_v8i16_as_i64_32: ; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV32-NEXT: vmv.v.i v9, 0 ; RV32-NEXT: li a0, 32 -; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a0 -; RV32-NEXT: li a0, 63 -; RV32-NEXT: vand.vx v10, v9, a0 -; RV32-NEXT: vsll.vv v10, v8, v10 -; RV32-NEXT: vrsub.vi v9, v9, 0 -; RV32-NEXT: vand.vx v9, v9, a0 -; RV32-NEXT: vsrl.vv v8, v8, v9 -; RV32-NEXT: vor.vv v8, v10, v8 +; RV32-NEXT: vwsubu.vx v10, v9, a0 +; RV32-NEXT: li a1, 63 +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV32-NEXT: vand.vx v9, v10, a1 +; RV32-NEXT: vsrl.vv v9, v8, v9 +; RV32-NEXT: vmv.v.x v10, a0 +; RV32-NEXT: vand.vx v10, v10, a1 +; RV32-NEXT: vsll.vv v8, v8, v10 +; RV32-NEXT: vor.vv v8, v8, v9 ; RV32-NEXT: ret ; ; RV64-LABEL: shuffle_v8i16_as_i64_32: @@ -567,16 +571,18 @@ define <8 x i16> @shuffle_v8i16_as_i64_32(<8 x i16> %v) { define <8 x i16> @shuffle_v8i16_as_i64_48(<8 x i16> %v) { ; RV32-LABEL: shuffle_v8i16_as_i64_48: ; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV32-NEXT: vmv.v.i v9, 0 ; RV32-NEXT: li a0, 16 -; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a0 -; RV32-NEXT: li a0, 63 -; RV32-NEXT: vand.vx v10, v9, a0 -; RV32-NEXT: vsll.vv v10, v8, v10 -; RV32-NEXT: vrsub.vi v9, v9, 0 -; RV32-NEXT: vand.vx v9, v9, a0 -; RV32-NEXT: vsrl.vv v8, v8, v9 -; RV32-NEXT: vor.vv v8, v10, v8 +; RV32-NEXT: vwsubu.vx v10, v9, a0 +; RV32-NEXT: li a1, 63 +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV32-NEXT: vand.vx v9, v10, a1 +; RV32-NEXT: vsrl.vv v9, v8, v9 +; RV32-NEXT: vmv.v.x v10, a0 +; RV32-NEXT: vand.vx v10, v10, a1 +; RV32-NEXT: vsll.vv v8, v8, v10 +; RV32-NEXT: vor.vv v8, v8, v9 ; RV32-NEXT: ret ; ; RV64-LABEL: shuffle_v8i16_as_i64_48: @@ -611,16 +617,18 @@ define <8 x i16> @shuffle_v8i16_as_i64_48(<8 x i16> %v) { define <8 x i32> @shuffle_v8i32_as_i64(<8 x i32> %v) { ; RV32-LABEL: shuffle_v8i32_as_i64: ; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vmv.v.i v10, 0 ; RV32-NEXT: li a0, 32 -; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a0 -; RV32-NEXT: li a0, 63 -; RV32-NEXT: vand.vx v12, v10, a0 -; RV32-NEXT: vsll.vv v12, v8, v12 -; RV32-NEXT: vrsub.vi v10, v10, 0 -; RV32-NEXT: vand.vx v10, v10, a0 -; RV32-NEXT: vsrl.vv v8, v8, v10 -; RV32-NEXT: vor.vv v8, v12, v8 +; RV32-NEXT: vwsubu.vx v12, v10, a0 +; RV32-NEXT: li a1, 63 +; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV32-NEXT: vand.vx v10, v12, a1 +; RV32-NEXT: vsrl.vv v10, v8, v10 +; RV32-NEXT: vmv.v.x v12, a0 +; RV32-NEXT: vand.vx v12, v12, a1 +; RV32-NEXT: vsll.vv v8, v8, v12 +; RV32-NEXT: vor.vv v8, v8, v10 ; RV32-NEXT: ret ; ; RV64-LABEL: shuffle_v8i32_as_i64: @@ -680,16 +688,18 @@ define <8 x half> @shuffle_v8f16_as_i32(<8 x half> %v) { define <8 x half> @shuffle_v8f16_as_i64_16(<8 x half> %v) { ; RV32-LABEL: shuffle_v8f16_as_i64_16: ; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV32-NEXT: vmv.v.i v9, 0 ; RV32-NEXT: li a0, 48 -; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a0 -; RV32-NEXT: li a0, 63 -; RV32-NEXT: vand.vx v10, v9, a0 -; RV32-NEXT: vsll.vv v10, v8, v10 -; RV32-NEXT: vrsub.vi v9, v9, 0 -; RV32-NEXT: vand.vx v9, v9, a0 -; RV32-NEXT: vsrl.vv v8, v8, v9 -; RV32-NEXT: vor.vv v8, v10, v8 +; RV32-NEXT: vwsubu.vx v10, v9, a0 +; RV32-NEXT: li a1, 63 +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV32-NEXT: vand.vx v9, v10, a1 +; RV32-NEXT: vsrl.vv v9, v8, v9 +; RV32-NEXT: vmv.v.x v10, a0 +; RV32-NEXT: vand.vx v10, v10, a1 +; RV32-NEXT: vsll.vv v8, v8, v10 +; RV32-NEXT: vor.vv v8, v8, v9 ; RV32-NEXT: ret ; ; RV64-LABEL: shuffle_v8f16_as_i64_16: @@ -724,16 +734,18 @@ define <8 x half> @shuffle_v8f16_as_i64_16(<8 x half> %v) { define <8 x half> @shuffle_v8f16_as_i64_32(<8 x half> %v) { ; RV32-LABEL: shuffle_v8f16_as_i64_32: ; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV32-NEXT: vmv.v.i v9, 0 ; RV32-NEXT: li a0, 32 -; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a0 -; RV32-NEXT: li a0, 63 -; RV32-NEXT: vand.vx v10, v9, a0 -; RV32-NEXT: vsll.vv v10, v8, v10 -; RV32-NEXT: vrsub.vi v9, v9, 0 -; RV32-NEXT: vand.vx v9, v9, a0 -; RV32-NEXT: vsrl.vv v8, v8, v9 -; RV32-NEXT: vor.vv v8, v10, v8 +; RV32-NEXT: vwsubu.vx v10, v9, a0 +; RV32-NEXT: li a1, 63 +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV32-NEXT: vand.vx v9, v10, a1 +; RV32-NEXT: vsrl.vv v9, v8, v9 +; RV32-NEXT: vmv.v.x v10, a0 +; RV32-NEXT: vand.vx v10, v10, a1 +; RV32-NEXT: vsll.vv v8, v8, v10 +; RV32-NEXT: vor.vv v8, v8, v9 ; RV32-NEXT: ret ; ; RV64-LABEL: shuffle_v8f16_as_i64_32: @@ -768,16 +780,18 @@ define <8 x half> @shuffle_v8f16_as_i64_32(<8 x half> %v) { define <8 x half> @shuffle_v8f16_as_i64_48(<8 x half> %v) { ; RV32-LABEL: shuffle_v8f16_as_i64_48: ; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV32-NEXT: vmv.v.i v9, 0 ; RV32-NEXT: li a0, 16 -; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a0 -; RV32-NEXT: li a0, 63 -; RV32-NEXT: vand.vx v10, v9, a0 -; RV32-NEXT: vsll.vv v10, v8, v10 -; RV32-NEXT: vrsub.vi v9, v9, 0 -; RV32-NEXT: vand.vx v9, v9, a0 -; RV32-NEXT: vsrl.vv v8, v8, v9 -; RV32-NEXT: vor.vv v8, v10, v8 +; RV32-NEXT: vwsubu.vx v10, v9, a0 +; RV32-NEXT: li a1, 63 +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV32-NEXT: vand.vx v9, v10, a1 +; RV32-NEXT: vsrl.vv v9, v8, v9 +; RV32-NEXT: vmv.v.x v10, a0 +; RV32-NEXT: vand.vx v10, v10, a1 +; RV32-NEXT: vsll.vv v8, v8, v10 +; RV32-NEXT: vor.vv v8, v8, v9 ; RV32-NEXT: ret ; ; RV64-LABEL: shuffle_v8f16_as_i64_48: @@ -812,16 +826,18 @@ define <8 x half> @shuffle_v8f16_as_i64_48(<8 x half> %v) { define <8 x float> @shuffle_v8f32_as_i64(<8 x float> %v) { ; RV32-LABEL: shuffle_v8f32_as_i64: ; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vmv.v.i v10, 0 ; RV32-NEXT: li a0, 32 -; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a0 -; RV32-NEXT: li a0, 63 -; RV32-NEXT: vand.vx v12, v10, a0 -; RV32-NEXT: vsll.vv v12, v8, v12 -; RV32-NEXT: vrsub.vi v10, v10, 0 -; RV32-NEXT: vand.vx v10, v10, a0 -; RV32-NEXT: vsrl.vv v8, v8, v10 -; RV32-NEXT: vor.vv v8, v12, v8 +; RV32-NEXT: vwsubu.vx v12, v10, a0 +; RV32-NEXT: li a1, 63 +; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV32-NEXT: vand.vx v10, v12, a1 +; RV32-NEXT: vsrl.vv v10, v8, v10 +; RV32-NEXT: vmv.v.x v12, a0 +; RV32-NEXT: vand.vx v12, v12, a1 +; RV32-NEXT: vsll.vv v8, v8, v12 +; RV32-NEXT: vor.vv v8, v8, v10 ; RV32-NEXT: ret ; ; RV64-LABEL: shuffle_v8f32_as_i64: @@ -857,16 +873,18 @@ define <8 x float> @shuffle_v8f32_as_i64(<8 x float> %v) { define <8 x float> @shuffle_v8f32_as_i64_exact(<8 x float> %v) vscale_range(2,2) { ; RV32-LABEL: shuffle_v8f32_as_i64_exact: ; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vmv.v.i v10, 0 ; RV32-NEXT: li a0, 32 -; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a0 -; RV32-NEXT: li a0, 63 -; RV32-NEXT: vand.vx v12, v10, a0 -; RV32-NEXT: vsll.vv v12, v8, v12 -; RV32-NEXT: vrsub.vi v10, v10, 0 -; RV32-NEXT: vand.vx v10, v10, a0 -; RV32-NEXT: vsrl.vv v8, v8, v10 -; RV32-NEXT: vor.vv v8, v12, v8 +; RV32-NEXT: vwsubu.vx v12, v10, a0 +; RV32-NEXT: li a1, 63 +; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV32-NEXT: vand.vx v10, v12, a1 +; RV32-NEXT: vsrl.vv v10, v8, v10 +; RV32-NEXT: vmv.v.x v12, a0 +; RV32-NEXT: vand.vx v12, v12, a1 +; RV32-NEXT: vsll.vv v8, v8, v12 +; RV32-NEXT: vor.vv v8, v8, v10 ; RV32-NEXT: ret ; ; RV64-LABEL: shuffle_v8f32_as_i64_exact: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrol.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrol.ll index e719c6f374973..418b159c8fb98 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrol.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrol.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64 ; RUN: llc -mtriple=riscv32 -mattr=+v,+zvkb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-ZVKB ; RUN: llc -mtriple=riscv64 -mattr=+v,+zvkb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-ZVKB @@ -946,18 +946,34 @@ define <2 x i64> @vrol_vv_v2i64(<2 x i64> %a, <2 x i64> %b) { } define <2 x i64> @vrol_vx_v2i64(<2 x i64> %a, i64 %b) { -; CHECK-LABEL: vrol_vx_v2i64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vmv.v.x v9, a0 -; CHECK-NEXT: li a0, 63 -; CHECK-NEXT: vand.vx v10, v9, a0 -; CHECK-NEXT: vsll.vv v10, v8, v10 -; CHECK-NEXT: vrsub.vi v9, v9, 0 -; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsrl.vv v8, v8, v9 -; CHECK-NEXT: vor.vv v8, v10, v8 -; CHECK-NEXT: ret +; RV32-LABEL: vrol_vx_v2i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vmv.v.x v9, a0 +; RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vwsub.vx v11, v10, a0 +; RV32-NEXT: li a0, 63 +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV32-NEXT: vand.vx v10, v11, a0 +; RV32-NEXT: vsrl.vv v10, v8, v10 +; RV32-NEXT: vand.vx v9, v9, a0 +; RV32-NEXT: vsll.vv v8, v8, v9 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vrol_vx_v2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64-NEXT: vmv.v.x v9, a0 +; RV64-NEXT: li a0, 63 +; RV64-NEXT: vand.vx v10, v9, a0 +; RV64-NEXT: vsll.vv v10, v8, v10 +; RV64-NEXT: vrsub.vi v9, v9, 0 +; RV64-NEXT: vand.vx v9, v9, a0 +; RV64-NEXT: vsrl.vv v8, v8, v9 +; RV64-NEXT: vor.vv v8, v10, v8 +; RV64-NEXT: ret ; ; CHECK-ZVKB-LABEL: vrol_vx_v2i64: ; CHECK-ZVKB: # %bb.0: @@ -995,18 +1011,34 @@ define <4 x i64> @vrol_vv_v4i64(<4 x i64> %a, <4 x i64> %b) { } define <4 x i64> @vrol_vx_v4i64(<4 x i64> %a, i64 %b) { -; CHECK-LABEL: vrol_vx_v4i64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vmv.v.x v10, a0 -; CHECK-NEXT: li a0, 63 -; CHECK-NEXT: vand.vx v12, v10, a0 -; CHECK-NEXT: vsll.vv v12, v8, v12 -; CHECK-NEXT: vrsub.vi v10, v10, 0 -; CHECK-NEXT: vand.vx v10, v10, a0 -; CHECK-NEXT: vsrl.vv v8, v8, v10 -; CHECK-NEXT: vor.vv v8, v12, v8 -; CHECK-NEXT: ret +; RV32-LABEL: vrol_vx_v4i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vmv.v.x v10, a0 +; RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vwsub.vx v14, v12, a0 +; RV32-NEXT: li a0, 63 +; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV32-NEXT: vand.vx v12, v14, a0 +; RV32-NEXT: vsrl.vv v12, v8, v12 +; RV32-NEXT: vand.vx v10, v10, a0 +; RV32-NEXT: vsll.vv v8, v8, v10 +; RV32-NEXT: vor.vv v8, v8, v12 +; RV32-NEXT: ret +; +; RV64-LABEL: vrol_vx_v4i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV64-NEXT: vmv.v.x v10, a0 +; RV64-NEXT: li a0, 63 +; RV64-NEXT: vand.vx v12, v10, a0 +; RV64-NEXT: vsll.vv v12, v8, v12 +; RV64-NEXT: vrsub.vi v10, v10, 0 +; RV64-NEXT: vand.vx v10, v10, a0 +; RV64-NEXT: vsrl.vv v8, v8, v10 +; RV64-NEXT: vor.vv v8, v12, v8 +; RV64-NEXT: ret ; ; CHECK-ZVKB-LABEL: vrol_vx_v4i64: ; CHECK-ZVKB: # %bb.0: @@ -1044,18 +1076,34 @@ define <8 x i64> @vrol_vv_v8i64(<8 x i64> %a, <8 x i64> %b) { } define <8 x i64> @vrol_vx_v8i64(<8 x i64> %a, i64 %b) { -; CHECK-LABEL: vrol_vx_v8i64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; CHECK-NEXT: vmv.v.x v12, a0 -; CHECK-NEXT: li a0, 63 -; CHECK-NEXT: vand.vx v16, v12, a0 -; CHECK-NEXT: vsll.vv v16, v8, v16 -; CHECK-NEXT: vrsub.vi v12, v12, 0 -; CHECK-NEXT: vand.vx v12, v12, a0 -; CHECK-NEXT: vsrl.vv v8, v8, v12 -; CHECK-NEXT: vor.vv v8, v16, v8 -; CHECK-NEXT: ret +; RV32-LABEL: vrol_vx_v8i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vmv.v.x v12, a0 +; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32-NEXT: vmv.v.i v16, 0 +; RV32-NEXT: vwsub.vx v20, v16, a0 +; RV32-NEXT: li a0, 63 +; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV32-NEXT: vand.vx v16, v20, a0 +; RV32-NEXT: vsrl.vv v16, v8, v16 +; RV32-NEXT: vand.vx v12, v12, a0 +; RV32-NEXT: vsll.vv v8, v8, v12 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: ret +; +; RV64-LABEL: vrol_vx_v8i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV64-NEXT: vmv.v.x v12, a0 +; RV64-NEXT: li a0, 63 +; RV64-NEXT: vand.vx v16, v12, a0 +; RV64-NEXT: vsll.vv v16, v8, v16 +; RV64-NEXT: vrsub.vi v12, v12, 0 +; RV64-NEXT: vand.vx v12, v12, a0 +; RV64-NEXT: vsrl.vv v8, v8, v12 +; RV64-NEXT: vor.vv v8, v16, v8 +; RV64-NEXT: ret ; ; CHECK-ZVKB-LABEL: vrol_vx_v8i64: ; CHECK-ZVKB: # %bb.0: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vror.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vror.ll index 367c56caf813d..e4ddfeb4c4195 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vror.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vror.ll @@ -1692,18 +1692,34 @@ define <2 x i64> @vror_vv_v2i64(<2 x i64> %a, <2 x i64> %b) { } define <2 x i64> @vror_vx_v2i64(<2 x i64> %a, i64 %b) { -; CHECK-LABEL: vror_vx_v2i64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vmv.v.x v9, a0 -; CHECK-NEXT: li a0, 63 -; CHECK-NEXT: vand.vx v10, v9, a0 -; CHECK-NEXT: vsrl.vv v10, v8, v10 -; CHECK-NEXT: vrsub.vi v9, v9, 0 -; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsll.vv v8, v8, v9 -; CHECK-NEXT: vor.vv v8, v10, v8 -; CHECK-NEXT: ret +; CHECK-RV32-LABEL: vror_vx_v2i64: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-RV32-NEXT: vmv.v.x v9, a0 +; CHECK-RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-RV32-NEXT: vmv.v.i v10, 0 +; CHECK-RV32-NEXT: vwsub.vx v11, v10, a0 +; CHECK-RV32-NEXT: li a0, 63 +; CHECK-RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-RV32-NEXT: vand.vx v10, v11, a0 +; CHECK-RV32-NEXT: vsll.vv v10, v8, v10 +; CHECK-RV32-NEXT: vand.vx v9, v9, a0 +; CHECK-RV32-NEXT: vsrl.vv v8, v8, v9 +; CHECK-RV32-NEXT: vor.vv v8, v8, v10 +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64-LABEL: vror_vx_v2i64: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-RV64-NEXT: vmv.v.x v9, a0 +; CHECK-RV64-NEXT: li a0, 63 +; CHECK-RV64-NEXT: vand.vx v10, v9, a0 +; CHECK-RV64-NEXT: vsrl.vv v10, v8, v10 +; CHECK-RV64-NEXT: vrsub.vi v9, v9, 0 +; CHECK-RV64-NEXT: vand.vx v9, v9, a0 +; CHECK-RV64-NEXT: vsll.vv v8, v8, v9 +; CHECK-RV64-NEXT: vor.vv v8, v10, v8 +; CHECK-RV64-NEXT: ret ; ; CHECK-ZVKB-LABEL: vror_vx_v2i64: ; CHECK-ZVKB: # %bb.0: @@ -1719,11 +1735,13 @@ define <2 x i64> @vror_vx_v2i64(<2 x i64> %a, i64 %b) { define <2 x i64> @vror_vi_v2i64(<2 x i64> %a) { ; CHECK-RV32-LABEL: vror_vi_v2i64: ; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-RV32-NEXT: vmv.v.i v9, 1 -; CHECK-RV32-NEXT: vrsub.vi v9, v9, 0 +; CHECK-RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-RV32-NEXT: vmv.v.i v9, 0 +; CHECK-RV32-NEXT: li a0, 1 +; CHECK-RV32-NEXT: vwsubu.vx v10, v9, a0 ; CHECK-RV32-NEXT: li a0, 63 -; CHECK-RV32-NEXT: vand.vx v9, v9, a0 +; CHECK-RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-RV32-NEXT: vand.vx v9, v10, a0 ; CHECK-RV32-NEXT: vsll.vv v9, v8, v9 ; CHECK-RV32-NEXT: vmv.v.x v10, a0 ; CHECK-RV32-NEXT: vand.vi v10, v10, 1 @@ -1752,11 +1770,13 @@ define <2 x i64> @vror_vi_v2i64(<2 x i64> %a) { define <2 x i64> @vror_vi_rotl_v2i64(<2 x i64> %a) { ; CHECK-RV32-LABEL: vror_vi_rotl_v2i64: ; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-RV32-NEXT: vmv.v.i v9, 1 -; CHECK-RV32-NEXT: vrsub.vi v9, v9, 0 +; CHECK-RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-RV32-NEXT: vmv.v.i v9, 0 +; CHECK-RV32-NEXT: li a0, 1 +; CHECK-RV32-NEXT: vwsubu.vx v10, v9, a0 ; CHECK-RV32-NEXT: li a0, 63 -; CHECK-RV32-NEXT: vand.vx v9, v9, a0 +; CHECK-RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-RV32-NEXT: vand.vx v9, v10, a0 ; CHECK-RV32-NEXT: vsrl.vv v9, v8, v9 ; CHECK-RV32-NEXT: vmv.v.x v10, a0 ; CHECK-RV32-NEXT: vand.vi v10, v10, 1 @@ -1808,18 +1828,34 @@ define <4 x i64> @vror_vv_v4i64(<4 x i64> %a, <4 x i64> %b) { } define <4 x i64> @vror_vx_v4i64(<4 x i64> %a, i64 %b) { -; CHECK-LABEL: vror_vx_v4i64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vmv.v.x v10, a0 -; CHECK-NEXT: li a0, 63 -; CHECK-NEXT: vand.vx v12, v10, a0 -; CHECK-NEXT: vsrl.vv v12, v8, v12 -; CHECK-NEXT: vrsub.vi v10, v10, 0 -; CHECK-NEXT: vand.vx v10, v10, a0 -; CHECK-NEXT: vsll.vv v8, v8, v10 -; CHECK-NEXT: vor.vv v8, v12, v8 -; CHECK-NEXT: ret +; CHECK-RV32-LABEL: vror_vx_v4i64: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-RV32-NEXT: vmv.v.x v10, a0 +; CHECK-RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-RV32-NEXT: vmv.v.i v12, 0 +; CHECK-RV32-NEXT: vwsub.vx v14, v12, a0 +; CHECK-RV32-NEXT: li a0, 63 +; CHECK-RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-RV32-NEXT: vand.vx v12, v14, a0 +; CHECK-RV32-NEXT: vsll.vv v12, v8, v12 +; CHECK-RV32-NEXT: vand.vx v10, v10, a0 +; CHECK-RV32-NEXT: vsrl.vv v8, v8, v10 +; CHECK-RV32-NEXT: vor.vv v8, v8, v12 +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64-LABEL: vror_vx_v4i64: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-RV64-NEXT: vmv.v.x v10, a0 +; CHECK-RV64-NEXT: li a0, 63 +; CHECK-RV64-NEXT: vand.vx v12, v10, a0 +; CHECK-RV64-NEXT: vsrl.vv v12, v8, v12 +; CHECK-RV64-NEXT: vrsub.vi v10, v10, 0 +; CHECK-RV64-NEXT: vand.vx v10, v10, a0 +; CHECK-RV64-NEXT: vsll.vv v8, v8, v10 +; CHECK-RV64-NEXT: vor.vv v8, v12, v8 +; CHECK-RV64-NEXT: ret ; ; CHECK-ZVKB-LABEL: vror_vx_v4i64: ; CHECK-ZVKB: # %bb.0: @@ -1835,11 +1871,13 @@ define <4 x i64> @vror_vx_v4i64(<4 x i64> %a, i64 %b) { define <4 x i64> @vror_vi_v4i64(<4 x i64> %a) { ; CHECK-RV32-LABEL: vror_vi_v4i64: ; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-RV32-NEXT: vmv.v.i v10, 1 -; CHECK-RV32-NEXT: vrsub.vi v10, v10, 0 +; CHECK-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-RV32-NEXT: vmv.v.i v10, 0 +; CHECK-RV32-NEXT: li a0, 1 +; CHECK-RV32-NEXT: vwsubu.vx v12, v10, a0 ; CHECK-RV32-NEXT: li a0, 63 -; CHECK-RV32-NEXT: vand.vx v10, v10, a0 +; CHECK-RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-RV32-NEXT: vand.vx v10, v12, a0 ; CHECK-RV32-NEXT: vsll.vv v10, v8, v10 ; CHECK-RV32-NEXT: vmv.v.x v12, a0 ; CHECK-RV32-NEXT: vand.vi v12, v12, 1 @@ -1868,11 +1906,13 @@ define <4 x i64> @vror_vi_v4i64(<4 x i64> %a) { define <4 x i64> @vror_vi_rotl_v4i64(<4 x i64> %a) { ; CHECK-RV32-LABEL: vror_vi_rotl_v4i64: ; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-RV32-NEXT: vmv.v.i v10, 1 -; CHECK-RV32-NEXT: vrsub.vi v10, v10, 0 +; CHECK-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-RV32-NEXT: vmv.v.i v10, 0 +; CHECK-RV32-NEXT: li a0, 1 +; CHECK-RV32-NEXT: vwsubu.vx v12, v10, a0 ; CHECK-RV32-NEXT: li a0, 63 -; CHECK-RV32-NEXT: vand.vx v10, v10, a0 +; CHECK-RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-RV32-NEXT: vand.vx v10, v12, a0 ; CHECK-RV32-NEXT: vsrl.vv v10, v8, v10 ; CHECK-RV32-NEXT: vmv.v.x v12, a0 ; CHECK-RV32-NEXT: vand.vi v12, v12, 1 @@ -1924,18 +1964,34 @@ define <8 x i64> @vror_vv_v8i64(<8 x i64> %a, <8 x i64> %b) { } define <8 x i64> @vror_vx_v8i64(<8 x i64> %a, i64 %b) { -; CHECK-LABEL: vror_vx_v8i64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; CHECK-NEXT: vmv.v.x v12, a0 -; CHECK-NEXT: li a0, 63 -; CHECK-NEXT: vand.vx v16, v12, a0 -; CHECK-NEXT: vsrl.vv v16, v8, v16 -; CHECK-NEXT: vrsub.vi v12, v12, 0 -; CHECK-NEXT: vand.vx v12, v12, a0 -; CHECK-NEXT: vsll.vv v8, v8, v12 -; CHECK-NEXT: vor.vv v8, v16, v8 -; CHECK-NEXT: ret +; CHECK-RV32-LABEL: vror_vx_v8i64: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; CHECK-RV32-NEXT: vmv.v.x v12, a0 +; CHECK-RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-RV32-NEXT: vmv.v.i v16, 0 +; CHECK-RV32-NEXT: vwsub.vx v20, v16, a0 +; CHECK-RV32-NEXT: li a0, 63 +; CHECK-RV32-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-RV32-NEXT: vand.vx v16, v20, a0 +; CHECK-RV32-NEXT: vsll.vv v16, v8, v16 +; CHECK-RV32-NEXT: vand.vx v12, v12, a0 +; CHECK-RV32-NEXT: vsrl.vv v8, v8, v12 +; CHECK-RV32-NEXT: vor.vv v8, v8, v16 +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64-LABEL: vror_vx_v8i64: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; CHECK-RV64-NEXT: vmv.v.x v12, a0 +; CHECK-RV64-NEXT: li a0, 63 +; CHECK-RV64-NEXT: vand.vx v16, v12, a0 +; CHECK-RV64-NEXT: vsrl.vv v16, v8, v16 +; CHECK-RV64-NEXT: vrsub.vi v12, v12, 0 +; CHECK-RV64-NEXT: vand.vx v12, v12, a0 +; CHECK-RV64-NEXT: vsll.vv v8, v8, v12 +; CHECK-RV64-NEXT: vor.vv v8, v16, v8 +; CHECK-RV64-NEXT: ret ; ; CHECK-ZVKB-LABEL: vror_vx_v8i64: ; CHECK-ZVKB: # %bb.0: @@ -1951,11 +2007,13 @@ define <8 x i64> @vror_vx_v8i64(<8 x i64> %a, i64 %b) { define <8 x i64> @vror_vi_v8i64(<8 x i64> %a) { ; CHECK-RV32-LABEL: vror_vi_v8i64: ; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; CHECK-RV32-NEXT: vmv.v.i v12, 1 -; CHECK-RV32-NEXT: vrsub.vi v12, v12, 0 +; CHECK-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-RV32-NEXT: vmv.v.i v12, 0 +; CHECK-RV32-NEXT: li a0, 1 +; CHECK-RV32-NEXT: vwsubu.vx v16, v12, a0 ; CHECK-RV32-NEXT: li a0, 63 -; CHECK-RV32-NEXT: vand.vx v12, v12, a0 +; CHECK-RV32-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-RV32-NEXT: vand.vx v12, v16, a0 ; CHECK-RV32-NEXT: vsll.vv v12, v8, v12 ; CHECK-RV32-NEXT: vmv.v.x v16, a0 ; CHECK-RV32-NEXT: vand.vi v16, v16, 1 @@ -1984,11 +2042,13 @@ define <8 x i64> @vror_vi_v8i64(<8 x i64> %a) { define <8 x i64> @vror_vi_rotl_v8i64(<8 x i64> %a) { ; CHECK-RV32-LABEL: vror_vi_rotl_v8i64: ; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; CHECK-RV32-NEXT: vmv.v.i v12, 1 -; CHECK-RV32-NEXT: vrsub.vi v12, v12, 0 +; CHECK-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-RV32-NEXT: vmv.v.i v12, 0 +; CHECK-RV32-NEXT: li a0, 1 +; CHECK-RV32-NEXT: vwsubu.vx v16, v12, a0 ; CHECK-RV32-NEXT: li a0, 63 -; CHECK-RV32-NEXT: vand.vx v12, v12, a0 +; CHECK-RV32-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-RV32-NEXT: vand.vx v12, v16, a0 ; CHECK-RV32-NEXT: vsrl.vv v12, v8, v12 ; CHECK-RV32-NEXT: vmv.v.x v16, a0 ; CHECK-RV32-NEXT: vand.vi v16, v16, 1 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll index 2a31ff5ab3f8c..3a222e95566a4 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll @@ -767,23 +767,13 @@ define <4 x i32> @vwadd_vx_v4i32_i32(ptr %x, ptr %y) { } define <2 x i64> @vwadd_vx_v2i64_i8(ptr %x, ptr %y) nounwind { -; RV32-LABEL: vwadd_vx_v2i64_i8: -; RV32: # %bb.0: -; RV32-NEXT: lb a1, 0(a1) -; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32-NEXT: vle32.v v9, (a0) -; RV32-NEXT: vmv.v.x v8, a1 -; RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; RV32-NEXT: vwadd.wv v8, v8, v9 -; RV32-NEXT: ret -; -; RV64-LABEL: vwadd_vx_v2i64_i8: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; RV64-NEXT: vle32.v v9, (a0) -; RV64-NEXT: lb a0, 0(a1) -; RV64-NEXT: vwadd.vx v8, v9, a0 -; RV64-NEXT: ret +; CHECK-LABEL: vwadd_vx_v2i64_i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: lb a0, 0(a1) +; CHECK-NEXT: vwadd.vx v8, v9, a0 +; CHECK-NEXT: ret %a = load <2 x i32>, ptr %x %b = load i8, ptr %y %c = sext i8 %b to i64 @@ -795,23 +785,13 @@ define <2 x i64> @vwadd_vx_v2i64_i8(ptr %x, ptr %y) nounwind { } define <2 x i64> @vwadd_vx_v2i64_i16(ptr %x, ptr %y) nounwind { -; RV32-LABEL: vwadd_vx_v2i64_i16: -; RV32: # %bb.0: -; RV32-NEXT: lh a1, 0(a1) -; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32-NEXT: vle32.v v9, (a0) -; RV32-NEXT: vmv.v.x v8, a1 -; RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; RV32-NEXT: vwadd.wv v8, v8, v9 -; RV32-NEXT: ret -; -; RV64-LABEL: vwadd_vx_v2i64_i16: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; RV64-NEXT: vle32.v v9, (a0) -; RV64-NEXT: lh a0, 0(a1) -; RV64-NEXT: vwadd.vx v8, v9, a0 -; RV64-NEXT: ret +; CHECK-LABEL: vwadd_vx_v2i64_i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: lh a0, 0(a1) +; CHECK-NEXT: vwadd.vx v8, v9, a0 +; CHECK-NEXT: ret %a = load <2 x i32>, ptr %x %b = load i16, ptr %y %c = sext i16 %b to i64 @@ -823,23 +803,13 @@ define <2 x i64> @vwadd_vx_v2i64_i16(ptr %x, ptr %y) nounwind { } define <2 x i64> @vwadd_vx_v2i64_i32(ptr %x, ptr %y) nounwind { -; RV32-LABEL: vwadd_vx_v2i64_i32: -; RV32: # %bb.0: -; RV32-NEXT: lw a1, 0(a1) -; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32-NEXT: vle32.v v9, (a0) -; RV32-NEXT: vmv.v.x v8, a1 -; RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; RV32-NEXT: vwadd.wv v8, v8, v9 -; RV32-NEXT: ret -; -; RV64-LABEL: vwadd_vx_v2i64_i32: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; RV64-NEXT: vle32.v v9, (a0) -; RV64-NEXT: lw a0, 0(a1) -; RV64-NEXT: vwadd.vx v8, v9, a0 -; RV64-NEXT: ret +; CHECK-LABEL: vwadd_vx_v2i64_i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: lw a0, 0(a1) +; CHECK-NEXT: vwadd.vx v8, v9, a0 +; CHECK-NEXT: ret %a = load <2 x i32>, ptr %x %b = load i32, ptr %y %c = sext i32 %b to i64 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll index 93927e10e607e..9d63b8f31a3e8 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll @@ -795,22 +795,13 @@ define <4 x i32> @vwmul_vx_v4i32_i32(ptr %x, ptr %y) { } define <2 x i64> @vwmul_vx_v2i64_i8(ptr %x, ptr %y) { -; RV32-LABEL: vwmul_vx_v2i64_i8: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32-NEXT: vle32.v v8, (a0) -; RV32-NEXT: lb a0, 0(a1) -; RV32-NEXT: vsext.vf2 v9, v8 -; RV32-NEXT: vmul.vx v8, v9, a0 -; RV32-NEXT: ret -; -; RV64-LABEL: vwmul_vx_v2i64_i8: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; RV64-NEXT: vle32.v v9, (a0) -; RV64-NEXT: lb a0, 0(a1) -; RV64-NEXT: vwmul.vx v8, v9, a0 -; RV64-NEXT: ret +; CHECK-LABEL: vwmul_vx_v2i64_i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: lb a0, 0(a1) +; CHECK-NEXT: vwmul.vx v8, v9, a0 +; CHECK-NEXT: ret %a = load <2 x i32>, ptr %x %b = load i8, ptr %y %c = sext i8 %b to i64 @@ -822,22 +813,13 @@ define <2 x i64> @vwmul_vx_v2i64_i8(ptr %x, ptr %y) { } define <2 x i64> @vwmul_vx_v2i64_i16(ptr %x, ptr %y) { -; RV32-LABEL: vwmul_vx_v2i64_i16: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32-NEXT: vle32.v v8, (a0) -; RV32-NEXT: lh a0, 0(a1) -; RV32-NEXT: vsext.vf2 v9, v8 -; RV32-NEXT: vmul.vx v8, v9, a0 -; RV32-NEXT: ret -; -; RV64-LABEL: vwmul_vx_v2i64_i16: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; RV64-NEXT: vle32.v v9, (a0) -; RV64-NEXT: lh a0, 0(a1) -; RV64-NEXT: vwmul.vx v8, v9, a0 -; RV64-NEXT: ret +; CHECK-LABEL: vwmul_vx_v2i64_i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: lh a0, 0(a1) +; CHECK-NEXT: vwmul.vx v8, v9, a0 +; CHECK-NEXT: ret %a = load <2 x i32>, ptr %x %b = load i16, ptr %y %c = sext i16 %b to i64 @@ -849,22 +831,13 @@ define <2 x i64> @vwmul_vx_v2i64_i16(ptr %x, ptr %y) { } define <2 x i64> @vwmul_vx_v2i64_i32(ptr %x, ptr %y) { -; RV32-LABEL: vwmul_vx_v2i64_i32: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32-NEXT: vle32.v v8, (a0) -; RV32-NEXT: lw a0, 0(a1) -; RV32-NEXT: vsext.vf2 v9, v8 -; RV32-NEXT: vmul.vx v8, v9, a0 -; RV32-NEXT: ret -; -; RV64-LABEL: vwmul_vx_v2i64_i32: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; RV64-NEXT: vle32.v v9, (a0) -; RV64-NEXT: lw a0, 0(a1) -; RV64-NEXT: vwmul.vx v8, v9, a0 -; RV64-NEXT: ret +; CHECK-LABEL: vwmul_vx_v2i64_i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: lw a0, 0(a1) +; CHECK-NEXT: vwmul.vx v8, v9, a0 +; CHECK-NEXT: ret %a = load <2 x i32>, ptr %x %b = load i32, ptr %y %c = sext i32 %b to i64 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsll.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsll.ll index af67b9920ed1e..fce22849a58af 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsll.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsll.ll @@ -906,19 +906,12 @@ define <4 x i64> @vwsll_vi_v4i64_v4i8(<4 x i8> %a) { ; CHECK-NEXT: vsll.vi v8, v10, 2 ; CHECK-NEXT: ret ; -; CHECK-ZVBB-RV32-LABEL: vwsll_vi_v4i64_v4i8: -; CHECK-ZVBB-RV32: # %bb.0: -; CHECK-ZVBB-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-ZVBB-RV32-NEXT: vzext.vf8 v10, v8 -; CHECK-ZVBB-RV32-NEXT: vsll.vi v8, v10, 2 -; CHECK-ZVBB-RV32-NEXT: ret -; -; CHECK-ZVBB-RV64-LABEL: vwsll_vi_v4i64_v4i8: -; CHECK-ZVBB-RV64: # %bb.0: -; CHECK-ZVBB-RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-ZVBB-RV64-NEXT: vzext.vf4 v10, v8 -; CHECK-ZVBB-RV64-NEXT: vwsll.vi v8, v10, 2 -; CHECK-ZVBB-RV64-NEXT: ret +; CHECK-ZVBB-LABEL: vwsll_vi_v4i64_v4i8: +; CHECK-ZVBB: # %bb.0: +; CHECK-ZVBB-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-ZVBB-NEXT: vzext.vf4 v10, v8 +; CHECK-ZVBB-NEXT: vwsll.vi v8, v10, 2 +; CHECK-ZVBB-NEXT: ret %x = zext <4 x i8> %a to <4 x i64> %z = shl <4 x i64> %x, splat (i64 2) ret <4 x i64> %z diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll index c3353a2df4912..d632dc4c2a30d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll @@ -770,24 +770,14 @@ define <4 x i32> @vwsub_vx_v4i32_i32(ptr %x, ptr %y) { } define <2 x i64> @vwsub_vx_v2i64_i8(ptr %x, ptr %y) nounwind { -; RV32-LABEL: vwsub_vx_v2i64_i8: -; RV32: # %bb.0: -; RV32-NEXT: lb a1, 0(a1) -; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32-NEXT: vle32.v v9, (a0) -; RV32-NEXT: vmv.v.x v8, a1 -; RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; RV32-NEXT: vwsub.wv v8, v8, v9 -; RV32-NEXT: ret -; -; RV64-LABEL: vwsub_vx_v2i64_i8: -; RV64: # %bb.0: -; RV64-NEXT: lb a1, 0(a1) -; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; RV64-NEXT: vle32.v v9, (a0) -; RV64-NEXT: vmv.v.x v10, a1 -; RV64-NEXT: vwsub.vv v8, v10, v9 -; RV64-NEXT: ret +; CHECK-LABEL: vwsub_vx_v2i64_i8: +; CHECK: # %bb.0: +; CHECK-NEXT: lb a1, 0(a1) +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vwsub.vv v8, v10, v9 +; CHECK-NEXT: ret %a = load <2 x i32>, ptr %x %b = load i8, ptr %y %c = sext i8 %b to i64 @@ -799,24 +789,14 @@ define <2 x i64> @vwsub_vx_v2i64_i8(ptr %x, ptr %y) nounwind { } define <2 x i64> @vwsub_vx_v2i64_i16(ptr %x, ptr %y) nounwind { -; RV32-LABEL: vwsub_vx_v2i64_i16: -; RV32: # %bb.0: -; RV32-NEXT: lh a1, 0(a1) -; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32-NEXT: vle32.v v9, (a0) -; RV32-NEXT: vmv.v.x v8, a1 -; RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; RV32-NEXT: vwsub.wv v8, v8, v9 -; RV32-NEXT: ret -; -; RV64-LABEL: vwsub_vx_v2i64_i16: -; RV64: # %bb.0: -; RV64-NEXT: lh a1, 0(a1) -; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; RV64-NEXT: vle32.v v9, (a0) -; RV64-NEXT: vmv.v.x v10, a1 -; RV64-NEXT: vwsub.vv v8, v10, v9 -; RV64-NEXT: ret +; CHECK-LABEL: vwsub_vx_v2i64_i16: +; CHECK: # %bb.0: +; CHECK-NEXT: lh a1, 0(a1) +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vwsub.vv v8, v10, v9 +; CHECK-NEXT: ret %a = load <2 x i32>, ptr %x %b = load i16, ptr %y %c = sext i16 %b to i64 @@ -828,24 +808,14 @@ define <2 x i64> @vwsub_vx_v2i64_i16(ptr %x, ptr %y) nounwind { } define <2 x i64> @vwsub_vx_v2i64_i32(ptr %x, ptr %y) nounwind { -; RV32-LABEL: vwsub_vx_v2i64_i32: -; RV32: # %bb.0: -; RV32-NEXT: lw a1, 0(a1) -; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32-NEXT: vle32.v v9, (a0) -; RV32-NEXT: vmv.v.x v8, a1 -; RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; RV32-NEXT: vwsub.wv v8, v8, v9 -; RV32-NEXT: ret -; -; RV64-LABEL: vwsub_vx_v2i64_i32: -; RV64: # %bb.0: -; RV64-NEXT: lw a1, 0(a1) -; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; RV64-NEXT: vle32.v v9, (a0) -; RV64-NEXT: vmv.v.x v10, a1 -; RV64-NEXT: vwsub.vv v8, v10, v9 -; RV64-NEXT: ret +; CHECK-LABEL: vwsub_vx_v2i64_i32: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vwsub.vv v8, v10, v9 +; CHECK-NEXT: ret %a = load <2 x i32>, ptr %x %b = load i32, ptr %y %c = sext i32 %b to i64 diff --git a/llvm/test/CodeGen/RISCV/rvv/vrol-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vrol-sdnode.ll index 4ea5a6709db5c..4a86b717f9f3c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vrol-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vrol-sdnode.ll @@ -944,13 +944,16 @@ define @vrol_vx_nxv1i64( %a, i64 %b) { ; CHECK-RV32: # %bb.0: ; CHECK-RV32-NEXT: vsetvli a1, zero, e64, m1, ta, ma ; CHECK-RV32-NEXT: vmv.v.x v9, a0 +; CHECK-RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-RV32-NEXT: vmv.v.i v10, 0 +; CHECK-RV32-NEXT: vwsub.vx v11, v10, a0 ; CHECK-RV32-NEXT: li a0, 63 -; CHECK-RV32-NEXT: vand.vx v10, v9, a0 -; CHECK-RV32-NEXT: vsll.vv v10, v8, v10 -; CHECK-RV32-NEXT: vrsub.vi v9, v9, 0 +; CHECK-RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-RV32-NEXT: vand.vx v10, v11, a0 +; CHECK-RV32-NEXT: vsrl.vv v10, v8, v10 ; CHECK-RV32-NEXT: vand.vx v9, v9, a0 -; CHECK-RV32-NEXT: vsrl.vv v8, v8, v9 -; CHECK-RV32-NEXT: vor.vv v8, v10, v8 +; CHECK-RV32-NEXT: vsll.vv v8, v8, v9 +; CHECK-RV32-NEXT: vor.vv v8, v8, v10 ; CHECK-RV32-NEXT: ret ; ; CHECK-RV64-LABEL: vrol_vx_nxv1i64: @@ -1004,13 +1007,16 @@ define @vrol_vx_nxv2i64( %a, i64 %b) { ; CHECK-RV32: # %bb.0: ; CHECK-RV32-NEXT: vsetvli a1, zero, e64, m2, ta, ma ; CHECK-RV32-NEXT: vmv.v.x v10, a0 +; CHECK-RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-RV32-NEXT: vmv.v.i v12, 0 +; CHECK-RV32-NEXT: vwsub.vx v14, v12, a0 ; CHECK-RV32-NEXT: li a0, 63 -; CHECK-RV32-NEXT: vand.vx v12, v10, a0 -; CHECK-RV32-NEXT: vsll.vv v12, v8, v12 -; CHECK-RV32-NEXT: vrsub.vi v10, v10, 0 +; CHECK-RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-RV32-NEXT: vand.vx v12, v14, a0 +; CHECK-RV32-NEXT: vsrl.vv v12, v8, v12 ; CHECK-RV32-NEXT: vand.vx v10, v10, a0 -; CHECK-RV32-NEXT: vsrl.vv v8, v8, v10 -; CHECK-RV32-NEXT: vor.vv v8, v12, v8 +; CHECK-RV32-NEXT: vsll.vv v8, v8, v10 +; CHECK-RV32-NEXT: vor.vv v8, v8, v12 ; CHECK-RV32-NEXT: ret ; ; CHECK-RV64-LABEL: vrol_vx_nxv2i64: @@ -1064,13 +1070,16 @@ define @vrol_vx_nxv4i64( %a, i64 %b) { ; CHECK-RV32: # %bb.0: ; CHECK-RV32-NEXT: vsetvli a1, zero, e64, m4, ta, ma ; CHECK-RV32-NEXT: vmv.v.x v12, a0 +; CHECK-RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-RV32-NEXT: vmv.v.i v16, 0 +; CHECK-RV32-NEXT: vwsub.vx v20, v16, a0 ; CHECK-RV32-NEXT: li a0, 63 -; CHECK-RV32-NEXT: vand.vx v16, v12, a0 -; CHECK-RV32-NEXT: vsll.vv v16, v8, v16 -; CHECK-RV32-NEXT: vrsub.vi v12, v12, 0 +; CHECK-RV32-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-RV32-NEXT: vand.vx v16, v20, a0 +; CHECK-RV32-NEXT: vsrl.vv v16, v8, v16 ; CHECK-RV32-NEXT: vand.vx v12, v12, a0 -; CHECK-RV32-NEXT: vsrl.vv v8, v8, v12 -; CHECK-RV32-NEXT: vor.vv v8, v16, v8 +; CHECK-RV32-NEXT: vsll.vv v8, v8, v12 +; CHECK-RV32-NEXT: vor.vv v8, v8, v16 ; CHECK-RV32-NEXT: ret ; ; CHECK-RV64-LABEL: vrol_vx_nxv4i64: @@ -1124,13 +1133,16 @@ define @vrol_vx_nxv8i64( %a, i64 %b) { ; CHECK-RV32: # %bb.0: ; CHECK-RV32-NEXT: vsetvli a1, zero, e64, m8, ta, ma ; CHECK-RV32-NEXT: vmv.v.x v16, a0 +; CHECK-RV32-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-RV32-NEXT: vmv.v.i v24, 0 +; CHECK-RV32-NEXT: vwsub.vx v0, v24, a0 ; CHECK-RV32-NEXT: li a0, 63 -; CHECK-RV32-NEXT: vand.vx v24, v16, a0 -; CHECK-RV32-NEXT: vsll.vv v24, v8, v24 -; CHECK-RV32-NEXT: vrsub.vi v16, v16, 0 +; CHECK-RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; CHECK-RV32-NEXT: vand.vx v24, v0, a0 +; CHECK-RV32-NEXT: vsrl.vv v24, v8, v24 ; CHECK-RV32-NEXT: vand.vx v16, v16, a0 -; CHECK-RV32-NEXT: vsrl.vv v8, v8, v16 -; CHECK-RV32-NEXT: vor.vv v8, v24, v8 +; CHECK-RV32-NEXT: vsll.vv v8, v8, v16 +; CHECK-RV32-NEXT: vor.vv v8, v8, v24 ; CHECK-RV32-NEXT: ret ; ; CHECK-RV64-LABEL: vrol_vx_nxv8i64: diff --git a/llvm/test/CodeGen/RISCV/rvv/vror-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vror-sdnode.ll index 16abf2bd28acc..cf2f0d8873165 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vror-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vror-sdnode.ll @@ -1611,13 +1611,16 @@ define @vror_vx_nxv1i64( %a, i64 %b) { ; CHECK-RV32: # %bb.0: ; CHECK-RV32-NEXT: vsetvli a1, zero, e64, m1, ta, ma ; CHECK-RV32-NEXT: vmv.v.x v9, a0 +; CHECK-RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-RV32-NEXT: vmv.v.i v10, 0 +; CHECK-RV32-NEXT: vwsub.vx v11, v10, a0 ; CHECK-RV32-NEXT: li a0, 63 -; CHECK-RV32-NEXT: vand.vx v10, v9, a0 -; CHECK-RV32-NEXT: vsrl.vv v10, v8, v10 -; CHECK-RV32-NEXT: vrsub.vi v9, v9, 0 +; CHECK-RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-RV32-NEXT: vand.vx v10, v11, a0 +; CHECK-RV32-NEXT: vsll.vv v10, v8, v10 ; CHECK-RV32-NEXT: vand.vx v9, v9, a0 -; CHECK-RV32-NEXT: vsll.vv v8, v8, v9 -; CHECK-RV32-NEXT: vor.vv v8, v10, v8 +; CHECK-RV32-NEXT: vsrl.vv v8, v8, v9 +; CHECK-RV32-NEXT: vor.vv v8, v8, v10 ; CHECK-RV32-NEXT: ret ; ; CHECK-RV64-LABEL: vror_vx_nxv1i64: @@ -1710,13 +1713,16 @@ define @vror_vx_nxv2i64( %a, i64 %b) { ; CHECK-RV32: # %bb.0: ; CHECK-RV32-NEXT: vsetvli a1, zero, e64, m2, ta, ma ; CHECK-RV32-NEXT: vmv.v.x v10, a0 +; CHECK-RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-RV32-NEXT: vmv.v.i v12, 0 +; CHECK-RV32-NEXT: vwsub.vx v14, v12, a0 ; CHECK-RV32-NEXT: li a0, 63 -; CHECK-RV32-NEXT: vand.vx v12, v10, a0 -; CHECK-RV32-NEXT: vsrl.vv v12, v8, v12 -; CHECK-RV32-NEXT: vrsub.vi v10, v10, 0 +; CHECK-RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-RV32-NEXT: vand.vx v12, v14, a0 +; CHECK-RV32-NEXT: vsll.vv v12, v8, v12 ; CHECK-RV32-NEXT: vand.vx v10, v10, a0 -; CHECK-RV32-NEXT: vsll.vv v8, v8, v10 -; CHECK-RV32-NEXT: vor.vv v8, v12, v8 +; CHECK-RV32-NEXT: vsrl.vv v8, v8, v10 +; CHECK-RV32-NEXT: vor.vv v8, v8, v12 ; CHECK-RV32-NEXT: ret ; ; CHECK-RV64-LABEL: vror_vx_nxv2i64: @@ -1809,13 +1815,16 @@ define @vror_vx_nxv4i64( %a, i64 %b) { ; CHECK-RV32: # %bb.0: ; CHECK-RV32-NEXT: vsetvli a1, zero, e64, m4, ta, ma ; CHECK-RV32-NEXT: vmv.v.x v12, a0 +; CHECK-RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-RV32-NEXT: vmv.v.i v16, 0 +; CHECK-RV32-NEXT: vwsub.vx v20, v16, a0 ; CHECK-RV32-NEXT: li a0, 63 -; CHECK-RV32-NEXT: vand.vx v16, v12, a0 -; CHECK-RV32-NEXT: vsrl.vv v16, v8, v16 -; CHECK-RV32-NEXT: vrsub.vi v12, v12, 0 +; CHECK-RV32-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-RV32-NEXT: vand.vx v16, v20, a0 +; CHECK-RV32-NEXT: vsll.vv v16, v8, v16 ; CHECK-RV32-NEXT: vand.vx v12, v12, a0 -; CHECK-RV32-NEXT: vsll.vv v8, v8, v12 -; CHECK-RV32-NEXT: vor.vv v8, v16, v8 +; CHECK-RV32-NEXT: vsrl.vv v8, v8, v12 +; CHECK-RV32-NEXT: vor.vv v8, v8, v16 ; CHECK-RV32-NEXT: ret ; ; CHECK-RV64-LABEL: vror_vx_nxv4i64: @@ -1908,13 +1917,16 @@ define @vror_vx_nxv8i64( %a, i64 %b) { ; CHECK-RV32: # %bb.0: ; CHECK-RV32-NEXT: vsetvli a1, zero, e64, m8, ta, ma ; CHECK-RV32-NEXT: vmv.v.x v16, a0 +; CHECK-RV32-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-RV32-NEXT: vmv.v.i v24, 0 +; CHECK-RV32-NEXT: vwsub.vx v0, v24, a0 ; CHECK-RV32-NEXT: li a0, 63 -; CHECK-RV32-NEXT: vand.vx v24, v16, a0 -; CHECK-RV32-NEXT: vsrl.vv v24, v8, v24 -; CHECK-RV32-NEXT: vrsub.vi v16, v16, 0 +; CHECK-RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; CHECK-RV32-NEXT: vand.vx v24, v0, a0 +; CHECK-RV32-NEXT: vsll.vv v24, v8, v24 ; CHECK-RV32-NEXT: vand.vx v16, v16, a0 -; CHECK-RV32-NEXT: vsll.vv v8, v8, v16 -; CHECK-RV32-NEXT: vor.vv v8, v24, v8 +; CHECK-RV32-NEXT: vsrl.vv v8, v8, v16 +; CHECK-RV32-NEXT: vor.vv v8, v8, v24 ; CHECK-RV32-NEXT: ret ; ; CHECK-RV64-LABEL: vror_vx_nxv8i64: diff --git a/llvm/test/CodeGen/RISCV/rvv/vwadd-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vwadd-sdnode.ll index d70f619c3601a..06b31657e0eca 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vwadd-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vwadd-sdnode.ll @@ -1551,21 +1551,12 @@ define @vwadd_wx_splat_zext( %va, i32 %b) { } define @vwadd_vx_splat_sext( %va, i32 %b) { -; RV32-LABEL: vwadd_vx_splat_sext: -; RV32: # %bb.0: -; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a0 -; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; RV32-NEXT: vwadd.wv v16, v16, v8 -; RV32-NEXT: vmv8r.v v8, v16 -; RV32-NEXT: ret -; -; RV64-LABEL: vwadd_vx_splat_sext: -; RV64: # %bb.0: -; RV64-NEXT: vsetvli a1, zero, e32, m4, ta, ma -; RV64-NEXT: vwadd.vx v16, v8, a0 -; RV64-NEXT: vmv8r.v v8, v16 -; RV64-NEXT: ret +; CHECK-LABEL: vwadd_vx_splat_sext: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e32, m4, ta, ma +; CHECK-NEXT: vwadd.vx v16, v8, a0 +; CHECK-NEXT: vmv8r.v v8, v16 +; CHECK-NEXT: ret %sb = sext i32 %b to i64 %head = insertelement poison, i64 %sb, i32 0 %splat = shufflevector %head, poison, zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/vwsll-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vwsll-sdnode.ll index 41ec2fc443d02..ff807adf0e59f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vwsll-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vwsll-sdnode.ll @@ -864,20 +864,16 @@ define @vwsll_vi_nxv2i64_nxv2i8( %a) { ; CHECK-NEXT: vsll.vi v8, v10, 2 ; CHECK-NEXT: ret ; -; RV32ZVBB-LABEL: vwsll_vi_nxv2i64_nxv2i8: -; RV32ZVBB: # %bb.0: -; RV32ZVBB-NEXT: vsetvli a0, zero, e64, m2, ta, ma -; RV32ZVBB-NEXT: vzext.vf8 v10, v8 -; RV32ZVBB-NEXT: vsll.vi v8, v10, 2 -; RV32ZVBB-NEXT: ret -; -; RV64ZVBB-LABEL: vwsll_vi_nxv2i64_nxv2i8: -; RV64ZVBB: # %bb.0: -; RV64ZVBB-NEXT: vsetvli a0, zero, e32, m1, ta, ma -; RV64ZVBB-NEXT: vzext.vf4 v10, v8 -; RV64ZVBB-NEXT: vwsll.vi v8, v10, 2 -; RV64ZVBB-NEXT: ret +; CHECK-ZVBB-LABEL: vwsll_vi_nxv2i64_nxv2i8: +; CHECK-ZVBB: # %bb.0: +; CHECK-ZVBB-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; CHECK-ZVBB-NEXT: vzext.vf4 v10, v8 +; CHECK-ZVBB-NEXT: vwsll.vi v8, v10, 2 +; CHECK-ZVBB-NEXT: ret %x = zext %a to %z = shl %x, splat (i64 2) ret %z } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; RV32ZVBB: {{.*}} +; RV64ZVBB: {{.*}} From b5e4d323badbd24324bfab4366b670977b16df07 Mon Sep 17 00:00:00 2001 From: Jordan Rupprecht Date: Wed, 17 Jul 2024 14:27:31 -0500 Subject: [PATCH 329/777] [bazel][mlir] Add MathToROCDL to fix layering check (#99377) --- utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index 5badfccc29f22..fe67286422f15 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -4222,6 +4222,7 @@ cc_library( ":MathToFuncs", ":MathToLLVM", ":MathToLibm", + ":MathToROCDL", ":MathToSPIRV", ":MemRefToEmitC", ":MemRefToLLVM", From 8044a863518166db1a1e05df5c76e26d53dbbcb9 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Wed, 17 Jul 2024 12:53:40 -0700 Subject: [PATCH 330/777] [compiler-rt][www] Update standalone build instruction (#98707) Follow up to #71500 --- compiler-rt/www/index.html | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/compiler-rt/www/index.html b/compiler-rt/www/index.html index ddad80c7ebff1..72f1491a396f6 100644 --- a/compiler-rt/www/index.html +++ b/compiler-rt/www/index.html @@ -116,14 +116,13 @@

    Get it and get involved!

    cmake.

    To build it separately, first - build LLVM - separately to get llvm-config binary, and then run: + build LLVM, and then run:

    • cd llvm-project
    • mkdir build-compiler-rt
    • cd build-compiler-rt
    • -
    • cmake ../compiler-rt -DLLVM_CMAKE_DIR=/path/to/llvm-project/cmake/modules
    • +
    • cmake ../compiler-rt -DLLVM_CMAKE_DIR=/path/to/llvm-build-dir
    • make
    From 495d3ea989d4e97ce77ee73d6b35b171a7346019 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 17 Jul 2024 13:06:58 -0700 Subject: [PATCH 331/777] [MachineSink][RISCV] Only call isConstantPhysReg or isIgnorableUse for uses. (#99363) The included test case contains X0 as a def register. X0 is considered a constant register when it is a use. When its a def, it means to throw away the result value. If we treat it as a constant register here, we will execute the continue and not assign `DefReg` to any register. This will cause a crash when trying to get the register class for `DefReg` after the loop. By only checking isConstantPhysReg for uses, we will reach the `return false` a little further down and stop processing this instruction. --- llvm/lib/CodeGen/MachineSink.cpp | 2 +- .../CodeGen/RISCV/sink-and-fold-crash.mir | 39 +++++++++++++++++++ 2 files changed, 40 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/RISCV/sink-and-fold-crash.mir diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp index d782c8b086319..4b3ff57fb478a 100644 --- a/llvm/lib/CodeGen/MachineSink.cpp +++ b/llvm/lib/CodeGen/MachineSink.cpp @@ -417,7 +417,7 @@ bool MachineSinking::PerformSinkAndFold(MachineInstr &MI, continue; } - if (Reg.isPhysical() && + if (Reg.isPhysical() && MO.isUse() && (MRI->isConstantPhysReg(Reg) || TII->isIgnorableUse(MO))) continue; diff --git a/llvm/test/CodeGen/RISCV/sink-and-fold-crash.mir b/llvm/test/CodeGen/RISCV/sink-and-fold-crash.mir new file mode 100644 index 0000000000000..a14c5ceed9ec5 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/sink-and-fold-crash.mir @@ -0,0 +1,39 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc %s -mtriple=riscv64 -run-pass=machine-sink -o - | FileCheck %s + +--- +name: main +tracksRegLiveness: true +registers: + - { id: 0, class: fpr32 } + - { id: 1, class: fpr32 } + - { id: 2, class: gpr } + - { id: 3, class: gpr } +liveins: + - { reg: '$f10_f', virtual-reg: '%0' } + - { reg: '$f11_f', virtual-reg: '%1' } +body: | + bb.0.entry: + liveins: $f10_f, $f11_f + + ; CHECK-LABEL: name: main + ; CHECK: liveins: $f10_f, $f11_f + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr32 = COPY $f11_f + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr32 = COPY $f10_f + ; CHECK-NEXT: [[ReadFFLAGS:%[0-9]+]]:gpr = ReadFFLAGS implicit $fflags + ; CHECK-NEXT: [[FLE_S:%[0-9]+]]:gpr = nofpexcept FLE_S [[COPY1]], [[COPY]] + ; CHECK-NEXT: WriteFFLAGS killed [[ReadFFLAGS]], implicit-def $fflags + ; CHECK-NEXT: $x0 = nofpexcept FEQ_S [[COPY1]], [[COPY]] + ; CHECK-NEXT: $x10 = COPY [[FLE_S]] + ; CHECK-NEXT: PseudoRET implicit $x10 + %1:fpr32 = COPY $f11_f + %0:fpr32 = COPY $f10_f + %3:gpr = ReadFFLAGS implicit $fflags + %2:gpr = nofpexcept FLE_S %0, %1 + WriteFFLAGS killed %3, implicit-def $fflags + $x0 = nofpexcept FEQ_S %0, %1 + $x10 = COPY %2 + PseudoRET implicit $x10 + +... From 63fae3ed656241a1d6a19c3e773ecc9bfff3e182 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Wed, 17 Jul 2024 21:11:00 +0100 Subject: [PATCH 332/777] [AMDGPU] clang-tidy: no else after return etc. NFC. (#99298) --- llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 3 +- .../AMDGPU/AMDGPUMachineCFGStructurizer.cpp | 129 +++++++++--------- .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 15 +- llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp | 7 +- .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 51 ++++--- .../Disassembler/AMDGPUDisassembler.cpp | 22 ++- llvm/lib/Target/AMDGPU/GCNILPSched.cpp | 10 +- .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp | 3 +- .../MCTargetDesc/AMDGPUMCTargetDesc.cpp | 3 +- .../AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp | 44 +++--- .../AMDGPU/R600ControlFlowFinalizer.cpp | 32 ++--- llvm/lib/Target/AMDGPU/R600ISelDAGToDAG.cpp | 5 +- llvm/lib/Target/AMDGPU/R600ISelLowering.cpp | 28 ++-- llvm/lib/Target/AMDGPU/R600InstrInfo.cpp | 54 ++++---- .../AMDGPU/R600MachineCFGStructurizer.cpp | 2 +- .../Target/AMDGPU/R600MachineScheduler.cpp | 9 +- llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp | 3 +- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 27 ++-- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 28 ++-- .../Target/AMDGPU/SIMachineFunctionInfo.cpp | 3 +- llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp | 60 ++++---- .../Target/AMDGPU/SIOptimizeExecMasking.cpp | 10 +- llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 19 ++- .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 10 +- 24 files changed, 273 insertions(+), 304 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 632657589bdd2..3154dc6fe433d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -1450,7 +1450,8 @@ bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, AMDGPUInstPrinter::printRegOperand(MO.getReg(), O, *MF->getSubtarget().getRegisterInfo()); return false; - } else if (MO.isImm()) { + } + if (MO.isImm()) { int64_t Val = MO.getImm(); if (AMDGPU::isInlinableIntLiteral(Val)) { O << Val; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp index 5874a6f1f3992..07b2ecc2fed0e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp @@ -568,16 +568,14 @@ class RegionMRT : public MRT { bool contains(MachineBasicBlock *MBB) { for (auto *CI : Children) { if (CI->isMBB()) { - if (MBB == CI->getMBBMRT()->getMBB()) { + if (MBB == CI->getMBBMRT()->getMBB()) return true; - } } else { - if (CI->getRegionMRT()->contains(MBB)) { + if (CI->getRegionMRT()->contains(MBB)) return true; - } else if (CI->getRegionMRT()->getLinearizedRegion() != nullptr && - CI->getRegionMRT()->getLinearizedRegion()->contains(MBB)) { + if (CI->getRegionMRT()->getLinearizedRegion() != nullptr && + CI->getRegionMRT()->getLinearizedRegion()->contains(MBB)) return true; - } } } return false; @@ -2259,63 +2257,60 @@ MachineBasicBlock *AMDGPUMachineCFGStructurizer::createIfRegion( CodeBB->addSuccessor(MergeBB); CurrentRegion->addMBB(CodeBB); return nullptr; - } else { - // Handle internal block. - const TargetRegisterClass *RegClass = MRI->getRegClass(BBSelectRegIn); - Register CodeBBSelectReg = MRI->createVirtualRegister(RegClass); - rewriteCodeBBTerminator(CodeBB, MergeBB, CodeBBSelectReg); - bool IsRegionEntryBB = CurrentRegion->getEntry() == CodeBB; - MachineBasicBlock *IfBB = createIfBlock(MergeBB, CodeBB, CodeBB, CodeBB, - BBSelectRegIn, IsRegionEntryBB); - CurrentRegion->addMBB(IfBB); - // If this is the entry block we need to make the If block the new - // linearized region entry. - if (IsRegionEntryBB) { - CurrentRegion->setEntry(IfBB); - - if (CurrentRegion->getHasLoop()) { - MachineBasicBlock *RegionExit = CurrentRegion->getExit(); - MachineBasicBlock *ETrueBB = nullptr; - MachineBasicBlock *EFalseBB = nullptr; - SmallVector ECond; - - const DebugLoc &DL = DebugLoc(); - TII->analyzeBranch(*RegionExit, ETrueBB, EFalseBB, ECond); - TII->removeBranch(*RegionExit); - - // We need to create a backedge if there is a loop - Register Reg = TII->insertNE( - RegionExit, RegionExit->instr_end(), DL, - CurrentRegion->getRegionMRT()->getInnerOutputRegister(), - CurrentRegion->getRegionMRT()->getEntry()->getNumber()); - MachineOperand RegOp = - MachineOperand::CreateReg(Reg, false, false, true); - ArrayRef Cond(RegOp); - LLVM_DEBUG(dbgs() << "RegionExitReg: "); - LLVM_DEBUG(RegOp.print(dbgs(), TRI)); - LLVM_DEBUG(dbgs() << "\n"); - TII->insertBranch(*RegionExit, CurrentRegion->getEntry(), RegionExit, - Cond, DebugLoc()); - RegionExit->addSuccessor(CurrentRegion->getEntry()); - } - } - CurrentRegion->addMBB(CodeBB); - LinearizedRegion InnerRegion(CodeBB, MRI, TRI, PHIInfo); + } + // Handle internal block. + const TargetRegisterClass *RegClass = MRI->getRegClass(BBSelectRegIn); + Register CodeBBSelectReg = MRI->createVirtualRegister(RegClass); + rewriteCodeBBTerminator(CodeBB, MergeBB, CodeBBSelectReg); + bool IsRegionEntryBB = CurrentRegion->getEntry() == CodeBB; + MachineBasicBlock *IfBB = createIfBlock(MergeBB, CodeBB, CodeBB, CodeBB, + BBSelectRegIn, IsRegionEntryBB); + CurrentRegion->addMBB(IfBB); + // If this is the entry block we need to make the If block the new + // linearized region entry. + if (IsRegionEntryBB) { + CurrentRegion->setEntry(IfBB); + + if (CurrentRegion->getHasLoop()) { + MachineBasicBlock *RegionExit = CurrentRegion->getExit(); + MachineBasicBlock *ETrueBB = nullptr; + MachineBasicBlock *EFalseBB = nullptr; + SmallVector ECond; - InnerRegion.setParent(CurrentRegion); - LLVM_DEBUG(dbgs() << "Insert BB Select PHI (BB)\n"); - insertMergePHI(IfBB, CodeBB, MergeBB, BBSelectRegOut, BBSelectRegIn, - CodeBBSelectReg); - InnerRegion.addMBB(MergeBB); + const DebugLoc &DL = DebugLoc(); + TII->analyzeBranch(*RegionExit, ETrueBB, EFalseBB, ECond); + TII->removeBranch(*RegionExit); - LLVM_DEBUG(InnerRegion.print(dbgs(), TRI)); - rewriteLiveOutRegs(IfBB, CodeBB, MergeBB, &InnerRegion, CurrentRegion); - extractKilledPHIs(CodeBB); - if (IsRegionEntryBB) { - createEntryPHIs(CurrentRegion); + // We need to create a backedge if there is a loop + Register Reg = + TII->insertNE(RegionExit, RegionExit->instr_end(), DL, + CurrentRegion->getRegionMRT()->getInnerOutputRegister(), + CurrentRegion->getRegionMRT()->getEntry()->getNumber()); + MachineOperand RegOp = MachineOperand::CreateReg(Reg, false, false, true); + ArrayRef Cond(RegOp); + LLVM_DEBUG(dbgs() << "RegionExitReg: "); + LLVM_DEBUG(RegOp.print(dbgs(), TRI)); + LLVM_DEBUG(dbgs() << "\n"); + TII->insertBranch(*RegionExit, CurrentRegion->getEntry(), RegionExit, + Cond, DebugLoc()); + RegionExit->addSuccessor(CurrentRegion->getEntry()); } - return IfBB; } + CurrentRegion->addMBB(CodeBB); + LinearizedRegion InnerRegion(CodeBB, MRI, TRI, PHIInfo); + + InnerRegion.setParent(CurrentRegion); + LLVM_DEBUG(dbgs() << "Insert BB Select PHI (BB)\n"); + insertMergePHI(IfBB, CodeBB, MergeBB, BBSelectRegOut, BBSelectRegIn, + CodeBBSelectReg); + InnerRegion.addMBB(MergeBB); + + LLVM_DEBUG(InnerRegion.print(dbgs(), TRI)); + rewriteLiveOutRegs(IfBB, CodeBB, MergeBB, &InnerRegion, CurrentRegion); + extractKilledPHIs(CodeBB); + if (IsRegionEntryBB) + createEntryPHIs(CurrentRegion); + return IfBB; } MachineBasicBlock *AMDGPUMachineCFGStructurizer::createIfRegion( @@ -2712,12 +2707,11 @@ bool AMDGPUMachineCFGStructurizer::structurizeRegion(RegionMRT *Region) { if (false && regionIsSimpleIf(Region)) { transformSimpleIfRegion(Region); return true; - } else if (regionIsSequence(Region)) { + } + if (regionIsSequence(Region)) fixupRegionExits(Region); - return false; - } else { + else structurizeComplexRegion(Region); - } return false; } @@ -2784,12 +2778,11 @@ AMDGPUMachineCFGStructurizer::initializeSelectRegisters(MRT *MRT, unsigned Selec InnerSelectOut = initializeSelectRegisters(CI, InnerSelectOut, MRI, TII); MRT->setBBSelectRegIn(InnerSelectOut); return InnerSelectOut; - } else { - MRT->setBBSelectRegOut(SelectOut); - unsigned NewSelectIn = createBBSelectReg(TII, MRI); - MRT->setBBSelectRegIn(NewSelectIn); - return NewSelectIn; } + MRT->setBBSelectRegOut(SelectOut); + unsigned NewSelectIn = createBBSelectReg(TII, MRI); + MRT->setBBSelectRegIn(NewSelectIn); + return NewSelectIn; } static void checkRegOnlyPHIInputs(MachineFunction &MF) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 17413ab55536d..73796edb5d3e3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -1116,15 +1116,14 @@ bool AMDGPURegisterBankInfo::applyMappingLoad( LegalizerHelper::Legalized) return false; return true; + } + LLT WiderTy = widen96To128(LoadTy); + auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0); + if (WiderTy.isScalar()) { + B.buildTrunc(MI.getOperand(0), WideLoad); } else { - LLT WiderTy = widen96To128(LoadTy); - auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0); - if (WiderTy.isScalar()) - B.buildTrunc(MI.getOperand(0), WideLoad); - else { - B.buildDeleteTrailingVectorElements(MI.getOperand(0).getReg(), - WideLoad); - } + B.buildDeleteTrailingVectorElements(MI.getOperand(0).getReg(), + WideLoad); } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 55218afb9a8e8..2e1bdf4692478 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -1038,15 +1038,14 @@ unsigned GCNSubtarget::getNSAThreshold(const MachineFunction &MF) const { const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) return static_cast(MF.getSubtarget()); - else - return static_cast(MF.getSubtarget()); + return static_cast(MF.getSubtarget()); } const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) { if (TM.getTargetTriple().getArch() == Triple::amdgcn) return static_cast(TM.getSubtarget(F)); - else - return static_cast(TM.getSubtarget(F)); + return static_cast( + TM.getSubtarget(F)); } GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F, diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 1d43043308ed9..217487b2cc7e6 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -99,13 +99,11 @@ class AMDGPUOperand : public MCParsedAsmOperand { int64_t getModifiersOperand() const { assert(!(hasFPModifiers() && hasIntModifiers()) && "fp and int modifiers should not be used simultaneously"); - if (hasFPModifiers()) { + if (hasFPModifiers()) return getFPModifiersOperand(); - } else if (hasIntModifiers()) { + if (hasIntModifiers()) return getIntModifiersOperand(); - } else { - return 0; - } + return 0; } friend raw_ostream &operator <<(raw_ostream &OS, AMDGPUOperand::Modifiers Mods); @@ -2162,10 +2160,9 @@ template bool AMDGPUOperand::isT16VRegWithInputMods() const { bool AMDGPUOperand::isSDWAOperand(MVT type) const { if (AsmParser->isVI()) return isVReg32(); - else if (AsmParser->isGFX9Plus()) + if (AsmParser->isGFX9Plus()) return isRegClass(AMDGPU::VS_32RegClassID) || isInlinableImm(type); - else - return false; + return false; } bool AMDGPUOperand::isSDWAFP16Operand() const { @@ -3680,19 +3677,17 @@ static OperandIndices getSrcOperandIndices(unsigned Opcode, bool AMDGPUAsmParser::usesConstantBus(const MCInst &Inst, unsigned OpIdx) { const MCOperand &MO = Inst.getOperand(OpIdx); - if (MO.isImm()) { + if (MO.isImm()) return !isInlineConstant(Inst, OpIdx); - } else if (MO.isReg()) { + if (MO.isReg()) { auto Reg = MO.getReg(); - if (!Reg) { + if (!Reg) return false; - } const MCRegisterInfo *TRI = getContext().getRegisterInfo(); auto PReg = mc2PseudoReg(Reg); return isSGPR(PReg, TRI) && PReg != SGPR_NULL; - } else { - return true; } + return true; } // Based on the comment for `AMDGPUInstructionSelector::selectWritelane`: @@ -6338,16 +6333,20 @@ StringRef AMDGPUAsmParser::parseMnemonicSuffix(StringRef Name) { setForcedDPP(true); setForcedEncodingSize(64); return Name.substr(0, Name.size() - 8); - } else if (Name.ends_with("_e64")) { + } + if (Name.ends_with("_e64")) { setForcedEncodingSize(64); return Name.substr(0, Name.size() - 4); - } else if (Name.ends_with("_e32")) { + } + if (Name.ends_with("_e32")) { setForcedEncodingSize(32); return Name.substr(0, Name.size() - 4); - } else if (Name.ends_with("_dpp")) { + } + if (Name.ends_with("_dpp")) { setForcedDPP(true); return Name.substr(0, Name.size() - 4); - } else if (Name.ends_with("_sdwa")) { + } + if (Name.ends_with("_sdwa")) { setForcedSDWA(true); return Name.substr(0, Name.size() - 5); } @@ -7754,10 +7753,9 @@ AMDGPUAsmParser::parseString(StringRef &Val, const StringRef ErrMsg) { Val = getToken().getStringContents(); lex(); return true; - } else { - Error(getLoc(), ErrMsg); - return false; } + Error(getLoc(), ErrMsg); + return false; } bool @@ -7766,11 +7764,10 @@ AMDGPUAsmParser::parseId(StringRef &Val, const StringRef ErrMsg) { Val = getTokenStr(); lex(); return true; - } else { - if (!ErrMsg.empty()) - Error(getLoc(), ErrMsg); - return false; } + if (!ErrMsg.empty()) + Error(getLoc(), ErrMsg); + return false; } AsmToken @@ -9475,8 +9472,8 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands, (SkipSrcVcc && Inst.getNumOperands() == 5))) { SkippedVcc = true; continue; - } else if (BasicInstType == SIInstrFlags::VOPC && - Inst.getNumOperands() == 0) { + } + if (BasicInstType == SIInstrFlags::VOPC && Inst.getNumOperands() == 0) { SkippedVcc = true; continue; } diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index 3e7b6ab19dd0c..1a0dc7098347a 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -1566,8 +1566,7 @@ AMDGPUDisassembler::decodeNonVGPRSrcOp(const OpWidthTy Width, unsigned Val, if (MandatoryLiteral) // Keep a sentinel value for deferred setting return MCOperand::createImm(LITERAL_CONST); - else - return decodeLiteralConstant(Sema == AMDGPU::OperandSemantics::FP64); + return decodeLiteralConstant(Sema == AMDGPU::OperandSemantics::FP64); } switch (Width) { @@ -1701,9 +1700,9 @@ AMDGPUDisassembler::decodeSDWASrc(const OpWidthTy Width, const unsigned Val, return decodeFPImmed(ImmWidth, SVal, Sema); return decodeSpecialReg32(SVal); - } else if (STI.hasFeature(AMDGPU::FeatureVolcanicIslands)) { - return createRegOperand(getVgprClassId(Width), Val); } + if (STI.hasFeature(AMDGPU::FeatureVolcanicIslands)) + return createRegOperand(getVgprClassId(Width), Val); llvm_unreachable("unsupported target"); } @@ -1731,15 +1730,13 @@ MCOperand AMDGPUDisassembler::decodeSDWAVopcDst(unsigned Val) const { if (TTmpIdx >= 0) { auto TTmpClsId = getTtmpClassId(IsWave64 ? OPW64 : OPW32); return createSRegOperand(TTmpClsId, TTmpIdx); - } else if (Val > SGPR_MAX) { - return IsWave64 ? decodeSpecialReg64(Val) - : decodeSpecialReg32(Val); - } else { - return createSRegOperand(getSgprClassId(IsWave64 ? OPW64 : OPW32), Val); } - } else { - return createRegOperand(IsWave64 ? AMDGPU::VCC : AMDGPU::VCC_LO); + if (Val > SGPR_MAX) { + return IsWave64 ? decodeSpecialReg64(Val) : decodeSpecialReg32(Val); + } + return createSRegOperand(getSgprClassId(IsWave64 ? OPW64 : OPW32), Val); } + return createRegOperand(IsWave64 ? AMDGPU::VCC : AMDGPU::VCC_LO); } MCOperand AMDGPUDisassembler::decodeBoolReg(unsigned Val) const { @@ -2265,7 +2262,8 @@ Expected AMDGPUDisassembler::decodeKernelDescriptorDirective( return createReservedKDBitsError( KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32, amdhsa::KERNEL_CODE_PROPERTIES_OFFSET, "must be zero on gfx9"); - } else if (isGFX10Plus()) { + } + if (isGFX10Plus()) { PRINT_DIRECTIVE(".amdhsa_wavefront_size32", KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32); } diff --git a/llvm/lib/Target/AMDGPU/GCNILPSched.cpp b/llvm/lib/Target/AMDGPU/GCNILPSched.cpp index 5926abca12449..8f15cc1b2b537 100644 --- a/llvm/lib/Target/AMDGPU/GCNILPSched.cpp +++ b/llvm/lib/Target/AMDGPU/GCNILPSched.cpp @@ -224,13 +224,11 @@ const SUnit *GCNILPScheduler::pickBest(const SUnit *left, const SUnit *right) return result > 0 ? right : left; return left; } - else { - if (left->getHeight() != right->getHeight()) - return (left->getHeight() > right->getHeight()) ? right : left; + if (left->getHeight() != right->getHeight()) + return (left->getHeight() > right->getHeight()) ? right : left; - if (left->getDepth() != right->getDepth()) - return (left->getDepth() < right->getDepth()) ? right : left; - } + if (left->getDepth() != right->getDepth()) + return (left->getDepth() < right->getDepth()) ? right : left; assert(left->NodeQueueId && right->NodeQueueId && "NodeQueueId cannot be zero"); diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index bb5de368810d5..37bb9675d8c1d 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -1071,7 +1071,8 @@ void AMDGPUInstPrinter::printDPPCtrl(const MCInst *MI, unsigned OpNo, if (!AMDGPU::isLegalDPALU_DPPControl(Imm) && AMDGPU::isDPALU_DPP(Desc)) { O << " /* DP ALU dpp only supports row_newbcast */"; return; - } else if (Imm <= DppCtrl::QUAD_PERM_LAST) { + } + if (Imm <= DppCtrl::QUAD_PERM_LAST) { O << "quad_perm:["; O << formatDec(Imm & 0x3) << ','; O << formatDec((Imm & 0xc) >> 2) << ','; diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp index 30dd384051b94..d2ac5a7ebb2fb 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp @@ -88,8 +88,7 @@ static MCInstPrinter *createAMDGPUMCInstPrinter(const Triple &T, const MCRegisterInfo &MRI) { if (T.getArch() == Triple::r600) return new R600InstPrinter(MAI, MII, MRI); - else - return new AMDGPUInstPrinter(MAI, MII, MRI); + return new AMDGPUInstPrinter(MAI, MII, MRI); } static MCTargetStreamer *createAMDGPUAsmTargetStreamer(MCStreamer &S, diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp index 6c539df7677ee..fa040d548f64c 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp @@ -94,7 +94,8 @@ void R600MCCodeEmitter::encodeInstruction(const MCInst &MI, MI.getOpcode() == R600::BUNDLE || MI.getOpcode() == R600::KILL) { return; - } else if (IS_VTX(Desc)) { + } + if (IS_VTX(Desc)) { uint64_t InstWord01 = getBinaryCodeForInstr(MI, Fixups, STI); uint32_t InstWord2 = MI.getOperand(2).getImm(); // Offset if (!(STI.hasFeature(R600::FeatureCaymanISA))) { @@ -105,29 +106,24 @@ void R600MCCodeEmitter::encodeInstruction(const MCInst &MI, emit(InstWord2, CB); emit((uint32_t)0, CB); } else if (IS_TEX(Desc)) { - int64_t Sampler = MI.getOperand(14).getImm(); - - int64_t SrcSelect[4] = { - MI.getOperand(2).getImm(), - MI.getOperand(3).getImm(), - MI.getOperand(4).getImm(), - MI.getOperand(5).getImm() - }; - int64_t Offsets[3] = { - MI.getOperand(6).getImm() & 0x1F, - MI.getOperand(7).getImm() & 0x1F, - MI.getOperand(8).getImm() & 0x1F - }; - - uint64_t Word01 = getBinaryCodeForInstr(MI, Fixups, STI); - uint32_t Word2 = Sampler << 15 | SrcSelect[ELEMENT_X] << 20 | - SrcSelect[ELEMENT_Y] << 23 | SrcSelect[ELEMENT_Z] << 26 | - SrcSelect[ELEMENT_W] << 29 | Offsets[0] << 0 | Offsets[1] << 5 | - Offsets[2] << 10; - - emit(Word01, CB); - emit(Word2, CB); - emit((uint32_t)0, CB); + int64_t Sampler = MI.getOperand(14).getImm(); + + int64_t SrcSelect[4] = { + MI.getOperand(2).getImm(), MI.getOperand(3).getImm(), + MI.getOperand(4).getImm(), MI.getOperand(5).getImm()}; + int64_t Offsets[3] = {MI.getOperand(6).getImm() & 0x1F, + MI.getOperand(7).getImm() & 0x1F, + MI.getOperand(8).getImm() & 0x1F}; + + uint64_t Word01 = getBinaryCodeForInstr(MI, Fixups, STI); + uint32_t Word2 = Sampler << 15 | SrcSelect[ELEMENT_X] << 20 | + SrcSelect[ELEMENT_Y] << 23 | SrcSelect[ELEMENT_Z] << 26 | + SrcSelect[ELEMENT_W] << 29 | Offsets[0] << 0 | + Offsets[1] << 5 | Offsets[2] << 10; + + emit(Word01, CB); + emit(Word2, CB); + emit((uint32_t)0, CB); } else { uint64_t Inst = getBinaryCodeForInstr(MI, Fixups, STI); if ((STI.hasFeature(R600::FeatureR600ALUInst)) && diff --git a/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp b/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp index 4e26bc8a4b52c..81b142e4e7b9e 100644 --- a/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp +++ b/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp @@ -89,15 +89,14 @@ bool CFStack::requiresWorkAroundForInst(unsigned Opcode) { // work-around when CurrentSubEntries > 3 allows us to over-allocate stack // resources without any problems. return CurrentSubEntries > 3; - } else { - assert(ST->getWavefrontSize() == 32); - // We are being conservative here. We only require the work-around if - // CurrentSubEntries > 7 && - // (CurrentSubEntries % 8 == 7 || CurrentSubEntries % 8 == 0) - // See the comment on the wavefront size == 64 case for why we are - // being conservative. - return CurrentSubEntries > 7; } + assert(ST->getWavefrontSize() == 32); + // We are being conservative here. We only require the work-around if + // CurrentSubEntries > 7 && + // (CurrentSubEntries % 8 == 7 || CurrentSubEntries % 8 == 0) + // See the comment on the wavefront size == 64 case for why we are + // being conservative. + return CurrentSubEntries > 7; } } @@ -106,19 +105,18 @@ unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) { default: return 0; case CFStack::FIRST_NON_WQM_PUSH: - assert(!ST->hasCaymanISA()); - if (ST->getGeneration() <= AMDGPUSubtarget::R700) { - // +1 For the push operation. - // +2 Extra space required. - return 3; - } else { + assert(!ST->hasCaymanISA()); + if (ST->getGeneration() <= AMDGPUSubtarget::R700) { + // +1 For the push operation. + // +2 Extra space required. + return 3; + } // Some documentation says that this is not necessary on Evergreen, // but experimentation has show that we need to allocate 1 extra // sub-entry for the first non-WQM push. // +1 For the push operation. // +1 Extra space required. return 2; - } case CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY: assert(ST->getGeneration() >= AMDGPUSubtarget::EVERGREEN); // +1 For the push operation. @@ -294,8 +292,8 @@ class R600ControlFlowFinalizer : public MachineFunctionPass { if ((DstRegs.find(SrcMI) == DstRegs.end())) { DstRegs.insert(DstMI); return true; - } else - return false; + } + return false; } ClauseFile diff --git a/llvm/lib/Target/AMDGPU/R600ISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/R600ISelDAGToDAG.cpp index 28bcf72b3b091..6b4f5a88c6476 100644 --- a/llvm/lib/Target/AMDGPU/R600ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/R600ISelDAGToDAG.cpp @@ -175,8 +175,9 @@ bool R600DAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base, MVT::i32); return true; // If the pointer address is constant, we can move it to the offset field. - } else if ((IMMOffset = dyn_cast(Addr)) && - isInt<16>(IMMOffset->getZExtValue())) { + } + if ((IMMOffset = dyn_cast(Addr)) && + isInt<16>(IMMOffset->getZExtValue())) { Base = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), SDLoc(CurDAG->getEntryNode()), R600::ZERO, MVT::i32); diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp index 159b2d440b31a..7e0d96622f3c5 100644 --- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -775,13 +775,11 @@ SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT, } bool R600TargetLowering::isZero(SDValue Op) const { - if(ConstantSDNode *Cst = dyn_cast(Op)) { + if (ConstantSDNode *Cst = dyn_cast(Op)) return Cst->isZero(); - } else if(ConstantFPSDNode *CstFP = dyn_cast(Op)){ + if (ConstantFPSDNode *CstFP = dyn_cast(Op)) return CstFP->isZero(); - } else { - return false; - } + return false; } bool R600TargetLowering::isHWTrueValue(SDValue Op) const { @@ -1187,7 +1185,8 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL, Op->getVTList(), Args, MemVT, StoreNode->getMemOperand()); - } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR && VT.bitsGE(MVT::i32)) { + } + if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR && VT.bitsGE(MVT::i32)) { // Convert pointer from byte address to dword address. Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, PtrVT, DWordAddr); @@ -1348,16 +1347,15 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { if (isa(LoadNode->getMemOperand()->getValue()) || isa(Ptr)) { return constBufferLoad(LoadNode, LoadNode->getAddressSpace(), DAG); - } else { - //TODO: Does this even work? - // non-constant ptr can't be folded, keeps it as a v4f32 load - Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32, - DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, - DAG.getConstant(4, DL, MVT::i32)), - DAG.getConstant(LoadNode->getAddressSpace() - - AMDGPUAS::CONSTANT_BUFFER_0, DL, MVT::i32) - ); } + // TODO: Does this even work? + // non-constant ptr can't be folded, keeps it as a v4f32 load + Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32, + DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, + DAG.getConstant(4, DL, MVT::i32)), + DAG.getConstant(LoadNode->getAddressSpace() - + AMDGPUAS::CONSTANT_BUFFER_0, + DL, MVT::i32)); if (!VT.isVector()) { Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result, diff --git a/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp b/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp index 29a43bf4dc52f..a3159944a2add 100644 --- a/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp @@ -679,7 +679,8 @@ bool R600InstrInfo::analyzeBranch(MachineBasicBlock &MBB, if (LastOpc == R600::JUMP) { TBB = LastInst.getOperand(0).getMBB(); return false; - } else if (LastOpc == R600::JUMP_COND) { + } + if (LastOpc == R600::JUMP_COND) { auto predSet = I; while (!isPredicateSetter(predSet->getOpcode())) { predSet = --I; @@ -739,38 +740,36 @@ unsigned R600InstrInfo::insertBranch(MachineBasicBlock &MBB, if (Cond.empty()) { BuildMI(&MBB, DL, get(R600::JUMP)).addMBB(TBB); return 1; - } else { - MachineInstr *PredSet = findFirstPredicateSetterFrom(MBB, MBB.end()); - assert(PredSet && "No previous predicate !"); - addFlag(*PredSet, 0, MO_FLAG_PUSH); - PredSet->getOperand(2).setImm(Cond[1].getImm()); - - BuildMI(&MBB, DL, get(R600::JUMP_COND)) - .addMBB(TBB) - .addReg(R600::PREDICATE_BIT, RegState::Kill); - MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB); - if (CfAlu == MBB.end()) - return 1; - assert (CfAlu->getOpcode() == R600::CF_ALU); - CfAlu->setDesc(get(R600::CF_ALU_PUSH_BEFORE)); - return 1; } - } else { MachineInstr *PredSet = findFirstPredicateSetterFrom(MBB, MBB.end()); assert(PredSet && "No previous predicate !"); addFlag(*PredSet, 0, MO_FLAG_PUSH); PredSet->getOperand(2).setImm(Cond[1].getImm()); + BuildMI(&MBB, DL, get(R600::JUMP_COND)) - .addMBB(TBB) - .addReg(R600::PREDICATE_BIT, RegState::Kill); - BuildMI(&MBB, DL, get(R600::JUMP)).addMBB(FBB); + .addMBB(TBB) + .addReg(R600::PREDICATE_BIT, RegState::Kill); MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB); if (CfAlu == MBB.end()) - return 2; + return 1; assert (CfAlu->getOpcode() == R600::CF_ALU); CfAlu->setDesc(get(R600::CF_ALU_PUSH_BEFORE)); - return 2; + return 1; } + MachineInstr *PredSet = findFirstPredicateSetterFrom(MBB, MBB.end()); + assert(PredSet && "No previous predicate !"); + addFlag(*PredSet, 0, MO_FLAG_PUSH); + PredSet->getOperand(2).setImm(Cond[1].getImm()); + BuildMI(&MBB, DL, get(R600::JUMP_COND)) + .addMBB(TBB) + .addReg(R600::PREDICATE_BIT, RegState::Kill); + BuildMI(&MBB, DL, get(R600::JUMP)).addMBB(FBB); + MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB); + if (CfAlu == MBB.end()) + return 2; + assert(CfAlu->getOpcode() == R600::CF_ALU); + CfAlu->setDesc(get(R600::CF_ALU_PUSH_BEFORE)); + return 2; } unsigned R600InstrInfo::removeBranch(MachineBasicBlock &MBB, @@ -853,20 +852,19 @@ bool R600InstrInfo::isPredicable(const MachineInstr &MI) const { // be predicated. Until we have proper support for instruction clauses in the // backend, we will mark KILL* instructions as unpredicable. - if (MI.getOpcode() == R600::KILLGT) { + if (MI.getOpcode() == R600::KILLGT) return false; - } else if (MI.getOpcode() == R600::CF_ALU) { + if (MI.getOpcode() == R600::CF_ALU) { // If the clause start in the middle of MBB then the MBB has more // than a single clause, unable to predicate several clauses. if (MI.getParent()->begin() != MachineBasicBlock::const_iterator(MI)) return false; // TODO: We don't support KC merging atm return MI.getOperand(3).getImm() == 0 && MI.getOperand(4).getImm() == 0; - } else if (isVector(MI)) { - return false; - } else { - return TargetInstrInfo::isPredicable(MI); } + if (isVector(MI)) + return false; + return TargetInstrInfo::isPredicable(MI); } bool diff --git a/llvm/lib/Target/AMDGPU/R600MachineCFGStructurizer.cpp b/llvm/lib/Target/AMDGPU/R600MachineCFGStructurizer.cpp index abcccc492c671..4db5808c93f50 100644 --- a/llvm/lib/Target/AMDGPU/R600MachineCFGStructurizer.cpp +++ b/llvm/lib/Target/AMDGPU/R600MachineCFGStructurizer.cpp @@ -598,7 +598,7 @@ MachineInstr *R600MachineCFGStructurizer::getLoopendBlockBranchInstr( if (MI) { if (isCondBranch(MI) || isUncondBranch(MI)) return MI; - else if (!TII->isMov(MI->getOpcode())) + if (!TII->isMov(MI->getOpcode())) break; } } diff --git a/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp b/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp index d26879ed8d608..eded8063feaaa 100644 --- a/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp +++ b/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp @@ -202,11 +202,9 @@ void R600SchedStrategy::releaseBottomNode(SUnit *SU) { bool R600SchedStrategy::regBelongsToClass(Register Reg, const TargetRegisterClass *RC) const { - if (!Reg.isVirtual()) { + if (!Reg.isVirtual()) return RC->contains(Reg); - } else { - return MRI->getRegClass(Reg) == RC; - } + return MRI->getRegClass(Reg) == RC; } R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const { @@ -319,9 +317,8 @@ SUnit *R600SchedStrategy::PopInst(std::vector &Q, bool AnyALU) { InstructionsGroupCandidate.pop_back(); Q.erase((It + 1).base()); return SU; - } else { - InstructionsGroupCandidate.pop_back(); } + InstructionsGroupCandidate.pop_back(); } return nullptr; } diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index 3491558a3e8e7..6dfd0bb3964e9 100644 --- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -456,7 +456,8 @@ static bool hoistAndMergeSGPRInits(unsigned Reg, (!MO.isImm() && !MO.isReg()) || (MO.isImm() && Imm)) { Imm = nullptr; break; - } else if (MO.isImm()) + } + if (MO.isImm()) Imm = &MO; } if (Imm) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index df5a334f83082..b68962e0541ce 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1663,14 +1663,14 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const { - if (AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) { + if (AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) return (MemVT.getSizeInBits() <= 4 * 32); - } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) { + if (AS == AMDGPUAS::PRIVATE_ADDRESS) { unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize(); return (MemVT.getSizeInBits() <= MaxPrivateBits); - } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { - return (MemVT.getSizeInBits() <= 2 * 32); } + if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) + return (MemVT.getSizeInBits() <= 2 * 32); return true; } @@ -3031,7 +3031,8 @@ SDValue SITargetLowering::LowerFormalArguments( InVals.push_back(NewArg); continue; - } else if (!IsEntryFunc && VA.isMemLoc()) { + } + if (!IsEntryFunc && VA.isMemLoc()) { SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg); InVals.push_back(Val); if (!Arg.Flags.isByVal()) @@ -10921,7 +10922,8 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { return expandUnalignedStore(Store, DAG); return SDValue(); - } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) { + } + if (AS == AMDGPUAS::PRIVATE_ADDRESS) { switch (Subtarget->getMaxPrivateElementSize()) { case 4: return scalarizeVectorStore(Store, DAG); @@ -12516,11 +12518,12 @@ SITargetLowering::performSignExtendInRegCombine(SDNode *N, Opc, DL, ResList, Ops, M->getMemoryVT(), M->getMemOperand()); SDValue LoadVal = DCI.DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad); return LoadVal; - } else if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE && - VTSign->getVT() == MVT::i8) || - (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT && - VTSign->getVT() == MVT::i16)) && - Src.hasOneUse()) { + } + if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE && + VTSign->getVT() == MVT::i8) || + (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT && + VTSign->getVT() == MVT::i16)) && + Src.hasOneUse()) { auto *M = cast(Src); SDValue Ops[] = { Src.getOperand(0), // Chain @@ -16343,7 +16346,7 @@ SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const { : &AMDGPU::SReg_32RegClass; if (!TRI->isSGPRClass(RC) && !isDivergent) return TRI->getEquivalentSGPRClass(RC); - else if (TRI->isSGPRClass(RC) && isDivergent) + if (TRI->isSGPRClass(RC) && isDivergent) return TRI->getEquivalentVGPRClass(RC); return RC; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 6d12e8c6f2de2..7f7b7c4472042 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1381,13 +1381,13 @@ unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { // Assume hi bits are unneeded. Only _e64 true16 instructions are legal // before RA. return RI.isSGPRClass(DstRC) ? AMDGPU::COPY : AMDGPU::V_MOV_B16_t16_e64; - } else if (RI.getRegSizeInBits(*DstRC) == 32) { + } + if (RI.getRegSizeInBits(*DstRC) == 32) return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; - } else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) { + if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) return AMDGPU::S_MOV_B64; - } else if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC)) { - return AMDGPU::V_MOV_B64_PSEUDO; - } + if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC)) + return AMDGPU::V_MOV_B64_PSEUDO; return AMDGPU::COPY; } @@ -4546,13 +4546,11 @@ bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, // SGPRs use the constant bus if (MO.isImplicit()) { - return MO.getReg() == AMDGPU::M0 || - MO.getReg() == AMDGPU::VCC || + return MO.getReg() == AMDGPU::M0 || MO.getReg() == AMDGPU::VCC || MO.getReg() == AMDGPU::VCC_LO; - } else { - return AMDGPU::SReg_32RegClass.contains(MO.getReg()) || - AMDGPU::SReg_64RegClass.contains(MO.getReg()); } + return AMDGPU::SReg_32RegClass.contains(MO.getReg()) || + AMDGPU::SReg_64RegClass.contains(MO.getReg()); } static Register findImplicitSGPRRead(const MachineInstr &MI) { @@ -4859,8 +4857,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, ErrInfo = "Dst register should be tied to implicit use of preserved register"; return false; - } else if (TiedMO.getReg().isPhysical() && - Dst.getReg() != TiedMO.getReg()) { + } + if (TiedMO.getReg().isPhysical() && Dst.getReg() != TiedMO.getReg()) { ErrInfo = "Dst register should use same physical register as preserved"; return false; } @@ -5232,7 +5230,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, "row_newbroadcast/row_share is not supported before " "GFX90A/GFX10"; return false; - } else if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) { + } + if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) { ErrInfo = "Invalid dpp_ctrl value: " "row_share and row_xmask are not supported before GFX10"; return false; @@ -9513,7 +9512,8 @@ MachineInstr *SIInstrInfo::foldMemoryOperandImpl( if (RC->hasSuperClassEq(&AMDGPU::SReg_32RegClass)) { MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_32_XM0_XEXECRegClass); return nullptr; - } else if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) { + } + if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) { MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_64_XEXECRegClass); return nullptr; } diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 2186c1ede468c..c5251826b117c 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -778,7 +778,8 @@ bool SIMachineFunctionInfo::usesAGPRs(const MachineFunction &MF) const { if (RC && SIRegisterInfo::isAGPRClass(RC)) { UsesAGPRs = true; return true; - } else if (!RC && !MRI.use_empty(Reg) && MRI.getType(Reg).isValid()) { + } + if (!RC && !MRI.use_empty(Reg) && MRI.getType(Reg).isValid()) { // Defer caching UsesAGPRs, function might not yet been regbank selected. return true; } diff --git a/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp b/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp index fb4f5ea4aa760..7c7e0204b1764 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp @@ -617,9 +617,8 @@ SIScheduleBlockCreator::getBlocks(SISchedulerBlockCreatorVariant BlockVariant) { Res.TopDownBlock2Index = TopDownBlock2Index; Blocks[BlockVariant] = Res; return Res; - } else { - return B->second; } + return B->second; } bool SIScheduleBlockCreator::isSUInBlock(SUnit *SU, unsigned ID) { @@ -705,45 +704,42 @@ void SIScheduleBlockCreator::colorHighLatenciesGroups() { HasSubGraph); if (!HasSubGraph) continue; // No dependencies between each other - else if (SubGraph.size() > 5) { + if (SubGraph.size() > 5) { // Too many elements would be required to be added to the block. CompatibleGroup = false; break; } - else { - // Check the type of dependency - for (unsigned k : SubGraph) { - // If in the path to join the two instructions, - // there is another high latency instruction, - // or instructions colored for another block - // abort the merge. - if (DAG->IsHighLatencySU[k] || - (CurrentColoring[k] != ProposedColor && - CurrentColoring[k] != 0)) { - CompatibleGroup = false; - break; - } - // If one of the SU in the subgraph depends on the result of SU j, - // there'll be a data dependency. - if (hasDataDependencyPred(DAG->SUnits[k], DAG->SUnits[j])) { - CompatibleGroup = false; - break; - } - } - if (!CompatibleGroup) + // Check the type of dependency + for (unsigned k : SubGraph) { + // If in the path to join the two instructions, + // there is another high latency instruction, + // or instructions colored for another block + // abort the merge. + if (DAG->IsHighLatencySU[k] || (CurrentColoring[k] != ProposedColor && + CurrentColoring[k] != 0)) { + CompatibleGroup = false; break; - // Same check for the SU - if (hasDataDependencyPred(SU, DAG->SUnits[j])) { + } + // If one of the SU in the subgraph depends on the result of SU j, + // there'll be a data dependency. + if (hasDataDependencyPred(DAG->SUnits[k], DAG->SUnits[j])) { CompatibleGroup = false; break; } - // Add all the required instructions to the block - // These cannot live in another block (because they - // depend (order dependency) on one of the - // instruction in the block, and are required for the - // high latency instruction we add. - llvm::append_range(AdditionalElements, SubGraph); } + if (!CompatibleGroup) + break; + // Same check for the SU + if (hasDataDependencyPred(SU, DAG->SUnits[j])) { + CompatibleGroup = false; + break; + } + // Add all the required instructions to the block + // These cannot live in another block (because they + // depend (order dependency) on one of the + // instruction in the block, and are required for the + // high latency instruction we add. + llvm::append_range(AdditionalElements, SubGraph); } if (CompatibleGroup) { FormingGroup.insert(SU.NodeNum); diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp index 1f6f45e9630ce..93b70fa4ba974 100644 --- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp +++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp @@ -503,12 +503,12 @@ bool SIOptimizeExecMasking::optimizeExecSequence() { SaveExecInst = &*J; LLVM_DEBUG(dbgs() << "Found save exec op: " << *SaveExecInst << '\n'); continue; - } else { - LLVM_DEBUG(dbgs() - << "Instruction does not read exec copy: " << *J << '\n'); - break; } - } else if (ReadsCopyFromExec && !SaveExecInst) { + LLVM_DEBUG(dbgs() << "Instruction does not read exec copy: " << *J + << '\n'); + break; + } + if (ReadsCopyFromExec && !SaveExecInst) { // Make sure no other instruction is trying to use this copy, before it // will be rewritten by the saveexec, i.e. hasOneUse. There may have // been another use, such as an inserted spill. For example: diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp index d428864c9dd59..d80e1277b2a8a 100644 --- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -597,12 +597,11 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) { Opcode == AMDGPU::V_LSHLREV_B32_e64) { return std::make_unique( Dst, Src1, *Imm == 16 ? WORD_1 : BYTE_3, UNUSED_PAD); - } else { - return std::make_unique( - Src1, Dst, *Imm == 16 ? WORD_1 : BYTE_3, false, false, - Opcode != AMDGPU::V_LSHRREV_B32_e32 && - Opcode != AMDGPU::V_LSHRREV_B32_e64); } + return std::make_unique( + Src1, Dst, *Imm == 16 ? WORD_1 : BYTE_3, false, false, + Opcode != AMDGPU::V_LSHRREV_B32_e32 && + Opcode != AMDGPU::V_LSHRREV_B32_e64); break; } @@ -633,14 +632,12 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) { break; if (Opcode == AMDGPU::V_LSHLREV_B16_e32 || - Opcode == AMDGPU::V_LSHLREV_B16_e64) { + Opcode == AMDGPU::V_LSHLREV_B16_e64) return std::make_unique(Dst, Src1, BYTE_1, UNUSED_PAD); - } else { - return std::make_unique( - Src1, Dst, BYTE_1, false, false, - Opcode != AMDGPU::V_LSHRREV_B16_e32 && + return std::make_unique( + Src1, Dst, BYTE_1, false, false, + Opcode != AMDGPU::V_LSHRREV_B16_e32 && Opcode != AMDGPU::V_LSHRREV_B16_e64); - } break; } diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index bb5f2328129f9..96d4863e94014 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -537,8 +537,7 @@ CanBeVOPD getCanBeVOPD(unsigned Opc) { const VOPDComponentInfo *Info = getVOPDComponentHelper(Opc); if (Info) return {Info->CanBeVOPDX, true}; - else - return {false, false}; + return {false, false}; } unsigned getVOPDOpcode(unsigned Opc) { @@ -1479,11 +1478,10 @@ static unsigned getCombinedCountBitMask(const IsaVersion &Version, unsigned Storecnt = getBitMask(getLoadcntStorecntBitShift(Version.Major), getStorecntBitWidth(Version.Major)); return Dscnt | Storecnt; - } else { - unsigned Loadcnt = getBitMask(getLoadcntStorecntBitShift(Version.Major), - getLoadcntBitWidth(Version.Major)); - return Dscnt | Loadcnt; } + unsigned Loadcnt = getBitMask(getLoadcntStorecntBitShift(Version.Major), + getLoadcntBitWidth(Version.Major)); + return Dscnt | Loadcnt; } Waitcnt decodeLoadcntDscnt(const IsaVersion &Version, unsigned LoadcntDscnt) { From 2bb65660ae8b9b2e1896b07b881505a4ffc0393b Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 17 Jul 2024 21:37:28 +0100 Subject: [PATCH 333/777] [LV] Allow re-processing of operands of instrs feeding interleave group Follow up to d216615518 to update dead interleave group pointer detection to allow re-processing of operands of instructions determined to only feed interleave groups. This is needed because instructions feeding interleave group pointers can become dead in any order, as per the newly added test case. --- .../Transforms/Vectorize/LoopVectorize.cpp | 9 +- .../LoopVectorize/X86/interleave-cost.ll | 228 ++++++++++++++++++ 2 files changed, 231 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index c276a2995f54c..40919c944d21f 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7027,7 +7027,7 @@ void LoopVectorizationCostModel::collectValuesToIgnore() { // Ignore ephemeral values. CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); - SmallVector InitialInterleavePointersOps; + SmallVector DeadInterleavePointerOps; for (BasicBlock *BB : TheLoop->blocks()) for (Instruction &I : *BB) { // Find all stores to invariant variables. Since they are going to sink @@ -7045,13 +7045,10 @@ void LoopVectorizationCostModel::collectValuesToIgnore() { if (Group->getInsertPos() == &I) continue; Value *PointerOp = getLoadStorePointerOperand(&I); - InitialInterleavePointersOps.push_back(PointerOp); + DeadInterleavePointerOps.push_back(PointerOp); } } - SmallSetVector DeadInterleavePointerOps( - InitialInterleavePointersOps.rbegin(), - InitialInterleavePointersOps.rend()); // Mark ops feeding interleave group members as free, if they are only used // by other dead computations. for (unsigned I = 0; I != DeadInterleavePointerOps.size(); ++I) { @@ -7064,7 +7061,7 @@ void LoopVectorizationCostModel::collectValuesToIgnore() { })) continue; VecValuesToIgnore.insert(Op); - DeadInterleavePointerOps.insert(Op->op_begin(), Op->op_end()); + DeadInterleavePointerOps.append(Op->op_begin(), Op->op_end()); } // Ignore type-promoting instructions we identified during reduction diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll index 9bba1a90096e6..b1f7516f3c8dc 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll @@ -373,7 +373,230 @@ exit: ret void } +define void @geps_feeding_interleave_groups_with_reuse2(ptr %A, ptr %B, i64 %N) #1 { +; CHECK-LABEL: define void @geps_feeding_interleave_groups_with_reuse2( +; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[N]], 3 +; CHECK-NEXT: [[TMP1:%.*]] = add nuw nsw i64 [[TMP0]], 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP1]], 28 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]] +; CHECK: [[VECTOR_SCEVCHECK]]: +; CHECK-NEXT: [[TMP2:%.*]] = lshr i64 [[N]], 3 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A]], i64 24 +; CHECK-NEXT: [[MUL:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 32, i64 [[TMP2]]) +; CHECK-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i64, i1 } [[MUL]], 0 +; CHECK-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i64, i1 } [[MUL]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = sub i64 0, [[MUL_RESULT]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[SCEVGEP]], i64 [[MUL_RESULT]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp ult ptr [[TMP4]], [[SCEVGEP]] +; CHECK-NEXT: [[TMP6:%.*]] = or i1 [[TMP5]], [[MUL_OVERFLOW]] +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[A]], i64 28 +; CHECK-NEXT: [[MUL2:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 32, i64 [[TMP2]]) +; CHECK-NEXT: [[MUL_RESULT3:%.*]] = extractvalue { i64, i1 } [[MUL2]], 0 +; CHECK-NEXT: [[MUL_OVERFLOW4:%.*]] = extractvalue { i64, i1 } [[MUL2]], 1 +; CHECK-NEXT: [[TMP7:%.*]] = sub i64 0, [[MUL_RESULT3]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[SCEVGEP1]], i64 [[MUL_RESULT3]] +; CHECK-NEXT: [[TMP9:%.*]] = icmp ult ptr [[TMP8]], [[SCEVGEP1]] +; CHECK-NEXT: [[TMP10:%.*]] = or i1 [[TMP9]], [[MUL_OVERFLOW4]] +; CHECK-NEXT: [[SCEVGEP5:%.*]] = getelementptr i8, ptr [[A]], i64 20 +; CHECK-NEXT: [[MUL6:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 32, i64 [[TMP2]]) +; CHECK-NEXT: [[MUL_RESULT7:%.*]] = extractvalue { i64, i1 } [[MUL6]], 0 +; CHECK-NEXT: [[MUL_OVERFLOW8:%.*]] = extractvalue { i64, i1 } [[MUL6]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = sub i64 0, [[MUL_RESULT7]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[SCEVGEP5]], i64 [[MUL_RESULT7]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp ult ptr [[TMP12]], [[SCEVGEP5]] +; CHECK-NEXT: [[TMP14:%.*]] = or i1 [[TMP13]], [[MUL_OVERFLOW8]] +; CHECK-NEXT: [[SCEVGEP9:%.*]] = getelementptr i8, ptr [[A]], i64 16 +; CHECK-NEXT: [[MUL10:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 32, i64 [[TMP2]]) +; CHECK-NEXT: [[MUL_RESULT11:%.*]] = extractvalue { i64, i1 } [[MUL10]], 0 +; CHECK-NEXT: [[MUL_OVERFLOW12:%.*]] = extractvalue { i64, i1 } [[MUL10]], 1 +; CHECK-NEXT: [[TMP15:%.*]] = sub i64 0, [[MUL_RESULT11]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[SCEVGEP9]], i64 [[MUL_RESULT11]] +; CHECK-NEXT: [[TMP17:%.*]] = icmp ult ptr [[TMP16]], [[SCEVGEP9]] +; CHECK-NEXT: [[TMP18:%.*]] = or i1 [[TMP17]], [[MUL_OVERFLOW12]] +; CHECK-NEXT: [[SCEVGEP13:%.*]] = getelementptr i8, ptr [[A]], i64 12 +; CHECK-NEXT: [[MUL14:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 32, i64 [[TMP2]]) +; CHECK-NEXT: [[MUL_RESULT15:%.*]] = extractvalue { i64, i1 } [[MUL14]], 0 +; CHECK-NEXT: [[MUL_OVERFLOW16:%.*]] = extractvalue { i64, i1 } [[MUL14]], 1 +; CHECK-NEXT: [[TMP19:%.*]] = sub i64 0, [[MUL_RESULT15]] +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[SCEVGEP13]], i64 [[MUL_RESULT15]] +; CHECK-NEXT: [[TMP21:%.*]] = icmp ult ptr [[TMP20]], [[SCEVGEP13]] +; CHECK-NEXT: [[TMP22:%.*]] = or i1 [[TMP21]], [[MUL_OVERFLOW16]] +; CHECK-NEXT: [[SCEVGEP17:%.*]] = getelementptr i8, ptr [[A]], i64 8 +; CHECK-NEXT: [[MUL18:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 32, i64 [[TMP2]]) +; CHECK-NEXT: [[MUL_RESULT19:%.*]] = extractvalue { i64, i1 } [[MUL18]], 0 +; CHECK-NEXT: [[MUL_OVERFLOW20:%.*]] = extractvalue { i64, i1 } [[MUL18]], 1 +; CHECK-NEXT: [[TMP23:%.*]] = sub i64 0, [[MUL_RESULT19]] +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[SCEVGEP17]], i64 [[MUL_RESULT19]] +; CHECK-NEXT: [[TMP25:%.*]] = icmp ult ptr [[TMP24]], [[SCEVGEP17]] +; CHECK-NEXT: [[TMP26:%.*]] = or i1 [[TMP25]], [[MUL_OVERFLOW20]] +; CHECK-NEXT: [[SCEVGEP21:%.*]] = getelementptr i8, ptr [[A]], i64 4 +; CHECK-NEXT: [[MUL22:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 32, i64 [[TMP2]]) +; CHECK-NEXT: [[MUL_RESULT23:%.*]] = extractvalue { i64, i1 } [[MUL22]], 0 +; CHECK-NEXT: [[MUL_OVERFLOW24:%.*]] = extractvalue { i64, i1 } [[MUL22]], 1 +; CHECK-NEXT: [[TMP27:%.*]] = sub i64 0, [[MUL_RESULT23]] +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[SCEVGEP21]], i64 [[MUL_RESULT23]] +; CHECK-NEXT: [[TMP29:%.*]] = icmp ult ptr [[TMP28]], [[SCEVGEP21]] +; CHECK-NEXT: [[TMP30:%.*]] = or i1 [[TMP29]], [[MUL_OVERFLOW24]] +; CHECK-NEXT: [[MUL25:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 32, i64 [[TMP2]]) +; CHECK-NEXT: [[MUL_RESULT26:%.*]] = extractvalue { i64, i1 } [[MUL25]], 0 +; CHECK-NEXT: [[MUL_OVERFLOW27:%.*]] = extractvalue { i64, i1 } [[MUL25]], 1 +; CHECK-NEXT: [[TMP31:%.*]] = sub i64 0, [[MUL_RESULT26]] +; CHECK-NEXT: [[TMP32:%.*]] = getelementptr i8, ptr [[A]], i64 [[MUL_RESULT26]] +; CHECK-NEXT: [[TMP33:%.*]] = icmp ult ptr [[TMP32]], [[A]] +; CHECK-NEXT: [[TMP34:%.*]] = or i1 [[TMP33]], [[MUL_OVERFLOW27]] +; CHECK-NEXT: [[TMP35:%.*]] = or i1 [[TMP6]], [[TMP10]] +; CHECK-NEXT: [[TMP36:%.*]] = or i1 [[TMP35]], [[TMP14]] +; CHECK-NEXT: [[TMP37:%.*]] = or i1 [[TMP36]], [[TMP18]] +; CHECK-NEXT: [[TMP38:%.*]] = or i1 [[TMP37]], [[TMP22]] +; CHECK-NEXT: [[TMP39:%.*]] = or i1 [[TMP38]], [[TMP26]] +; CHECK-NEXT: [[TMP40:%.*]] = or i1 [[TMP39]], [[TMP30]] +; CHECK-NEXT: [[TMP41:%.*]] = or i1 [[TMP40]], [[TMP34]] +; CHECK-NEXT: br i1 [[TMP41]], label %[[SCALAR_PH]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP42:%.*]] = lshr i64 [[N]], 3 +; CHECK-NEXT: [[TMP43:%.*]] = shl i64 [[TMP42]], 5 +; CHECK-NEXT: [[TMP44:%.*]] = add i64 [[TMP43]], 32 +; CHECK-NEXT: [[SCEVGEP28:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP44]] +; CHECK-NEXT: [[TMP45:%.*]] = add nuw nsw i64 [[TMP43]], 4 +; CHECK-NEXT: [[SCEVGEP29:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP45]] +; CHECK-NEXT: [[TMP46:%.*]] = shl i64 [[TMP42]], 4 +; CHECK-NEXT: [[TMP47:%.*]] = add nuw nsw i64 [[TMP46]], 8 +; CHECK-NEXT: [[SCEVGEP30:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP47]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[A]], [[SCEVGEP29]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[B]], [[SCEVGEP28]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[BOUND031:%.*]] = icmp ult ptr [[A]], [[SCEVGEP30]] +; CHECK-NEXT: [[BOUND132:%.*]] = icmp ult ptr [[B]], [[SCEVGEP28]] +; CHECK-NEXT: [[FOUND_CONFLICT33:%.*]] = and i1 [[BOUND031]], [[BOUND132]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT33]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 4 +; CHECK-NEXT: [[TMP48:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: [[TMP49:%.*]] = select i1 [[TMP48]], i64 4, i64 [[N_MOD_VF]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[TMP49]] +; CHECK-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], 8 +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP50:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP51:%.*]] = lshr exact i64 [[TMP50]], 1 +; CHECK-NEXT: [[TMP52:%.*]] = getelementptr i32, ptr [[B]], i64 [[TMP51]] +; CHECK-NEXT: [[TMP53:%.*]] = getelementptr i32, ptr [[TMP52]], i32 0 +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP53]], align 4 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> +; CHECK-NEXT: [[STRIDED_VEC34:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP54:%.*]] = getelementptr i32, ptr [[B]], <4 x i64> [[VEC_IND]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP54]], i32 4, <4 x i1> , <4 x i32> poison), !alias.scope [[META6:![0-9]+]] +; CHECK-NEXT: [[TMP55:%.*]] = or disjoint i64 [[TMP50]], 7 +; CHECK-NEXT: [[TMP56:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP55]] +; CHECK-NEXT: [[TMP57:%.*]] = getelementptr i32, ptr [[TMP56]], i32 -7 +; CHECK-NEXT: [[TMP58:%.*]] = shufflevector <4 x i32> [[STRIDED_VEC]], <4 x i32> zeroinitializer, <8 x i32> +; CHECK-NEXT: [[TMP59:%.*]] = shufflevector <4 x i32> [[STRIDED_VEC34]], <4 x i32> zeroinitializer, <8 x i32> +; CHECK-NEXT: [[TMP60:%.*]] = shufflevector <4 x i32> [[WIDE_MASKED_GATHER]], <4 x i32> zeroinitializer, <8 x i32> +; CHECK-NEXT: [[TMP61:%.*]] = shufflevector <8 x i32> [[TMP58]], <8 x i32> [[TMP59]], <16 x i32> +; CHECK-NEXT: [[TMP62:%.*]] = shufflevector <8 x i32> [[TMP60]], <8 x i32> zeroinitializer, <16 x i32> +; CHECK-NEXT: [[TMP63:%.*]] = shufflevector <16 x i32> [[TMP61]], <16 x i32> [[TMP62]], <32 x i32> +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[TMP63]], <32 x i32> poison, <32 x i32> +; CHECK-NEXT: store <32 x i32> [[INTERLEAVED_VEC]], ptr [[TMP57]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP64]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT_7:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[SHR_1:%.*]] = lshr exact i64 [[IV]], 1 +; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr nusw i32, ptr [[B]], i64 [[SHR_1]] +; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[GEP_B]], align 4 +; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr i32, ptr [[A]], i64 [[IV]] +; CHECK-NEXT: store i32 [[L]], ptr [[GEP_A]], align 4 +; CHECK-NEXT: [[IV_NEXT:%.*]] = or disjoint i64 [[IV]], 1 +; CHECK-NEXT: [[GEP_A_1:%.*]] = getelementptr i32, ptr [[A]], i64 [[IV_NEXT]] +; CHECK-NEXT: store i32 0, ptr [[GEP_A_1]], align 4 +; CHECK-NEXT: [[IV_NEXT_1:%.*]] = or disjoint i64 [[IV]], 2 +; CHECK-NEXT: [[SHR_2:%.*]] = lshr exact i64 [[IV_NEXT_1]], 1 +; CHECK-NEXT: [[GEP_B_2:%.*]] = getelementptr i32, ptr [[B]], i64 [[SHR_2]] +; CHECK-NEXT: [[TMP65:%.*]] = load i32, ptr [[GEP_B_2]], align 4 +; CHECK-NEXT: [[GEP_A_2:%.*]] = getelementptr i32, ptr [[A]], i64 [[IV_NEXT_1]] +; CHECK-NEXT: store i32 [[TMP65]], ptr [[GEP_A_2]], align 4 +; CHECK-NEXT: [[IV_NEXT_2:%.*]] = or disjoint i64 [[IV]], 3 +; CHECK-NEXT: [[GEP_A_3:%.*]] = getelementptr i32, ptr [[A]], i64 [[IV_NEXT_2]] +; CHECK-NEXT: store i32 0, ptr [[GEP_A_3]], align 4 +; CHECK-NEXT: [[IV_NEXT_3:%.*]] = or disjoint i64 [[IV]], 4 +; CHECK-NEXT: [[GEP_B_4:%.*]] = getelementptr i32, ptr [[B]], i64 [[IV]] +; CHECK-NEXT: [[TMP66:%.*]] = load i32, ptr [[GEP_B_4]], align 4 +; CHECK-NEXT: [[GEP_A_4:%.*]] = getelementptr i32, ptr [[A]], i64 [[IV_NEXT_3]] +; CHECK-NEXT: store i32 [[TMP66]], ptr [[GEP_A_4]], align 4 +; CHECK-NEXT: [[IV_NEXT_4:%.*]] = or disjoint i64 [[IV]], 5 +; CHECK-NEXT: [[GEP_A_5:%.*]] = getelementptr i32, ptr [[A]], i64 [[IV_NEXT_4]] +; CHECK-NEXT: store i32 0, ptr [[GEP_A_5]], align 4 +; CHECK-NEXT: [[IV_NEXT_5:%.*]] = or disjoint i64 [[IV]], 6 +; CHECK-NEXT: [[GEP_A_6:%.*]] = getelementptr i32, ptr [[A]], i64 [[IV_NEXT_5]] +; CHECK-NEXT: store i32 0, ptr [[GEP_A_6]], align 4 +; CHECK-NEXT: [[IV_NEXT_6:%.*]] = or disjoint i64 [[IV]], 7 +; CHECK-NEXT: [[GEP_A_7:%.*]] = getelementptr i32, ptr [[A]], i64 [[IV_NEXT_6]] +; CHECK-NEXT: store i32 0, ptr [[GEP_A_7]], align 4 +; CHECK-NEXT: [[IV_NEXT_7]] = add nuw nsw i64 [[IV]], 8 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], [[N]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next.7, %loop ] + %shr.1 = lshr exact i64 %iv, 1 + %gep.B = getelementptr nusw i32, ptr %B, i64 %shr.1 + %l = load i32, ptr %gep.B, align 4 + %gep.A = getelementptr i32, ptr %A, i64 %iv + store i32 %l, ptr %gep.A, align 4 + %iv.next = or disjoint i64 %iv, 1 + %gep.A.1 = getelementptr i32, ptr %A, i64 %iv.next + store i32 0, ptr %gep.A.1, align 4 + %iv.next.1 = or disjoint i64 %iv, 2 + %shr.2 = lshr exact i64 %iv.next.1, 1 + %gep.B.2 = getelementptr i32, ptr %B, i64 %shr.2 + %1 = load i32, ptr %gep.B.2, align 4 + %gep.A.2 = getelementptr i32, ptr %A, i64 %iv.next.1 + store i32 %1, ptr %gep.A.2, align 4 + %iv.next.2 = or disjoint i64 %iv, 3 + %gep.A.3 = getelementptr i32, ptr %A, i64 %iv.next.2 + store i32 0, ptr %gep.A.3, align 4 + %iv.next.3 = or disjoint i64 %iv, 4 + %gep.B.4 = getelementptr i32, ptr %B, i64 %iv + %2 = load i32, ptr %gep.B.4, align 4 + %gep.A.4 = getelementptr i32, ptr %A, i64 %iv.next.3 + store i32 %2, ptr %gep.A.4, align 4 + %iv.next.4 = or disjoint i64 %iv, 5 + %gep.A.5 = getelementptr i32, ptr %A, i64 %iv.next.4 + store i32 0, ptr %gep.A.5, align 4 + %iv.next.5 = or disjoint i64 %iv, 6 + %gep.A.6 = getelementptr i32, ptr %A, i64 %iv.next.5 + store i32 0, ptr %gep.A.6, align 4 + %iv.next.6 = or disjoint i64 %iv, 7 + %gep.A.7 = getelementptr i32, ptr %A, i64 %iv.next.6 + store i32 0, ptr %gep.A.7, align 4 + %iv.next.7 = add nuw nsw i64 %iv, 8 + %ec = icmp eq i64 %iv, %N + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + attributes #0 = { "target-features"="+sse4.2" } +attributes #1 = { "min-legal-vector-width"="0" "target-cpu"="cascadelake" } ;. ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} @@ -382,4 +605,9 @@ attributes #0 = { "target-features"="+sse4.2" } ; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]} ; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} ; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]]} +; CHECK: [[META6]] = !{[[META7:![0-9]+]]} +; CHECK: [[META7]] = distinct !{[[META7]], [[META8:![0-9]+]]} +; CHECK: [[META8]] = distinct !{[[META8]], !"LVerDomain"} +; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]], [[META2]]} +; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]]} ;. From a742693f6104055ec026852a70a68275fb82f7a0 Mon Sep 17 00:00:00 2001 From: Mircea Trofin Date: Wed, 17 Jul 2024 13:40:16 -0700 Subject: [PATCH 334/777] [ctx_prof] Add missing test for `PGOContextualProfile::getContainedGuids` --- .../unittests/ProfileData/PGOCtxProfReaderWriterTest.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/llvm/unittests/ProfileData/PGOCtxProfReaderWriterTest.cpp b/llvm/unittests/ProfileData/PGOCtxProfReaderWriterTest.cpp index d2cdbb28e2fce..6c6798ded00b5 100644 --- a/llvm/unittests/ProfileData/PGOCtxProfReaderWriterTest.cpp +++ b/llvm/unittests/ProfileData/PGOCtxProfReaderWriterTest.cpp @@ -115,6 +115,15 @@ TEST_F(PGOCtxProfRWTest, RoundTrip) { EXPECT_EQ(Ctxes.size(), 2U); for (auto &[G, R] : roots()) checkSame(*R, Ctxes.find(G)->second); + + DenseSet Guids; + Ctxes.at(1U).getContainedGuids(Guids); + EXPECT_THAT(Guids, + testing::WhenSorted(testing::ElementsAre(1U, 2U, 4U, 5U))); + + Guids.clear(); + Ctxes.at(3U).getContainedGuids(Guids); + EXPECT_THAT(Guids, testing::ElementsAre(3U)); } } From 33cb29cc3e38990173688aee353d6cbeeb187728 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Wed, 17 Jul 2024 13:52:36 -0700 Subject: [PATCH 335/777] [flang][cuda] Use cuf.alloc/cuf.free for local descriptor (#98518) Local descriptor for cuda allocatable need to be handled on host and device. One solution is to duplicate the descriptor (one on the host and one on the device) and keep them in sync or have the descriptor in managed/unified memory so we don't to take care of any sync. The second solution is probably the one we will implement. In order to have more flexibility on how descriptor representing cuda allocatable are allocated, this patch updates the lowering to use the cuf operations alloc and free to managed them. --- flang/include/flang/Semantics/tools.h | 8 ++---- flang/lib/Lower/ConvertVariable.cpp | 32 ++++++++++++---------- flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp | 6 ++-- flang/test/Fir/cuf-invalid.fir | 11 +------- flang/test/Lower/CUDA/cuda-allocatable.cuf | 21 +++++++++----- 5 files changed, 38 insertions(+), 40 deletions(-) diff --git a/flang/include/flang/Semantics/tools.h b/flang/include/flang/Semantics/tools.h index 0b5308d9242de..0fcba3131fad1 100644 --- a/flang/include/flang/Semantics/tools.h +++ b/flang/include/flang/Semantics/tools.h @@ -222,7 +222,6 @@ inline bool HasCUDAAttr(const Symbol &sym) { } inline bool NeedCUDAAlloc(const Symbol &sym) { - bool inDeviceSubprogram{IsCUDADeviceContext(&sym.owner())}; if (IsDummy(sym)) { return false; } @@ -230,11 +229,8 @@ inline bool NeedCUDAAlloc(const Symbol &sym) { if (details->cudaDataAttr() && (*details->cudaDataAttr() == common::CUDADataAttr::Device || *details->cudaDataAttr() == common::CUDADataAttr::Managed || - *details->cudaDataAttr() == common::CUDADataAttr::Unified)) { - // Descriptor is allocated on host when in host context. - if (IsAllocatable(sym)) { - return inDeviceSubprogram; - } + *details->cudaDataAttr() == common::CUDADataAttr::Unified || + *details->cudaDataAttr() == common::CUDADataAttr::Pinned)) { return true; } } diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp index e5a71c5ec5b4a..47ad48fb322cc 100644 --- a/flang/lib/Lower/ConvertVariable.cpp +++ b/flang/lib/Lower/ConvertVariable.cpp @@ -715,8 +715,9 @@ static mlir::Value createNewLocal(Fortran::lower::AbstractConverter &converter, auto idxTy = builder.getIndexType(); for (mlir::Value sh : elidedShape) indices.push_back(builder.createConvert(loc, idxTy, sh)); - return builder.create(loc, ty, nm, symNm, dataAttr, lenParams, - indices); + mlir::Value alloc = builder.create( + loc, ty, nm, symNm, dataAttr, lenParams, indices); + return alloc; } // Let the builder do all the heavy lifting. @@ -927,6 +928,19 @@ static void instantiateLocal(Fortran::lower::AbstractConverter &converter, finalizeAtRuntime(converter, var, symMap); if (mustBeDefaultInitializedAtRuntime(var)) defaultInitializeAtRuntime(converter, var, symMap); + if (Fortran::semantics::NeedCUDAAlloc(var.getSymbol())) { + auto *builder = &converter.getFirOpBuilder(); + mlir::Location loc = converter.getCurrentLocation(); + fir::ExtendedValue exv = + converter.getSymbolExtendedValue(var.getSymbol(), &symMap); + auto *sym = &var.getSymbol(); + converter.getFctCtx().attachCleanup([builder, loc, exv, sym]() { + cuf::DataAttributeAttr dataAttr = + Fortran::lower::translateSymbolCUFDataAttribute(builder->getContext(), + *sym); + builder->create(loc, fir::getBase(exv), dataAttr); + }); + } if (std::optional cleanup = needDeallocationOrFinalization(var)) { auto *builder = &converter.getFirOpBuilder(); @@ -950,22 +964,10 @@ static void instantiateLocal(Fortran::lower::AbstractConverter &converter, "trying to deallocate entity not lowered as allocatable"); Fortran::lower::genDeallocateIfAllocated(*converterPtr, *mutableBox, loc, sym); + }); } } - if (Fortran::semantics::NeedCUDAAlloc(var.getSymbol())) { - auto *builder = &converter.getFirOpBuilder(); - mlir::Location loc = converter.getCurrentLocation(); - fir::ExtendedValue exv = - converter.getSymbolExtendedValue(var.getSymbol(), &symMap); - auto *sym = &var.getSymbol(); - converter.getFctCtx().attachCleanup([builder, loc, exv, sym]() { - cuf::DataAttributeAttr dataAttr = - Fortran::lower::translateSymbolCUFDataAttribute(builder->getContext(), - *sym); - builder->create(loc, fir::getBase(exv), dataAttr); - }); - } } //===----------------------------------------------------------------===// diff --git a/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp b/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp index 53092bed5720b..f7b36b208a7de 100644 --- a/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp +++ b/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp @@ -54,9 +54,11 @@ template static llvm::LogicalResult checkCudaAttr(Op op) { if (op.getDataAttr() == cuf::DataAttribute::Device || op.getDataAttr() == cuf::DataAttribute::Managed || - op.getDataAttr() == cuf::DataAttribute::Unified) + op.getDataAttr() == cuf::DataAttribute::Unified || + op.getDataAttr() == cuf::DataAttribute::Pinned) return mlir::success(); - return op.emitOpError("expect device, managed or unified cuda attribute"); + return op.emitOpError() + << "expect device, managed, pinned or unified cuda attribute"; } llvm::LogicalResult cuf::AllocOp::verify() { return checkCudaAttr(*this); } diff --git a/flang/test/Fir/cuf-invalid.fir b/flang/test/Fir/cuf-invalid.fir index 6e18e48ac82fc..06e08d14b2435 100644 --- a/flang/test/Fir/cuf-invalid.fir +++ b/flang/test/Fir/cuf-invalid.fir @@ -88,18 +88,9 @@ func.func @_QPsub1() { // ----- -func.func @_QPsub1() { - // expected-error@+1{{'cuf.alloc' op expect device, managed or unified cuda attribute}} - %0 = cuf.alloc f32 {bindc_name = "r", data_attr = #cuf.cuda, uniq_name = "_QFsub1Er"} -> !fir.ref - cuf.free %0 : !fir.ref {data_attr = #cuf.cuda} - return -} - -// ----- - func.func @_QPsub1() { %0 = cuf.alloc f32 {bindc_name = "r", data_attr = #cuf.cuda, uniq_name = "_QFsub1Er"} -> !fir.ref - // expected-error@+1{{'cuf.free' op expect device, managed or unified cuda attribute}} + // expected-error@+1{{'cuf.free' op expect device, managed, pinned or unified cuda attribute}} cuf.free %0 : !fir.ref {data_attr = #cuf.cuda} return } diff --git a/flang/test/Lower/CUDA/cuda-allocatable.cuf b/flang/test/Lower/CUDA/cuda-allocatable.cuf index 74a3ec100a8f2..82c1063507535 100644 --- a/flang/test/Lower/CUDA/cuda-allocatable.cuf +++ b/flang/test/Lower/CUDA/cuda-allocatable.cuf @@ -10,7 +10,7 @@ subroutine sub1() end subroutine ! CHECK-LABEL: func.func @_QPsub1() -! CHECK: %[[BOX:.*]] = fir.alloca !fir.box>> {bindc_name = "a", uniq_name = "_QFsub1Ea"} +! CHECK: %[[BOX:.*]] = cuf.alloc !fir.box>> {bindc_name = "a", data_attr = #cuf.cuda, uniq_name = "_QFsub1Ea"} -> !fir.ref>>> ! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %[[BOX]] {data_attr = #cuf.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QFsub1Ea"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) ! CHECK: fir.call @_FortranAAllocatableSetBounds ! CHECK: %{{.*}} = cuf.allocate %[[BOX_DECL]]#1 : !fir.ref>>> {data_attr = #cuf.cuda} -> i32 @@ -25,6 +25,7 @@ end subroutine ! CHECK: fir.if %[[NE_C0]] { ! CHECK: %{{.*}} = cuf.deallocate %[[BOX_DECL]]#1 : !fir.ref>>> {data_attr = #cuf.cuda} -> i32 ! CHECK: } +! CHECK: cuf.free %[[BOX_DECL]]#1 : !fir.ref>>> {data_attr = #cuf.cuda} subroutine sub2() real, allocatable, managed :: a(:) @@ -35,7 +36,7 @@ subroutine sub2() end subroutine ! CHECK-LABEL: func.func @_QPsub2() -! CHECK: %[[BOX:.*]] = fir.alloca !fir.box>> {bindc_name = "a", uniq_name = "_QFsub2Ea"} +! CHECK: %[[BOX:.*]] = cuf.alloc !fir.box>> {bindc_name = "a", data_attr = #cuf.cuda, uniq_name = "_QFsub2Ea"} -> !fir.ref>>> ! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %[[BOX]] {data_attr = #cuf.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QFsub2Ea"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) ! CHECK: %[[ISTAT:.*]] = fir.alloca i32 {bindc_name = "istat", uniq_name = "_QFsub2Eistat"} ! CHECK: %[[ISTAT_DECL:.*]]:2 = hlfir.declare %[[ISTAT]] {uniq_name = "_QFsub2Eistat"} : (!fir.ref) -> (!fir.ref, !fir.ref) @@ -49,6 +50,7 @@ end subroutine ! CHECK: fir.if %{{.*}} { ! CHECK: %{{.*}} = cuf.deallocate %[[BOX_DECL]]#1 : !fir.ref>>> {data_attr = #cuf.cuda} -> i32 ! CHECK: } +! CHECK: cuf.free %[[BOX_DECL]]#1 : !fir.ref>>> {data_attr = #cuf.cuda} subroutine sub3() integer, allocatable, pinned :: a(:,:) @@ -57,7 +59,7 @@ subroutine sub3() end subroutine ! CHECK-LABEL: func.func @_QPsub3() -! CHECK: %[[BOX:.*]] = fir.alloca !fir.box>> {bindc_name = "a", uniq_name = "_QFsub3Ea"} +! CHECK: %[[BOX:.*]] = cuf.alloc !fir.box>> {bindc_name = "a", data_attr = #cuf.cuda, uniq_name = "_QFsub3Ea"} -> !fir.ref>>> ! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %[[BOX]] {data_attr = #cuf.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QFsub3Ea"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) ! CHECK: %[[PLOG:.*]] = fir.alloca !fir.logical<4> {bindc_name = "plog", uniq_name = "_QFsub3Eplog"} ! CHECK: %[[PLOG_DECL:.*]]:2 = hlfir.declare %5 {uniq_name = "_QFsub3Eplog"} : (!fir.ref>) -> (!fir.ref>, !fir.ref>) @@ -66,6 +68,7 @@ end subroutine ! CHECK: fir.if %{{.*}} { ! CHECK: %{{.*}} = cuf.deallocate %[[BOX_DECL]]#1 : !fir.ref>>> {data_attr = #cuf.cuda} -> i32 ! CHECK: } +! CHECK: cuf.free %[[BOX_DECL]]#1 : !fir.ref>>> {data_attr = #cuf.cuda} subroutine sub4() real, allocatable, device :: a(:) @@ -74,7 +77,7 @@ subroutine sub4() end subroutine ! CHECK-LABEL: func.func @_QPsub4() -! CHECK: %[[BOX:.*]] = fir.alloca !fir.box>> {bindc_name = "a", uniq_name = "_QFsub4Ea"} +! CHECK: %[[BOX:.*]] = cuf.alloc !fir.box>> {bindc_name = "a", data_attr = #cuf.cuda, uniq_name = "_QFsub4Ea"} -> !fir.ref>>> ! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %0 {data_attr = #cuf.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QFsub4Ea"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) ! CHECK: %[[ISTREAM:.*]] = fir.alloca i32 {bindc_name = "istream", uniq_name = "_QFsub4Eistream"} ! CHECK: %[[ISTREAM_DECL:.*]]:2 = hlfir.declare %[[ISTREAM]] {uniq_name = "_QFsub4Eistream"} : (!fir.ref) -> (!fir.ref, !fir.ref) @@ -84,6 +87,7 @@ end subroutine ! CHECK: fir.if %{{.*}} { ! CHECK: %{{.*}} = cuf.deallocate %[[BOX_DECL]]#1 : !fir.ref>>> {data_attr = #cuf.cuda} -> i32 ! CHECK: } +! CHECK: cuf.free %[[BOX_DECL]]#1 : !fir.ref>>> {data_attr = #cuf.cuda} subroutine sub5() real, allocatable, device :: a(:) @@ -92,7 +96,7 @@ subroutine sub5() end subroutine ! CHECK-LABEL: func.func @_QPsub5() -! CHECK: %[[BOX_A:.*]] = fir.alloca !fir.box>> {bindc_name = "a", uniq_name = "_QFsub5Ea"} +! CHECK: %[[BOX_A:.*]] = cuf.alloc !fir.box>> {bindc_name = "a", data_attr = #cuf.cuda, uniq_name = "_QFsub5Ea"} -> !fir.ref>>> ! CHECK: %[[BOX_A_DECL:.*]]:2 = hlfir.declare %[[BOX]] {data_attr = #cuf.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QFsub5Ea"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) ! CHECK: %[[BOX_B:.*]] = fir.alloca !fir.box>> {bindc_name = "b", uniq_name = "_QFsub5Eb"} ! CHECK: %[[BOX_B_DECL:.*]]:2 = hlfir.declare %[[BOX_B]] {fortran_attrs = #fir.var_attrs, uniq_name = "_QFsub5Eb"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) @@ -104,6 +108,7 @@ end subroutine ! CHECK: fir.if %{{.*}} { ! CHECK: %{{.*}} = cuf.deallocate %[[BOX_A_DECL]]#1 : !fir.ref>>> {data_attr = #cuf.cuda} -> i32 ! CHECK: } +! CHECK: cuf.free %[[BOX_A_DECL]]#1 : !fir.ref>>> {data_attr = #cuf.cuda} subroutine sub6() real, allocatable, device :: a(:) @@ -112,7 +117,7 @@ subroutine sub6() end subroutine ! CHECK-LABEL: func.func @_QPsub6() -! CHECK: %[[BOX_A:.*]] = fir.alloca !fir.box>> {bindc_name = "a", uniq_name = "_QFsub6Ea"} +! CHECK: %[[BOX_A:.*]] = cuf.alloc !fir.box>> {bindc_name = "a", data_attr = #cuf.cuda, uniq_name = "_QFsub6Ea"} -> !fir.ref>>> ! CHECK: %[[BOX_A_DECL:.*]]:2 = hlfir.declare %[[BOX]] {data_attr = #cuf.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QFsub6Ea"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) ! CHECK: %[[BOX_B:.*]] = fir.alloca !fir.box>> {bindc_name = "b", uniq_name = "_QFsub6Eb"} ! CHECK: %[[BOX_B_DECL:.*]]:2 = hlfir.declare %[[BOX_B]] {fortran_attrs = #fir.var_attrs, uniq_name = "_QFsub6Eb"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) @@ -122,6 +127,7 @@ end subroutine ! CHECK: fir.if %{{.*}} { ! CHECK: %{{.*}} = cuf.deallocate %[[BOX_A_DECL]]#1 : !fir.ref>>> {data_attr = #cuf.cuda} -> i32 ! CHECK: } +! CHECK: cuf.free %[[BOX_A_DECL]]#1 : !fir.ref>>> {data_attr = #cuf.cuda} subroutine sub7() real, allocatable, device :: a(:) @@ -133,7 +139,7 @@ subroutine sub7() end subroutine ! CHECK-LABEL: func.func @_QPsub7() -! CHECK: %[[BOX:.*]] = fir.alloca !fir.box>> {bindc_name = "a", uniq_name = "_QFsub7Ea"} +! CHECK: %[[BOX:.*]] = cuf.alloc !fir.box>> {bindc_name = "a", data_attr = #cuf.cuda, uniq_name = "_QFsub7Ea"} -> !fir.ref>>> ! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %[[BOX]] {data_attr = #cuf.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QFsub7Ea"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) ! CHECK: %[[ERR:.*]] = fir.alloca !fir.char<1,50> {bindc_name = "err", uniq_name = "_QFsub7Eerr"} ! CHECK: %[[ERR_DECL:.*]]:2 = hlfir.declare %[[ERR]] typeparams %{{.*}} {uniq_name = "_QFsub7Eerr"} : (!fir.ref>, index) -> (!fir.ref>, !fir.ref>) @@ -150,3 +156,4 @@ end subroutine ! CHECK: fir.if %{{.*}} { ! CHECK: %{{.*}} = cuf.deallocate %[[BOX_DECL]]#1 : !fir.ref>>> {data_attr = #cuf.cuda} -> i32 ! CHECK: } +! CHECK: cuf.free %[[BOX_DECL]]#1 : !fir.ref>>> {data_attr = #cuf.cuda} From fffe2728534a238ff0024e11a18280f85094dcde Mon Sep 17 00:00:00 2001 From: Teresa Johnson Date: Wed, 17 Jul 2024 13:53:10 -0700 Subject: [PATCH 336/777] [ADT] Make set_subtract more efficient when subtrahend is larger (NFC) (#98702) If the subtrahend is larger, iterate the minuend set instead. Noticed when subtracting a large set from a number of other smaller sets for an upcoming MemProf change, this change makes that much faster. I subsequently found a couple of callsites in one file that were calling set_subtract with a vector subtrahend, which doesn't have the "count()" interface. Add a separate helper for subtracting a vector. --- llvm/include/llvm/ADT/SetOperations.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/llvm/include/llvm/ADT/SetOperations.h b/llvm/include/llvm/ADT/SetOperations.h index 1a911b239f4c6..ba784bddfe79a 100644 --- a/llvm/include/llvm/ADT/SetOperations.h +++ b/llvm/include/llvm/ADT/SetOperations.h @@ -94,7 +94,22 @@ S1Ty set_difference(const S1Ty &S1, const S2Ty &S2) { /// set_subtract(A, B) - Compute A := A - B /// +/// Selects the set to iterate based on the relative sizes of A and B for better +/// efficiency. +/// template void set_subtract(S1Ty &S1, const S2Ty &S2) { + using ElemTy = decltype(*S1.begin()); + // A couple callers pass a vector for S2, which doesn't support contains(), + // and wouldn't be efficient if it did. + if constexpr (detail::HasMemberContains) { + if (S1.size() < S2.size()) { + for (typename S1Ty::iterator SI = S1.begin(), SE = S1.end(); SI != SE; + ++SI) + if (S2.contains(*SI)) + S1.erase(SI); + return; + } + } for (typename S2Ty::const_iterator SI = S2.begin(), SE = S2.end(); SI != SE; ++SI) S1.erase(*SI); From 306196349f7e7a92156ca733f876d503049696e7 Mon Sep 17 00:00:00 2001 From: Teresa Johnson Date: Wed, 17 Jul 2024 14:01:29 -0700 Subject: [PATCH 337/777] Revert "[ADT] Make set_subtract more efficient when subtrahend is larger (NFC)" (#99386) Reverts llvm/llvm-project#98702 This broke some mlir code and needs investigation. --- llvm/include/llvm/ADT/SetOperations.h | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/llvm/include/llvm/ADT/SetOperations.h b/llvm/include/llvm/ADT/SetOperations.h index ba784bddfe79a..1a911b239f4c6 100644 --- a/llvm/include/llvm/ADT/SetOperations.h +++ b/llvm/include/llvm/ADT/SetOperations.h @@ -94,22 +94,7 @@ S1Ty set_difference(const S1Ty &S1, const S2Ty &S2) { /// set_subtract(A, B) - Compute A := A - B /// -/// Selects the set to iterate based on the relative sizes of A and B for better -/// efficiency. -/// template void set_subtract(S1Ty &S1, const S2Ty &S2) { - using ElemTy = decltype(*S1.begin()); - // A couple callers pass a vector for S2, which doesn't support contains(), - // and wouldn't be efficient if it did. - if constexpr (detail::HasMemberContains) { - if (S1.size() < S2.size()) { - for (typename S1Ty::iterator SI = S1.begin(), SE = S1.end(); SI != SE; - ++SI) - if (S2.contains(*SI)) - S1.erase(SI); - return; - } - } for (typename S2Ty::const_iterator SI = S2.begin(), SE = S2.end(); SI != SE; ++SI) S1.erase(*SI); From 1ecffdaf27cb456aecc5a1c0272d3994d26bf645 Mon Sep 17 00:00:00 2001 From: jameshu15869 <55058507+jameshu15869@users.noreply.github.com> Date: Wed, 17 Jul 2024 16:07:12 -0500 Subject: [PATCH 338/777] [libc] Add Kernel Resource Usage to nvptx-loader (#97503) This PR allows `nvptx-loader` to read the resource usage of `_start`, `_begin`, and `_end` when executing CUDA binaries. Example output: ``` $ nvptx-loader --print-resource-usage libc/benchmarks/gpu/src/ctype/libc.benchmarks.gpu.src.ctype.isalnum_benchmark.__build__ [ RUN ] LlvmLibcIsAlNumGpuBenchmark.IsAlnumWrapper [ OK ] LlvmLibcIsAlNumGpuBenchmark.IsAlnumWrapper: 93 cycles, 76 min, 470 max, 23 iterations, 78000 ns, 80 stddev _begin registers: 25 _start registers: 80 _end registers: 62 ``` --------- Co-authored-by: Joseph Huber --- libc/benchmarks/gpu/CMakeLists.txt | 4 ++- libc/cmake/modules/LLVMLibCTestRules.cmake | 14 +++++++-- libc/utils/gpu/loader/Loader.h | 2 +- libc/utils/gpu/loader/Main.cpp | 10 ++++-- libc/utils/gpu/loader/amdgpu/Loader.cpp | 36 ++++++++++++++-------- libc/utils/gpu/loader/nvptx/Loader.cpp | 34 +++++++++++++++----- 6 files changed, 73 insertions(+), 27 deletions(-) diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt index eaeecbdacd23e..14ba9f3f64b48 100644 --- a/libc/benchmarks/gpu/CMakeLists.txt +++ b/libc/benchmarks/gpu/CMakeLists.txt @@ -15,13 +15,15 @@ function(add_benchmark benchmark_name) endif() add_libc_hermetic( ${benchmark_name} - IS_BENCHMARK + IS_GPU_BENCHMARK LINK_LIBRARIES LibcGpuBenchmark.hermetic ${BENCHMARK_LINK_LIBRARIES} ${BENCHMARK_UNPARSED_ARGUMENTS} ) get_fq_target_name(${benchmark_name} fq_target_name) + set(fq_build_target_name ${fq_target_name}.__build__) + add_dependencies(gpu-benchmark ${fq_target_name}) endfunction(add_benchmark) diff --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake index fbeec32883b63..4d349cb1799da 100644 --- a/libc/cmake/modules/LLVMLibCTestRules.cmake +++ b/libc/cmake/modules/LLVMLibCTestRules.cmake @@ -553,7 +553,7 @@ function(add_libc_hermetic test_name) endif() cmake_parse_arguments( "HERMETIC_TEST" - "IS_BENCHMARK" # Optional arguments + "IS_GPU_BENCHMARK" # Optional arguments "SUITE" # Single value arguments "SRCS;HDRS;DEPENDS;ARGS;ENV;COMPILE_OPTIONS;LINK_LIBRARIES;LOADER_ARGS" # Multi-value arguments ${ARGN} @@ -709,14 +709,24 @@ function(add_libc_hermetic test_name) $ ${HERMETIC_TEST_ARGS}) add_custom_target( ${fq_target_name} + DEPENDS ${fq_target_name}-cmd + ) + + add_custom_command( + OUTPUT ${fq_target_name}-cmd COMMAND ${test_cmd} COMMAND_EXPAND_LISTS COMMENT "Running hermetic test ${fq_target_name}" ${LIBC_HERMETIC_TEST_JOB_POOL} ) + set_source_files_properties(${fq_target_name}-cmd + PROPERTIES + SYMBOLIC "TRUE" + ) + add_dependencies(${HERMETIC_TEST_SUITE} ${fq_target_name}) - if(NOT ${HERMETIC_TEST_IS_BENCHMARK}) + if(NOT ${HERMETIC_TEST_IS_GPU_BENCHMARK}) # If it is a benchmark, it will already have been added to the # gpu-benchmark target add_dependencies(libc-hermetic-tests ${fq_target_name}) diff --git a/libc/utils/gpu/loader/Loader.h b/libc/utils/gpu/loader/Loader.h index eae2776b2773f..e029816764427 100644 --- a/libc/utils/gpu/loader/Loader.h +++ b/libc/utils/gpu/loader/Loader.h @@ -54,7 +54,7 @@ struct end_args_t { /// kernel on the target device. Copies \p argc and \p argv to the device. /// Returns the final value of the `main` function on the device. int load(int argc, char **argv, char **evnp, void *image, size_t size, - const LaunchParameters ¶ms); + const LaunchParameters ¶ms, bool print_resource_usage); /// Return \p V aligned "upwards" according to \p Align. template inline V align_up(V val, A align) { diff --git a/libc/utils/gpu/loader/Main.cpp b/libc/utils/gpu/loader/Main.cpp index b711ec91c9f30..a9c0b868725d0 100644 --- a/libc/utils/gpu/loader/Main.cpp +++ b/libc/utils/gpu/loader/Main.cpp @@ -20,7 +20,8 @@ int main(int argc, char **argv, char **envp) { if (argc < 2) { - printf("USAGE: ./loader [--threads , --blocks ] " + printf("USAGE: ./loader [--threads , --blocks , " + "--print-resource-usage] " ", ...\n"); return EXIT_SUCCESS; } @@ -29,6 +30,7 @@ int main(int argc, char **argv, char **envp) { FILE *file = nullptr; char *ptr; LaunchParameters params = {1, 1, 1, 1, 1, 1}; + bool print_resource_usage = false; while (!file && ++offset < argc) { if (argv[offset] == std::string("--threads") || argv[offset] == std::string("--threads-x")) { @@ -62,6 +64,9 @@ int main(int argc, char **argv, char **envp) { offset + 1 < argc ? strtoul(argv[offset + 1], &ptr, 10) : 1; offset++; continue; + } else if (argv[offset] == std::string("--print-resource-usage")) { + print_resource_usage = true; + continue; } else { file = fopen(argv[offset], "r"); if (!file) { @@ -87,7 +92,8 @@ int main(int argc, char **argv, char **envp) { fclose(file); // Drop the loader from the program arguments. - int ret = load(argc - offset, &argv[offset], envp, image, size, params); + int ret = load(argc - offset, &argv[offset], envp, image, size, params, + print_resource_usage); free(image); return ret; diff --git a/libc/utils/gpu/loader/amdgpu/Loader.cpp b/libc/utils/gpu/loader/amdgpu/Loader.cpp index f8d178be7a517..a9ce36194d94d 100644 --- a/libc/utils/gpu/loader/amdgpu/Loader.cpp +++ b/libc/utils/gpu/loader/amdgpu/Loader.cpp @@ -125,6 +125,10 @@ hsa_status_t get_agent(hsa_agent_t *output_agent) { return iterate_agents(cb); } +void print_kernel_resources(char *kernel_name) { + fprintf("Kernel resources on AMDGPU is not supported yet.\n"); +} + /// Retrieve a global memory pool with a \p flag from the agent. template hsa_status_t get_agent_memory_pool(hsa_agent_t agent, @@ -156,8 +160,9 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable, hsa_amd_memory_pool_t coarsegrained_pool, hsa_queue_t *queue, rpc_device_t device, const LaunchParameters ¶ms, - const char *kernel_name, args_t kernel_args) { - // Look up the '_start' kernel in the loaded executable. + const char *kernel_name, args_t kernel_args, + bool print_resource_usage) { + // Look up the kernel in the loaded executable. hsa_executable_symbol_t symbol; if (hsa_status_t err = hsa_executable_get_symbol_by_name( executable, kernel_name, &dev_agent, &symbol)) @@ -220,7 +225,7 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable, handle_error(err); hsa_amd_agents_allow_access(1, &dev_agent, nullptr, args); - // Initialie all the arguments (explicit and implicit) to zero, then set the + // Initialize all the arguments (explicit and implicit) to zero, then set the // explicit arguments to the values created above. std::memset(args, 0, args_size); std::memcpy(args, &kernel_args, sizeof(args_t)); @@ -270,6 +275,9 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable, hsa_signal_create(1, 0, nullptr, &packet->completion_signal)) handle_error(err); + if (print_resource_usage) + print_kernel_resources(kernel_name); + // Initialize the packet header and set the doorbell signal to begin execution // by the HSA runtime. uint16_t header = @@ -327,7 +335,7 @@ static hsa_status_t hsa_memcpy(void *dst, hsa_agent_t dst_agent, } int load(int argc, char **argv, char **envp, void *image, size_t size, - const LaunchParameters ¶ms) { + const LaunchParameters ¶ms, bool print_resource_usage) { // Initialize the HSA runtime used to communicate with the device. if (hsa_status_t err = hsa_init()) handle_error(err); @@ -545,15 +553,16 @@ int load(int argc, char **argv, char **envp, void *image, size_t size, LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1}; begin_args_t init_args = {argc, dev_argv, dev_envp}; - if (hsa_status_t err = launch_kernel( - dev_agent, executable, kernargs_pool, coarsegrained_pool, queue, - device, single_threaded_params, "_begin.kd", init_args)) + if (hsa_status_t err = launch_kernel(dev_agent, executable, kernargs_pool, + coarsegrained_pool, queue, device, + single_threaded_params, "_begin.kd", + init_args, print_resource_usage)) handle_error(err); start_args_t args = {argc, dev_argv, dev_envp, dev_ret}; - if (hsa_status_t err = launch_kernel(dev_agent, executable, kernargs_pool, - coarsegrained_pool, queue, device, - params, "_start.kd", args)) + if (hsa_status_t err = launch_kernel( + dev_agent, executable, kernargs_pool, coarsegrained_pool, queue, + device, params, "_start.kd", args, print_resource_usage)) handle_error(err); void *host_ret; @@ -571,9 +580,10 @@ int load(int argc, char **argv, char **envp, void *image, size_t size, int ret = *static_cast(host_ret); end_args_t fini_args = {ret}; - if (hsa_status_t err = launch_kernel( - dev_agent, executable, kernargs_pool, coarsegrained_pool, queue, - device, single_threaded_params, "_end.kd", fini_args)) + if (hsa_status_t err = launch_kernel(dev_agent, executable, kernargs_pool, + coarsegrained_pool, queue, device, + single_threaded_params, "_end.kd", + fini_args, print_resource_usage)) handle_error(err); if (rpc_status_t err = rpc_server_shutdown( diff --git a/libc/utils/gpu/loader/nvptx/Loader.cpp b/libc/utils/gpu/loader/nvptx/Loader.cpp index 012cb778ecf15..9c3cf3ae19b41 100644 --- a/libc/utils/gpu/loader/nvptx/Loader.cpp +++ b/libc/utils/gpu/loader/nvptx/Loader.cpp @@ -152,10 +152,23 @@ Expected get_ctor_dtor_array(const void *image, const size_t size, return dev_memory; } +void print_kernel_resources(CUmodule binary, const char *kernel_name) { + CUfunction function; + if (CUresult err = cuModuleGetFunction(&function, binary, kernel_name)) + handle_error(err); + int num_regs; + if (CUresult err = + cuFuncGetAttribute(&num_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, function)) + handle_error(err); + printf("Executing kernel %s:\n", kernel_name); + printf("%6s registers: %d\n", kernel_name, num_regs); +} + template CUresult launch_kernel(CUmodule binary, CUstream stream, rpc_device_t rpc_device, const LaunchParameters ¶ms, - const char *kernel_name, args_t kernel_args) { + const char *kernel_name, args_t kernel_args, + bool print_resource_usage) { // look up the '_start' kernel in the loaded module. CUfunction function; if (CUresult err = cuModuleGetFunction(&function, binary, kernel_name)) @@ -208,6 +221,9 @@ CUresult launch_kernel(CUmodule binary, CUstream stream, }, &memory_stream); + if (print_resource_usage) + print_kernel_resources(binary, kernel_name); + // Call the kernel with the given arguments. if (CUresult err = cuLaunchKernel( function, params.num_blocks_x, params.num_blocks_y, @@ -230,7 +246,7 @@ CUresult launch_kernel(CUmodule binary, CUstream stream, } int load(int argc, char **argv, char **envp, void *image, size_t size, - const LaunchParameters ¶ms) { + const LaunchParameters ¶ms, bool print_resource_usage) { if (CUresult err = cuInit(0)) handle_error(err); // Obtain the first device found on the system. @@ -323,14 +339,15 @@ int load(int argc, char **argv, char **envp, void *image, size_t size, LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1}; begin_args_t init_args = {argc, dev_argv, dev_envp}; - if (CUresult err = launch_kernel(binary, stream, rpc_device, - single_threaded_params, "_begin", init_args)) + if (CUresult err = + launch_kernel(binary, stream, rpc_device, single_threaded_params, + "_begin", init_args, print_resource_usage)) handle_error(err); start_args_t args = {argc, dev_argv, dev_envp, reinterpret_cast(dev_ret)}; - if (CUresult err = - launch_kernel(binary, stream, rpc_device, params, "_start", args)) + if (CUresult err = launch_kernel(binary, stream, rpc_device, params, "_start", + args, print_resource_usage)) handle_error(err); // Copy the return value back from the kernel and wait. @@ -342,8 +359,9 @@ int load(int argc, char **argv, char **envp, void *image, size_t size, handle_error(err); end_args_t fini_args = {host_ret}; - if (CUresult err = launch_kernel(binary, stream, rpc_device, - single_threaded_params, "_end", fini_args)) + if (CUresult err = + launch_kernel(binary, stream, rpc_device, single_threaded_params, + "_end", fini_args, print_resource_usage)) handle_error(err); // Free the memory allocated for the device. From 82b800ecb35fb46881aa52000fa40b1b99aa654e Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Wed, 17 Jul 2024 14:01:29 -0700 Subject: [PATCH 339/777] [SLP][NFC]Limit number of the external uses analysis, NFC. BoUpSLP::buildExternalUses runs through all the users of the vectorized scalars, which may require significant amount of time, if there are too many users. Limited the analysis, if there are too many users, all of them are replaced, not individually. --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 1cf2ff89371d9..d88c6307b994b 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -5892,6 +5892,8 @@ void BoUpSLP::buildExternalUses( } } + if (U && Scalar->hasNUsesOrMore(UsesLimit)) + U = nullptr; int FoundLane = Entry->findLaneForValue(Scalar); LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst << " from lane " << FoundLane << " from " << *Scalar @@ -13940,6 +13942,7 @@ Value *BoUpSLP::vectorizeTree( if (!ScalarsWithNullptrUser.insert(Scalar).second) continue; assert((ExternallyUsedValues.count(Scalar) || + Scalar->hasNUsesOrMore(UsesLimit) || any_of(Scalar->users(), [&](llvm::User *U) { if (ExternalUsesAsGEPs.contains(U)) From 10b4834b76e0473eee3eb70490dd39366589534d Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Wed, 17 Jul 2024 16:34:47 -0500 Subject: [PATCH 340/777] [libc] Fix wrong printf usage in AMDGPU loader --- libc/utils/gpu/loader/amdgpu/Loader.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libc/utils/gpu/loader/amdgpu/Loader.cpp b/libc/utils/gpu/loader/amdgpu/Loader.cpp index a9ce36194d94d..8cf6ea5dc9aec 100644 --- a/libc/utils/gpu/loader/amdgpu/Loader.cpp +++ b/libc/utils/gpu/loader/amdgpu/Loader.cpp @@ -125,8 +125,8 @@ hsa_status_t get_agent(hsa_agent_t *output_agent) { return iterate_agents(cb); } -void print_kernel_resources(char *kernel_name) { - fprintf("Kernel resources on AMDGPU is not supported yet.\n"); +void print_kernel_resources(const char *kernel_name) { + fprintf(stderr, "Kernel resources on AMDGPU is not supported yet.\n"); } /// Retrieve a global memory pool with a \p flag from the agent. From f6add66b720f85bf1092af7d6702b7397da57349 Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Wed, 17 Jul 2024 14:36:11 -0700 Subject: [PATCH 341/777] [instcombine] Extend logical reduction canonicalization to scalable vectors (#99366) These transformations do not depend on the type being fixed in size, so enable them for scalable vectors too. Unlike for fixed vectors, these are only a canonicalization - the bitcast lowering for and/or/add is not legal on a scalable vector type. --- .../Transforms/InstCombine/InstCombineCalls.cpp | 16 ++++++++-------- .../InstCombine/vector-logical-reductions.ll | 14 +++++++------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index 467b291f9a4c3..809be499ee0f9 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -3430,8 +3430,8 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { } if (match(Arg, m_ZExtOrSExtOrSelf(m_Value(Vect)))) { - if (auto *FTy = dyn_cast(Vect->getType())) - if (FTy->getElementType() == Builder.getInt1Ty()) { + if (auto *VTy = dyn_cast(Vect->getType())) + if (VTy->getElementType() == Builder.getInt1Ty()) { Value *Res = Builder.CreateAddReduce(Vect); if (Arg != Vect) Res = Builder.CreateCast(cast(Arg)->getOpcode(), Res, @@ -3460,8 +3460,8 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { } if (match(Arg, m_ZExtOrSExtOrSelf(m_Value(Vect)))) { - if (auto *FTy = dyn_cast(Vect->getType())) - if (FTy->getElementType() == Builder.getInt1Ty()) { + if (auto *VTy = dyn_cast(Vect->getType())) + if (VTy->getElementType() == Builder.getInt1Ty()) { Value *Res = Builder.CreateAndReduce(Vect); if (Res->getType() != II->getType()) Res = Builder.CreateZExt(Res, II->getType()); @@ -3491,8 +3491,8 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { } if (match(Arg, m_ZExtOrSExtOrSelf(m_Value(Vect)))) { - if (auto *FTy = dyn_cast(Vect->getType())) - if (FTy->getElementType() == Builder.getInt1Ty()) { + if (auto *VTy = dyn_cast(Vect->getType())) + if (VTy->getElementType() == Builder.getInt1Ty()) { Value *Res = IID == Intrinsic::vector_reduce_umin ? Builder.CreateAndReduce(Vect) : Builder.CreateOrReduce(Vect); @@ -3533,8 +3533,8 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { } if (match(Arg, m_ZExtOrSExtOrSelf(m_Value(Vect)))) { - if (auto *FTy = dyn_cast(Vect->getType())) - if (FTy->getElementType() == Builder.getInt1Ty()) { + if (auto *VTy = dyn_cast(Vect->getType())) + if (VTy->getElementType() == Builder.getInt1Ty()) { Instruction::CastOps ExtOpc = Instruction::CastOps::CastOpsEnd; if (Arg != Vect) ExtOpc = cast(Arg)->getOpcode(); diff --git a/llvm/test/Transforms/InstCombine/vector-logical-reductions.ll b/llvm/test/Transforms/InstCombine/vector-logical-reductions.ll index 74f4ed01085f8..52e6a0b009978 100644 --- a/llvm/test/Transforms/InstCombine/vector-logical-reductions.ll +++ b/llvm/test/Transforms/InstCombine/vector-logical-reductions.ll @@ -51,7 +51,7 @@ define i1 @reduction_logical_mul(<2 x i1> %x) { define i1 @reduction_logical_mul_nxv2i1( %x) { ; CHECK-LABEL: @reduction_logical_mul_nxv2i1( -; CHECK-NEXT: [[R:%.*]] = call i1 @llvm.vector.reduce.mul.nxv2i1( [[X:%.*]]) +; CHECK-NEXT: [[R:%.*]] = call i1 @llvm.vector.reduce.and.nxv2i1( [[X:%.*]]) ; CHECK-NEXT: ret i1 [[R]] ; %r = call i1 @llvm.vector.reduce.mul.nxv2i1( %x) @@ -71,7 +71,7 @@ define i1 @reduction_logical_xor(<2 x i1> %x) { define i1 @reduction_logical_xor_nxv2i1( %x) { ; CHECK-LABEL: @reduction_logical_xor_nxv2i1( -; CHECK-NEXT: [[R:%.*]] = call i1 @llvm.vector.reduce.xor.nxv2i1( [[X:%.*]]) +; CHECK-NEXT: [[R:%.*]] = call i1 @llvm.vector.reduce.add.nxv2i1( [[X:%.*]]) ; CHECK-NEXT: ret i1 [[R]] ; %r = call i1 @llvm.vector.reduce.xor.nxv2i1( %x) @@ -90,7 +90,7 @@ define i1 @reduction_logical_smin(<2 x i1> %x) { define i1 @reduction_logical_smin_nxv2i1( %x) { ; CHECK-LABEL: @reduction_logical_smin_nxv2i1( -; CHECK-NEXT: [[R:%.*]] = call i1 @llvm.vector.reduce.smin.nxv2i1( [[X:%.*]]) +; CHECK-NEXT: [[R:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1( [[X:%.*]]) ; CHECK-NEXT: ret i1 [[R]] ; %r = call i1 @llvm.vector.reduce.smin.nxv2i1( %x) @@ -109,7 +109,7 @@ define i1 @reduction_logical_smax(<2 x i1> %x) { define i1 @reduction_logical_smax_nxv2i1( %x) { ; CHECK-LABEL: @reduction_logical_smax_nxv2i1( -; CHECK-NEXT: [[R:%.*]] = call i1 @llvm.vector.reduce.smax.nxv2i1( [[X:%.*]]) +; CHECK-NEXT: [[R:%.*]] = call i1 @llvm.vector.reduce.and.nxv2i1( [[X:%.*]]) ; CHECK-NEXT: ret i1 [[R]] ; %r = call i1 @llvm.vector.reduce.smax.nxv2i1( %x) @@ -128,7 +128,7 @@ define i1 @reduction_logical_umin(<2 x i1> %x) { define i1 @reduction_logical_umin_nxv2i1( %x) { ; CHECK-LABEL: @reduction_logical_umin_nxv2i1( -; CHECK-NEXT: [[R:%.*]] = call i1 @llvm.vector.reduce.umin.nxv2i1( [[X:%.*]]) +; CHECK-NEXT: [[R:%.*]] = call i1 @llvm.vector.reduce.and.nxv2i1( [[X:%.*]]) ; CHECK-NEXT: ret i1 [[R]] ; %r = call i1 @llvm.vector.reduce.umin.nxv2i1( %x) @@ -147,7 +147,7 @@ define i1 @reduction_logical_umax(<2 x i1> %x) { define i1 @reduction_logical_umax_nxv2i1( %x) { ; CHECK-LABEL: @reduction_logical_umax_nxv2i1( -; CHECK-NEXT: [[R:%.*]] = call i1 @llvm.vector.reduce.umax.nxv2i1( [[X:%.*]]) +; CHECK-NEXT: [[R:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1( [[X:%.*]]) ; CHECK-NEXT: ret i1 [[R]] ; %r = call i1 @llvm.vector.reduce.umax.nxv2i1( %x) @@ -199,7 +199,7 @@ define i1 @reduction_logical_and_reverse_v2i1(<2 x i1> %p) { define i1 @reduction_logical_xor_reverse_nxv2i1( %p) { ; CHECK-LABEL: @reduction_logical_xor_reverse_nxv2i1( -; CHECK-NEXT: [[RED:%.*]] = call i1 @llvm.vector.reduce.xor.nxv2i1( [[P:%.*]]) +; CHECK-NEXT: [[RED:%.*]] = call i1 @llvm.vector.reduce.add.nxv2i1( [[P:%.*]]) ; CHECK-NEXT: ret i1 [[RED]] ; %rev = call @llvm.vector.reverse.nxv2i1( %p) From d08527ee3ee2dc1e90d2afcc6e5982d0997dad20 Mon Sep 17 00:00:00 2001 From: Joshua Baehring <98630690+JoshuaMBa@users.noreply.github.com> Date: Wed, 17 Jul 2024 15:21:52 -0700 Subject: [PATCH 342/777] [scudo] Add static vector functionality. (#98986) The scudo vector implementation maintains static local data before switching to dynamically allocated data as the array size grows. Users of the vector must now specify the size of the static local data through the vector template (the default size has been removed). If 0 is specified for the size of the static local data, an assertion will be triggered. --- compiler-rt/lib/scudo/standalone/string_utils.h | 2 +- .../lib/scudo/standalone/tests/vector_test.cpp | 8 ++++---- compiler-rt/lib/scudo/standalone/vector.h | 15 +++++++++------ 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/compiler-rt/lib/scudo/standalone/string_utils.h b/compiler-rt/lib/scudo/standalone/string_utils.h index 6e00b63779737..cf61e150f20e5 100644 --- a/compiler-rt/lib/scudo/standalone/string_utils.h +++ b/compiler-rt/lib/scudo/standalone/string_utils.h @@ -40,7 +40,7 @@ class ScopedString { void appendString(int Width, int MaxChars, const char *S); void appendPointer(u64 ptr_value); - Vector String; + Vector String; }; void Printf(const char *Format, ...) FORMAT(1, 2); diff --git a/compiler-rt/lib/scudo/standalone/tests/vector_test.cpp b/compiler-rt/lib/scudo/standalone/tests/vector_test.cpp index 1547824c11763..a972d24a62688 100644 --- a/compiler-rt/lib/scudo/standalone/tests/vector_test.cpp +++ b/compiler-rt/lib/scudo/standalone/tests/vector_test.cpp @@ -11,7 +11,7 @@ #include "vector.h" TEST(ScudoVectorTest, Basic) { - scudo::Vector V; + scudo::Vector V; EXPECT_EQ(V.size(), 0U); V.push_back(42); EXPECT_EQ(V.size(), 1U); @@ -23,7 +23,7 @@ TEST(ScudoVectorTest, Basic) { } TEST(ScudoVectorTest, Stride) { - scudo::Vector V; + scudo::Vector V; for (scudo::uptr I = 0; I < 1000; I++) { V.push_back(I); EXPECT_EQ(V.size(), I + 1U); @@ -34,7 +34,7 @@ TEST(ScudoVectorTest, Stride) { } TEST(ScudoVectorTest, ResizeReduction) { - scudo::Vector V; + scudo::Vector V; V.push_back(0); V.push_back(0); EXPECT_EQ(V.size(), 2U); @@ -48,7 +48,7 @@ TEST(ScudoVectorTest, ResizeReduction) { // Verify that if the reallocate fails, nothing new is added. TEST(ScudoVectorTest, ReallocateFails) { - scudo::Vector V; + scudo::Vector V; scudo::uptr capacity = V.capacity(); // Get the current address space size. diff --git a/compiler-rt/lib/scudo/standalone/vector.h b/compiler-rt/lib/scudo/standalone/vector.h index ca10cc281d770..98b3db4ad6980 100644 --- a/compiler-rt/lib/scudo/standalone/vector.h +++ b/compiler-rt/lib/scudo/standalone/vector.h @@ -21,7 +21,7 @@ namespace scudo { // implementation supports only POD types. // // NOTE: This class is not meant to be used directly, use Vector instead. -template class VectorNoCtor { +template class VectorNoCtor { public: T &operator[](uptr I) { DCHECK_LT(I, Size); @@ -116,18 +116,21 @@ template class VectorNoCtor { uptr CapacityBytes = 0; uptr Size = 0; - T LocalData[256 / sizeof(T)] = {}; + T LocalData[StaticNumEntries] = {}; MemMapT ExternalBuffer; }; -template class Vector : public VectorNoCtor { +template +class Vector : public VectorNoCtor { public: - constexpr Vector() { VectorNoCtor::init(); } + static_assert(StaticNumEntries > 0U, + "Vector must have a non-zero number of static entries."); + constexpr Vector() { VectorNoCtor::init(); } explicit Vector(uptr Count) { - VectorNoCtor::init(Count); + VectorNoCtor::init(Count); this->resize(Count); } - ~Vector() { VectorNoCtor::destroy(); } + ~Vector() { VectorNoCtor::destroy(); } // Disallow copies and moves. Vector(const Vector &) = delete; Vector &operator=(const Vector &) = delete; From 07f8a65d09608d67bfd6adbd62bb0999c7363456 Mon Sep 17 00:00:00 2001 From: Oliver Hunt Date: Wed, 17 Jul 2024 15:22:53 -0700 Subject: [PATCH 343/777] [clang] Ensure pointers passed to runtime support functions are correctly signed (#98276) Updates codegen for global destructors and raising exceptions to ensure that the function pointers being passed are signed using the correct schema. Notably this requires that CodeGenFunction::createAtExitStub to return an opaque Constant* rather than a Function* as the value being emitted is no longer necessarily a raw function pointer depending on the configured ABI. Co-Authored-By: Akira Hatanaka Co-Authored-By: John McCall --- clang/lib/CodeGen/CGDeclCXX.cpp | 12 ++++++-- clang/lib/CodeGen/CodeGenFunction.h | 2 +- clang/lib/CodeGen/ItaniumCXXABI.cpp | 21 ++++++++++++-- .../CodeGenCXX/ptrauth-static-destructors.cpp | 24 +++++++++++++++ clang/test/CodeGenCXX/ptrauth-throw.cpp | 29 +++++++++++++++++++ 5 files changed, 82 insertions(+), 6 deletions(-) create mode 100644 clang/test/CodeGenCXX/ptrauth-static-destructors.cpp create mode 100644 clang/test/CodeGenCXX/ptrauth-throw.cpp diff --git a/clang/lib/CodeGen/CGDeclCXX.cpp b/clang/lib/CodeGen/CGDeclCXX.cpp index 05dd7ddb86fa6..2f56355cff90e 100644 --- a/clang/lib/CodeGen/CGDeclCXX.cpp +++ b/clang/lib/CodeGen/CGDeclCXX.cpp @@ -232,7 +232,7 @@ void CodeGenFunction::EmitCXXGlobalVarDeclInit(const VarDecl &D, /// Create a stub function, suitable for being passed to atexit, /// which passes the given address to the given destructor function. -llvm::Function *CodeGenFunction::createAtExitStub(const VarDecl &VD, +llvm::Constant *CodeGenFunction::createAtExitStub(const VarDecl &VD, llvm::FunctionCallee dtor, llvm::Constant *addr) { // Get the destructor function type, void(*)(void). @@ -264,7 +264,12 @@ llvm::Function *CodeGenFunction::createAtExitStub(const VarDecl &VD, CGF.FinishFunction(); - return fn; + // Get a proper function pointer. + FunctionProtoType::ExtProtoInfo EPI(getContext().getDefaultCallingConvention( + /*IsVariadic=*/false, /*IsCXXMethod=*/false)); + QualType fnType = getContext().getFunctionType(getContext().VoidTy, + {getContext().VoidPtrTy}, EPI); + return CGM.getFunctionPointer(fn, fnType); } /// Create a stub function, suitable for being passed to __pt_atexit_np, @@ -333,7 +338,8 @@ void CodeGenFunction::registerGlobalDtorWithLLVM(const VarDecl &VD, llvm::FunctionCallee Dtor, llvm::Constant *Addr) { // Create a function which calls the destructor. - llvm::Function *dtorStub = createAtExitStub(VD, Dtor, Addr); + llvm::Function *dtorStub = + cast(createAtExitStub(VD, Dtor, Addr)); CGM.AddGlobalDtor(dtorStub); } diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h index d9f6e5b321341..1aac2ee9a5c90 100644 --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -4864,7 +4864,7 @@ class CodeGenFunction : public CodeGenTypeCache { void EmitCXXGlobalVarDeclInit(const VarDecl &D, llvm::GlobalVariable *GV, bool PerformInit); - llvm::Function *createAtExitStub(const VarDecl &VD, llvm::FunctionCallee Dtor, + llvm::Constant *createAtExitStub(const VarDecl &VD, llvm::FunctionCallee Dtor, llvm::Constant *Addr); llvm::Function *createTLSAtExitStub(const VarDecl &VD, diff --git a/clang/lib/CodeGen/ItaniumCXXABI.cpp b/clang/lib/CodeGen/ItaniumCXXABI.cpp index e1d056765a866..37b436a21fbc0 100644 --- a/clang/lib/CodeGen/ItaniumCXXABI.cpp +++ b/clang/lib/CodeGen/ItaniumCXXABI.cpp @@ -1321,8 +1321,16 @@ void ItaniumCXXABI::emitThrow(CodeGenFunction &CGF, const CXXThrowExpr *E) { if (const RecordType *RecordTy = ThrowType->getAs()) { CXXRecordDecl *Record = cast(RecordTy->getDecl()); if (!Record->hasTrivialDestructor()) { + // __cxa_throw is declared to take its destructor as void (*)(void *). We + // must match that if function pointers can be authenticated with a + // discriminator based on their type. + const ASTContext &Ctx = getContext(); + QualType DtorTy = Ctx.getFunctionType(Ctx.VoidTy, {Ctx.VoidPtrTy}, + FunctionProtoType::ExtProtoInfo()); + CXXDestructorDecl *DtorD = Record->getDestructor(); Dtor = CGM.getAddrOfCXXStructor(GlobalDecl(DtorD, Dtor_Complete)); + Dtor = CGM.getFunctionPointer(Dtor, DtorTy); } } if (!Dtor) Dtor = llvm::Constant::getNullValue(CGM.Int8PtrTy); @@ -2699,6 +2707,14 @@ static void emitGlobalDtorWithCXAAtExit(CodeGenFunction &CGF, if (llvm::Function *fn = dyn_cast(atexit.getCallee())) fn->setDoesNotThrow(); + const auto &Context = CGF.CGM.getContext(); + FunctionProtoType::ExtProtoInfo EPI(Context.getDefaultCallingConvention( + /*IsVariadic=*/false, /*IsCXXMethod=*/false)); + QualType fnType = + Context.getFunctionType(Context.VoidTy, {Context.VoidPtrTy}, EPI); + llvm::Constant *dtorCallee = cast(dtor.getCallee()); + dtorCallee = CGF.CGM.getFunctionPointer(dtorCallee, fnType); + if (!addr) // addr is null when we are trying to register a dtor annotated with // __attribute__((destructor)) in a constructor function. Using null here is @@ -2706,7 +2722,7 @@ static void emitGlobalDtorWithCXAAtExit(CodeGenFunction &CGF, // function. addr = llvm::Constant::getNullValue(CGF.Int8PtrTy); - llvm::Value *args[] = {dtor.getCallee(), addr, handle}; + llvm::Value *args[] = {dtorCallee, addr, handle}; CGF.EmitNounwindRuntimeCall(atexit, args); } @@ -4907,7 +4923,8 @@ void XLCXXABI::registerGlobalDtor(CodeGenFunction &CGF, const VarDecl &D, } // Create __dtor function for the var decl. - llvm::Function *DtorStub = CGF.createAtExitStub(D, Dtor, Addr); + llvm::Function *DtorStub = + cast(CGF.createAtExitStub(D, Dtor, Addr)); // Register above __dtor with atexit(). CGF.registerGlobalDtorWithAtExit(DtorStub); diff --git a/clang/test/CodeGenCXX/ptrauth-static-destructors.cpp b/clang/test/CodeGenCXX/ptrauth-static-destructors.cpp new file mode 100644 index 0000000000000..cad43dc0746df --- /dev/null +++ b/clang/test/CodeGenCXX/ptrauth-static-destructors.cpp @@ -0,0 +1,24 @@ +// RUN: %clang_cc1 -triple arm64-apple-ios -fptrauth-calls -emit-llvm -std=c++11 %s -o - \ +// RUN: | FileCheck %s --check-prefix=CXAATEXIT + +// RUN: %clang_cc1 -triple arm64-apple-ios -fptrauth-calls -emit-llvm -std=c++11 %s -o - \ +// RUN: -fno-use-cxa-atexit \ +// RUN: | FileCheck %s --check-prefix=ATEXIT + +class Foo { + public: + ~Foo() { + } +}; + +Foo global; + +// CXAATEXIT: define internal void @__cxx_global_var_init() +// CXAATEXIT: call i32 @__cxa_atexit(ptr ptrauth (ptr @_ZN3FooD1Ev, i32 0), ptr @global, ptr @__dso_handle) + + +// ATEXIT: define internal void @__cxx_global_var_init() +// ATEXIT: %{{.*}} = call i32 @atexit(ptr ptrauth (ptr @__dtor_global, i32 0)) + +// ATEXIT: define internal void @__dtor_global() {{.*}} section "__TEXT,__StaticInit,regular,pure_instructions" { +// ATEXIT: %{{.*}} = call ptr @_ZN3FooD1Ev(ptr @global) diff --git a/clang/test/CodeGenCXX/ptrauth-throw.cpp b/clang/test/CodeGenCXX/ptrauth-throw.cpp new file mode 100644 index 0000000000000..cea7226547e5a --- /dev/null +++ b/clang/test/CodeGenCXX/ptrauth-throw.cpp @@ -0,0 +1,29 @@ +// RUN: %clang_cc1 -triple arm64-apple-ios -fptrauth-calls -fcxx-exceptions -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK +// RUN: %clang_cc1 -fptrauth-function-pointer-type-discrimination -triple arm64-apple-ios -fptrauth-calls -fcxx-exceptions -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECKDISC + +class Foo { + public: + ~Foo() { + } +}; + +// CHECK-LABEL: define void @_Z1fv() +// CHECK: call void @__cxa_throw(ptr %{{.*}}, ptr @_ZTI3Foo, ptr ptrauth (ptr @_ZN3FooD1Ev, i32 0)) + +// CHECKDISC-LABEL: define void @_Z1fv() +// CHECKDISC: call void @__cxa_throw(ptr %{{.*}}, ptr @_ZTI3Foo, ptr ptrauth (ptr @_ZN3FooD1Ev, i32 0, i64 10942)) + +void f() { + throw Foo(); +} + +// __cxa_throw is defined to take its destructor as "void (*)(void *)" in the ABI. +// CHECK-LABEL: define void @__cxa_throw({{.*}}) +// CHECK: call void {{%.*}}(ptr noundef {{%.*}}) [ "ptrauth"(i32 0, i64 0) ] + +// CHECKDISC-LABEL: define void @__cxa_throw({{.*}}) +// CHECKDISC: call void {{%.*}}(ptr noundef {{%.*}}) [ "ptrauth"(i32 0, i64 10942) ] + +extern "C" void __cxa_throw(void *exception, void *, void (*dtor)(void *)) { + dtor(exception); +} From 7647174738bf1b8e58c854c488183a849403d5db Mon Sep 17 00:00:00 2001 From: David Truby Date: Wed, 17 Jul 2024 23:28:36 +0100 Subject: [PATCH 344/777] [flang] Add -rtlib flag (#99058) This patch allows the -rtlib flag with flang-new to select between the libgcc_s and compiler-rt runtimes. The behaviour is identical to the same flag with clang. --- clang/include/clang/Driver/Options.td | 2 +- flang/test/Driver/linker-flags.f90 | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 2400b193d4d38..25555e4620523 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -5622,7 +5622,7 @@ def resource_dir_EQ : Joined<["-"], "resource-dir=">, Flags<[NoXarchOption]>, Alias; def rpath : Separate<["-"], "rpath">, Flags<[LinkerInput]>, Group, Visibility<[ClangOption, CLOption, DXCOption, FlangOption]>; -def rtlib_EQ : Joined<["-", "--"], "rtlib=">, Visibility<[ClangOption, CLOption]>, +def rtlib_EQ : Joined<["-", "--"], "rtlib=">, Visibility<[ClangOption, CLOption, FlangOption]>, HelpText<"Compiler runtime library to use">; def frtlib_add_rpath: Flag<["-"], "frtlib-add-rpath">, Flags<[NoArgumentUnused]>, Visibility<[ClangOption, FlangOption]>, diff --git a/flang/test/Driver/linker-flags.f90 b/flang/test/Driver/linker-flags.f90 index 02e217494f818..ac9500d7c45ce 100644 --- a/flang/test/Driver/linker-flags.f90 +++ b/flang/test/Driver/linker-flags.f90 @@ -11,6 +11,7 @@ ! RUN: %flang -### --target=x86_64-unknown-dragonfly %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,UNIX,UNIX-F128%f128-lib ! RUN: %flang -### --target=x86_64-unknown-haiku %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,HAIKU,HAIKU-F128%f128-lib ! RUN: %flang -### --target=x86_64-windows-gnu %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,MINGW,MINGW-F128%f128-lib +! RUN: %flang -### -rtlib=compiler-rt --target=aarch64-linux-gnu %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,UNIX,COMPILER-RT ! NOTE: Clang's driver library, clangDriver, usually adds 'oldnames' on Windows, ! but it is not needed when compiling Fortran code and they might bring in @@ -33,6 +34,7 @@ ! UNIX-F128LIBQUADMATH-SAME: "-lFortranFloat128Math" "--as-needed" "-lquadmath" "--no-as-needed" ! SOLARIS-F128LIBQUADMATH-SAME: "-lFortranFloat128Math" "-z" "ignore" "-lquadmath" "-z" "record" ! UNIX-SAME: "-lFortranRuntime" "-lFortranDecimal" "-lm" +! COMPILER-RT: "{{.*}}{{\\|/}}libclang_rt.builtins.a" ! DARWIN-LABEL: "{{.*}}ld{{(\.exe)?}}" ! DARWIN-SAME: "[[object_file]]" @@ -61,3 +63,6 @@ ! MSVC-LABEL: link ! MSVC-SAME: /subsystem:console ! MSVC-SAME: "[[object_file]]" + +! COMPILER-RT-NOT: "-lgcc" +! COMPILER-RT-NOT: "-lgcc_s" From 83251a22f623df8d27b6184d19b24c18d314f2bd Mon Sep 17 00:00:00 2001 From: Scallop Ye Date: Thu, 18 Jul 2024 06:55:41 +0800 Subject: [PATCH 345/777] [libFuzzer] Fix incorrect coverage number in fork mode (#82335) Closes #82307. I built LLVM with the changes and tested fuzzing in fork mode. The coverage number was correct: ``` [ye@ye-arch ~]$ /home/ye/work/llvm-project/build/bin/clang++ -fsanitize=fuzzer test_fuzzer.cc [ye@ye-arch ~]$ ./a.out corpus -fork=4 INFO: Running with entropic power schedule (0xFF, 100). INFO: Seed: 3152497917 INFO: Loaded 1 modules (40 inline 8-bit counters): 40 [0x5aa6f7b310d0, 0x5aa6f7b310f8), INFO: Loaded 1 PC tables (40 PCs): 40 [0x5aa6f7b310f8,0x5aa6f7b31378), INFO: -fork=4: fuzzing in separate process(s) INFO: -fork=4: 56 seed inputs, starting to fuzz in /tmp/libFuzzerTemp.FuzzWithFork54465.dir #600649: cov: 36 ft: 224 corp: 56 exec/s: 300324 oom/timeout/crash: 0/0/0 time: 2s job: 1 dft_time: 0 #1548208: cov: 36 ft: 224 corp: 56 exec/s: 315853 oom/timeout/crash: 0/0/0 time: 3s job: 2 dft_time: 0 #2465991: cov: 36 ft: 224 corp: 56 exec/s: 229445 oom/timeout/crash: 0/0/0 time: 4s job: 3 dft_time: 0 #3887877: cov: 36 ft: 224 corp: 56 exec/s: 284377 oom/timeout/crash: 0/0/0 time: 5s job: 4 dft_time: 0 ``` --- compiler-rt/lib/fuzzer/FuzzerFork.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compiler-rt/lib/fuzzer/FuzzerFork.cpp b/compiler-rt/lib/fuzzer/FuzzerFork.cpp index c248a1d246a30..e544cd846e4db 100644 --- a/compiler-rt/lib/fuzzer/FuzzerFork.cpp +++ b/compiler-rt/lib/fuzzer/FuzzerFork.cpp @@ -349,7 +349,7 @@ void FuzzWithFork(Random &Rand, const FuzzingOptions &Options, &NewFeatures, Env.Cov, &NewCov, CFPath, /*Verbose=*/false, /*IsSetCoverMerge=*/false); Env.Features.insert(NewFeatures.begin(), NewFeatures.end()); - Env.Cov.insert(NewFeatures.begin(), NewFeatures.end()); + Env.Cov.insert(NewCov.begin(), NewCov.end()); RemoveFile(CFPath); } From 884772fdd6213c1bc16316b1e57fe08d85bdbc2d Mon Sep 17 00:00:00 2001 From: Akira Hatanaka Date: Wed, 17 Jul 2024 16:19:21 -0700 Subject: [PATCH 346/777] [Sema] Don't drop weak_import from a declaration if its definition isn't seen (#85886) I believe this is what the original commit (33e022650adee965c65f9aea086ee74f3fd1bad5) was trying to do. This fixes a bug where clang removes the attribute from a declaration that follows a declaration directly contained in a linkage-specification. rdar://61865848 --- .../clang/Basic/DiagnosticSemaKinds.td | 2 +- clang/lib/Sema/SemaDecl.cpp | 20 ++++++++++--------- clang/test/Sema/attr-weak.c | 8 ++++++-- clang/test/SemaCXX/attr-weak.cpp | 7 +++++++ 4 files changed, 25 insertions(+), 12 deletions(-) diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index de3d94155a9a0..b8a43b0a9fe8e 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -6075,7 +6075,7 @@ def note_extern_c_global_conflict : Note< def note_extern_c_begins_here : Note< "extern \"C\" language linkage specification begins here">; def warn_weak_import : Warning < - "an already-declared variable is made a weak_import declaration %0">; + "%0 cannot be declared 'weak_import' because its definition has been provided">; def ext_static_non_static : Extension< "redeclaring non-static %0 as static is a Microsoft extension">, InGroup; diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index a3dd5ede9116a..6c3589bf87433 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -4518,16 +4518,18 @@ void Sema::MergeVarDecl(VarDecl *New, LookupResult &Previous) { } mergeDeclAttributes(New, Old); - // Warn if an already-declared variable is made a weak_import in a subsequent + // Warn if an already-defined variable is made a weak_import in a subsequent // declaration - if (New->hasAttr() && - Old->getStorageClass() == SC_None && - !Old->hasAttr()) { - Diag(New->getLocation(), diag::warn_weak_import) << New->getDeclName(); - Diag(Old->getLocation(), diag::note_previous_declaration); - // Remove weak_import attribute on new declaration. - New->dropAttr(); - } + if (New->hasAttr()) + for (auto *D = Old; D; D = D->getPreviousDecl()) { + if (D->isThisDeclarationADefinition() != VarDecl::DeclarationOnly) { + Diag(New->getLocation(), diag::warn_weak_import) << New->getDeclName(); + Diag(D->getLocation(), diag::note_previous_definition); + // Remove weak_import attribute on new declaration. + New->dropAttr(); + break; + } + } if (const auto *ILA = New->getAttr()) if (!Old->hasAttr()) { diff --git a/clang/test/Sema/attr-weak.c b/clang/test/Sema/attr-weak.c index b827d1539b997..f6482109bc9f6 100644 --- a/clang/test/Sema/attr-weak.c +++ b/clang/test/Sema/attr-weak.c @@ -16,8 +16,12 @@ struct __attribute__((weak_import)) s1 {}; // expected-warning {{'weak_import' a static int f(void) __attribute__((weak)); // expected-error {{weak declaration cannot have internal linkage}} static int x __attribute__((weak)); // expected-error {{weak declaration cannot have internal linkage}} -int C; // expected-note {{previous declaration is here}} -extern int C __attribute__((weak_import)); // expected-warning {{an already-declared variable is made a weak_import declaration}} +int C; // expected-note {{previous definition is here}} +extern int C __attribute__((weak_import)); // expected-warning {{'C' cannot be declared 'weak_import'}} + +int C2; // expected-note {{previous definition is here}} +extern int C2; +extern int C2 __attribute__((weak_import)); // expected-warning {{'C2' cannot be declared 'weak_import'}} static int pr14946_x; extern int pr14946_x __attribute__((weak)); // expected-error {{weak declaration cannot have internal linkage}} diff --git a/clang/test/SemaCXX/attr-weak.cpp b/clang/test/SemaCXX/attr-weak.cpp index 0f9a2975e5f68..c6272ef5786ef 100644 --- a/clang/test/SemaCXX/attr-weak.cpp +++ b/clang/test/SemaCXX/attr-weak.cpp @@ -56,3 +56,10 @@ constexpr bool weak_method_is_non_null = &WithWeakMember::weak_method != nullptr // virtual member function is present. constexpr bool virtual_weak_method_is_non_null = &WithWeakMember::virtual_weak_method != nullptr; // expected-error {{must be initialized by a constant expression}} // expected-note@-1 {{comparison against pointer to weak member 'WithWeakMember::virtual_weak_method' can only be performed at runtime}} + +// Check that no warnings are emitted. +extern "C" int g0; +extern int g0 __attribute__((weak_import)); + +extern "C" int g1 = 0; // expected-note {{previous definition is here}} +extern int g1 __attribute__((weak_import)); // expected-warning {{attribute declaration must precede definition}} From 83fbd79319a4d997520c85ab41997692a58cd958 Mon Sep 17 00:00:00 2001 From: aaryanshukla <53713108+aaryanshukla@users.noreply.github.com> Date: Wed, 17 Jul 2024 16:23:15 -0700 Subject: [PATCH 347/777] [libc] newheadergen: configured cmake (#98828) - all headers in the build system are generated by newheadergen - tested on gpu-build --------- Co-authored-by: Rose Zhang --- libc/CMakeLists.txt | 1 + libc/cmake/modules/LLVMLibCHeaderRules.cmake | 101 ++++- libc/include/CMakeLists.txt | 380 +++++++++++------- .../class_implementation/classes/function.py | 2 +- libc/newhdrgen/header.py | 8 +- 5 files changed, 332 insertions(+), 160 deletions(-) diff --git a/libc/CMakeLists.txt b/libc/CMakeLists.txt index 6ba54475d0fd1..3b8e4e6c517e9 100644 --- a/libc/CMakeLists.txt +++ b/libc/CMakeLists.txt @@ -72,6 +72,7 @@ option(LIBC_BUILD_GPU_LOADER "Always build the GPU loader utilities" OFF) if(LIBC_BUILD_GPU_LOADER OR (LLVM_LIBC_GPU_BUILD AND NOT LLVM_RUNTIMES_BUILD)) add_subdirectory(utils/gpu) endif() +option(LIBC_USE_NEW_HEADER_GEN "Generate header files using new headergen instead of the old one" OFF) set(NEED_LIBC_HDRGEN FALSE) if(NOT LLVM_RUNTIMES_BUILD) diff --git a/libc/cmake/modules/LLVMLibCHeaderRules.cmake b/libc/cmake/modules/LLVMLibCHeaderRules.cmake index 7fc6860f23eb2..91054810f5ec5 100644 --- a/libc/cmake/modules/LLVMLibCHeaderRules.cmake +++ b/libc/cmake/modules/LLVMLibCHeaderRules.cmake @@ -66,7 +66,106 @@ function(add_header target_name) ) endfunction(add_header) -# A rule for generated header file targets. +function(add_gen_header2 target_name) + cmake_parse_arguments( + "ADD_GEN_HDR2" + "PUBLIC" # No optional arguments + "YAML_FILE;DEF_FILE;GEN_HDR" # Single value arguments + "DEPENDS" # Multi value arguments + ${ARGN} + ) + get_fq_target_name(${target_name} fq_target_name) + if(NOT LLVM_LIBC_FULL_BUILD) + add_library(${fq_target_name} INTERFACE) + return() + endif() + if(NOT ADD_GEN_HDR2_DEF_FILE) + message(FATAL_ERROR "`add_gen_hdr2` rule requires DEF_FILE to be specified.") + endif() + if(NOT ADD_GEN_HDR2_GEN_HDR) + message(FATAL_ERROR "`add_gen_hdr2` rule requires GEN_HDR to be specified.") + endif() + if(NOT ADD_GEN_HDR2_YAML_FILE) + message(FATAL_ERROR "`add_gen_hdr2` rule requires YAML_FILE to be specified.") + endif() + + set(absolute_path ${CMAKE_CURRENT_SOURCE_DIR}/${ADD_GEN_HDR2_GEN_HDR}) + file(RELATIVE_PATH relative_path ${LIBC_INCLUDE_SOURCE_DIR} ${absolute_path}) + set(out_file ${LIBC_INCLUDE_DIR}/${relative_path}) + set(yaml_file ${CMAKE_SOURCE_DIR}/${ADD_GEN_HDR2_YAML_FILE}) + set(def_file ${CMAKE_CURRENT_SOURCE_DIR}/${ADD_GEN_HDR2_DEF_FILE}) + + set(fq_data_files "") + if(ADD_GEN_HDR2_DATA_FILES) + foreach(data_file IN LISTS ADD_GEN_HDR2_DATA_FILES) + list(APPEND fq_data_files "${CMAKE_CURRENT_SOURCE_DIR}/${data_file}") + endforeach(data_file) + endif() + + set(entry_points "${TARGET_ENTRYPOINT_NAME_LIST}") + list(TRANSFORM entry_points PREPEND "--e=") + + add_custom_command( + OUTPUT ${out_file} + COMMAND ${Python3_EXECUTABLE} ${LIBC_SOURCE_DIR}/newhdrgen/yaml_to_classes.py + ${yaml_file} + --h_def_file ${def_file} + ${entry_points} + --output_dir ${out_file} + DEPENDS ${yaml_file} ${def_file} ${fq_data_files} + COMMENT "Generating header ${ADD_GEN_HDR2_GE2N_HDR} from ${yaml_file} and ${def_file}" + ) + if(LIBC_TARGET_OS_IS_GPU) + file(MAKE_DIRECTORY ${LIBC_INCLUDE_DIR}/llvm-libc-decls) + file(MAKE_DIRECTORY ${LIBC_INCLUDE_DIR}/llvm-libc-decls/gpu) + set(decl_out_file ${LIBC_INCLUDE_DIR}/llvm-libc-decls/${relative_path}) + add_custom_command( + OUTPUT ${decl_out_file} + COMMAND ${Python3_EXECUTABLE} ${LIBC_SOURCE_DIR}/newhdrgen/yaml_to_classes.py + ${yaml_file} + --export-decls + ${entry_points} + --output_dir ${decl_out_file} + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + DEPENDS ${yaml_file} ${fq_data_files} + ) + endif() + + if(ADD_GEN_HDR2_DEPENDS) + get_fq_deps_list(fq_deps_list ${ADD_GEN_HDR2_DEPENDS}) + # Dependencies of a add_header target can only be another add_gen_header target + # or an add_header target. + foreach(dep IN LISTS fq_deps_list) + get_target_property(header_file ${dep} HEADER_FILE_PATH) + if(NOT header_file) + message(FATAL_ERROR "Invalid dependency '${dep}' for '${fq_target_name}'.") + endif() + endforeach() + endif() + set(generated_hdr_target ${fq_target_name}.__generated_hdr__) + add_custom_target( + ${generated_hdr_target} + DEPENDS ${out_file} ${fq_deps_list} ${decl_out_file} + ) + + add_header_library( + ${target_name} + HDRS + ${out_file} + ) + + add_dependencies(${fq_target_name} ${generated_hdr_target}) + + set_target_properties( + ${fq_target_name} + PROPERTIES + HEADER_FILE_PATH ${out_file} + DEPS "${fq_deps_list}" + ) + + +endfunction(add_gen_header2) + # Usage: # add_gen_header( # diff --git a/libc/include/CMakeLists.txt b/libc/include/CMakeLists.txt index 2cf7206f3a625..bbc0f7abafd55 100644 --- a/libc/include/CMakeLists.txt +++ b/libc/include/CMakeLists.txt @@ -17,18 +17,41 @@ add_header( __llvm-libc-common.h ) -add_gen_header( +macro(add_header_macro TARGET_NAME YAML_FILE DEF_FILE GEN_HDR DEPENDS) + if (LIBC_USE_NEW_HEADER_GEN) + add_gen_header2( + ${TARGET_NAME} + YAML_FILE ${YAML_FILE} + DEF_FILE ${DEF_FILE} + GEN_HDR ${GEN_HDR} + ${DEPENDS} + ${ARGN} + ) + else() + add_gen_header( + ${TARGET_NAME} + DEF_FILE ${DEF_FILE} + GEN_HDR ${GEN_HDR} + ${DEPENDS} + ${ARGN} + ) + endif() +endmacro() + +add_header_macro( ctype - DEF_FILE ctype.h.def - GEN_HDR ctype.h + ../libc/newhdrgen/yaml/ctype.yaml + ctype.h.def + ctype.h DEPENDS .llvm_libc_common_h ) -add_gen_header( +add_header_macro( dirent - DEF_FILE dirent.h.def - GEN_HDR dirent.h + ../libc/newhdrgen/yaml/dirent.yaml + dirent.h.def + dirent.h DEPENDS .llvm_libc_common_h .llvm-libc-types.ino_t @@ -36,10 +59,11 @@ add_gen_header( .llvm-libc-types.struct_dirent ) -add_gen_header( +add_header_macro( fcntl - DEF_FILE fcntl.h.def - GEN_HDR fcntl.h + ../libc/newhdrgen/yaml/fcntl.yaml + fcntl.h.def + fcntl.h DEPENDS .llvm-libc-macros.fcntl_macros .llvm-libc-types.mode_t @@ -51,28 +75,31 @@ add_gen_header( .llvm_libc_common_h ) -add_gen_header( +add_header_macro( dlfcn - DEF_FILE dlfcn.h.def - GEN_HDR dlfcn.h + ../libc/newhdrgen/yaml/dlfcn.yaml + dlfcn.h.def + dlfcn.h DEPENDS .llvm-libc-macros.dlfcn_macros .llvm_libc_common_h ) -add_gen_header( +add_header_macro( features - DEF_FILE features.h.def - GEN_HDR features.h + ../libc/newhdrgen/yaml/features.yaml + features.h.def + features.h DEPENDS .llvm_libc_common_h .llvm-libc-macros.features_macros ) -add_gen_header( +add_header_macro( fenv - DEF_FILE fenv.h.def - GEN_HDR fenv.h + ../libc/newhdrgen/yaml/fenv.yaml + fenv.h.def + fenv.h DEPENDS .llvm_libc_common_h .llvm-libc-macros.fenv_macros @@ -80,58 +107,63 @@ add_gen_header( .llvm-libc-types.fexcept_t ) -add_gen_header( +add_header_macro( inttypes - DEF_FILE inttypes.h.def - GEN_HDR inttypes.h + ../libc/newhdrgen/yaml/inttypes.yaml + inttypes.h.def + inttypes.h DEPENDS .llvm_libc_common_h .llvm-libc-types.imaxdiv_t .llvm-libc-macros.inttypes_macros ) -add_gen_header( +add_header_macro( float - DEF_FILE float.h.def - GEN_HDR float.h + ../libc/newhdrgen/yaml/float.yaml + float.h.def + float.h DEPENDS .llvm-libc-macros.float_macros ) -add_gen_header( +add_header_macro( stdint - DEF_FILE stdint.h.def - GEN_HDR stdint.h + ../libc/newhdrgen/yaml/stdint.yaml + stdint.h.def + stdint.h DEPENDS .llvm-libc-macros.stdint_macros ) -add_gen_header( +add_header_macro( limits - DEF_FILE limits.h.def - GEN_HDR limits.h + ../libc/newhdrgen/yaml/limits.yaml + limits.h.def + limits.h DEPENDS .llvm-libc-macros.limits_macros ) -add_gen_header( +add_header_macro( math - DEF_FILE math.h.def - GEN_HDR math.h + ../libc/newhdrgen/yaml/math.yaml + math.h.def + math.h DEPENDS .llvm_libc_common_h .llvm-libc-macros.float16_macros .llvm-libc-macros.math_macros - .llvm-libc-macros.math_function_macros .llvm-libc-types.double_t .llvm-libc-types.float_t .llvm-libc-types.float128 ) -add_gen_header( +add_header_macro( stdfix - DEF_FILE stdfix.h.def - GEN_HDR stdfix.h + ../libc/newhdrgen/yaml/stdfix.yaml + stdfix.h.def + stdfix.h DEPENDS .llvm-libc-macros.stdfix_macros ) @@ -139,55 +171,61 @@ add_gen_header( # TODO: This should be conditional on POSIX networking being included. file(MAKE_DIRECTORY ${LIBC_INCLUDE_DIR}/arpa) -add_gen_header( +add_header_macro( arpa_inet - DEF_FILE arpa/inet.h.def - GEN_HDR arpa/inet.h + ../libc/newhdrgen/yaml/arpa_inet.yaml + arpa/inet.h.def + arpa/inet.h DEPENDS .llvm_libc_common_h ) -add_gen_header( +add_header_macro( assert - DEF_FILE assert.h.def - GEN_HDR assert.h + ../libc/newhdrgen/yaml/assert.yaml + assert.h.def + assert.h DEPENDS .llvm_libc_common_h .llvm-libc-macros.assert_macros ) -add_gen_header( +add_header_macro( setjmp - DEF_FILE setjmp.h.def - GEN_HDR setjmp.h + ../libc/newhdrgen/yaml/setjmp.yaml + setjmp.h.def + setjmp.h DEPENDS .llvm_libc_common_h .llvm-libc-types.jmp_buf ) -add_gen_header( +add_header_macro( string - DEF_FILE string.h.def - GEN_HDR string.h + ../libc/newhdrgen/yaml/string.yaml + string.h.def + string.h DEPENDS .llvm_libc_common_h .llvm-libc-macros.null_macro .llvm-libc-types.size_t ) -add_gen_header( +add_header_macro( strings - DEF_FILE strings.h.def - GEN_HDR strings.h + ../libc/newhdrgen/yaml/strings.yaml + strings.h.def + strings.h DEPENDS .llvm_libc_common_h .llvm-libc-types.size_t ) -add_gen_header( +add_header_macro( search - DEF_FILE search.h.def - GEN_HDR search.h + ../libc/newhdrgen/yaml/search.yaml + search.h.def + search.h DEPENDS .llvm_libc_common_h .llvm-libc-types.ACTION @@ -196,10 +234,11 @@ add_gen_header( .llvm-libc-types.size_t ) -add_gen_header( +add_header_macro( time - DEF_FILE time.h.def - GEN_HDR time.h + ../libc/newhdrgen/yaml/time.yaml + time.h.def + time.h DEPENDS .llvm_libc_common_h .llvm-libc-macros.time_macros @@ -211,10 +250,11 @@ add_gen_header( .llvm-libc-types.clockid_t ) -add_gen_header( +add_header_macro( threads - DEF_FILE threads.h.def - GEN_HDR threads.h + ../libc/newhdrgen/yaml/threads.yaml + threads.h.def + threads.h DEPENDS .llvm_libc_common_h .llvm-libc-types.__call_once_func_t @@ -227,19 +267,21 @@ add_gen_header( .llvm-libc-types.tss_dtor_t ) -add_gen_header( +add_header_macro( errno - DEF_FILE errno.h.def - GEN_HDR errno.h + ../libc/newhdrgen/yaml/errno.yaml + errno.h.def + errno.h DEPENDS .llvm-libc-macros.generic_error_number_macros .llvm-libc-macros.error_number_macros ) -add_gen_header( +add_header_macro( signal - DEF_FILE signal.h.def - GEN_HDR signal.h + ../libc/newhdrgen/yaml/signal.yaml + signal.h.def + signal.h DEPENDS .llvm-libc-macros.signal_macros .llvm-libc-types.sig_atomic_t @@ -251,28 +293,31 @@ add_gen_header( .llvm-libc-types.pid_t ) -add_gen_header( +add_header_macro( stdbit - DEF_FILE stdbit.h.def - GEN_HDR stdbit.h + ../libc/newhdrgen/yaml/stdbit.yaml + stdbit.h.def + stdbit.h DEPENDS .llvm_libc_common_h .llvm-libc-macros.stdbit_macros ) -add_gen_header( +add_header_macro( stdckdint - DEF_FILE stdckdint.h.def - GEN_HDR stdckdint.h + ../libc/newhdrgen/yaml/stdckdint.yaml + stdckdint.h.def + stdckdint.h DEPENDS .llvm_libc_common_h .llvm-libc-macros.stdckdint_macros ) -add_gen_header( +add_header_macro( stdio - DEF_FILE stdio.h.def - GEN_HDR stdio.h + ../libc/newhdrgen/yaml/stdio.yaml + stdio.h.def + stdio.h DEPENDS .llvm-libc-macros.file_seek_macros .llvm-libc-macros.stdio_macros @@ -284,10 +329,11 @@ add_gen_header( .llvm_libc_common_h ) -add_gen_header( +add_header_macro( stdlib - DEF_FILE stdlib.h.def - GEN_HDR stdlib.h + ../libc/newhdrgen/yaml/stdlib.yaml + stdlib.h.def + stdlib.h DEPENDS .llvm_libc_common_h .llvm-libc-macros.stdlib_macros @@ -301,10 +347,11 @@ add_gen_header( .llvm-libc-types.__atexithandler_t ) -add_gen_header( +add_header_macro( unistd - DEF_FILE unistd.h.def - GEN_HDR unistd.h + ../libc/newhdrgen/yaml/unistd.yaml + unistd.h.def + unistd.h DEPENDS .llvm_libc_common_h .llvm-libc-macros.file_seek_macros @@ -319,10 +366,11 @@ add_gen_header( .llvm-libc-types.__getoptargv_t ) -add_gen_header( +add_header_macro( pthread - DEF_FILE pthread.h.def - GEN_HDR pthread.h + ../libc/newhdrgen/yaml/pthread.yaml + pthread.h.def + pthread.h DEPENDS .llvm_libc_common_h .llvm-libc-types.__atfork_callback_t @@ -340,10 +388,11 @@ add_gen_header( .llvm-libc-types.pthread_t ) -add_gen_header( +add_header_macro( sched - DEF_FILE sched.h.def - GEN_HDR sched.h + ../libc/newhdrgen/yaml/sched.yaml + sched.h.def + sched.h DEPENDS .llvm_libc_common_h .llvm-libc-macros.sched_macros @@ -356,10 +405,11 @@ add_gen_header( .llvm-libc-types.struct_timespec ) -add_gen_header( +add_header_macro( spawn - DEF_FILE spawn.h.def - GEN_HDR spawn.h + ../libc/newhdrgen/yaml/spawn.yaml + spawn.h.def + spawn.h DEPENDS .llvm_libc_common_h .llvm-libc-types.mode_t @@ -373,19 +423,21 @@ add_gen_header( # them. file(MAKE_DIRECTORY ${LIBC_INCLUDE_DIR}/sys) -add_gen_header( +add_header_macro( sys_auxv - DEF_FILE sys/auxv.h.def - GEN_HDR sys/auxv.h + ../libc/newhdrgen/yaml/sys/sys_auxv.yaml + sys/auxv.h.def + sys/auxv.h DEPENDS .llvm_libc_common_h .llvm-libc-macros.sys_auxv_macros ) -add_gen_header( +add_header_macro( sys_epoll - DEF_FILE sys/epoll.h.def - GEN_HDR sys/epoll.h + ../libc/newhdrgen/yaml/sys/sys_epoll.yaml + sys/epoll.h.def + sys/epoll.h DEPENDS .llvm_libc_common_h .llvm-libc-types.struct_epoll_event @@ -394,19 +446,21 @@ add_gen_header( .llvm-libc-macros.sys_epoll_macros ) -add_gen_header( +add_header_macro( sys_ioctl - DEF_FILE sys/ioctl.h.def - GEN_HDR sys/ioctl.h + ../libc/newhdrgen/yaml/sys/sys_ioctl.yaml + sys/ioctl.h.def + sys/ioctl.h DEPENDS .llvm_libc_common_h .llvm-libc-macros.sys_ioctl_macros ) -add_gen_header( +add_header_macro( sys_mman - DEF_FILE sys/mman.h.def - GEN_HDR sys/mman.h + ../libc/newhdrgen/yaml/sys/sys_mman.yaml + sys/mman.h.def + sys/mman.h DEPENDS .llvm_libc_common_h .llvm-libc-macros.sys_mman_macros @@ -415,10 +469,11 @@ add_gen_header( .llvm-libc-types.ssize_t ) -add_gen_header( +add_header_macro( sys_prctl - DEF_FILE sys/prctl.h.def - GEN_HDR sys/prctl.h + ../libc/newhdrgen/yaml/sys/sys_prctl.yaml + sys/prctl.h.def + sys/prctl.h DEPENDS .llvm_libc_common_h ) @@ -431,10 +486,11 @@ add_header( .llvm-libc-macros.sys_queue_macros ) -add_gen_header( +add_header_macro( sys_random - DEF_FILE sys/random.h.def - GEN_HDR sys/random.h + ../libc/newhdrgen/yaml/sys/sys_random.yaml + sys/random.h.def + sys/random.h DEPENDS .llvm_libc_common_h .llvm-libc-macros.sys_random_macros @@ -442,10 +498,11 @@ add_gen_header( .llvm-libc-types.ssize_t ) -add_gen_header( +add_header_macro( sys_resource - DEF_FILE sys/resource.h.def - GEN_HDR sys/resource.h + ../libc/newhdrgen/yaml/sys/sys_resource.yaml + sys/resource.h.def + sys/resource.h DEPENDS .llvm_libc_common_h .llvm-libc-macros.sys_resource_macros @@ -453,10 +510,11 @@ add_gen_header( .llvm-libc-types.struct_rlimit ) -add_gen_header( +add_header_macro( sys_stat - DEF_FILE sys/stat.h.def - GEN_HDR sys/stat.h + ../libc/newhdrgen/yaml/sys/sys_stat.yaml + sys/stat.h.def + sys/stat.h DEPENDS .llvm_libc_common_h .llvm-libc-macros.sys_stat_macros @@ -474,10 +532,11 @@ add_gen_header( .llvm-libc-types.struct_stat ) -add_gen_header( +add_header_macro( sys_select - DEF_FILE sys/select.h.def - GEN_HDR sys/select.h + ../libc/newhdrgen/yaml/sys/sys_select.yaml + sys/select.h.def + sys/select.h DEPENDS .llvm_libc_common_h .llvm-libc-macros.sys_select_macros @@ -489,10 +548,11 @@ add_gen_header( .llvm-libc-types.struct_timeval ) -add_gen_header( +add_header_macro( sys_sendfile - DEF_FILE sys/sendfile.h.def - GEN_HDR sys/sendfile.h + ../libc/newhdrgen/yaml/sys/sys_sendfile.yaml + sys/sendfile.h.def + sys/sendfile.h DEPENDS .llvm_libc_common_h .llvm-libc-types.off_t @@ -500,10 +560,11 @@ add_gen_header( .llvm-libc-types.ssize_t ) -add_gen_header( +add_header_macro( sys_socket - DEF_FILE sys/socket.h.def - GEN_HDR sys/socket.h + ../libc/newhdrgen/yaml/sys/sys_socket.yaml + sys/socket.h.def + sys/socket.h DEPENDS .llvm_libc_common_h .llvm-libc-macros.sys_socket_macros @@ -513,35 +574,40 @@ add_gen_header( .llvm-libc-types.struct_sockaddr_un ) -add_gen_header( +add_header_macro( sys_statvfs - DEF_FILE sys/statvfs.h.def - GEN_HDR sys/statvfs.h + ../libc/newhdrgen/yaml/sys/sys_statvfs.yaml + sys/statvfs.h.def + sys/statvfs.h DEPENDS .llvm_libc_common_h .llvm-libc-types.struct_statvfs ) -add_gen_header( +add_header_macro( sys_syscall - DEF_FILE sys/syscall.h.def - GEN_HDR sys/syscall.h + ../libc/newhdrgen/yaml/sys/sys_syscall.yaml + sys/syscall.h.def + sys/syscall.h + DEPENDS ) -add_gen_header( +add_header_macro( sys_time - DEF_FILE sys/time.h.def - GEN_HDR sys/time.h + ../libc/newhdrgen/yaml/sys/sys_time.yaml + sys/time.h.def + sys/time.h DEPENDS .llvm_libc_common_h .llvm-libc-types.struct_timeval .llvm-libc-macros.sys_time_macros ) -add_gen_header( +add_header_macro( sys_types - DEF_FILE sys/types.h.def - GEN_HDR sys/types.h + ../libc/newhdrgen/yaml/sys/sys_types.yaml + sys/types.h.def + sys/types.h DEPENDS .llvm_libc_common_h .llvm-libc-types.blkcnt_t @@ -567,19 +633,21 @@ add_gen_header( .llvm-libc-types.uid_t ) -add_gen_header( +add_header_macro( sys_utsname - DEF_FILE sys/utsname.h.def - GEN_HDR sys/utsname.h + ../libc/newhdrgen/yaml/sys/sys_utsname.yaml + sys/utsname.h.def + sys/utsname.h DEPENDS .llvm_libc_common_h .llvm-libc-types.struct_utsname ) -add_gen_header( +add_header_macro( sys_wait - DEF_FILE sys/wait.h.def - GEN_HDR sys/wait.h + ../libc/newhdrgen/yaml/sys/sys_wait.yaml + sys/wait.h.def + sys/wait.h DEPENDS .llvm_libc_common_h .llvm-libc-macros.sys_wait_macros @@ -588,10 +656,11 @@ add_gen_header( .llvm-libc-types.siginfo_t ) -add_gen_header( +add_header_macro( termios - DEF_FILE termios.h.def - GEN_HDR termios.h + ../libc/newhdrgen/yaml/termios.yaml + termios.h.def + termios.h DEPENDS .llvm_libc_common_h .llvm-libc-macros.termios_macros @@ -602,10 +671,11 @@ add_gen_header( .llvm-libc-types.tcflag_t ) -add_gen_header( +add_header_macro( uchar - DEF_FILE uchar.h.def - GEN_HDR uchar.h + ../libc/newhdrgen/yaml/uchar.yaml + uchar.h.def + uchar.h DEPENDS .llvm_libc_common_h .llvm-libc-types.mbstate_t @@ -614,10 +684,11 @@ add_gen_header( .llvm-libc-types.char32_t ) -add_gen_header( +add_header_macro( wchar - DEF_FILE wchar.h.def - GEN_HDR wchar.h + ../libc/newhdrgen/yaml/wchar.yaml + wchar.h.def + wchar.h DEPENDS .llvm_libc_common_h .llvm-libc-macros.wchar_macros @@ -630,10 +701,11 @@ add_gen_header( if(LIBC_TARGET_OS_IS_GPU) file(MAKE_DIRECTORY ${LIBC_INCLUDE_DIR}/gpu) - add_gen_header( + add_header_macro( gpu_rpc - DEF_FILE gpu/rpc.h.def - GEN_HDR gpu/rpc.h + ../libc/newhdrgen/yaml/rpc.yaml + gpu/rpc.h.def + gpu/rpc.h DEPENDS .llvm_libc_common_h .llvm-libc-types.rpc_opcodes_t diff --git a/libc/newhdrgen/class_implementation/classes/function.py b/libc/newhdrgen/class_implementation/classes/function.py index ccfd93547c1d8..845ef1aebf54b 100644 --- a/libc/newhdrgen/class_implementation/classes/function.py +++ b/libc/newhdrgen/class_implementation/classes/function.py @@ -26,7 +26,7 @@ def __str__(self): attributes_str = " ".join(self.attributes) arguments_str = ", ".join(self.arguments) if attributes_str == "": - result = f"{self.return_type} {self.name}({arguments_str});" + result = f"{self.return_type} {self.name}({arguments_str})" else: result = f"{attributes_str} {self.return_type} {self.name}({arguments_str})" return result diff --git a/libc/newhdrgen/header.py b/libc/newhdrgen/header.py index 69de81eebb719..d1f0fe96dbc60 100644 --- a/libc/newhdrgen/header.py +++ b/libc/newhdrgen/header.py @@ -60,16 +60,16 @@ def __str__(self): current_guard = None for function in self.functions: if function.guard == None: - content.append(str(function) + "__NOEXCEPT") + content.append(str(function) + "__NOEXCEPT;") content.append("") else: if current_guard == None: current_guard = function.guard content.append(f"#ifdef {current_guard}") - content.append(str(function) + "__NOEXCEPT") + content.append(str(function) + "__NOEXCEPT;") content.append("") elif current_guard == function.guard: - content.append(str(function) + "__NOEXCEPT") + content.append(str(function) + "__NOEXCEPT;") content.append("") else: content.pop() @@ -77,7 +77,7 @@ def __str__(self): content.append("") current_guard = function.guard content.append(f"#ifdef {current_guard}") - content.append(str(function) + "__NOEXCEPT") + content.append(str(function) + "__NOEXCEPT;") content.append("") if current_guard != None: content.pop() From 9ce5b38dc32a5f023e9824afe246978130b9080e Mon Sep 17 00:00:00 2001 From: RoseZhang03 Date: Wed, 17 Jul 2024 23:34:53 +0000 Subject: [PATCH 348/777] [libc] final edits to newheadergen yaml files (#98983) - final run of integration tests to deal with incorrect YAML input (finished sys headers, will finish the rest today) - add any new functions made in recent PRs --- libc/config/linux/x86_64/headers.txt | 1 + libc/newhdrgen/yaml/{ => arpa}/arpa_inet.yaml | 5 +---- libc/newhdrgen/yaml/assert.yaml | 1 - libc/newhdrgen/yaml/{rpc.yaml => gpu/gpu_rpc.yaml} | 0 libc/newhdrgen/yaml/math.yaml | 6 ++++++ libc/newhdrgen/yaml/pthread.yaml | 7 ++++--- libc/newhdrgen/yaml/search.yaml | 1 - libc/newhdrgen/yaml/sys/sys_time.yaml | 3 +-- libc/newhdrgen/yaml/wchar.yaml | 1 + 9 files changed, 14 insertions(+), 11 deletions(-) rename libc/newhdrgen/yaml/{ => arpa}/arpa_inet.yaml (86%) rename libc/newhdrgen/yaml/{rpc.yaml => gpu/gpu_rpc.yaml} (100%) diff --git a/libc/config/linux/x86_64/headers.txt b/libc/config/linux/x86_64/headers.txt index df276894246c4..0294f62bc2f7a 100644 --- a/libc/config/linux/x86_64/headers.txt +++ b/libc/config/linux/x86_64/headers.txt @@ -45,6 +45,7 @@ set(TARGET_PUBLIC_HEADERS libc.include.sys_select libc.include.sys_socket libc.include.sys_stat + libc.include.sys_statvfs libc.include.sys_syscall libc.include.sys_time libc.include.sys_types diff --git a/libc/newhdrgen/yaml/arpa_inet.yaml b/libc/newhdrgen/yaml/arpa/arpa_inet.yaml similarity index 86% rename from libc/newhdrgen/yaml/arpa_inet.yaml rename to libc/newhdrgen/yaml/arpa/arpa_inet.yaml index 945a602705dba..c01235d4327a5 100644 --- a/libc/newhdrgen/yaml/arpa_inet.yaml +++ b/libc/newhdrgen/yaml/arpa/arpa_inet.yaml @@ -1,9 +1,6 @@ header: arpa-inet.h macros: [] -types: - - type_name: uint32_t - - type_name: uint16_t - - type_name: inttypes.h +types: [] enums: [] objects: [] functions: diff --git a/libc/newhdrgen/yaml/assert.yaml b/libc/newhdrgen/yaml/assert.yaml index 9ad0f0628274e..58d6c413cebdc 100644 --- a/libc/newhdrgen/yaml/assert.yaml +++ b/libc/newhdrgen/yaml/assert.yaml @@ -13,4 +13,3 @@ functions: - type: const char * - type: unsigned - type: const char * - guard: __cplusplus diff --git a/libc/newhdrgen/yaml/rpc.yaml b/libc/newhdrgen/yaml/gpu/gpu_rpc.yaml similarity index 100% rename from libc/newhdrgen/yaml/rpc.yaml rename to libc/newhdrgen/yaml/gpu/gpu_rpc.yaml diff --git a/libc/newhdrgen/yaml/math.yaml b/libc/newhdrgen/yaml/math.yaml index 5afde59b6b558..8588389bca4d2 100644 --- a/libc/newhdrgen/yaml/math.yaml +++ b/libc/newhdrgen/yaml/math.yaml @@ -7,6 +7,12 @@ types: enums: [] objects: [] functions: + - name: cbrt + standards: + - stdc + return_type: double + arguments: + - type: double - name: cbrtf standards: - stdc diff --git a/libc/newhdrgen/yaml/pthread.yaml b/libc/newhdrgen/yaml/pthread.yaml index f22767eb1b752..292d91751e406 100644 --- a/libc/newhdrgen/yaml/pthread.yaml +++ b/libc/newhdrgen/yaml/pthread.yaml @@ -8,12 +8,12 @@ types: - type_name: pthread_key_t - type_name: pthread_condattr_t - type_name: __pthread_tss_dtor_t + - type_name: pthread_rwlock_t - type_name: pthread_rwlockattr_t - type_name: pthread_attr_t - type_name: __pthread_start_t - type_name: __pthread_once_func_t - type_name: __atfork_callback_t - - type_name: pthread_rwlock_t enums: [] functions: - name: pthread_atfork @@ -106,7 +106,7 @@ functions: return_type: int arguments: - type: const pthread_condattr_t *__restrict - - type: clockid_t * __restrict + - type: clockid_t *__restrict - name: pthread_condattr_getpshared standards: - POSIX @@ -200,7 +200,8 @@ functions: standards: - POSIX return_type: pthread_t - arguments: [] + arguments: + - type: void - name: pthread_setname_np standards: - GNUExtensions diff --git a/libc/newhdrgen/yaml/search.yaml b/libc/newhdrgen/yaml/search.yaml index a7983a70bda73..b4fde14f771a2 100644 --- a/libc/newhdrgen/yaml/search.yaml +++ b/libc/newhdrgen/yaml/search.yaml @@ -1,7 +1,6 @@ header: search.h macros: [] types: - - type_name: size_t - type_name: struct_hsearch_data - type_name: ENTRY - type_name: ACTION diff --git a/libc/newhdrgen/yaml/sys/sys_time.yaml b/libc/newhdrgen/yaml/sys/sys_time.yaml index a901cdafd26a1..eb3dd548389b3 100644 --- a/libc/newhdrgen/yaml/sys/sys_time.yaml +++ b/libc/newhdrgen/yaml/sys/sys_time.yaml @@ -1,8 +1,7 @@ header: sys-time.h standards: Linux macros: [] -types: - - type_name: struct_timeval +types: [] enums: [] functions: [] objects: [] diff --git a/libc/newhdrgen/yaml/wchar.yaml b/libc/newhdrgen/yaml/wchar.yaml index 663267fb69d73..92ecdc26fbc73 100644 --- a/libc/newhdrgen/yaml/wchar.yaml +++ b/libc/newhdrgen/yaml/wchar.yaml @@ -4,6 +4,7 @@ types: - type_name: size_t - type_name: wint_t - type_name: wchar_t + - type_name: mbstate_t.h enums: [] objects: [] functions: From e0f3484874964ed749355d5a652e876efe3a05de Mon Sep 17 00:00:00 2001 From: Jason Molenda Date: Wed, 17 Jul 2024 16:37:59 -0700 Subject: [PATCH 349/777] [lldb][nfc] add an nfc entry to the .git-blame-ignore-revs. --- .git-blame-ignore-revs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs index 07f23b0109573..cc625f95ff69e 100644 --- a/.git-blame-ignore-revs +++ b/.git-blame-ignore-revs @@ -84,3 +84,6 @@ b9079baaddfed5e604fbfaa1d81a7a1c38e78c26 # [libc++][NFC] Run clang-format on libcxx/include again (#95874) e2c2ffbe7a1b5d9e32a2ce64279475b50c4cba5b + +# [lldb][nfc] Deindent ProcessGDBRemote::SetThreadStopInfo by two levels +b32931c5b32eb0d2cf37d688b34f8548c9674c19 From 90cbb1ec4ff9c687f7ebca505845388655ed5582 Mon Sep 17 00:00:00 2001 From: Michael Jones Date: Wed, 17 Jul 2024 16:42:18 -0700 Subject: [PATCH 350/777] [libc] Temporarily disable statvfs header (#99405) The statfvs header was not generating for a while. Patch #98983 added it to the list of headers, but it's apparently broken right now so this patch comments it out until it can be fixed. --- libc/config/linux/x86_64/headers.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/libc/config/linux/x86_64/headers.txt b/libc/config/linux/x86_64/headers.txt index 0294f62bc2f7a..8a52d80e1fbfb 100644 --- a/libc/config/linux/x86_64/headers.txt +++ b/libc/config/linux/x86_64/headers.txt @@ -45,7 +45,8 @@ set(TARGET_PUBLIC_HEADERS libc.include.sys_select libc.include.sys_socket libc.include.sys_stat - libc.include.sys_statvfs + # statvfs is broken, will uncomment once it's fixed. + # libc.include.sys_statvfs libc.include.sys_syscall libc.include.sys_time libc.include.sys_types From ad023a844ab19f37ea0abd2130ec81ea2663937b Mon Sep 17 00:00:00 2001 From: aaryanshukla <53713108+aaryanshukla@users.noreply.github.com> Date: Wed, 17 Jul 2024 16:43:57 -0700 Subject: [PATCH 351/777] [libc] newheadergen: cmakelist file changes (#99404) --- libc/include/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libc/include/CMakeLists.txt b/libc/include/CMakeLists.txt index bbc0f7abafd55..cd92ad126bc48 100644 --- a/libc/include/CMakeLists.txt +++ b/libc/include/CMakeLists.txt @@ -173,7 +173,7 @@ file(MAKE_DIRECTORY ${LIBC_INCLUDE_DIR}/arpa) add_header_macro( arpa_inet - ../libc/newhdrgen/yaml/arpa_inet.yaml + ../libc/newhdrgen/yaml/arpa/arpa_inet.yaml arpa/inet.h.def arpa/inet.h DEPENDS @@ -703,7 +703,7 @@ if(LIBC_TARGET_OS_IS_GPU) add_header_macro( gpu_rpc - ../libc/newhdrgen/yaml/rpc.yaml + ../libc/newhdrgen/yaml/gpu/gpu_rpc.yaml gpu/rpc.h.def gpu/rpc.h DEPENDS From d772cdd6279de1e578dfdfca7432327a1806c659 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 17 Jul 2024 16:44:21 -0700 Subject: [PATCH 352/777] [ADT] Make set_subtract more efficient when subtrahend is larger (NFC) (#99401) This patch is based on: commit fffe2728534a238ff0024e11a18280f85094dcde Author: Teresa Johnson Date: Wed Jul 17 13:53:10 2024 -0700 This iteration comes with a couple of improvements: - We now accommodate S2Ty being SmallPtrSet, which has remove_if(pred) but not erase(iterator). (Lack of this code path broke the mlir build.) - The code path for erase(iterator) now pre-increments the iterator to avoid problems with iterator invalidation. --- llvm/include/llvm/ADT/SetOperations.h | 37 +++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/llvm/include/llvm/ADT/SetOperations.h b/llvm/include/llvm/ADT/SetOperations.h index 1a911b239f4c6..2b1a103565f7d 100644 --- a/llvm/include/llvm/ADT/SetOperations.h +++ b/llvm/include/llvm/ADT/SetOperations.h @@ -27,6 +27,15 @@ using check_has_member_remove_if_t = template static constexpr bool HasMemberRemoveIf = is_detected::value; + +template +using check_has_member_erase_iter_t = + decltype(std::declval().erase(std::declval().begin())); + +template +static constexpr bool HasMemberEraseIter = + is_detected::value; + } // namespace detail /// set_union(A, B) - Compute A := A u B, return whether A changed. @@ -94,7 +103,35 @@ S1Ty set_difference(const S1Ty &S1, const S2Ty &S2) { /// set_subtract(A, B) - Compute A := A - B /// +/// Selects the set to iterate based on the relative sizes of A and B for better +/// efficiency. +/// template void set_subtract(S1Ty &S1, const S2Ty &S2) { + // If S1 is smaller than S2, iterate on S1 provided that S2 supports efficient + // lookups via contains(). Note that a couple callers pass a vector for S2, + // which doesn't support contains(), and wouldn't be efficient if it did. + using ElemTy = decltype(*S1.begin()); + if constexpr (detail::HasMemberContains) { + auto Pred = [&S2](const auto &E) { return S2.contains(E); }; + if constexpr (detail::HasMemberRemoveIf) { + if (S1.size() < S2.size()) { + S1.remove_if(Pred); + return; + } + } else if constexpr (detail::HasMemberEraseIter) { + if (S1.size() < S2.size()) { + typename S1Ty::iterator Next; + for (typename S1Ty::iterator SI = S1.begin(), SE = S1.end(); SI != SE; + SI = Next) { + Next = std::next(SI); + if (S2.contains(*SI)) + S1.erase(SI); + } + return; + } + } + } + for (typename S2Ty::const_iterator SI = S2.begin(), SE = S2.end(); SI != SE; ++SI) S1.erase(*SI); From 21c8c22a93086794c9023bfdbf2fc8f6ff99f90e Mon Sep 17 00:00:00 2001 From: aaryanshukla <53713108+aaryanshukla@users.noreply.github.com> Date: Wed, 17 Jul 2024 16:56:18 -0700 Subject: [PATCH 353/777] [libc] newheadergen: removing extra .h (#99408) --- libc/newhdrgen/yaml/wchar.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libc/newhdrgen/yaml/wchar.yaml b/libc/newhdrgen/yaml/wchar.yaml index 92ecdc26fbc73..e5627d14ed9be 100644 --- a/libc/newhdrgen/yaml/wchar.yaml +++ b/libc/newhdrgen/yaml/wchar.yaml @@ -4,7 +4,7 @@ types: - type_name: size_t - type_name: wint_t - type_name: wchar_t - - type_name: mbstate_t.h + - type_name: mbstate_t enums: [] objects: [] functions: From ab142c635e5edeb381fb3bd0222501cd2108c176 Mon Sep 17 00:00:00 2001 From: aaryanshukla <53713108+aaryanshukla@users.noreply.github.com> Date: Wed, 17 Jul 2024 17:00:58 -0700 Subject: [PATCH 354/777] [libc] newheadergen: quick fix to fuchsia build (#99410) --- libc/include/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/libc/include/CMakeLists.txt b/libc/include/CMakeLists.txt index cd92ad126bc48..3dd4985806263 100644 --- a/libc/include/CMakeLists.txt +++ b/libc/include/CMakeLists.txt @@ -157,6 +157,7 @@ add_header_macro( .llvm-libc-types.double_t .llvm-libc-types.float_t .llvm-libc-types.float128 + .llvm-libc-macros.math_function_macros ) add_header_macro( From d5fe73515a609639c63013478236bd81978db6b7 Mon Sep 17 00:00:00 2001 From: aaryanshukla <53713108+aaryanshukla@users.noreply.github.com> Date: Wed, 17 Jul 2024 17:15:18 -0700 Subject: [PATCH 355/777] Revert "[libc] newheadergen: quick fix to fuchsia build" (#99412) Reverts llvm/llvm-project#99410 --- libc/include/CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/libc/include/CMakeLists.txt b/libc/include/CMakeLists.txt index 3dd4985806263..cd92ad126bc48 100644 --- a/libc/include/CMakeLists.txt +++ b/libc/include/CMakeLists.txt @@ -157,7 +157,6 @@ add_header_macro( .llvm-libc-types.double_t .llvm-libc-types.float_t .llvm-libc-types.float128 - .llvm-libc-macros.math_function_macros ) add_header_macro( From ad4da8304cd75aecbdbe6d235ec70af8fa9e7bcb Mon Sep 17 00:00:00 2001 From: aaryanshukla <53713108+aaryanshukla@users.noreply.github.com> Date: Wed, 17 Jul 2024 17:16:33 -0700 Subject: [PATCH 356/777] Revert "[libc] newheadergen: cmakelist file changes" (#99413) Reverts llvm/llvm-project#99404 --- libc/include/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libc/include/CMakeLists.txt b/libc/include/CMakeLists.txt index cd92ad126bc48..bbc0f7abafd55 100644 --- a/libc/include/CMakeLists.txt +++ b/libc/include/CMakeLists.txt @@ -173,7 +173,7 @@ file(MAKE_DIRECTORY ${LIBC_INCLUDE_DIR}/arpa) add_header_macro( arpa_inet - ../libc/newhdrgen/yaml/arpa/arpa_inet.yaml + ../libc/newhdrgen/yaml/arpa_inet.yaml arpa/inet.h.def arpa/inet.h DEPENDS @@ -703,7 +703,7 @@ if(LIBC_TARGET_OS_IS_GPU) add_header_macro( gpu_rpc - ../libc/newhdrgen/yaml/gpu/gpu_rpc.yaml + ../libc/newhdrgen/yaml/rpc.yaml gpu/rpc.h.def gpu/rpc.h DEPENDS From 58d4ca06bdc5e2f8f9bf8bfd22ebd0577557a4fe Mon Sep 17 00:00:00 2001 From: aaryanshukla <53713108+aaryanshukla@users.noreply.github.com> Date: Wed, 17 Jul 2024 17:17:05 -0700 Subject: [PATCH 357/777] Revert "[libc] newheadergen: configured cmake" (#99414) Reverts llvm/llvm-project#98828 --- libc/CMakeLists.txt | 1 - libc/cmake/modules/LLVMLibCHeaderRules.cmake | 101 +---- libc/include/CMakeLists.txt | 380 +++++++----------- .../class_implementation/classes/function.py | 2 +- libc/newhdrgen/header.py | 8 +- 5 files changed, 160 insertions(+), 332 deletions(-) diff --git a/libc/CMakeLists.txt b/libc/CMakeLists.txt index 3b8e4e6c517e9..6ba54475d0fd1 100644 --- a/libc/CMakeLists.txt +++ b/libc/CMakeLists.txt @@ -72,7 +72,6 @@ option(LIBC_BUILD_GPU_LOADER "Always build the GPU loader utilities" OFF) if(LIBC_BUILD_GPU_LOADER OR (LLVM_LIBC_GPU_BUILD AND NOT LLVM_RUNTIMES_BUILD)) add_subdirectory(utils/gpu) endif() -option(LIBC_USE_NEW_HEADER_GEN "Generate header files using new headergen instead of the old one" OFF) set(NEED_LIBC_HDRGEN FALSE) if(NOT LLVM_RUNTIMES_BUILD) diff --git a/libc/cmake/modules/LLVMLibCHeaderRules.cmake b/libc/cmake/modules/LLVMLibCHeaderRules.cmake index 91054810f5ec5..7fc6860f23eb2 100644 --- a/libc/cmake/modules/LLVMLibCHeaderRules.cmake +++ b/libc/cmake/modules/LLVMLibCHeaderRules.cmake @@ -66,106 +66,7 @@ function(add_header target_name) ) endfunction(add_header) -function(add_gen_header2 target_name) - cmake_parse_arguments( - "ADD_GEN_HDR2" - "PUBLIC" # No optional arguments - "YAML_FILE;DEF_FILE;GEN_HDR" # Single value arguments - "DEPENDS" # Multi value arguments - ${ARGN} - ) - get_fq_target_name(${target_name} fq_target_name) - if(NOT LLVM_LIBC_FULL_BUILD) - add_library(${fq_target_name} INTERFACE) - return() - endif() - if(NOT ADD_GEN_HDR2_DEF_FILE) - message(FATAL_ERROR "`add_gen_hdr2` rule requires DEF_FILE to be specified.") - endif() - if(NOT ADD_GEN_HDR2_GEN_HDR) - message(FATAL_ERROR "`add_gen_hdr2` rule requires GEN_HDR to be specified.") - endif() - if(NOT ADD_GEN_HDR2_YAML_FILE) - message(FATAL_ERROR "`add_gen_hdr2` rule requires YAML_FILE to be specified.") - endif() - - set(absolute_path ${CMAKE_CURRENT_SOURCE_DIR}/${ADD_GEN_HDR2_GEN_HDR}) - file(RELATIVE_PATH relative_path ${LIBC_INCLUDE_SOURCE_DIR} ${absolute_path}) - set(out_file ${LIBC_INCLUDE_DIR}/${relative_path}) - set(yaml_file ${CMAKE_SOURCE_DIR}/${ADD_GEN_HDR2_YAML_FILE}) - set(def_file ${CMAKE_CURRENT_SOURCE_DIR}/${ADD_GEN_HDR2_DEF_FILE}) - - set(fq_data_files "") - if(ADD_GEN_HDR2_DATA_FILES) - foreach(data_file IN LISTS ADD_GEN_HDR2_DATA_FILES) - list(APPEND fq_data_files "${CMAKE_CURRENT_SOURCE_DIR}/${data_file}") - endforeach(data_file) - endif() - - set(entry_points "${TARGET_ENTRYPOINT_NAME_LIST}") - list(TRANSFORM entry_points PREPEND "--e=") - - add_custom_command( - OUTPUT ${out_file} - COMMAND ${Python3_EXECUTABLE} ${LIBC_SOURCE_DIR}/newhdrgen/yaml_to_classes.py - ${yaml_file} - --h_def_file ${def_file} - ${entry_points} - --output_dir ${out_file} - DEPENDS ${yaml_file} ${def_file} ${fq_data_files} - COMMENT "Generating header ${ADD_GEN_HDR2_GE2N_HDR} from ${yaml_file} and ${def_file}" - ) - if(LIBC_TARGET_OS_IS_GPU) - file(MAKE_DIRECTORY ${LIBC_INCLUDE_DIR}/llvm-libc-decls) - file(MAKE_DIRECTORY ${LIBC_INCLUDE_DIR}/llvm-libc-decls/gpu) - set(decl_out_file ${LIBC_INCLUDE_DIR}/llvm-libc-decls/${relative_path}) - add_custom_command( - OUTPUT ${decl_out_file} - COMMAND ${Python3_EXECUTABLE} ${LIBC_SOURCE_DIR}/newhdrgen/yaml_to_classes.py - ${yaml_file} - --export-decls - ${entry_points} - --output_dir ${decl_out_file} - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} - DEPENDS ${yaml_file} ${fq_data_files} - ) - endif() - - if(ADD_GEN_HDR2_DEPENDS) - get_fq_deps_list(fq_deps_list ${ADD_GEN_HDR2_DEPENDS}) - # Dependencies of a add_header target can only be another add_gen_header target - # or an add_header target. - foreach(dep IN LISTS fq_deps_list) - get_target_property(header_file ${dep} HEADER_FILE_PATH) - if(NOT header_file) - message(FATAL_ERROR "Invalid dependency '${dep}' for '${fq_target_name}'.") - endif() - endforeach() - endif() - set(generated_hdr_target ${fq_target_name}.__generated_hdr__) - add_custom_target( - ${generated_hdr_target} - DEPENDS ${out_file} ${fq_deps_list} ${decl_out_file} - ) - - add_header_library( - ${target_name} - HDRS - ${out_file} - ) - - add_dependencies(${fq_target_name} ${generated_hdr_target}) - - set_target_properties( - ${fq_target_name} - PROPERTIES - HEADER_FILE_PATH ${out_file} - DEPS "${fq_deps_list}" - ) - - -endfunction(add_gen_header2) - +# A rule for generated header file targets. # Usage: # add_gen_header( # diff --git a/libc/include/CMakeLists.txt b/libc/include/CMakeLists.txt index bbc0f7abafd55..2cf7206f3a625 100644 --- a/libc/include/CMakeLists.txt +++ b/libc/include/CMakeLists.txt @@ -17,41 +17,18 @@ add_header( __llvm-libc-common.h ) -macro(add_header_macro TARGET_NAME YAML_FILE DEF_FILE GEN_HDR DEPENDS) - if (LIBC_USE_NEW_HEADER_GEN) - add_gen_header2( - ${TARGET_NAME} - YAML_FILE ${YAML_FILE} - DEF_FILE ${DEF_FILE} - GEN_HDR ${GEN_HDR} - ${DEPENDS} - ${ARGN} - ) - else() - add_gen_header( - ${TARGET_NAME} - DEF_FILE ${DEF_FILE} - GEN_HDR ${GEN_HDR} - ${DEPENDS} - ${ARGN} - ) - endif() -endmacro() - -add_header_macro( +add_gen_header( ctype - ../libc/newhdrgen/yaml/ctype.yaml - ctype.h.def - ctype.h + DEF_FILE ctype.h.def + GEN_HDR ctype.h DEPENDS .llvm_libc_common_h ) -add_header_macro( +add_gen_header( dirent - ../libc/newhdrgen/yaml/dirent.yaml - dirent.h.def - dirent.h + DEF_FILE dirent.h.def + GEN_HDR dirent.h DEPENDS .llvm_libc_common_h .llvm-libc-types.ino_t @@ -59,11 +36,10 @@ add_header_macro( .llvm-libc-types.struct_dirent ) -add_header_macro( +add_gen_header( fcntl - ../libc/newhdrgen/yaml/fcntl.yaml - fcntl.h.def - fcntl.h + DEF_FILE fcntl.h.def + GEN_HDR fcntl.h DEPENDS .llvm-libc-macros.fcntl_macros .llvm-libc-types.mode_t @@ -75,31 +51,28 @@ add_header_macro( .llvm_libc_common_h ) -add_header_macro( +add_gen_header( dlfcn - ../libc/newhdrgen/yaml/dlfcn.yaml - dlfcn.h.def - dlfcn.h + DEF_FILE dlfcn.h.def + GEN_HDR dlfcn.h DEPENDS .llvm-libc-macros.dlfcn_macros .llvm_libc_common_h ) -add_header_macro( +add_gen_header( features - ../libc/newhdrgen/yaml/features.yaml - features.h.def - features.h + DEF_FILE features.h.def + GEN_HDR features.h DEPENDS .llvm_libc_common_h .llvm-libc-macros.features_macros ) -add_header_macro( +add_gen_header( fenv - ../libc/newhdrgen/yaml/fenv.yaml - fenv.h.def - fenv.h + DEF_FILE fenv.h.def + GEN_HDR fenv.h DEPENDS .llvm_libc_common_h .llvm-libc-macros.fenv_macros @@ -107,63 +80,58 @@ add_header_macro( .llvm-libc-types.fexcept_t ) -add_header_macro( +add_gen_header( inttypes - ../libc/newhdrgen/yaml/inttypes.yaml - inttypes.h.def - inttypes.h + DEF_FILE inttypes.h.def + GEN_HDR inttypes.h DEPENDS .llvm_libc_common_h .llvm-libc-types.imaxdiv_t .llvm-libc-macros.inttypes_macros ) -add_header_macro( +add_gen_header( float - ../libc/newhdrgen/yaml/float.yaml - float.h.def - float.h + DEF_FILE float.h.def + GEN_HDR float.h DEPENDS .llvm-libc-macros.float_macros ) -add_header_macro( +add_gen_header( stdint - ../libc/newhdrgen/yaml/stdint.yaml - stdint.h.def - stdint.h + DEF_FILE stdint.h.def + GEN_HDR stdint.h DEPENDS .llvm-libc-macros.stdint_macros ) -add_header_macro( +add_gen_header( limits - ../libc/newhdrgen/yaml/limits.yaml - limits.h.def - limits.h + DEF_FILE limits.h.def + GEN_HDR limits.h DEPENDS .llvm-libc-macros.limits_macros ) -add_header_macro( +add_gen_header( math - ../libc/newhdrgen/yaml/math.yaml - math.h.def - math.h + DEF_FILE math.h.def + GEN_HDR math.h DEPENDS .llvm_libc_common_h .llvm-libc-macros.float16_macros .llvm-libc-macros.math_macros + .llvm-libc-macros.math_function_macros .llvm-libc-types.double_t .llvm-libc-types.float_t .llvm-libc-types.float128 ) -add_header_macro( +add_gen_header( stdfix - ../libc/newhdrgen/yaml/stdfix.yaml - stdfix.h.def - stdfix.h + DEF_FILE stdfix.h.def + GEN_HDR stdfix.h DEPENDS .llvm-libc-macros.stdfix_macros ) @@ -171,61 +139,55 @@ add_header_macro( # TODO: This should be conditional on POSIX networking being included. file(MAKE_DIRECTORY ${LIBC_INCLUDE_DIR}/arpa) -add_header_macro( +add_gen_header( arpa_inet - ../libc/newhdrgen/yaml/arpa_inet.yaml - arpa/inet.h.def - arpa/inet.h + DEF_FILE arpa/inet.h.def + GEN_HDR arpa/inet.h DEPENDS .llvm_libc_common_h ) -add_header_macro( +add_gen_header( assert - ../libc/newhdrgen/yaml/assert.yaml - assert.h.def - assert.h + DEF_FILE assert.h.def + GEN_HDR assert.h DEPENDS .llvm_libc_common_h .llvm-libc-macros.assert_macros ) -add_header_macro( +add_gen_header( setjmp - ../libc/newhdrgen/yaml/setjmp.yaml - setjmp.h.def - setjmp.h + DEF_FILE setjmp.h.def + GEN_HDR setjmp.h DEPENDS .llvm_libc_common_h .llvm-libc-types.jmp_buf ) -add_header_macro( +add_gen_header( string - ../libc/newhdrgen/yaml/string.yaml - string.h.def - string.h + DEF_FILE string.h.def + GEN_HDR string.h DEPENDS .llvm_libc_common_h .llvm-libc-macros.null_macro .llvm-libc-types.size_t ) -add_header_macro( +add_gen_header( strings - ../libc/newhdrgen/yaml/strings.yaml - strings.h.def - strings.h + DEF_FILE strings.h.def + GEN_HDR strings.h DEPENDS .llvm_libc_common_h .llvm-libc-types.size_t ) -add_header_macro( +add_gen_header( search - ../libc/newhdrgen/yaml/search.yaml - search.h.def - search.h + DEF_FILE search.h.def + GEN_HDR search.h DEPENDS .llvm_libc_common_h .llvm-libc-types.ACTION @@ -234,11 +196,10 @@ add_header_macro( .llvm-libc-types.size_t ) -add_header_macro( +add_gen_header( time - ../libc/newhdrgen/yaml/time.yaml - time.h.def - time.h + DEF_FILE time.h.def + GEN_HDR time.h DEPENDS .llvm_libc_common_h .llvm-libc-macros.time_macros @@ -250,11 +211,10 @@ add_header_macro( .llvm-libc-types.clockid_t ) -add_header_macro( +add_gen_header( threads - ../libc/newhdrgen/yaml/threads.yaml - threads.h.def - threads.h + DEF_FILE threads.h.def + GEN_HDR threads.h DEPENDS .llvm_libc_common_h .llvm-libc-types.__call_once_func_t @@ -267,21 +227,19 @@ add_header_macro( .llvm-libc-types.tss_dtor_t ) -add_header_macro( +add_gen_header( errno - ../libc/newhdrgen/yaml/errno.yaml - errno.h.def - errno.h + DEF_FILE errno.h.def + GEN_HDR errno.h DEPENDS .llvm-libc-macros.generic_error_number_macros .llvm-libc-macros.error_number_macros ) -add_header_macro( +add_gen_header( signal - ../libc/newhdrgen/yaml/signal.yaml - signal.h.def - signal.h + DEF_FILE signal.h.def + GEN_HDR signal.h DEPENDS .llvm-libc-macros.signal_macros .llvm-libc-types.sig_atomic_t @@ -293,31 +251,28 @@ add_header_macro( .llvm-libc-types.pid_t ) -add_header_macro( +add_gen_header( stdbit - ../libc/newhdrgen/yaml/stdbit.yaml - stdbit.h.def - stdbit.h + DEF_FILE stdbit.h.def + GEN_HDR stdbit.h DEPENDS .llvm_libc_common_h .llvm-libc-macros.stdbit_macros ) -add_header_macro( +add_gen_header( stdckdint - ../libc/newhdrgen/yaml/stdckdint.yaml - stdckdint.h.def - stdckdint.h + DEF_FILE stdckdint.h.def + GEN_HDR stdckdint.h DEPENDS .llvm_libc_common_h .llvm-libc-macros.stdckdint_macros ) -add_header_macro( +add_gen_header( stdio - ../libc/newhdrgen/yaml/stdio.yaml - stdio.h.def - stdio.h + DEF_FILE stdio.h.def + GEN_HDR stdio.h DEPENDS .llvm-libc-macros.file_seek_macros .llvm-libc-macros.stdio_macros @@ -329,11 +284,10 @@ add_header_macro( .llvm_libc_common_h ) -add_header_macro( +add_gen_header( stdlib - ../libc/newhdrgen/yaml/stdlib.yaml - stdlib.h.def - stdlib.h + DEF_FILE stdlib.h.def + GEN_HDR stdlib.h DEPENDS .llvm_libc_common_h .llvm-libc-macros.stdlib_macros @@ -347,11 +301,10 @@ add_header_macro( .llvm-libc-types.__atexithandler_t ) -add_header_macro( +add_gen_header( unistd - ../libc/newhdrgen/yaml/unistd.yaml - unistd.h.def - unistd.h + DEF_FILE unistd.h.def + GEN_HDR unistd.h DEPENDS .llvm_libc_common_h .llvm-libc-macros.file_seek_macros @@ -366,11 +319,10 @@ add_header_macro( .llvm-libc-types.__getoptargv_t ) -add_header_macro( +add_gen_header( pthread - ../libc/newhdrgen/yaml/pthread.yaml - pthread.h.def - pthread.h + DEF_FILE pthread.h.def + GEN_HDR pthread.h DEPENDS .llvm_libc_common_h .llvm-libc-types.__atfork_callback_t @@ -388,11 +340,10 @@ add_header_macro( .llvm-libc-types.pthread_t ) -add_header_macro( +add_gen_header( sched - ../libc/newhdrgen/yaml/sched.yaml - sched.h.def - sched.h + DEF_FILE sched.h.def + GEN_HDR sched.h DEPENDS .llvm_libc_common_h .llvm-libc-macros.sched_macros @@ -405,11 +356,10 @@ add_header_macro( .llvm-libc-types.struct_timespec ) -add_header_macro( +add_gen_header( spawn - ../libc/newhdrgen/yaml/spawn.yaml - spawn.h.def - spawn.h + DEF_FILE spawn.h.def + GEN_HDR spawn.h DEPENDS .llvm_libc_common_h .llvm-libc-types.mode_t @@ -423,21 +373,19 @@ add_header_macro( # them. file(MAKE_DIRECTORY ${LIBC_INCLUDE_DIR}/sys) -add_header_macro( +add_gen_header( sys_auxv - ../libc/newhdrgen/yaml/sys/sys_auxv.yaml - sys/auxv.h.def - sys/auxv.h + DEF_FILE sys/auxv.h.def + GEN_HDR sys/auxv.h DEPENDS .llvm_libc_common_h .llvm-libc-macros.sys_auxv_macros ) -add_header_macro( +add_gen_header( sys_epoll - ../libc/newhdrgen/yaml/sys/sys_epoll.yaml - sys/epoll.h.def - sys/epoll.h + DEF_FILE sys/epoll.h.def + GEN_HDR sys/epoll.h DEPENDS .llvm_libc_common_h .llvm-libc-types.struct_epoll_event @@ -446,21 +394,19 @@ add_header_macro( .llvm-libc-macros.sys_epoll_macros ) -add_header_macro( +add_gen_header( sys_ioctl - ../libc/newhdrgen/yaml/sys/sys_ioctl.yaml - sys/ioctl.h.def - sys/ioctl.h + DEF_FILE sys/ioctl.h.def + GEN_HDR sys/ioctl.h DEPENDS .llvm_libc_common_h .llvm-libc-macros.sys_ioctl_macros ) -add_header_macro( +add_gen_header( sys_mman - ../libc/newhdrgen/yaml/sys/sys_mman.yaml - sys/mman.h.def - sys/mman.h + DEF_FILE sys/mman.h.def + GEN_HDR sys/mman.h DEPENDS .llvm_libc_common_h .llvm-libc-macros.sys_mman_macros @@ -469,11 +415,10 @@ add_header_macro( .llvm-libc-types.ssize_t ) -add_header_macro( +add_gen_header( sys_prctl - ../libc/newhdrgen/yaml/sys/sys_prctl.yaml - sys/prctl.h.def - sys/prctl.h + DEF_FILE sys/prctl.h.def + GEN_HDR sys/prctl.h DEPENDS .llvm_libc_common_h ) @@ -486,11 +431,10 @@ add_header( .llvm-libc-macros.sys_queue_macros ) -add_header_macro( +add_gen_header( sys_random - ../libc/newhdrgen/yaml/sys/sys_random.yaml - sys/random.h.def - sys/random.h + DEF_FILE sys/random.h.def + GEN_HDR sys/random.h DEPENDS .llvm_libc_common_h .llvm-libc-macros.sys_random_macros @@ -498,11 +442,10 @@ add_header_macro( .llvm-libc-types.ssize_t ) -add_header_macro( +add_gen_header( sys_resource - ../libc/newhdrgen/yaml/sys/sys_resource.yaml - sys/resource.h.def - sys/resource.h + DEF_FILE sys/resource.h.def + GEN_HDR sys/resource.h DEPENDS .llvm_libc_common_h .llvm-libc-macros.sys_resource_macros @@ -510,11 +453,10 @@ add_header_macro( .llvm-libc-types.struct_rlimit ) -add_header_macro( +add_gen_header( sys_stat - ../libc/newhdrgen/yaml/sys/sys_stat.yaml - sys/stat.h.def - sys/stat.h + DEF_FILE sys/stat.h.def + GEN_HDR sys/stat.h DEPENDS .llvm_libc_common_h .llvm-libc-macros.sys_stat_macros @@ -532,11 +474,10 @@ add_header_macro( .llvm-libc-types.struct_stat ) -add_header_macro( +add_gen_header( sys_select - ../libc/newhdrgen/yaml/sys/sys_select.yaml - sys/select.h.def - sys/select.h + DEF_FILE sys/select.h.def + GEN_HDR sys/select.h DEPENDS .llvm_libc_common_h .llvm-libc-macros.sys_select_macros @@ -548,11 +489,10 @@ add_header_macro( .llvm-libc-types.struct_timeval ) -add_header_macro( +add_gen_header( sys_sendfile - ../libc/newhdrgen/yaml/sys/sys_sendfile.yaml - sys/sendfile.h.def - sys/sendfile.h + DEF_FILE sys/sendfile.h.def + GEN_HDR sys/sendfile.h DEPENDS .llvm_libc_common_h .llvm-libc-types.off_t @@ -560,11 +500,10 @@ add_header_macro( .llvm-libc-types.ssize_t ) -add_header_macro( +add_gen_header( sys_socket - ../libc/newhdrgen/yaml/sys/sys_socket.yaml - sys/socket.h.def - sys/socket.h + DEF_FILE sys/socket.h.def + GEN_HDR sys/socket.h DEPENDS .llvm_libc_common_h .llvm-libc-macros.sys_socket_macros @@ -574,40 +513,35 @@ add_header_macro( .llvm-libc-types.struct_sockaddr_un ) -add_header_macro( +add_gen_header( sys_statvfs - ../libc/newhdrgen/yaml/sys/sys_statvfs.yaml - sys/statvfs.h.def - sys/statvfs.h + DEF_FILE sys/statvfs.h.def + GEN_HDR sys/statvfs.h DEPENDS .llvm_libc_common_h .llvm-libc-types.struct_statvfs ) -add_header_macro( +add_gen_header( sys_syscall - ../libc/newhdrgen/yaml/sys/sys_syscall.yaml - sys/syscall.h.def - sys/syscall.h - DEPENDS + DEF_FILE sys/syscall.h.def + GEN_HDR sys/syscall.h ) -add_header_macro( +add_gen_header( sys_time - ../libc/newhdrgen/yaml/sys/sys_time.yaml - sys/time.h.def - sys/time.h + DEF_FILE sys/time.h.def + GEN_HDR sys/time.h DEPENDS .llvm_libc_common_h .llvm-libc-types.struct_timeval .llvm-libc-macros.sys_time_macros ) -add_header_macro( +add_gen_header( sys_types - ../libc/newhdrgen/yaml/sys/sys_types.yaml - sys/types.h.def - sys/types.h + DEF_FILE sys/types.h.def + GEN_HDR sys/types.h DEPENDS .llvm_libc_common_h .llvm-libc-types.blkcnt_t @@ -633,21 +567,19 @@ add_header_macro( .llvm-libc-types.uid_t ) -add_header_macro( +add_gen_header( sys_utsname - ../libc/newhdrgen/yaml/sys/sys_utsname.yaml - sys/utsname.h.def - sys/utsname.h + DEF_FILE sys/utsname.h.def + GEN_HDR sys/utsname.h DEPENDS .llvm_libc_common_h .llvm-libc-types.struct_utsname ) -add_header_macro( +add_gen_header( sys_wait - ../libc/newhdrgen/yaml/sys/sys_wait.yaml - sys/wait.h.def - sys/wait.h + DEF_FILE sys/wait.h.def + GEN_HDR sys/wait.h DEPENDS .llvm_libc_common_h .llvm-libc-macros.sys_wait_macros @@ -656,11 +588,10 @@ add_header_macro( .llvm-libc-types.siginfo_t ) -add_header_macro( +add_gen_header( termios - ../libc/newhdrgen/yaml/termios.yaml - termios.h.def - termios.h + DEF_FILE termios.h.def + GEN_HDR termios.h DEPENDS .llvm_libc_common_h .llvm-libc-macros.termios_macros @@ -671,11 +602,10 @@ add_header_macro( .llvm-libc-types.tcflag_t ) -add_header_macro( +add_gen_header( uchar - ../libc/newhdrgen/yaml/uchar.yaml - uchar.h.def - uchar.h + DEF_FILE uchar.h.def + GEN_HDR uchar.h DEPENDS .llvm_libc_common_h .llvm-libc-types.mbstate_t @@ -684,11 +614,10 @@ add_header_macro( .llvm-libc-types.char32_t ) -add_header_macro( +add_gen_header( wchar - ../libc/newhdrgen/yaml/wchar.yaml - wchar.h.def - wchar.h + DEF_FILE wchar.h.def + GEN_HDR wchar.h DEPENDS .llvm_libc_common_h .llvm-libc-macros.wchar_macros @@ -701,11 +630,10 @@ add_header_macro( if(LIBC_TARGET_OS_IS_GPU) file(MAKE_DIRECTORY ${LIBC_INCLUDE_DIR}/gpu) - add_header_macro( + add_gen_header( gpu_rpc - ../libc/newhdrgen/yaml/rpc.yaml - gpu/rpc.h.def - gpu/rpc.h + DEF_FILE gpu/rpc.h.def + GEN_HDR gpu/rpc.h DEPENDS .llvm_libc_common_h .llvm-libc-types.rpc_opcodes_t diff --git a/libc/newhdrgen/class_implementation/classes/function.py b/libc/newhdrgen/class_implementation/classes/function.py index 845ef1aebf54b..ccfd93547c1d8 100644 --- a/libc/newhdrgen/class_implementation/classes/function.py +++ b/libc/newhdrgen/class_implementation/classes/function.py @@ -26,7 +26,7 @@ def __str__(self): attributes_str = " ".join(self.attributes) arguments_str = ", ".join(self.arguments) if attributes_str == "": - result = f"{self.return_type} {self.name}({arguments_str})" + result = f"{self.return_type} {self.name}({arguments_str});" else: result = f"{attributes_str} {self.return_type} {self.name}({arguments_str})" return result diff --git a/libc/newhdrgen/header.py b/libc/newhdrgen/header.py index d1f0fe96dbc60..69de81eebb719 100644 --- a/libc/newhdrgen/header.py +++ b/libc/newhdrgen/header.py @@ -60,16 +60,16 @@ def __str__(self): current_guard = None for function in self.functions: if function.guard == None: - content.append(str(function) + "__NOEXCEPT;") + content.append(str(function) + "__NOEXCEPT") content.append("") else: if current_guard == None: current_guard = function.guard content.append(f"#ifdef {current_guard}") - content.append(str(function) + "__NOEXCEPT;") + content.append(str(function) + "__NOEXCEPT") content.append("") elif current_guard == function.guard: - content.append(str(function) + "__NOEXCEPT;") + content.append(str(function) + "__NOEXCEPT") content.append("") else: content.pop() @@ -77,7 +77,7 @@ def __str__(self): content.append("") current_guard = function.guard content.append(f"#ifdef {current_guard}") - content.append(str(function) + "__NOEXCEPT;") + content.append(str(function) + "__NOEXCEPT") content.append("") if current_guard != None: content.pop() From 4283f1ad18db9878b98f98e7a36b4f94ab674d29 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Wed, 17 Jul 2024 17:22:39 -0700 Subject: [PATCH 358/777] [NFC][fuzzer] Remove unhelpful lit notes They are not actionable. --- compiler-rt/test/fuzzer/lit.cfg.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/compiler-rt/test/fuzzer/lit.cfg.py b/compiler-rt/test/fuzzer/lit.cfg.py index 0c0550ac729d6..75d4cf2e4c529 100644 --- a/compiler-rt/test/fuzzer/lit.cfg.py +++ b/compiler-rt/test/fuzzer/lit.cfg.py @@ -33,19 +33,18 @@ ): lit_config.note("lsan feature unavailable") else: - lit_config.note("lsan feature available") config.available_features.add("lsan") # MemorySanitizer is not supported on OSX or Windows right now if ( sys.platform.startswith("darwin") or sys.platform.startswith("win") - or config.target_arch == "i386" ): lit_config.note("msan feature unavailable") assert "msan" not in config.available_features +elif config.target_arch == "i386": + assert "msan" not in config.available_features else: - lit_config.note("msan feature available") config.available_features.add("msan") if sys.platform.startswith("win") or sys.platform.startswith("cygwin"): @@ -57,10 +56,7 @@ if sys.platform.startswith("linux"): # Note the value of ``sys.platform`` is not consistent # between python 2 and 3, hence the use of ``.startswith()``. - lit_config.note("linux feature available") config.available_features.add("linux") -else: - lit_config.note("linux feature unavailable") if config.arm_thumb: config.available_features.add("thumb") From 888b130bdfd98bda71e14fb10893113cbbd15733 Mon Sep 17 00:00:00 2001 From: Sam Clegg Date: Wed, 17 Jul 2024 18:26:59 -0700 Subject: [PATCH 359/777] [lld][WebAssembly] Consolidate --fatal-warnings and --no-fatal-warnings options. NFC (#99374) Also document defaults for boolean options. See https://reviews.llvm.org/D42859 --- lld/wasm/Options.td | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/lld/wasm/Options.td b/lld/wasm/Options.td index bf8134dc33cc1..3a70ee65f7c4f 100644 --- a/lld/wasm/Options.td +++ b/lld/wasm/Options.td @@ -58,7 +58,7 @@ def compress_relocations: F<"compress-relocations">, HelpText<"Compress the relocation targets in the code section.">; defm demangle: B<"demangle", - "Demangle symbol names", + "Demangle symbol names (default)", "Do not demangle symbol names">; def emit_relocs: F<"emit-relocs">, HelpText<"Generate relocations in output">; @@ -79,15 +79,16 @@ def entry: S<"entry">, MetaVarName<"">, defm error_limit: EEq<"error-limit", "Maximum number of errors to emit before stopping (0 = no limit)">; -def fatal_warnings: F<"fatal-warnings">, - HelpText<"Treat warnings as errors">; +defm fatal_warnings: B<"fatal-warnings", + "Treat warnings as errors", + "Do not treat warnings as errors (default)">; defm gc_sections: B<"gc-sections", - "Enable garbage collection of unused sections", + "Enable garbage collection of unused sections (defualt)", "Disable garbage collection of unused sections">; defm merge_data_segments: BB<"merge-data-segments", - "Enable merging data segments", + "Enable merging data segments (default)", "Disable merging data segments">; def help: F<"help">, HelpText<"Print option help">; @@ -104,8 +105,6 @@ defm mllvm: Eq<"mllvm", "Additional arguments to forward to LLVM's option proces defm Map: Eq<"Map", "Print a link map to the specified file">; -def no_fatal_warnings: F<"no-fatal-warnings">; - def o: JoinedOrSeparate<["-"], "o">, MetaVarName<"">, HelpText<"Path to file to write output">; @@ -117,7 +116,7 @@ defm pie: B<"pie", defm print_gc_sections: B<"print-gc-sections", "List removed unused sections", - "Do not list removed unused sections">; + "Do not list removed unused sections (default)">; def print_map: F<"print-map">, HelpText<"Print a link map to the standard output">; From c41fa0fdd7e14019fc48bece2a2b0b00c88c8518 Mon Sep 17 00:00:00 2001 From: hev Date: Thu, 18 Jul 2024 09:32:45 +0800 Subject: [PATCH 360/777] [LoongArch] Remove spurious mask operations from andn->icmp on 16 and 8 bit values (#99272) --- .../LoongArch/LoongArchISelLowering.cpp | 162 ++++++++++++++++++ llvm/test/CodeGen/LoongArch/andn-icmp.ll | 56 ++---- 2 files changed, 178 insertions(+), 40 deletions(-) diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index ba6be85c7f2e8..6072e5e244263 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -335,6 +335,7 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::AND); setTargetDAGCombine(ISD::OR); setTargetDAGCombine(ISD::SRL); + setTargetDAGCombine(ISD::SETCC); // Set DAG combine for 'LSX' feature. @@ -2528,6 +2529,165 @@ static SDValue performORCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } +static bool checkValueWidth(SDValue V, ISD::LoadExtType &ExtType) { + ExtType = ISD::NON_EXTLOAD; + + switch (V.getNode()->getOpcode()) { + case ISD::LOAD: { + LoadSDNode *LoadNode = cast(V.getNode()); + if ((LoadNode->getMemoryVT() == MVT::i8) || + (LoadNode->getMemoryVT() == MVT::i16)) { + ExtType = LoadNode->getExtensionType(); + return true; + } + return false; + } + case ISD::AssertSext: { + VTSDNode *TypeNode = cast(V.getNode()->getOperand(1)); + if ((TypeNode->getVT() == MVT::i8) || (TypeNode->getVT() == MVT::i16)) { + ExtType = ISD::SEXTLOAD; + return true; + } + return false; + } + case ISD::AssertZext: { + VTSDNode *TypeNode = cast(V.getNode()->getOperand(1)); + if ((TypeNode->getVT() == MVT::i8) || (TypeNode->getVT() == MVT::i16)) { + ExtType = ISD::ZEXTLOAD; + return true; + } + return false; + } + default: + return false; + } + + return false; +} + +// Eliminate redundant truncation and zero-extension nodes. +// * Case 1: +// +------------+ +------------+ +------------+ +// | Input1 | | Input2 | | CC | +// +------------+ +------------+ +------------+ +// | | | +// V V +----+ +// +------------+ +------------+ | +// | TRUNCATE | | TRUNCATE | | +// +------------+ +------------+ | +// | | | +// V V | +// +------------+ +------------+ | +// | ZERO_EXT | | ZERO_EXT | | +// +------------+ +------------+ | +// | | | +// | +-------------+ | +// V V | | +// +----------------+ | | +// | AND | | | +// +----------------+ | | +// | | | +// +---------------+ | | +// | | | +// V V V +// +-------------+ +// | CMP | +// +-------------+ +// * Case 2: +// +------------+ +------------+ +-------------+ +------------+ +------------+ +// | Input1 | | Input2 | | Constant -1 | | Constant 0 | | CC | +// +------------+ +------------+ +-------------+ +------------+ +------------+ +// | | | | | +// V | | | | +// +------------+ | | | | +// | XOR |<---------------------+ | | +// +------------+ | | | +// | | | | +// V V +---------------+ | +// +------------+ +------------+ | | +// | TRUNCATE | | TRUNCATE | | +-------------------------+ +// +------------+ +------------+ | | +// | | | | +// V V | | +// +------------+ +------------+ | | +// | ZERO_EXT | | ZERO_EXT | | | +// +------------+ +------------+ | | +// | | | | +// V V | | +// +----------------+ | | +// | AND | | | +// +----------------+ | | +// | | | +// +---------------+ | | +// | | | +// V V V +// +-------------+ +// | CMP | +// +-------------+ +static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const LoongArchSubtarget &Subtarget) { + ISD::CondCode CC = cast(N->getOperand(2))->get(); + + SDNode *AndNode = N->getOperand(0).getNode(); + if (AndNode->getOpcode() != ISD::AND) + return SDValue(); + + SDValue AndInputValue2 = AndNode->getOperand(1); + if (AndInputValue2.getOpcode() != ISD::ZERO_EXTEND) + return SDValue(); + + SDValue CmpInputValue = N->getOperand(1); + SDValue AndInputValue1 = AndNode->getOperand(0); + if (AndInputValue1.getOpcode() == ISD::XOR) { + if (CC != ISD::SETEQ && CC != ISD::SETNE) + return SDValue(); + ConstantSDNode *CN = dyn_cast(AndInputValue1.getOperand(1)); + if (!CN || CN->getSExtValue() != -1) + return SDValue(); + CN = dyn_cast(CmpInputValue); + if (!CN || CN->getSExtValue() != 0) + return SDValue(); + AndInputValue1 = AndInputValue1.getOperand(0); + if (AndInputValue1.getOpcode() != ISD::ZERO_EXTEND) + return SDValue(); + } else if (AndInputValue1.getOpcode() == ISD::ZERO_EXTEND) { + if (AndInputValue2 != CmpInputValue) + return SDValue(); + } else { + return SDValue(); + } + + SDValue TruncValue1 = AndInputValue1.getNode()->getOperand(0); + if (TruncValue1.getOpcode() != ISD::TRUNCATE) + return SDValue(); + + SDValue TruncValue2 = AndInputValue2.getNode()->getOperand(0); + if (TruncValue2.getOpcode() != ISD::TRUNCATE) + return SDValue(); + + SDValue TruncInputValue1 = TruncValue1.getNode()->getOperand(0); + SDValue TruncInputValue2 = TruncValue2.getNode()->getOperand(0); + ISD::LoadExtType ExtType1; + ISD::LoadExtType ExtType2; + + if (!checkValueWidth(TruncInputValue1, ExtType1) || + !checkValueWidth(TruncInputValue2, ExtType2)) + return SDValue(); + + if ((ExtType2 != ISD::ZEXTLOAD) && + ((ExtType2 != ISD::SEXTLOAD) && (ExtType1 != ISD::SEXTLOAD))) + return SDValue(); + + // These truncation and zero-extension nodes are not necessary, remove them. + SDValue NewAnd = DAG.getNode(ISD::AND, SDLoc(N), AndNode->getValueType(0), + TruncInputValue1, TruncInputValue2); + SDValue NewSetCC = + DAG.getSetCC(SDLoc(N), N->getValueType(0), NewAnd, TruncInputValue2, CC); + DAG.ReplaceAllUsesWith(N, NewSetCC.getNode()); + return SDValue(N, 0); +} + // Combine (loongarch_bitrev_w (loongarch_revb_2w X)) to loongarch_bitrev_4b. static SDValue performBITREV_WCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, @@ -3155,6 +3315,8 @@ SDValue LoongArchTargetLowering::PerformDAGCombine(SDNode *N, return performANDCombine(N, DAG, DCI, Subtarget); case ISD::OR: return performORCombine(N, DAG, DCI, Subtarget); + case ISD::SETCC: + return performSETCCCombine(N, DAG, DCI, Subtarget); case ISD::SRL: return performSRLCombine(N, DAG, DCI, Subtarget); case LoongArchISD::BITREV_W: diff --git a/llvm/test/CodeGen/LoongArch/andn-icmp.ll b/llvm/test/CodeGen/LoongArch/andn-icmp.ll index 4fc3c8df4664c..6d07e7a947297 100644 --- a/llvm/test/CodeGen/LoongArch/andn-icmp.ll +++ b/llvm/test/CodeGen/LoongArch/andn-icmp.ll @@ -6,14 +6,12 @@ define i1 @andn_icmp_eq_i8(i8 signext %a, i8 signext %b) nounwind { ; LA32-LABEL: andn_icmp_eq_i8: ; LA32: # %bb.0: ; LA32-NEXT: andn $a0, $a1, $a0 -; LA32-NEXT: andi $a0, $a0, 255 ; LA32-NEXT: sltui $a0, $a0, 1 ; LA32-NEXT: ret ; ; LA64-LABEL: andn_icmp_eq_i8: ; LA64: # %bb.0: ; LA64-NEXT: andn $a0, $a1, $a0 -; LA64-NEXT: andi $a0, $a0, 255 ; LA64-NEXT: sltui $a0, $a0, 1 ; LA64-NEXT: ret %and = and i8 %a, %b @@ -25,14 +23,12 @@ define i1 @andn_icmp_eq_i16(i16 signext %a, i16 signext %b) nounwind { ; LA32-LABEL: andn_icmp_eq_i16: ; LA32: # %bb.0: ; LA32-NEXT: andn $a0, $a1, $a0 -; LA32-NEXT: bstrpick.w $a0, $a0, 15, 0 ; LA32-NEXT: sltui $a0, $a0, 1 ; LA32-NEXT: ret ; ; LA64-LABEL: andn_icmp_eq_i16: ; LA64: # %bb.0: ; LA64-NEXT: andn $a0, $a1, $a0 -; LA64-NEXT: bstrpick.d $a0, $a0, 15, 0 ; LA64-NEXT: sltui $a0, $a0, 1 ; LA64-NEXT: ret %and = and i16 %a, %b @@ -80,14 +76,12 @@ define i1 @andn_icmp_ne_i8(i8 signext %a, i8 signext %b) nounwind { ; LA32-LABEL: andn_icmp_ne_i8: ; LA32: # %bb.0: ; LA32-NEXT: andn $a0, $a1, $a0 -; LA32-NEXT: andi $a0, $a0, 255 ; LA32-NEXT: sltu $a0, $zero, $a0 ; LA32-NEXT: ret ; ; LA64-LABEL: andn_icmp_ne_i8: ; LA64: # %bb.0: ; LA64-NEXT: andn $a0, $a1, $a0 -; LA64-NEXT: andi $a0, $a0, 255 ; LA64-NEXT: sltu $a0, $zero, $a0 ; LA64-NEXT: ret %and = and i8 %a, %b @@ -99,14 +93,12 @@ define i1 @andn_icmp_ne_i16(i16 signext %a, i16 signext %b) nounwind { ; LA32-LABEL: andn_icmp_ne_i16: ; LA32: # %bb.0: ; LA32-NEXT: andn $a0, $a1, $a0 -; LA32-NEXT: bstrpick.w $a0, $a0, 15, 0 ; LA32-NEXT: sltu $a0, $zero, $a0 ; LA32-NEXT: ret ; ; LA64-LABEL: andn_icmp_ne_i16: ; LA64: # %bb.0: ; LA64-NEXT: andn $a0, $a1, $a0 -; LA64-NEXT: bstrpick.d $a0, $a0, 15, 0 ; LA64-NEXT: sltu $a0, $zero, $a0 ; LA64-NEXT: ret %and = and i16 %a, %b @@ -153,15 +145,13 @@ define i1 @andn_icmp_ne_i64(i64 %a, i64 %b) nounwind { define i1 @andn_icmp_ult_i8(i8 signext %a, i8 signext %b) nounwind { ; LA32-LABEL: andn_icmp_ult_i8: ; LA32: # %bb.0: -; LA32-NEXT: andi $a1, $a1, 255 -; LA32-NEXT: and $a0, $a1, $a0 +; LA32-NEXT: and $a0, $a0, $a1 ; LA32-NEXT: sltu $a0, $a0, $a1 ; LA32-NEXT: ret ; ; LA64-LABEL: andn_icmp_ult_i8: ; LA64: # %bb.0: -; LA64-NEXT: andi $a1, $a1, 255 -; LA64-NEXT: and $a0, $a1, $a0 +; LA64-NEXT: and $a0, $a0, $a1 ; LA64-NEXT: sltu $a0, $a0, $a1 ; LA64-NEXT: ret %and = and i8 %a, %b @@ -172,15 +162,13 @@ define i1 @andn_icmp_ult_i8(i8 signext %a, i8 signext %b) nounwind { define i1 @andn_icmp_ult_i16(i16 signext %a, i16 signext %b) nounwind { ; LA32-LABEL: andn_icmp_ult_i16: ; LA32: # %bb.0: -; LA32-NEXT: bstrpick.w $a1, $a1, 15, 0 -; LA32-NEXT: and $a0, $a1, $a0 +; LA32-NEXT: and $a0, $a0, $a1 ; LA32-NEXT: sltu $a0, $a0, $a1 ; LA32-NEXT: ret ; ; LA64-LABEL: andn_icmp_ult_i16: ; LA64: # %bb.0: -; LA64-NEXT: bstrpick.d $a1, $a1, 15, 0 -; LA64-NEXT: and $a0, $a1, $a0 +; LA64-NEXT: and $a0, $a0, $a1 ; LA64-NEXT: sltu $a0, $a0, $a1 ; LA64-NEXT: ret %and = and i16 %a, %b @@ -191,16 +179,14 @@ define i1 @andn_icmp_ult_i16(i16 signext %a, i16 signext %b) nounwind { define i1 @andn_icmp_uge_i8(i8 signext %a, i8 signext %b) nounwind { ; LA32-LABEL: andn_icmp_uge_i8: ; LA32: # %bb.0: -; LA32-NEXT: andi $a1, $a1, 255 -; LA32-NEXT: and $a0, $a1, $a0 +; LA32-NEXT: and $a0, $a0, $a1 ; LA32-NEXT: sltu $a0, $a0, $a1 ; LA32-NEXT: xori $a0, $a0, 1 ; LA32-NEXT: ret ; ; LA64-LABEL: andn_icmp_uge_i8: ; LA64: # %bb.0: -; LA64-NEXT: andi $a1, $a1, 255 -; LA64-NEXT: and $a0, $a1, $a0 +; LA64-NEXT: and $a0, $a0, $a1 ; LA64-NEXT: sltu $a0, $a0, $a1 ; LA64-NEXT: xori $a0, $a0, 1 ; LA64-NEXT: ret @@ -212,16 +198,14 @@ define i1 @andn_icmp_uge_i8(i8 signext %a, i8 signext %b) nounwind { define i1 @andn_icmp_uge_i16(i16 signext %a, i16 signext %b) nounwind { ; LA32-LABEL: andn_icmp_uge_i16: ; LA32: # %bb.0: -; LA32-NEXT: bstrpick.w $a1, $a1, 15, 0 -; LA32-NEXT: and $a0, $a1, $a0 +; LA32-NEXT: and $a0, $a0, $a1 ; LA32-NEXT: sltu $a0, $a0, $a1 ; LA32-NEXT: xori $a0, $a0, 1 ; LA32-NEXT: ret ; ; LA64-LABEL: andn_icmp_uge_i16: ; LA64: # %bb.0: -; LA64-NEXT: bstrpick.d $a1, $a1, 15, 0 -; LA64-NEXT: and $a0, $a1, $a0 +; LA64-NEXT: and $a0, $a0, $a1 ; LA64-NEXT: sltu $a0, $a0, $a1 ; LA64-NEXT: xori $a0, $a0, 1 ; LA64-NEXT: ret @@ -233,15 +217,13 @@ define i1 @andn_icmp_uge_i16(i16 signext %a, i16 signext %b) nounwind { define i1 @andn_icmp_ugt_i8(i8 signext %a, i8 signext %b) nounwind { ; LA32-LABEL: andn_icmp_ugt_i8: ; LA32: # %bb.0: -; LA32-NEXT: andi $a1, $a1, 255 -; LA32-NEXT: and $a0, $a1, $a0 +; LA32-NEXT: and $a0, $a0, $a1 ; LA32-NEXT: sltu $a0, $a1, $a0 ; LA32-NEXT: ret ; ; LA64-LABEL: andn_icmp_ugt_i8: ; LA64: # %bb.0: -; LA64-NEXT: andi $a1, $a1, 255 -; LA64-NEXT: and $a0, $a1, $a0 +; LA64-NEXT: and $a0, $a0, $a1 ; LA64-NEXT: sltu $a0, $a1, $a0 ; LA64-NEXT: ret %and = and i8 %a, %b @@ -252,15 +234,13 @@ define i1 @andn_icmp_ugt_i8(i8 signext %a, i8 signext %b) nounwind { define i1 @andn_icmp_ugt_i16(i16 signext %a, i16 signext %b) nounwind { ; LA32-LABEL: andn_icmp_ugt_i16: ; LA32: # %bb.0: -; LA32-NEXT: bstrpick.w $a1, $a1, 15, 0 -; LA32-NEXT: and $a0, $a1, $a0 +; LA32-NEXT: and $a0, $a0, $a1 ; LA32-NEXT: sltu $a0, $a1, $a0 ; LA32-NEXT: ret ; ; LA64-LABEL: andn_icmp_ugt_i16: ; LA64: # %bb.0: -; LA64-NEXT: bstrpick.d $a1, $a1, 15, 0 -; LA64-NEXT: and $a0, $a1, $a0 +; LA64-NEXT: and $a0, $a0, $a1 ; LA64-NEXT: sltu $a0, $a1, $a0 ; LA64-NEXT: ret %and = and i16 %a, %b @@ -271,16 +251,14 @@ define i1 @andn_icmp_ugt_i16(i16 signext %a, i16 signext %b) nounwind { define i1 @andn_icmp_ule_i8(i8 signext %a, i8 signext %b) nounwind { ; LA32-LABEL: andn_icmp_ule_i8: ; LA32: # %bb.0: -; LA32-NEXT: andi $a1, $a1, 255 -; LA32-NEXT: and $a0, $a1, $a0 +; LA32-NEXT: and $a0, $a0, $a1 ; LA32-NEXT: sltu $a0, $a1, $a0 ; LA32-NEXT: xori $a0, $a0, 1 ; LA32-NEXT: ret ; ; LA64-LABEL: andn_icmp_ule_i8: ; LA64: # %bb.0: -; LA64-NEXT: andi $a1, $a1, 255 -; LA64-NEXT: and $a0, $a1, $a0 +; LA64-NEXT: and $a0, $a0, $a1 ; LA64-NEXT: sltu $a0, $a1, $a0 ; LA64-NEXT: xori $a0, $a0, 1 ; LA64-NEXT: ret @@ -292,16 +270,14 @@ define i1 @andn_icmp_ule_i8(i8 signext %a, i8 signext %b) nounwind { define i1 @andn_icmp_ule_i16(i16 signext %a, i16 signext %b) nounwind { ; LA32-LABEL: andn_icmp_ule_i16: ; LA32: # %bb.0: -; LA32-NEXT: bstrpick.w $a1, $a1, 15, 0 -; LA32-NEXT: and $a0, $a1, $a0 +; LA32-NEXT: and $a0, $a0, $a1 ; LA32-NEXT: sltu $a0, $a1, $a0 ; LA32-NEXT: xori $a0, $a0, 1 ; LA32-NEXT: ret ; ; LA64-LABEL: andn_icmp_ule_i16: ; LA64: # %bb.0: -; LA64-NEXT: bstrpick.d $a1, $a1, 15, 0 -; LA64-NEXT: and $a0, $a1, $a0 +; LA64-NEXT: and $a0, $a0, $a1 ; LA64-NEXT: sltu $a0, $a1, $a0 ; LA64-NEXT: xori $a0, $a0, 1 ; LA64-NEXT: ret From fe6c24000f2d7316899d4ec4c12273892326ed47 Mon Sep 17 00:00:00 2001 From: Chuanqi Xu Date: Thu, 18 Jul 2024 10:10:22 +0800 Subject: [PATCH 361/777] [clangd] [C++20] [Modules] Introduce initial support for C++20 Modules (#66462) Alternatives to https://reviews.llvm.org/D153114. Try to address https://github.com/clangd/clangd/issues/1293. See the links for design ideas and the consensus so far. We want to have some initial support in clang18. This is the initial support for C++20 Modules in clangd. As suggested by sammccall in https://reviews.llvm.org/D153114, we should minimize the scope of the initial patch to make it easier to review and understand so that every one are in the same page: > Don't attempt any cross-file or cross-version coordination: i.e. don't > try to reuse BMIs between different files, don't try to reuse BMIs > between (preamble) reparses of the same file, don't try to persist the > module graph. Instead, when building a preamble, synchronously scan > for the module graph, build the required PCMs on the single preamble > thread with filenames private to that preamble, and then proceed to > build the preamble. This patch reflects the above opinions. # Testing in real-world project I tested this with a modularized library: https://github.com/alibaba/async_simple/tree/CXX20Modules. This library has 3 modules (async_simple, std and asio) and 65 module units. (Note that a module consists of multiple module units). Both `std` module and `asio` module have 100k+ lines of code (maybe more, I didn't count). And async_simple itself has 8k lines of code. This is the scale of the project. The result shows that it works pretty well, ..., well, except I need to wait roughly 10s after opening/editing any file. And this falls in our expectations. We know it is hard to make it perfect in the first move. # What this patch does in detail - Introduced an option `--experimental-modules-support` for the support for C++20 Modules. So that no matter how bad this is, it wouldn't affect current users. Following off the page, we'll assume the option is enabled. - Introduced two classes `ModuleFilesInfo` and `ModuleDependencyScanner`. Now `ModuleDependencyScanner` is only used by `ModuleFilesInfo`. - The class `ModuleFilesInfo` records the built module files for specific single source file. The module files can only be built by the static member function `ModuleFilesInfo::buildModuleFilesInfoFor(PathRef File, ...)`. - The class `PreambleData` adds a new member variable with type `ModuleFilesInfo`. This refers to the needed module files for the current file. It means the module files info is part of the preamble, which is suggested in the first patch too. - In `isPreambleCompatible()`, we add a call to `ModuleFilesInfo::CanReuse()` to check if the built module files are still up to date. - When we build the AST for a source file, we will load the built module files from ModuleFilesInfo. # What we need to do next Let's split the TODOs into clang part and clangd part to make things more clear. The TODOs in the clangd part include: 1. Enable reusing module files across source files. The may require us to bring a ModulesManager like thing which need to handle `scheduling`, `the possibility of BMI version conflicts` and `various events that can invalidate the module graph`. 2. Get a more efficient method to get the ` -> ` map. Currently we always scan the whole project during `ModuleFilesInfo::buildModuleFilesInfoFor(PathRef File, ...)`. This is clearly inefficient even if the scanning process is pretty fast. I think the potential solutions include: - Make a global scanner to monitor the state of every source file like I did in the first patch. The pain point is that we need to take care of the data races. - Ask the build systems to provide the map just like we ask them to provide the compilation database. 3. Persist the module files. So that we can reuse module files across clangd invocations or even across clangd instances. TODOs in the clang part include: 1. Clang should offer an option/mode to skip writing/reading the bodies of the functions. Or even if we can requrie the parser to skip parsing the function bodies. And it looks like we can say the support for C++20 Modules is initially workable after we made (1) and (2) (or even without (2)). --- clang-tools-extra/clangd/CMakeLists.txt | 3 + clang-tools-extra/clangd/ClangdLSPServer.cpp | 8 + clang-tools-extra/clangd/ClangdLSPServer.h | 5 + clang-tools-extra/clangd/ClangdServer.cpp | 2 + clang-tools-extra/clangd/ClangdServer.h | 6 + clang-tools-extra/clangd/Compiler.h | 3 + .../clangd/GlobalCompilationDatabase.cpp | 23 + .../clangd/GlobalCompilationDatabase.h | 13 + clang-tools-extra/clangd/ModulesBuilder.cpp | 336 +++++++++++++++ clang-tools-extra/clangd/ModulesBuilder.h | 106 +++++ clang-tools-extra/clangd/ParsedAST.cpp | 7 + clang-tools-extra/clangd/Preamble.cpp | 18 +- clang-tools-extra/clangd/Preamble.h | 4 + clang-tools-extra/clangd/ProjectModules.h | 50 +++ .../clangd/ScanningProjectModules.cpp | 202 +++++++++ .../clangd/ScanningProjectModules.h | 26 ++ clang-tools-extra/clangd/test/CMakeLists.txt | 1 + clang-tools-extra/clangd/test/modules.test | 83 ++++ clang-tools-extra/clangd/tool/Check.cpp | 12 +- clang-tools-extra/clangd/tool/ClangdMain.cpp | 8 + .../clangd/unittests/CMakeLists.txt | 1 + .../unittests/PrerequisiteModulesTest.cpp | 408 ++++++++++++++++++ clang-tools-extra/clangd/unittests/TestFS.h | 2 +- clang-tools-extra/docs/ReleaseNotes.rst | 4 + 24 files changed, 1327 insertions(+), 4 deletions(-) create mode 100644 clang-tools-extra/clangd/ModulesBuilder.cpp create mode 100644 clang-tools-extra/clangd/ModulesBuilder.h create mode 100644 clang-tools-extra/clangd/ProjectModules.h create mode 100644 clang-tools-extra/clangd/ScanningProjectModules.cpp create mode 100644 clang-tools-extra/clangd/ScanningProjectModules.h create mode 100644 clang-tools-extra/clangd/test/modules.test create mode 100644 clang-tools-extra/clangd/unittests/PrerequisiteModulesTest.cpp diff --git a/clang-tools-extra/clangd/CMakeLists.txt b/clang-tools-extra/clangd/CMakeLists.txt index f49704157880d..c21d277d2ffcb 100644 --- a/clang-tools-extra/clangd/CMakeLists.txt +++ b/clang-tools-extra/clangd/CMakeLists.txt @@ -97,12 +97,14 @@ add_clang_library(clangDaemon IncludeFixer.cpp InlayHints.cpp JSONTransport.cpp + ModulesBuilder.cpp PathMapping.cpp Protocol.cpp Quality.cpp ParsedAST.cpp Preamble.cpp RIFF.cpp + ScanningProjectModules.cpp Selection.cpp SemanticHighlighting.cpp SemanticSelection.cpp @@ -161,6 +163,7 @@ clang_target_link_libraries(clangDaemon clangAST clangASTMatchers clangBasic + clangDependencyScanning clangDriver clangFormat clangFrontend diff --git a/clang-tools-extra/clangd/ClangdLSPServer.cpp b/clang-tools-extra/clangd/ClangdLSPServer.cpp index 7fd599d4e1a0b..06573a5755424 100644 --- a/clang-tools-extra/clangd/ClangdLSPServer.cpp +++ b/clang-tools-extra/clangd/ClangdLSPServer.cpp @@ -14,6 +14,7 @@ #include "Feature.h" #include "GlobalCompilationDatabase.h" #include "LSPBinder.h" +#include "ModulesBuilder.h" #include "Protocol.h" #include "SemanticHighlighting.h" #include "SourceCode.h" @@ -51,6 +52,7 @@ namespace clang { namespace clangd { + namespace { // Tracks end-to-end latency of high level lsp calls. Measurements are in // seconds. @@ -563,6 +565,12 @@ void ClangdLSPServer::onInitialize(const InitializeParams &Params, Mangler.ResourceDir = *Opts.ResourceDir; CDB.emplace(BaseCDB.get(), Params.initializationOptions.fallbackFlags, std::move(Mangler)); + + if (Opts.EnableExperimentalModulesSupport) { + ModulesManager.emplace(*CDB); + Opts.ModulesManager = &*ModulesManager; + } + { // Switch caller's context with LSPServer's background context. Since we // rather want to propagate information from LSPServer's context into the diff --git a/clang-tools-extra/clangd/ClangdLSPServer.h b/clang-tools-extra/clangd/ClangdLSPServer.h index 8bcb29522509b..0b8e4720f5323 100644 --- a/clang-tools-extra/clangd/ClangdLSPServer.h +++ b/clang-tools-extra/clangd/ClangdLSPServer.h @@ -63,6 +63,9 @@ class ClangdLSPServer : private ClangdServer::Callbacks, /// Limit the number of references returned (0 means no limit). size_t ReferencesLimit = 0; + + /// Flag to hint the experimental modules support is enabled. + bool EnableExperimentalModulesSupport = false; }; ClangdLSPServer(Transport &Transp, const ThreadsafeFS &TFS, @@ -323,6 +326,8 @@ class ClangdLSPServer : private ClangdServer::Callbacks, std::optional CDB; // The ClangdServer is created by the "initialize" LSP method. std::optional Server; + // Manages to build module files. + std::optional ModulesManager; }; } // namespace clangd } // namespace clang diff --git a/clang-tools-extra/clangd/ClangdServer.cpp b/clang-tools-extra/clangd/ClangdServer.cpp index 1c4c2a79b5c05..e910a80ba0bae 100644 --- a/clang-tools-extra/clangd/ClangdServer.cpp +++ b/clang-tools-extra/clangd/ClangdServer.cpp @@ -216,6 +216,7 @@ ClangdServer::ClangdServer(const GlobalCompilationDatabase &CDB, Callbacks *Callbacks) : FeatureModules(Opts.FeatureModules), CDB(CDB), TFS(TFS), DynamicIdx(Opts.BuildDynamicSymbolIndex ? new FileIndex() : nullptr), + ModulesManager(Opts.ModulesManager), ClangTidyProvider(Opts.ClangTidyProvider), UseDirtyHeaders(Opts.UseDirtyHeaders), LineFoldingOnly(Opts.LineFoldingOnly), @@ -308,6 +309,7 @@ void ClangdServer::addDocument(PathRef File, llvm::StringRef Contents, Inputs.Index = Index; Inputs.ClangTidyProvider = ClangTidyProvider; Inputs.FeatureModules = FeatureModules; + Inputs.ModulesManager = ModulesManager; bool NewFile = WorkScheduler->update(File, Inputs, WantDiags); // If we loaded Foo.h, we want to make sure Foo.cpp is indexed. if (NewFile && BackgroundIdx) diff --git a/clang-tools-extra/clangd/ClangdServer.h b/clang-tools-extra/clangd/ClangdServer.h index 1661028be88b4..a653cdb56b751 100644 --- a/clang-tools-extra/clangd/ClangdServer.h +++ b/clang-tools-extra/clangd/ClangdServer.h @@ -16,6 +16,7 @@ #include "FeatureModule.h" #include "GlobalCompilationDatabase.h" #include "Hover.h" +#include "ModulesBuilder.h" #include "Protocol.h" #include "SemanticHighlighting.h" #include "TUScheduler.h" @@ -112,6 +113,9 @@ class ClangdServer { /// This throttler controls which preambles may be built at a given time. clangd::PreambleThrottler *PreambleThrottler = nullptr; + /// Manages to build module files. + ModulesBuilder *ModulesManager = nullptr; + /// If true, ClangdServer builds a dynamic in-memory index for symbols in /// opened files and uses the index to augment code completion results. bool BuildDynamicSymbolIndex = false; @@ -477,6 +481,8 @@ class ClangdServer { std::unique_ptr BackgroundIdx; // Storage for merged views of the various indexes. std::vector> MergedIdx; + // Manage module files. + ModulesBuilder *ModulesManager = nullptr; // When set, provides clang-tidy options for a specific file. TidyProviderRef ClangTidyProvider; diff --git a/clang-tools-extra/clangd/Compiler.h b/clang-tools-extra/clangd/Compiler.h index 56c2567ebe97b..4e68da7610ca2 100644 --- a/clang-tools-extra/clangd/Compiler.h +++ b/clang-tools-extra/clangd/Compiler.h @@ -16,6 +16,7 @@ #define LLVM_CLANG_TOOLS_EXTRA_CLANGD_COMPILER_H #include "FeatureModule.h" +#include "ModulesBuilder.h" #include "TidyProvider.h" #include "index/Index.h" #include "support/ThreadsafeFS.h" @@ -60,6 +61,8 @@ struct ParseInputs { TidyProviderRef ClangTidyProvider = {}; // Used to acquire ASTListeners when parsing files. FeatureModuleSet *FeatureModules = nullptr; + // Used to build and manage (C++) modules. + ModulesBuilder *ModulesManager = nullptr; }; /// Clears \p CI from options that are not supported by clangd, like codegen or diff --git a/clang-tools-extra/clangd/GlobalCompilationDatabase.cpp b/clang-tools-extra/clangd/GlobalCompilationDatabase.cpp index 85c80eb482efb..1d96667a8e9f4 100644 --- a/clang-tools-extra/clangd/GlobalCompilationDatabase.cpp +++ b/clang-tools-extra/clangd/GlobalCompilationDatabase.cpp @@ -9,6 +9,8 @@ #include "GlobalCompilationDatabase.h" #include "Config.h" #include "FS.h" +#include "ProjectModules.h" +#include "ScanningProjectModules.h" #include "SourceCode.h" #include "support/Logger.h" #include "support/Path.h" @@ -741,6 +743,20 @@ DirectoryBasedGlobalCompilationDatabase::getProjectInfo(PathRef File) const { return Res->PI; } +std::unique_ptr +DirectoryBasedGlobalCompilationDatabase::getProjectModules(PathRef File) const { + CDBLookupRequest Req; + Req.FileName = File; + Req.ShouldBroadcast = false; + Req.FreshTime = Req.FreshTimeMissing = + std::chrono::steady_clock::time_point::min(); + auto Res = lookupCDB(Req); + if (!Res) + return {}; + + return scanningProjectModules(Res->CDB, Opts.TFS); +} + OverlayCDB::OverlayCDB(const GlobalCompilationDatabase *Base, std::vector FallbackFlags, CommandMangler Mangler) @@ -833,6 +849,13 @@ std::optional DelegatingCDB::getProjectInfo(PathRef File) const { return Base->getProjectInfo(File); } +std::unique_ptr +DelegatingCDB::getProjectModules(PathRef File) const { + if (!Base) + return nullptr; + return Base->getProjectModules(File); +} + tooling::CompileCommand DelegatingCDB::getFallbackCommand(PathRef File) const { if (!Base) return GlobalCompilationDatabase::getFallbackCommand(File); diff --git a/clang-tools-extra/clangd/GlobalCompilationDatabase.h b/clang-tools-extra/clangd/GlobalCompilationDatabase.h index 2bf8c973c534c..ea999fe8aee01 100644 --- a/clang-tools-extra/clangd/GlobalCompilationDatabase.h +++ b/clang-tools-extra/clangd/GlobalCompilationDatabase.h @@ -9,6 +9,7 @@ #ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_GLOBALCOMPILATIONDATABASE_H #define LLVM_CLANG_TOOLS_EXTRA_CLANGD_GLOBALCOMPILATIONDATABASE_H +#include "ProjectModules.h" #include "support/Function.h" #include "support/Path.h" #include "support/Threading.h" @@ -45,6 +46,12 @@ class GlobalCompilationDatabase { return std::nullopt; } + /// Get the modules in the closest project to \p File + virtual std::unique_ptr + getProjectModules(PathRef File) const { + return nullptr; + } + /// Makes a guess at how to build a file. /// The default implementation just runs clang on the file. /// Clangd should treat the results as unreliable. @@ -76,6 +83,9 @@ class DelegatingCDB : public GlobalCompilationDatabase { std::optional getProjectInfo(PathRef File) const override; + std::unique_ptr + getProjectModules(PathRef File) const override; + tooling::CompileCommand getFallbackCommand(PathRef File) const override; bool blockUntilIdle(Deadline D) const override; @@ -122,6 +132,9 @@ class DirectoryBasedGlobalCompilationDatabase /// \p File's parents. std::optional getProjectInfo(PathRef File) const override; + std::unique_ptr + getProjectModules(PathRef File) const override; + bool blockUntilIdle(Deadline Timeout) const override; private: diff --git a/clang-tools-extra/clangd/ModulesBuilder.cpp b/clang-tools-extra/clangd/ModulesBuilder.cpp new file mode 100644 index 0000000000000..94c7eec2d09e4 --- /dev/null +++ b/clang-tools-extra/clangd/ModulesBuilder.cpp @@ -0,0 +1,336 @@ +//===----------------- ModulesBuilder.cpp ------------------------*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "ModulesBuilder.h" +#include "Compiler.h" +#include "support/Logger.h" +#include "clang/Frontend/FrontendAction.h" +#include "clang/Frontend/FrontendActions.h" +#include "clang/Serialization/ASTReader.h" + +namespace clang { +namespace clangd { + +namespace { + +// Create a path to store module files. Generally it should be: +// +// {TEMP_DIRS}/clangd/module_files/{hashed-file-name}-%%-%%-%%-%%-%%-%%/. +// +// {TEMP_DIRS} is the temporary directory for the system, e.g., "/var/tmp" +// or "C:/TEMP". +// +// '%%' means random value to make the generated path unique. +// +// \param MainFile is used to get the root of the project from global +// compilation database. +// +// TODO: Move these module fils out of the temporary directory if the module +// files are persistent. +llvm::SmallString<256> getUniqueModuleFilesPath(PathRef MainFile) { + llvm::SmallString<128> HashedPrefix = llvm::sys::path::filename(MainFile); + // There might be multiple files with the same name in a project. So appending + // the hash value of the full path to make sure they won't conflict. + HashedPrefix += std::to_string(llvm::hash_value(MainFile)); + + llvm::SmallString<256> ResultPattern; + + llvm::sys::path::system_temp_directory(/*erasedOnReboot=*/true, + ResultPattern); + + llvm::sys::path::append(ResultPattern, "clangd"); + llvm::sys::path::append(ResultPattern, "module_files"); + + llvm::sys::path::append(ResultPattern, HashedPrefix); + + ResultPattern.append("-%%-%%-%%-%%-%%-%%"); + + llvm::SmallString<256> Result; + llvm::sys::fs::createUniquePath(ResultPattern, Result, + /*MakeAbsolute=*/false); + + llvm::sys::fs::create_directories(Result); + return Result; +} + +// Get a unique module file path under \param ModuleFilesPrefix. +std::string getModuleFilePath(llvm::StringRef ModuleName, + PathRef ModuleFilesPrefix) { + llvm::SmallString<256> ModuleFilePath(ModuleFilesPrefix); + auto [PrimaryModuleName, PartitionName] = ModuleName.split(':'); + llvm::sys::path::append(ModuleFilePath, PrimaryModuleName); + if (!PartitionName.empty()) { + ModuleFilePath.append("-"); + ModuleFilePath.append(PartitionName); + } + + ModuleFilePath.append(".pcm"); + return std::string(ModuleFilePath); +} + +// FailedPrerequisiteModules - stands for the PrerequisiteModules which has +// errors happened during the building process. +class FailedPrerequisiteModules : public PrerequisiteModules { +public: + ~FailedPrerequisiteModules() override = default; + + // We shouldn't adjust the compilation commands based on + // FailedPrerequisiteModules. + void adjustHeaderSearchOptions(HeaderSearchOptions &Options) const override { + } + + // FailedPrerequisiteModules can never be reused. + bool + canReuse(const CompilerInvocation &CI, + llvm::IntrusiveRefCntPtr) const override { + return false; + } +}; + +// StandalonePrerequisiteModules - stands for PrerequisiteModules for which all +// the required modules are built successfully. All the module files +// are owned by the StandalonePrerequisiteModules class. +// +// Any of the built module files won't be shared with other instances of the +// class. So that we can avoid worrying thread safety. +// +// We don't need to worry about duplicated module names here since the standard +// guarantees the module names should be unique to a program. +class StandalonePrerequisiteModules : public PrerequisiteModules { +public: + StandalonePrerequisiteModules() = default; + + StandalonePrerequisiteModules(const StandalonePrerequisiteModules &) = delete; + StandalonePrerequisiteModules + operator=(const StandalonePrerequisiteModules &) = delete; + StandalonePrerequisiteModules(StandalonePrerequisiteModules &&) = delete; + StandalonePrerequisiteModules + operator=(StandalonePrerequisiteModules &&) = delete; + + ~StandalonePrerequisiteModules() override = default; + + void adjustHeaderSearchOptions(HeaderSearchOptions &Options) const override { + // Appending all built module files. + for (auto &RequiredModule : RequiredModules) + Options.PrebuiltModuleFiles.insert_or_assign( + RequiredModule.ModuleName, RequiredModule.ModuleFilePath); + } + + bool canReuse(const CompilerInvocation &CI, + llvm::IntrusiveRefCntPtr) const override; + + bool isModuleUnitBuilt(llvm::StringRef ModuleName) const { + return BuiltModuleNames.contains(ModuleName); + } + + void addModuleFile(llvm::StringRef ModuleName, + llvm::StringRef ModuleFilePath) { + RequiredModules.emplace_back(ModuleName, ModuleFilePath); + BuiltModuleNames.insert(ModuleName); + } + +private: + struct ModuleFile { + ModuleFile(llvm::StringRef ModuleName, PathRef ModuleFilePath) + : ModuleName(ModuleName.str()), ModuleFilePath(ModuleFilePath.str()) {} + + ModuleFile(const ModuleFile &) = delete; + ModuleFile operator=(const ModuleFile &) = delete; + + // The move constructor is needed for llvm::SmallVector. + ModuleFile(ModuleFile &&Other) + : ModuleName(std::move(Other.ModuleName)), + ModuleFilePath(std::move(Other.ModuleFilePath)) {} + + ModuleFile &operator=(ModuleFile &&Other) = delete; + + ~ModuleFile() { + if (!ModuleFilePath.empty()) + llvm::sys::fs::remove(ModuleFilePath); + } + + std::string ModuleName; + std::string ModuleFilePath; + }; + + llvm::SmallVector RequiredModules; + // A helper class to speedup the query if a module is built. + llvm::StringSet<> BuiltModuleNames; +}; + +// Build a module file for module with `ModuleName`. The information of built +// module file are stored in \param BuiltModuleFiles. +llvm::Error buildModuleFile(llvm::StringRef ModuleName, + const GlobalCompilationDatabase &CDB, + const ThreadsafeFS &TFS, ProjectModules &MDB, + PathRef ModuleFilesPrefix, + StandalonePrerequisiteModules &BuiltModuleFiles) { + if (BuiltModuleFiles.isModuleUnitBuilt(ModuleName)) + return llvm::Error::success(); + + PathRef ModuleUnitFileName = MDB.getSourceForModuleName(ModuleName); + // It is possible that we're meeting third party modules (modules whose + // source are not in the project. e.g, the std module may be a third-party + // module for most projects) or something wrong with the implementation of + // ProjectModules. + // FIXME: How should we treat third party modules here? If we want to ignore + // third party modules, we should return true instead of false here. + // Currently we simply bail out. + if (ModuleUnitFileName.empty()) + return llvm::createStringError("Failed to get the primary source"); + + // Try cheap operation earlier to boil-out cheaply if there are problems. + auto Cmd = CDB.getCompileCommand(ModuleUnitFileName); + if (!Cmd) + return llvm::createStringError( + llvm::formatv("No compile command for {0}", ModuleUnitFileName)); + + for (auto &RequiredModuleName : MDB.getRequiredModules(ModuleUnitFileName)) { + // Return early if there are errors building the module file. + if (llvm::Error Err = buildModuleFile(RequiredModuleName, CDB, TFS, MDB, + ModuleFilesPrefix, BuiltModuleFiles)) + return llvm::createStringError( + llvm::formatv("Failed to build dependency {0}: {1}", + RequiredModuleName, llvm::toString(std::move(Err)))); + } + + Cmd->Output = getModuleFilePath(ModuleName, ModuleFilesPrefix); + + ParseInputs Inputs; + Inputs.TFS = &TFS; + Inputs.CompileCommand = std::move(*Cmd); + + IgnoreDiagnostics IgnoreDiags; + auto CI = buildCompilerInvocation(Inputs, IgnoreDiags); + if (!CI) + return llvm::createStringError("Failed to build compiler invocation"); + + auto FS = Inputs.TFS->view(Inputs.CompileCommand.Directory); + auto Buf = FS->getBufferForFile(Inputs.CompileCommand.Filename); + if (!Buf) + return llvm::createStringError("Failed to create buffer"); + + // In clang's driver, we will suppress the check for ODR violation in GMF. + // See the implementation of RenderModulesOptions in Clang.cpp. + CI->getLangOpts().SkipODRCheckInGMF = true; + + // Hash the contents of input files and store the hash value to the BMI files. + // So that we can check if the files are still valid when we want to reuse the + // BMI files. + CI->getHeaderSearchOpts().ValidateASTInputFilesContent = true; + + BuiltModuleFiles.adjustHeaderSearchOptions(CI->getHeaderSearchOpts()); + + CI->getFrontendOpts().OutputFile = Inputs.CompileCommand.Output; + auto Clang = + prepareCompilerInstance(std::move(CI), /*Preamble=*/nullptr, + std::move(*Buf), std::move(FS), IgnoreDiags); + if (!Clang) + return llvm::createStringError("Failed to prepare compiler instance"); + + GenerateReducedModuleInterfaceAction Action; + Clang->ExecuteAction(Action); + + if (Clang->getDiagnostics().hasErrorOccurred()) + return llvm::createStringError("Compilation failed"); + + BuiltModuleFiles.addModuleFile(ModuleName, Inputs.CompileCommand.Output); + return llvm::Error::success(); +} +} // namespace + +std::unique_ptr +ModulesBuilder::buildPrerequisiteModulesFor(PathRef File, + const ThreadsafeFS &TFS) const { + std::unique_ptr MDB = CDB.getProjectModules(File); + if (!MDB) { + elog("Failed to get Project Modules information for {0}", File); + return std::make_unique(); + } + + std::vector RequiredModuleNames = MDB->getRequiredModules(File); + if (RequiredModuleNames.empty()) + return std::make_unique(); + + llvm::SmallString<256> ModuleFilesPrefix = getUniqueModuleFilesPath(File); + + log("Trying to build required modules for {0} in {1}", File, + ModuleFilesPrefix); + + auto RequiredModules = std::make_unique(); + + for (llvm::StringRef RequiredModuleName : RequiredModuleNames) { + // Return early if there is any error. + if (llvm::Error Err = + buildModuleFile(RequiredModuleName, CDB, TFS, *MDB.get(), + ModuleFilesPrefix, *RequiredModules.get())) { + elog("Failed to build module {0}; due to {1}", RequiredModuleName, + toString(std::move(Err))); + return std::make_unique(); + } + } + + log("Built required modules for {0} in {1}", File, ModuleFilesPrefix); + + return std::move(RequiredModules); +} + +bool StandalonePrerequisiteModules::canReuse( + const CompilerInvocation &CI, + llvm::IntrusiveRefCntPtr VFS) const { + if (RequiredModules.empty()) + return true; + + CompilerInstance Clang; + + Clang.setInvocation(std::make_shared(CI)); + IntrusiveRefCntPtr Diags = + CompilerInstance::createDiagnostics(new DiagnosticOptions()); + Clang.setDiagnostics(Diags.get()); + + FileManager *FM = Clang.createFileManager(VFS); + Clang.createSourceManager(*FM); + + if (!Clang.createTarget()) + return false; + + assert(Clang.getHeaderSearchOptsPtr()); + adjustHeaderSearchOptions(Clang.getHeaderSearchOpts()); + // Since we don't need to compile the source code actually, the TU kind here + // doesn't matter. + Clang.createPreprocessor(TU_Complete); + Clang.getHeaderSearchOpts().ForceCheckCXX20ModulesInputFiles = true; + Clang.getHeaderSearchOpts().ValidateASTInputFilesContent = true; + + // Following the practice of clang's driver to suppres the checking for ODR + // violation in GMF. + // See + // https://clang.llvm.org/docs/StandardCPlusPlusModules.html#object-definition-consistency + // for example. + Clang.getLangOpts().SkipODRCheckInGMF = true; + + Clang.createASTReader(); + for (auto &RequiredModule : RequiredModules) { + llvm::StringRef BMIPath = RequiredModule.ModuleFilePath; + // FIXME: Loading BMI fully is too heavy considering something cheaply to + // check if we can reuse the BMI. + auto ReadResult = + Clang.getASTReader()->ReadAST(BMIPath, serialization::MK_MainFile, + SourceLocation(), ASTReader::ARR_None); + + if (ReadResult != ASTReader::Success) { + elog("Can't reuse {0}: {1}", BMIPath, ReadResult); + return false; + } + } + + return true; +} + +} // namespace clangd +} // namespace clang diff --git a/clang-tools-extra/clangd/ModulesBuilder.h b/clang-tools-extra/clangd/ModulesBuilder.h new file mode 100644 index 0000000000000..0514e7486475d --- /dev/null +++ b/clang-tools-extra/clangd/ModulesBuilder.h @@ -0,0 +1,106 @@ +//===----------------- ModulesBuilder.h --------------------------*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Experimental support for C++20 Modules. +// +// Currently we simplify the implementations by preventing reusing module files +// across different versions and different source files. But this is clearly a +// waste of time and space in the end of the day. +// +// TODO: Supporting reusing module files across different versions and +// different source files. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_MODULES_BUILDER_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_MODULES_BUILDER_H + +#include "GlobalCompilationDatabase.h" +#include "ProjectModules.h" +#include "support/Path.h" +#include "support/ThreadsafeFS.h" +#include "clang/Frontend/CompilerInvocation.h" +#include "llvm/ADT/SmallString.h" +#include + +namespace clang { +namespace clangd { + +/// Store all the needed module files information to parse a single +/// source file. e.g., +/// +/// ``` +/// // a.cppm +/// export module a; +/// +/// // b.cppm +/// export module b; +/// import a; +/// +/// // c.cppm +/// export module c; +/// import b; +/// ``` +/// +/// For the source file `c.cppm`, an instance of the class will store +/// the module files for `a.cppm` and `b.cppm`. But the module file for `c.cppm` +/// won't be stored. Since it is not needed to parse `c.cppm`. +/// +/// Users should only get PrerequisiteModules from +/// `ModulesBuilder::buildPrerequisiteModulesFor(...)`. +/// +/// Users can detect whether the PrerequisiteModules is still up to date by +/// calling the `canReuse()` member function. +/// +/// The users should call `adjustHeaderSearchOptions(...)` to update the +/// compilation commands to select the built module files first. Before calling +/// `adjustHeaderSearchOptions()`, users should call `canReuse()` first to check +/// if all the stored module files are valid. In case they are not valid, +/// users should call `ModulesBuilder::buildPrerequisiteModulesFor(...)` again +/// to get the new PrerequisiteModules. +class PrerequisiteModules { +public: + /// Change commands to load the module files recorded in this + /// PrerequisiteModules first. + virtual void + adjustHeaderSearchOptions(HeaderSearchOptions &Options) const = 0; + + /// Whether or not the built module files are up to date. + /// Note that this can only be used after building the module files. + virtual bool + canReuse(const CompilerInvocation &CI, + llvm::IntrusiveRefCntPtr) const = 0; + + virtual ~PrerequisiteModules() = default; +}; + +/// This class handles building module files for a given source file. +/// +/// In the future, we want the class to manage the module files acorss +/// different versions and different source files. +class ModulesBuilder { +public: + ModulesBuilder(const GlobalCompilationDatabase &CDB) : CDB(CDB) {} + + ModulesBuilder(const ModulesBuilder &) = delete; + ModulesBuilder(ModulesBuilder &&) = delete; + + ModulesBuilder &operator=(const ModulesBuilder &) = delete; + ModulesBuilder &operator=(ModulesBuilder &&) = delete; + + std::unique_ptr + buildPrerequisiteModulesFor(PathRef File, const ThreadsafeFS &TFS) const; + +private: + const GlobalCompilationDatabase &CDB; +}; + +} // namespace clangd +} // namespace clang + +#endif diff --git a/clang-tools-extra/clangd/ParsedAST.cpp b/clang-tools-extra/clangd/ParsedAST.cpp index 2bd1fbcad2ada..a2f1504db7e88 100644 --- a/clang-tools-extra/clangd/ParsedAST.cpp +++ b/clang-tools-extra/clangd/ParsedAST.cpp @@ -446,6 +446,12 @@ ParsedAST::build(llvm::StringRef Filename, const ParseInputs &Inputs, L->sawDiagnostic(D, Diag); }); + // Adjust header search options to load the built module files recorded + // in RequiredModules. + if (Preamble && Preamble->RequiredModules) + Preamble->RequiredModules->adjustHeaderSearchOptions( + CI->getHeaderSearchOpts()); + std::optional Patch; // We might use an ignoring diagnostic consumer if they are going to be // dropped later on to not pay for extra latency by processing them. @@ -459,6 +465,7 @@ ParsedAST::build(llvm::StringRef Filename, const ParseInputs &Inputs, std::move(CI), PreamblePCH, llvm::MemoryBuffer::getMemBufferCopy(Inputs.Contents, Filename), VFS, *DiagConsumer); + if (!Clang) { // The last diagnostic contains information about the reason of this // failure. diff --git a/clang-tools-extra/clangd/Preamble.cpp b/clang-tools-extra/clangd/Preamble.cpp index ecd490145dd3c..dd13b1a9e5613 100644 --- a/clang-tools-extra/clangd/Preamble.cpp +++ b/clang-tools-extra/clangd/Preamble.cpp @@ -664,6 +664,7 @@ buildPreamble(PathRef FileName, CompilerInvocation CI, CI, ContentsBuffer.get(), Bounds, *PreambleDiagsEngine, Stats ? TimedFS : StatCacheFS, std::make_shared(), StoreInMemory, /*StoragePath=*/"", CapturedInfo); + PreambleTimer.stopTimer(); // We have to setup DiagnosticConsumer that will be alife @@ -696,6 +697,19 @@ buildPreamble(PathRef FileName, CompilerInvocation CI, Result->Includes = CapturedInfo.takeIncludes(); Result->Pragmas = std::make_shared( CapturedInfo.takePragmaIncludes()); + + if (Inputs.ModulesManager) { + WallTimer PrerequisiteModuleTimer; + PrerequisiteModuleTimer.startTimer(); + Result->RequiredModules = + Inputs.ModulesManager->buildPrerequisiteModulesFor(FileName, + *Inputs.TFS); + PrerequisiteModuleTimer.stopTimer(); + + log("Built prerequisite modules for file {0} in {1} seconds", FileName, + PrerequisiteModuleTimer.getTime()); + } + Result->Macros = CapturedInfo.takeMacros(); Result->Marks = CapturedInfo.takeMarks(); Result->StatCache = StatCache; @@ -737,7 +751,9 @@ bool isPreambleCompatible(const PreambleData &Preamble, auto VFS = Inputs.TFS->view(Inputs.CompileCommand.Directory); return compileCommandsAreEqual(Inputs.CompileCommand, Preamble.CompileCommand) && - Preamble.Preamble.CanReuse(CI, *ContentsBuffer, Bounds, *VFS); + Preamble.Preamble.CanReuse(CI, *ContentsBuffer, Bounds, *VFS) && + (!Preamble.RequiredModules || + Preamble.RequiredModules->canReuse(CI, VFS)); } void escapeBackslashAndQuotes(llvm::StringRef Text, llvm::raw_ostream &OS) { diff --git a/clang-tools-extra/clangd/Preamble.h b/clang-tools-extra/clangd/Preamble.h index 160b884beb56b..be8fed4ab88cd 100644 --- a/clang-tools-extra/clangd/Preamble.h +++ b/clang-tools-extra/clangd/Preamble.h @@ -27,6 +27,8 @@ #include "Diagnostics.h" #include "FS.h" #include "Headers.h" +#include "ModulesBuilder.h" + #include "clang-include-cleaner/Record.h" #include "support/Path.h" #include "clang/Basic/SourceManager.h" @@ -109,6 +111,8 @@ struct PreambleData { IncludeStructure Includes; // Captures #include-mapping information in #included headers. std::shared_ptr Pragmas; + // Information about required module files for this preamble. + std::unique_ptr RequiredModules; // Macros defined in the preamble section of the main file. // Users care about headers vs main-file, not preamble vs non-preamble. // These should be treated as main-file entities e.g. for code completion. diff --git a/clang-tools-extra/clangd/ProjectModules.h b/clang-tools-extra/clangd/ProjectModules.h new file mode 100644 index 0000000000000..3b9b564a87da0 --- /dev/null +++ b/clang-tools-extra/clangd/ProjectModules.h @@ -0,0 +1,50 @@ +//===------------------ ProjectModules.h -------------------------*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_PROJECTMODULES_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_PROJECTMODULES_H + +#include "support/Path.h" +#include "support/ThreadsafeFS.h" + +#include + +namespace clang { +namespace clangd { + +/// An interface to query the modules information in the project. +/// Users should get instances of `ProjectModules` from +/// `GlobalCompilationDatabase::getProjectModules(PathRef)`. +/// +/// Currently, the modules information includes: +/// - Given a source file, what are the required modules. +/// - Given a module name and a required source file, what is +/// the corresponding source file. +/// +/// Note that there can be multiple source files declaring the same module +/// in a valid project. Although the language specification requires that +/// every module unit's name must be unique in valid program, there can be +/// multiple program in a project. And it is technically valid if these program +/// doesn't interfere with each other. +/// +/// A module name should be in the format: +/// `[:partition-name]`. So module names covers partitions. +class ProjectModules { +public: + virtual std::vector getRequiredModules(PathRef File) = 0; + virtual PathRef + getSourceForModuleName(llvm::StringRef ModuleName, + PathRef RequiredSrcFile = PathRef()) = 0; + + virtual ~ProjectModules() = default; +}; + +} // namespace clangd +} // namespace clang + +#endif diff --git a/clang-tools-extra/clangd/ScanningProjectModules.cpp b/clang-tools-extra/clangd/ScanningProjectModules.cpp new file mode 100644 index 0000000000000..92f75ef7d5c25 --- /dev/null +++ b/clang-tools-extra/clangd/ScanningProjectModules.cpp @@ -0,0 +1,202 @@ +//===------------------ ProjectModules.h -------------------------*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "ProjectModules.h" +#include "support/Logger.h" +#include "clang/Tooling/DependencyScanning/DependencyScanningService.h" +#include "clang/Tooling/DependencyScanning/DependencyScanningTool.h" + +namespace clang::clangd { +namespace { +/// A scanner to query the dependency information for C++20 Modules. +/// +/// The scanner can scan a single file with `scan(PathRef)` member function +/// or scan the whole project with `globalScan(vector)` member +/// function. See the comments of `globalScan` to see the details. +/// +/// The ModuleDependencyScanner can get the directly required module names for a +/// specific source file. Also the ModuleDependencyScanner can get the source +/// file declaring the primary module interface for a specific module name. +/// +/// IMPORTANT NOTE: we assume that every module unit is only declared once in a +/// source file in the project. But the assumption is not strictly true even +/// besides the invalid projects. The language specification requires that every +/// module unit should be unique in a valid program. But a project can contain +/// multiple programs. Then it is valid that we can have multiple source files +/// declaring the same module in a project as long as these source files don't +/// interfere with each other. +class ModuleDependencyScanner { +public: + ModuleDependencyScanner( + std::shared_ptr CDB, + const ThreadsafeFS &TFS) + : CDB(CDB), TFS(TFS), + Service(tooling::dependencies::ScanningMode::CanonicalPreprocessing, + tooling::dependencies::ScanningOutputFormat::P1689) {} + + /// The scanned modules dependency information for a specific source file. + struct ModuleDependencyInfo { + /// The name of the module if the file is a module unit. + std::optional ModuleName; + /// A list of names for the modules that the file directly depends. + std::vector RequiredModules; + }; + + /// Scanning the single file specified by \param FilePath. + std::optional scan(PathRef FilePath); + + /// Scanning every source file in the current project to get the + /// to map. + /// TODO: We should find an efficient method to get the + /// to map. We can make it either by providing + /// a global module dependency scanner to monitor every file. Or we + /// can simply require the build systems (or even the end users) + /// to provide the map. + void globalScan(); + + /// Get the source file from the module name. Note that the language + /// guarantees all the module names are unique in a valid program. + /// This function should only be called after globalScan. + /// + /// TODO: We should handle the case that there are multiple source files + /// declaring the same module. + PathRef getSourceForModuleName(llvm::StringRef ModuleName) const; + + /// Return the direct required modules. Indirect required modules are not + /// included. + std::vector getRequiredModules(PathRef File); + +private: + std::shared_ptr CDB; + const ThreadsafeFS &TFS; + + // Whether the scanner has scanned the project globally. + bool GlobalScanned = false; + + clang::tooling::dependencies::DependencyScanningService Service; + + // TODO: Add a scanning cache. + + // Map module name to source file path. + llvm::StringMap ModuleNameToSource; +}; + +std::optional +ModuleDependencyScanner::scan(PathRef FilePath) { + auto Candidates = CDB->getCompileCommands(FilePath); + if (Candidates.empty()) + return std::nullopt; + + // Choose the first candidates as the compile commands as the file. + // Following the same logic with + // DirectoryBasedGlobalCompilationDatabase::getCompileCommand. + tooling::CompileCommand Cmd = std::move(Candidates.front()); + + static int StaticForMainAddr; // Just an address in this process. + Cmd.CommandLine.push_back("-resource-dir=" + + CompilerInvocation::GetResourcesPath( + "clangd", (void *)&StaticForMainAddr)); + + using namespace clang::tooling::dependencies; + + llvm::SmallString<128> FilePathDir(FilePath); + llvm::sys::path::remove_filename(FilePathDir); + DependencyScanningTool ScanningTool(Service, TFS.view(FilePathDir)); + + llvm::Expected ScanningResult = + ScanningTool.getP1689ModuleDependencyFile(Cmd, Cmd.Directory); + + if (auto E = ScanningResult.takeError()) { + elog("Scanning modules dependencies for {0} failed: {1}", FilePath, + llvm::toString(std::move(E))); + return std::nullopt; + } + + ModuleDependencyInfo Result; + + if (ScanningResult->Provides) { + ModuleNameToSource[ScanningResult->Provides->ModuleName] = FilePath; + Result.ModuleName = ScanningResult->Provides->ModuleName; + } + + for (auto &Required : ScanningResult->Requires) + Result.RequiredModules.push_back(Required.ModuleName); + + return Result; +} + +void ModuleDependencyScanner::globalScan() { + for (auto &File : CDB->getAllFiles()) + scan(File); + + GlobalScanned = true; +} + +PathRef ModuleDependencyScanner::getSourceForModuleName( + llvm::StringRef ModuleName) const { + assert( + GlobalScanned && + "We should only call getSourceForModuleName after calling globalScan()"); + + if (auto It = ModuleNameToSource.find(ModuleName); + It != ModuleNameToSource.end()) + return It->second; + + return {}; +} + +std::vector +ModuleDependencyScanner::getRequiredModules(PathRef File) { + auto ScanningResult = scan(File); + if (!ScanningResult) + return {}; + + return ScanningResult->RequiredModules; +} +} // namespace + +/// TODO: The existing `ScanningAllProjectModules` is not efficient. See the +/// comments in ModuleDependencyScanner for detail. +/// +/// In the future, we wish the build system can provide a well design +/// compilation database for modules then we can query that new compilation +/// database directly. Or we need to have a global long-live scanner to detect +/// the state of each file. +class ScanningAllProjectModules : public ProjectModules { +public: + ScanningAllProjectModules( + std::shared_ptr CDB, + const ThreadsafeFS &TFS) + : Scanner(CDB, TFS) {} + + ~ScanningAllProjectModules() override = default; + + std::vector getRequiredModules(PathRef File) override { + return Scanner.getRequiredModules(File); + } + + /// RequiredSourceFile is not used intentionally. See the comments of + /// ModuleDependencyScanner for detail. + PathRef + getSourceForModuleName(llvm::StringRef ModuleName, + PathRef RequiredSourceFile = PathRef()) override { + Scanner.globalScan(); + return Scanner.getSourceForModuleName(ModuleName); + } + +private: + ModuleDependencyScanner Scanner; +}; + +std::unique_ptr scanningProjectModules( + std::shared_ptr CDB, + const ThreadsafeFS &TFS) { + return std::make_unique(CDB, TFS); +} + +} // namespace clang::clangd diff --git a/clang-tools-extra/clangd/ScanningProjectModules.h b/clang-tools-extra/clangd/ScanningProjectModules.h new file mode 100644 index 0000000000000..75fc7dbcebce5 --- /dev/null +++ b/clang-tools-extra/clangd/ScanningProjectModules.h @@ -0,0 +1,26 @@ +//===------------ ScanningProjectModules.h -----------------------*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_SCANNINGPROJECTMODULES_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_SCANNINGPROJECTMODULES_H + +#include "ProjectModules.h" +#include "clang/Tooling/CompilationDatabase.h" + +namespace clang { +namespace clangd { + +/// Providing modules information for the project by scanning every file. +std::unique_ptr scanningProjectModules( + std::shared_ptr CDB, + const ThreadsafeFS &TFS); + +} // namespace clangd +} // namespace clang + +#endif diff --git a/clang-tools-extra/clangd/test/CMakeLists.txt b/clang-tools-extra/clangd/test/CMakeLists.txt index d073267066e0b..b51f461a49866 100644 --- a/clang-tools-extra/clangd/test/CMakeLists.txt +++ b/clang-tools-extra/clangd/test/CMakeLists.txt @@ -2,6 +2,7 @@ set(CLANGD_TEST_DEPS clangd ClangdTests clangd-indexer + split-file # No tests for it, but we should still make sure they build. dexp ) diff --git a/clang-tools-extra/clangd/test/modules.test b/clang-tools-extra/clangd/test/modules.test new file mode 100644 index 0000000000000..74280811a6cff --- /dev/null +++ b/clang-tools-extra/clangd/test/modules.test @@ -0,0 +1,83 @@ +# A smoke test to check the modules can work basically. +# +# Windows have different escaping modes. +# FIXME: We should add one for windows. +# UNSUPPORTED: system-windows +# +# RUN: rm -fr %t +# RUN: mkdir -p %t +# RUN: split-file %s %t +# +# RUN: sed -e "s|DIR|%/t|g" %t/compile_commands.json.tmpl > %t/compile_commands.json.tmp +# RUN: sed -e "s|CLANG_CC|%clang|g" %t/compile_commands.json.tmp > %t/compile_commands.json +# RUN: sed -e "s|DIR|%/t|g" %t/definition.jsonrpc.tmpl > %t/definition.jsonrpc +# +# RUN: clangd -experimental-modules-support -lit-test < %t/definition.jsonrpc \ +# RUN: | FileCheck -strict-whitespace %t/definition.jsonrpc + +#--- A.cppm +export module A; +export void printA() {} + +#--- Use.cpp +import A; +void foo() { + print +} + +#--- compile_commands.json.tmpl +[ + { + "directory": "DIR", + "command": "CLANG_CC -fprebuilt-module-path=DIR -std=c++20 -o DIR/main.cpp.o -c DIR/Use.cpp", + "file": "DIR/Use.cpp" + }, + { + "directory": "DIR", + "command": "CLANG_CC -std=c++20 DIR/A.cppm --precompile -o DIR/A.pcm", + "file": "DIR/A.cppm" + } +] + +#--- definition.jsonrpc.tmpl +{ + "jsonrpc": "2.0", + "id": 0, + "method": "initialize", + "params": { + "processId": 123, + "rootPath": "clangd", + "capabilities": { + "textDocument": { + "completion": { + "completionItem": { + "snippetSupport": true + } + } + } + }, + "trace": "off" + } +} +--- +{ + "jsonrpc": "2.0", + "method": "textDocument/didOpen", + "params": { + "textDocument": { + "uri": "file://DIR/Use.cpp", + "languageId": "cpp", + "version": 1, + "text": "import A;\nvoid foo() {\n print\n}\n" + } + } +} + +# CHECK: "message"{{.*}}printA{{.*}}(fix available) + +--- +{"jsonrpc":"2.0","id":1,"method":"textDocument/completion","params":{"textDocument":{"uri":"file://DIR/Use.cpp"},"context":{"triggerKind":1},"position":{"line":2,"character":6}}} +--- +{"jsonrpc":"2.0","id":2,"method":"shutdown"} +--- +{"jsonrpc":"2.0","method":"exit"} diff --git a/clang-tools-extra/clangd/tool/Check.cpp b/clang-tools-extra/clangd/tool/Check.cpp index 25005ec1bd045..bc2eaa77a66ee 100644 --- a/clang-tools-extra/clangd/tool/Check.cpp +++ b/clang-tools-extra/clangd/tool/Check.cpp @@ -146,10 +146,13 @@ class Checker { ClangdLSPServer::Options Opts; // from buildCommand tooling::CompileCommand Cmd; + std::unique_ptr BaseCDB; + std::unique_ptr CDB; // from buildInvocation ParseInputs Inputs; std::unique_ptr Invocation; format::FormatStyle Style; + std::optional ModulesManager; // from buildAST std::shared_ptr Preamble; std::optional AST; @@ -168,14 +171,14 @@ class Checker { DirectoryBasedGlobalCompilationDatabase::Options CDBOpts(TFS); CDBOpts.CompileCommandsDir = Config::current().CompileFlags.CDBSearch.FixedCDBPath; - std::unique_ptr BaseCDB = + BaseCDB = std::make_unique(CDBOpts); auto Mangler = CommandMangler::detect(); Mangler.SystemIncludeExtractor = getSystemIncludeExtractor(llvm::ArrayRef(Opts.QueryDriverGlobs)); if (Opts.ResourceDir) Mangler.ResourceDir = *Opts.ResourceDir; - auto CDB = std::make_unique( + CDB = std::make_unique( BaseCDB.get(), std::vector{}, std::move(Mangler)); if (auto TrueCmd = CDB->getCompileCommand(File)) { @@ -213,6 +216,11 @@ class Checker { return false; } } + if (Opts.EnableExperimentalModulesSupport) { + if (!ModulesManager) + ModulesManager.emplace(*CDB); + Inputs.ModulesManager = &*ModulesManager; + } log("Parsing command..."); Invocation = buildCompilerInvocation(Inputs, CaptureInvocationDiags, &CC1Args); diff --git a/clang-tools-extra/clangd/tool/ClangdMain.cpp b/clang-tools-extra/clangd/tool/ClangdMain.cpp index 73000d96c6ca8..3a5449ac8c799 100644 --- a/clang-tools-extra/clangd/tool/ClangdMain.cpp +++ b/clang-tools-extra/clangd/tool/ClangdMain.cpp @@ -551,6 +551,13 @@ opt ProjectRoot{ }; #endif +opt ExperimentalModulesSupport{ + "experimental-modules-support", + cat(Features), + desc("Experimental support for standard c++ modules"), + init(false), +}; + /// Supports a test URI scheme with relaxed constraints for lit tests. /// The path in a test URI will be combined with a platform-specific fake /// directory to form an absolute path. For example, test:///a.cpp is resolved @@ -860,6 +867,7 @@ clangd accepts flags on the commandline, and in the CLANGD_FLAGS environment var ClangdLSPServer::Options Opts; Opts.UseDirBasedCDB = (CompileArgsFrom == FilesystemCompileArgs); + Opts.EnableExperimentalModulesSupport = ExperimentalModulesSupport; switch (PCHStorage) { case PCHStorageFlag::Memory: diff --git a/clang-tools-extra/clangd/unittests/CMakeLists.txt b/clang-tools-extra/clangd/unittests/CMakeLists.txt index 0d4628ccf25d8..4fa9f18407ae9 100644 --- a/clang-tools-extra/clangd/unittests/CMakeLists.txt +++ b/clang-tools-extra/clangd/unittests/CMakeLists.txt @@ -74,6 +74,7 @@ add_unittest(ClangdUnitTests ClangdTests LoggerTests.cpp LSPBinderTests.cpp LSPClient.cpp + PrerequisiteModulesTest.cpp ModulesTests.cpp ParsedASTTests.cpp PathMappingTests.cpp diff --git a/clang-tools-extra/clangd/unittests/PrerequisiteModulesTest.cpp b/clang-tools-extra/clangd/unittests/PrerequisiteModulesTest.cpp new file mode 100644 index 0000000000000..7bbb95c8b8d67 --- /dev/null +++ b/clang-tools-extra/clangd/unittests/PrerequisiteModulesTest.cpp @@ -0,0 +1,408 @@ +//===--------------- PrerequisiteModulesTests.cpp -------------------*- C++ +//-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +/// FIXME: Skip testing on windows temporarily due to the different escaping +/// code mode. +#ifndef _WIN32 + +#include "ModulesBuilder.h" +#include "ScanningProjectModules.h" +#include "Annotations.h" +#include "CodeComplete.h" +#include "Compiler.h" +#include "TestTU.h" +#include "support/ThreadsafeFS.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/raw_ostream.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" + +namespace clang::clangd { +namespace { + +class MockDirectoryCompilationDatabase : public MockCompilationDatabase { +public: + MockDirectoryCompilationDatabase(StringRef TestDir, const ThreadsafeFS &TFS) + : MockCompilationDatabase(TestDir), + MockedCDBPtr(std::make_shared(*this)), + TFS(TFS) { + this->ExtraClangFlags.push_back("-std=c++20"); + this->ExtraClangFlags.push_back("-c"); + } + + void addFile(llvm::StringRef Path, llvm::StringRef Contents); + + std::unique_ptr getProjectModules(PathRef) const override { + return scanningProjectModules(MockedCDBPtr, TFS); + } + +private: + class MockClangCompilationDatabase : public tooling::CompilationDatabase { + public: + MockClangCompilationDatabase(MockDirectoryCompilationDatabase &MCDB) + : MCDB(MCDB) {} + + std::vector + getCompileCommands(StringRef FilePath) const override { + std::optional Cmd = + MCDB.getCompileCommand(FilePath); + EXPECT_TRUE(Cmd); + return {*Cmd}; + } + + std::vector getAllFiles() const override { return Files; } + + void AddFile(StringRef File) { Files.push_back(File.str()); } + + private: + MockDirectoryCompilationDatabase &MCDB; + std::vector Files; + }; + + std::shared_ptr MockedCDBPtr; + const ThreadsafeFS &TFS; +}; + +// Add files to the working testing directory and the compilation database. +void MockDirectoryCompilationDatabase::addFile(llvm::StringRef Path, + llvm::StringRef Contents) { + ASSERT_FALSE(llvm::sys::path::is_absolute(Path)); + + SmallString<256> AbsPath(Directory); + llvm::sys::path::append(AbsPath, Path); + + ASSERT_FALSE( + llvm::sys::fs::create_directories(llvm::sys::path::parent_path(AbsPath))); + + std::error_code EC; + llvm::raw_fd_ostream OS(AbsPath, EC); + ASSERT_FALSE(EC); + OS << Contents; + + MockedCDBPtr->AddFile(Path); +} + +class PrerequisiteModulesTests : public ::testing::Test { +protected: + void SetUp() override { + ASSERT_FALSE(llvm::sys::fs::createUniqueDirectory("modules-test", TestDir)); + } + + void TearDown() override { + ASSERT_FALSE(llvm::sys::fs::remove_directories(TestDir)); + } + +public: + // Get the absolute path for file specified by Path under testing working + // directory. + std::string getFullPath(llvm::StringRef Path) { + SmallString<128> Result(TestDir); + llvm::sys::path::append(Result, Path); + EXPECT_TRUE(llvm::sys::fs::exists(Result.str())); + return Result.str().str(); + } + + ParseInputs getInputs(llvm::StringRef FileName, + const GlobalCompilationDatabase &CDB) { + std::string FullPathName = getFullPath(FileName); + + ParseInputs Inputs; + std::optional Cmd = + CDB.getCompileCommand(FullPathName); + EXPECT_TRUE(Cmd); + Inputs.CompileCommand = std::move(*Cmd); + Inputs.TFS = &FS; + + if (auto Contents = FS.view(TestDir)->getBufferForFile(FullPathName)) + Inputs.Contents = Contents->get()->getBuffer().str(); + + return Inputs; + } + + SmallString<256> TestDir; + // FIXME: It will be better to use the MockFS if the scanning process and + // build module process doesn't depend on reading real IO. + RealThreadsafeFS FS; + + DiagnosticConsumer DiagConsumer; +}; + +TEST_F(PrerequisiteModulesTests, NonModularTest) { + MockDirectoryCompilationDatabase CDB(TestDir, FS); + + CDB.addFile("foo.h", R"cpp( +inline void foo() {} + )cpp"); + + CDB.addFile("NonModular.cpp", R"cpp( +#include "foo.h" +void use() { + foo(); +} + )cpp"); + + ModulesBuilder Builder(CDB); + + // NonModular.cpp is not related to modules. So nothing should be built. + auto NonModularInfo = + Builder.buildPrerequisiteModulesFor(getFullPath("NonModular.cpp"), FS); + EXPECT_TRUE(NonModularInfo); + + HeaderSearchOptions HSOpts; + NonModularInfo->adjustHeaderSearchOptions(HSOpts); + EXPECT_TRUE(HSOpts.PrebuiltModuleFiles.empty()); + + auto Invocation = + buildCompilerInvocation(getInputs("NonModular.cpp", CDB), DiagConsumer); + EXPECT_TRUE(NonModularInfo->canReuse(*Invocation, FS.view(TestDir))); +} + +TEST_F(PrerequisiteModulesTests, ModuleWithoutDepTest) { + MockDirectoryCompilationDatabase CDB(TestDir, FS); + + CDB.addFile("foo.h", R"cpp( +inline void foo() {} + )cpp"); + + CDB.addFile("M.cppm", R"cpp( +module; +#include "foo.h" +export module M; + )cpp"); + + ModulesBuilder Builder(CDB); + + auto MInfo = Builder.buildPrerequisiteModulesFor(getFullPath("M.cppm"), FS); + EXPECT_TRUE(MInfo); + + // Nothing should be built since M doesn't dependent on anything. + HeaderSearchOptions HSOpts; + MInfo->adjustHeaderSearchOptions(HSOpts); + EXPECT_TRUE(HSOpts.PrebuiltModuleFiles.empty()); + + auto Invocation = + buildCompilerInvocation(getInputs("M.cppm", CDB), DiagConsumer); + EXPECT_TRUE(MInfo->canReuse(*Invocation, FS.view(TestDir))); +} + +TEST_F(PrerequisiteModulesTests, ModuleWithDepTest) { + MockDirectoryCompilationDatabase CDB(TestDir, FS); + + CDB.addFile("foo.h", R"cpp( +inline void foo() {} + )cpp"); + + CDB.addFile("M.cppm", R"cpp( +module; +#include "foo.h" +export module M; + )cpp"); + + CDB.addFile("N.cppm", R"cpp( +export module N; +import :Part; +import M; + )cpp"); + + CDB.addFile("N-part.cppm", R"cpp( +// Different module name with filename intentionally. +export module N:Part; + )cpp"); + + ModulesBuilder Builder(CDB); + + auto NInfo = Builder.buildPrerequisiteModulesFor(getFullPath("N.cppm"), FS); + EXPECT_TRUE(NInfo); + + ParseInputs NInput = getInputs("N.cppm", CDB); + std::unique_ptr Invocation = + buildCompilerInvocation(NInput, DiagConsumer); + // Test that `PrerequisiteModules::canReuse` works basically. + EXPECT_TRUE(NInfo->canReuse(*Invocation, FS.view(TestDir))); + + { + // Check that + // `PrerequisiteModules::adjustHeaderSearchOptions(HeaderSearchOptions&)` + // can appending HeaderSearchOptions correctly. + HeaderSearchOptions HSOpts; + NInfo->adjustHeaderSearchOptions(HSOpts); + + EXPECT_TRUE(HSOpts.PrebuiltModuleFiles.count("M")); + EXPECT_TRUE(HSOpts.PrebuiltModuleFiles.count("N:Part")); + } + + { + // Check that + // `PrerequisiteModules::adjustHeaderSearchOptions(HeaderSearchOptions&)` + // can replace HeaderSearchOptions correctly. + HeaderSearchOptions HSOpts; + HSOpts.PrebuiltModuleFiles["M"] = "incorrect_path"; + HSOpts.PrebuiltModuleFiles["N:Part"] = "incorrect_path"; + NInfo->adjustHeaderSearchOptions(HSOpts); + + EXPECT_TRUE(StringRef(HSOpts.PrebuiltModuleFiles["M"]).ends_with(".pcm")); + EXPECT_TRUE( + StringRef(HSOpts.PrebuiltModuleFiles["N:Part"]).ends_with(".pcm")); + } +} + +TEST_F(PrerequisiteModulesTests, ReusabilityTest) { + MockDirectoryCompilationDatabase CDB(TestDir, FS); + + CDB.addFile("foo.h", R"cpp( +inline void foo() {} + )cpp"); + + CDB.addFile("M.cppm", R"cpp( +module; +#include "foo.h" +export module M; + )cpp"); + + CDB.addFile("N.cppm", R"cpp( +export module N; +import :Part; +import M; + )cpp"); + + CDB.addFile("N-part.cppm", R"cpp( +// Different module name with filename intentionally. +export module N:Part; + )cpp"); + + ModulesBuilder Builder(CDB); + + auto NInfo = Builder.buildPrerequisiteModulesFor(getFullPath("N.cppm"), FS); + EXPECT_TRUE(NInfo); + EXPECT_TRUE(NInfo); + + ParseInputs NInput = getInputs("N.cppm", CDB); + std::unique_ptr Invocation = + buildCompilerInvocation(NInput, DiagConsumer); + EXPECT_TRUE(NInfo->canReuse(*Invocation, FS.view(TestDir))); + + // Test that we can still reuse the NInfo after we touch a unrelated file. + { + CDB.addFile("L.cppm", R"cpp( +module; +#include "foo.h" +export module L; +export int ll = 43; + )cpp"); + EXPECT_TRUE(NInfo->canReuse(*Invocation, FS.view(TestDir))); + + CDB.addFile("bar.h", R"cpp( +inline void bar() {} +inline void bar(int) {} + )cpp"); + EXPECT_TRUE(NInfo->canReuse(*Invocation, FS.view(TestDir))); + } + + // Test that we can't reuse the NInfo after we touch a related file. + { + CDB.addFile("M.cppm", R"cpp( +module; +#include "foo.h" +export module M; +export int mm = 44; + )cpp"); + EXPECT_FALSE(NInfo->canReuse(*Invocation, FS.view(TestDir))); + + NInfo = Builder.buildPrerequisiteModulesFor(getFullPath("N.cppm"), FS); + EXPECT_TRUE(NInfo->canReuse(*Invocation, FS.view(TestDir))); + + CDB.addFile("foo.h", R"cpp( +inline void foo() {} +inline void foo(int) {} + )cpp"); + EXPECT_FALSE(NInfo->canReuse(*Invocation, FS.view(TestDir))); + + NInfo = Builder.buildPrerequisiteModulesFor(getFullPath("N.cppm"), FS); + EXPECT_TRUE(NInfo->canReuse(*Invocation, FS.view(TestDir))); + } + + CDB.addFile("N-part.cppm", R"cpp( +export module N:Part; +// Intentioned to make it uncompilable. +export int NPart = 4LIdjwldijaw + )cpp"); + EXPECT_FALSE(NInfo->canReuse(*Invocation, FS.view(TestDir))); + NInfo = Builder.buildPrerequisiteModulesFor(getFullPath("N.cppm"), FS); + EXPECT_TRUE(NInfo); + EXPECT_FALSE(NInfo->canReuse(*Invocation, FS.view(TestDir))); + + CDB.addFile("N-part.cppm", R"cpp( +export module N:Part; +export int NPart = 43; + )cpp"); + EXPECT_TRUE(NInfo); + EXPECT_FALSE(NInfo->canReuse(*Invocation, FS.view(TestDir))); + NInfo = Builder.buildPrerequisiteModulesFor(getFullPath("N.cppm"), FS); + EXPECT_TRUE(NInfo); + EXPECT_TRUE(NInfo->canReuse(*Invocation, FS.view(TestDir))); + + // Test that if we changed the modification time of the file, the module files + // info is still reusable if its content doesn't change. + CDB.addFile("N-part.cppm", R"cpp( +export module N:Part; +export int NPart = 43; + )cpp"); + EXPECT_TRUE(NInfo->canReuse(*Invocation, FS.view(TestDir))); + + CDB.addFile("N.cppm", R"cpp( +export module N; +import :Part; +import M; + +export int nn = 43; + )cpp"); + // NInfo should be reusable after we change its content. + EXPECT_TRUE(NInfo->canReuse(*Invocation, FS.view(TestDir))); +} + +// An End-to-End test for modules. +TEST_F(PrerequisiteModulesTests, ParsedASTTest) { + MockDirectoryCompilationDatabase CDB(TestDir, FS); + + CDB.addFile("A.cppm", R"cpp( +export module A; +export void printA(); + )cpp"); + + CDB.addFile("Use.cpp", R"cpp( +import A; +)cpp"); + + ModulesBuilder Builder(CDB); + + ParseInputs Use = getInputs("Use.cpp", CDB); + Use.ModulesManager = &Builder; + + std::unique_ptr CI = + buildCompilerInvocation(Use, DiagConsumer); + EXPECT_TRUE(CI); + + auto Preamble = + buildPreamble(getFullPath("Use.cpp"), *CI, Use, /*InMemory=*/true, + /*Callback=*/nullptr); + EXPECT_TRUE(Preamble); + EXPECT_TRUE(Preamble->RequiredModules); + + auto AST = ParsedAST::build(getFullPath("Use.cpp"), Use, std::move(CI), {}, + Preamble); + EXPECT_TRUE(AST); + + const NamedDecl &D = findDecl(*AST, "printA"); + EXPECT_TRUE(D.isFromASTFile()); +} + +} // namespace +} // namespace clang::clangd + +#endif diff --git a/clang-tools-extra/clangd/unittests/TestFS.h b/clang-tools-extra/clangd/unittests/TestFS.h index 6bdadc9c07439..568533f3b3b91 100644 --- a/clang-tools-extra/clangd/unittests/TestFS.h +++ b/clang-tools-extra/clangd/unittests/TestFS.h @@ -67,7 +67,7 @@ class MockCompilationDatabase : public GlobalCompilationDatabase { std::vector ExtraClangFlags; -private: +protected: StringRef Directory; StringRef RelPathPrefix; }; diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst index 004811d2eca4f..697b514ae1572 100644 --- a/clang-tools-extra/docs/ReleaseNotes.rst +++ b/clang-tools-extra/docs/ReleaseNotes.rst @@ -48,6 +48,10 @@ Major New Features Improvements to clangd ---------------------- +- Introduced exmperimental support for C++20 Modules. The experimental support can + be enabled by `-experimental-modules-support` option. It is in an early development + stage and may not perform efficiently in real-world scenarios. + Inlay hints ^^^^^^^^^^^ From 3e47f6ba4a2aae7a8414dfeafa21d8d79e806c43 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 17 Jul 2024 19:39:04 -0700 Subject: [PATCH 362/777] Rapply "[Target] Use range-based for loops (NFC) (#98844)" This iteration drops hunks where the loop body adds more elements. --- llvm/lib/Target/ARM/ARMFrameLowering.cpp | 4 ++-- llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp | 4 ++-- llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp | 4 ++-- llvm/lib/Target/Hexagon/HexagonGenInsert.cpp | 12 ++++++------ llvm/lib/Target/Mips/MipsFastISel.cpp | 4 ++-- llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp | 4 ++-- llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp | 4 ++-- llvm/lib/Target/PowerPC/PPCFastISel.cpp | 8 ++++---- .../Target/WebAssembly/WebAssemblyRegColoring.cpp | 3 +-- llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp | 5 ++--- 10 files changed, 25 insertions(+), 27 deletions(-) diff --git a/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/llvm/lib/Target/ARM/ARMFrameLowering.cpp index 831b6b0fc7223..e94b0f6e1a44f 100644 --- a/llvm/lib/Target/ARM/ARMFrameLowering.cpp +++ b/llvm/lib/Target/ARM/ARMFrameLowering.cpp @@ -1673,8 +1673,8 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB, .addReg(ARM::SP) .add(predOps(ARMCC::AL)) .setMIFlags(MachineInstr::FrameDestroy); - for (unsigned i = 0, e = Regs.size(); i < e; ++i) - MIB.addReg(Regs[i], getDefRegState(true)); + for (unsigned Reg : Regs) + MIB.addReg(Reg, getDefRegState(true)); if (DeleteRet) { if (MI != MBB.end()) { MIB.copyImplicitOps(*MI); diff --git a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp index e5e817f1ed9a2..b55b9a42e52cd 100644 --- a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp +++ b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp @@ -2579,8 +2579,8 @@ ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) { Bases.push_back(Base); return; } - for (unsigned i = 0, e = BI->second.size(); i != e; ++i) { - if (Offset == getMemoryOpOffset(*BI->second[i])) { + for (const MachineInstr *MI : BI->second) { + if (Offset == getMemoryOpOffset(*MI)) { StopHere = true; break; } diff --git a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp index 99745941d5798..6926b02701771 100644 --- a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp +++ b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp @@ -1056,8 +1056,8 @@ bool DeadCodeElimination::runOnNode(MachineDomTreeNode *N) { continue; B->erase(MI); - for (unsigned i = 0, n = Regs.size(); i != n; ++i) - MRI.markUsesInDebugValueAsUndef(Regs[i]); + for (unsigned Reg : Regs) + MRI.markUsesInDebugValueAsUndef(Reg); Changed = true; } diff --git a/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp b/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp index a4304b0531666..8840c272057ab 100644 --- a/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp +++ b/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp @@ -1040,8 +1040,8 @@ void HexagonGenInsert::pruneEmptyLists() { if (I->second.empty()) Prune.push_back(I); } - for (unsigned i = 0, n = Prune.size(); i < n; ++i) - IFMap.erase(Prune[i]); + for (const auto &It : Prune) + IFMap.erase(It); } void HexagonGenInsert::pruneCoveredSets(unsigned VR) { @@ -1470,8 +1470,8 @@ bool HexagonGenInsert::removeDeadCode(MachineDomTreeNode *N) { continue; B->erase(MI); - for (unsigned I = 0, N = Regs.size(); I != N; ++I) - MRI->markUsesInDebugValueAsUndef(Regs[I]); + for (unsigned Reg : Regs) + MRI->markUsesInDebugValueAsUndef(Reg); Changed = true; } @@ -1582,8 +1582,8 @@ bool HexagonGenInsert::runOnMachineFunction(MachineFunction &MF) { if (Idx >= Cutoff) Out.push_back(I); } - for (unsigned i = 0, n = Out.size(); i < n; ++i) - IFMap.erase(Out[i]); + for (const auto &It : Out) + IFMap.erase(It); } if (IFMap.empty()) return Changed; diff --git a/llvm/lib/Target/Mips/MipsFastISel.cpp b/llvm/lib/Target/Mips/MipsFastISel.cpp index ec12af66ff2d4..bd8ef43da625c 100644 --- a/llvm/lib/Target/Mips/MipsFastISel.cpp +++ b/llvm/lib/Target/Mips/MipsFastISel.cpp @@ -1763,8 +1763,8 @@ bool MipsFastISel::selectRet(const Instruction *I) { RetRegs.push_back(VA.getLocReg()); } MachineInstrBuilder MIB = emitInst(Mips::RetRA); - for (unsigned i = 0, e = RetRegs.size(); i != e; ++i) - MIB.addReg(RetRegs[i], RegState::Implicit); + for (unsigned Reg : RetRegs) + MIB.addReg(Reg, RegState::Implicit); return true; } diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp index d6e20932a247e..0b654abd2814c 100644 --- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp @@ -861,8 +861,8 @@ void NVPTXAsmPrinter::emitGlobals(const Module &M) { *static_cast(NTM.getSubtargetImpl()); // Print out module-level global variables in proper order - for (unsigned i = 0, e = Globals.size(); i != e; ++i) - printModuleLevelGV(Globals[i], OS2, /*processDemoted=*/false, STI); + for (const GlobalVariable *GV : Globals) + printModuleLevelGV(GV, OS2, /*processDemoted=*/false, STI); OS2 << '\n'; diff --git a/llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp b/llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp index ff3d36d39fb29..4c522e2c5be41 100644 --- a/llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp +++ b/llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp @@ -149,8 +149,8 @@ namespace { Changed = true; } - for (unsigned i = 0, ie = PredToRemove.size(); i != ie; ++i) - PredToRemove[i]->removeSuccessor(&ReturnMBB, true); + for (MachineBasicBlock *MBB : PredToRemove) + MBB->removeSuccessor(&ReturnMBB, true); if (Changed && !ReturnMBB.hasAddressTaken()) { // We now might be able to merge this blr-only block into its diff --git a/llvm/lib/Target/PowerPC/PPCFastISel.cpp b/llvm/lib/Target/PowerPC/PPCFastISel.cpp index 0e04bb944c3bb..8d364bcb22394 100644 --- a/llvm/lib/Target/PowerPC/PPCFastISel.cpp +++ b/llvm/lib/Target/PowerPC/PPCFastISel.cpp @@ -1668,8 +1668,8 @@ bool PPCFastISel::fastLowerCall(CallLoweringInfo &CLI) { } // Add implicit physical register uses to the call. - for (unsigned II = 0, IE = RegArgs.size(); II != IE; ++II) - MIB.addReg(RegArgs[II], RegState::Implicit); + for (unsigned Reg : RegArgs) + MIB.addReg(Reg, RegState::Implicit); // Direct calls, in both the ELF V1 and V2 ABIs, need the TOC register live // into the call. @@ -1793,8 +1793,8 @@ bool PPCFastISel::SelectRet(const Instruction *I) { MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(PPC::BLR8)); - for (unsigned i = 0, e = RetRegs.size(); i != e; ++i) - MIB.addReg(RetRegs[i], RegState::Implicit); + for (unsigned Reg : RetRegs) + MIB.addReg(Reg, RegState::Implicit); return true; } diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp index 8a74d77e369f6..7dc5c099c1270 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp @@ -135,8 +135,7 @@ static void undefInvalidDbgValues( #ifndef NDEBUG DenseSet SeenRegs; #endif - for (size_t I = 0, E = Assignments.size(); I < E; ++I) { - const auto &CoalescedIntervals = Assignments[I]; + for (const auto &CoalescedIntervals : Assignments) { if (CoalescedIntervals.empty()) continue; for (LiveInterval *LI : CoalescedIntervals) { diff --git a/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp b/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp index 793e624eefa8a..95962d1a0a240 100644 --- a/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp +++ b/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp @@ -178,8 +178,7 @@ bool XCoreLowerThreadLocal::runOnModule(Module &M) { for (GlobalVariable &GV : M.globals()) if (GV.isThreadLocal()) ThreadLocalGlobals.push_back(&GV); - for (unsigned I = 0, E = ThreadLocalGlobals.size(); I != E; ++I) { - MadeChange |= lowerGlobal(ThreadLocalGlobals[I]); - } + for (GlobalVariable *GV : ThreadLocalGlobals) + MadeChange |= lowerGlobal(GV); return MadeChange; } From ad154281230d83ee551e12d5be48bb956ef47ed3 Mon Sep 17 00:00:00 2001 From: David Tenty Date: Wed, 17 Jul 2024 22:36:01 -0400 Subject: [PATCH 363/777] [NFC][libc++][test] loosen XFAIL condition for setfill_wchar_max.pass.cpp So we can also match aarch64 triples which have four components instead of three when disabling the test, which the case on some buildbots. Follow on to #89305 --- .../iostream.format/std.manip/setfill_wchar_max.pass.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libcxx/test/std/input.output/iostream.format/std.manip/setfill_wchar_max.pass.cpp b/libcxx/test/std/input.output/iostream.format/std.manip/setfill_wchar_max.pass.cpp index f22850877dd62..d220a5c36a23b 100644 --- a/libcxx/test/std/input.output/iostream.format/std.manip/setfill_wchar_max.pass.cpp +++ b/libcxx/test/std/input.output/iostream.format/std.manip/setfill_wchar_max.pass.cpp @@ -15,8 +15,8 @@ // version 2 implementation fixes the problem. // XFAIL: target={{.*}}-windows{{.*}} && libcpp-abi-version=1 -// XFAIL: target=armv{{7|8}}l-linux-gnueabihf && libcpp-abi-version=1 -// XFAIL: target=aarch64-linux-gnu && libcpp-abi-version=1 +// XFAIL: target=armv{{7|8}}l{{.*}}-linux-gnueabihf && libcpp-abi-version=1 +// XFAIL: target=aarch64{{.*}}-linux-gnu && libcpp-abi-version=1 #include #include From edfe25064e13c9cabf1cf3398f7760bf0991ae3e Mon Sep 17 00:00:00 2001 From: Teresa Johnson Date: Wed, 17 Jul 2024 20:25:18 -0700 Subject: [PATCH 364/777] [MemProf] Consolidate increments in callee matching code (#99385) To facilitate some follow on changes, consolidate the incrementing of the edge iterator used during callee matching to the for loop statement. This requires an additional adjustment in the case of tail call handling. --- .../IPO/MemProfContextDisambiguation.cpp | 27 ++++++++++--------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp index ef9ddeaaab632..66bd786c85df5 100644 --- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp +++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp @@ -505,9 +505,8 @@ class CallsiteContextGraph { /// we were able to identify the call chain through intermediate tail calls. /// In the latter case new context nodes are added to the graph for the /// identified tail calls, and their synthesized nodes are added to - /// TailCallToContextNodeMap. The EdgeIter is updated in either case to the - /// next element after the input position (either incremented or updated after - /// removing the old edge). + /// TailCallToContextNodeMap. The EdgeIter is updated in the latter case for + /// the updated edges and to prepare it for an increment in the caller. bool calleesMatch(CallTy Call, EdgeIter &EI, MapVector &TailCallToContextNodeMap); @@ -1835,12 +1834,11 @@ void CallsiteContextGraphClones.empty()); // Check all node callees and see if in the same function. auto Call = Node->Call.call(); - for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) { + for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end(); + ++EI) { auto Edge = *EI; - if (!Edge->Callee->hasCall()) { - ++EI; + if (!Edge->Callee->hasCall()) continue; - } assert(NodeToCallingFunc.count(Edge->Callee)); // Check if the called function matches that of the callee node. if (calleesMatch(Call, EI, TailCallToContextNodeMap)) @@ -1889,16 +1887,12 @@ bool CallsiteContextGraph::calleesMatch( // calls between the profiled caller and callee. std::vector> FoundCalleeChain; if (!calleeMatchesFunc(Call, ProfiledCalleeFunc, CallerFunc, - FoundCalleeChain)) { - ++EI; + FoundCalleeChain)) return false; - } // The usual case where the profiled callee matches that of the IR/summary. - if (FoundCalleeChain.empty()) { - ++EI; + if (FoundCalleeChain.empty()) return true; - } auto AddEdge = [Edge, &EI](ContextNode *Caller, ContextNode *Callee) { auto *CurEdge = Callee->findEdgeFromCaller(Caller); @@ -1960,6 +1954,13 @@ bool CallsiteContextGraph::calleesMatch( Edge->Callee->eraseCallerEdge(Edge.get()); EI = Edge->Caller->CalleeEdges.erase(EI); + // To simplify the increment of EI in the caller, subtract one from EI. + // In the final AddEdge call we would have either added a new callee edge, + // to Edge->Caller, or found an existing one. Either way we are guaranteed + // that there is at least one callee edge. + assert(!Edge->Caller->CalleeEdges.empty()); + --EI; + return true; } From c184b94ff6546c8ba8ac54b5127189427567978f Mon Sep 17 00:00:00 2001 From: Chuanqi Xu Date: Thu, 18 Jul 2024 11:24:02 +0800 Subject: [PATCH 365/777] [C++20] [Modules] Write ODRHash for decls in GMF Previously, we skipped calculating ODRHash for decls in GMF when writing them to .pcm files as an optimization. But actually, it is not true that this will be a pure optimization. Whether or not it is beneficial depends on the use cases. For example, if we're writing a function `a` in module and there are 10 consumers of `a` in other TUs, then the other TUs will pay for the cost to calculate the ODR hash for `a` ten times. Then this optimization doesn't work. However, if all the consumers of the module didn't touch `a`, then we can save the cost to calculate the ODR hash of `a` for 1 times. And the assumption to make it was: generally, the consumers of a module may only consume a small part of the imported module. This is the reason why we tried to load declarations, types and identifiers lazily. Then it looks good to do the similar thing for calculating ODR hashs. It works fine for a long time, until we started to look into the support of modules in clangd. Then we meet multiple issue reports complaining we're calculating ODR hash in the wrong place. To workaround these issue reports, I decided to always write the ODRhash for decls in GMF. In my local test, I only observed less than 1% compile time regression after doing this. So it should be fine. --- clang/lib/Serialization/ASTReaderDecl.cpp | 28 ++++++----------------- clang/lib/Serialization/ASTWriter.cpp | 11 +++------ clang/lib/Serialization/ASTWriterDecl.cpp | 17 ++++---------- 3 files changed, 14 insertions(+), 42 deletions(-) diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp index 76032aa836b50..3de9d327f1b3b 100644 --- a/clang/lib/Serialization/ASTReaderDecl.cpp +++ b/clang/lib/Serialization/ASTReaderDecl.cpp @@ -794,15 +794,12 @@ void ASTDeclReader::VisitEnumDecl(EnumDecl *ED) { BitsUnpacker EnumDeclBits(Record.readInt()); ED->setNumPositiveBits(EnumDeclBits.getNextBits(/*Width=*/8)); ED->setNumNegativeBits(EnumDeclBits.getNextBits(/*Width=*/8)); - bool ShouldSkipCheckingODR = EnumDeclBits.getNextBit(); ED->setScoped(EnumDeclBits.getNextBit()); ED->setScopedUsingClassTag(EnumDeclBits.getNextBit()); ED->setFixed(EnumDeclBits.getNextBit()); - if (!ShouldSkipCheckingODR) { - ED->setHasODRHash(true); - ED->ODRHash = Record.readInt(); - } + ED->setHasODRHash(true); + ED->ODRHash = Record.readInt(); // If this is a definition subject to the ODR, and we already have a // definition, merge this one into it. @@ -864,9 +861,6 @@ ASTDeclReader::VisitRecordDeclImpl(RecordDecl *RD) { void ASTDeclReader::VisitRecordDecl(RecordDecl *RD) { VisitRecordDeclImpl(RD); - // We should only reach here if we're in C/Objective-C. There is no - // global module fragment. - assert(!shouldSkipCheckingODR(RD)); RD->setODRHash(Record.readInt()); // Maintain the invariant of a redeclaration chain containing only @@ -1066,7 +1060,6 @@ void ASTDeclReader::VisitFunctionDecl(FunctionDecl *FD) { FD->setCachedLinkage((Linkage)FunctionDeclBits.getNextBits(/*Width=*/3)); FD->setStorageClass((StorageClass)FunctionDeclBits.getNextBits(/*Width=*/3)); - bool ShouldSkipCheckingODR = FunctionDeclBits.getNextBit(); FD->setInlineSpecified(FunctionDeclBits.getNextBit()); FD->setImplicitlyInline(FunctionDeclBits.getNextBit()); FD->setHasSkippedBody(FunctionDeclBits.getNextBit()); @@ -1096,10 +1089,8 @@ void ASTDeclReader::VisitFunctionDecl(FunctionDecl *FD) { if (FD->isExplicitlyDefaulted()) FD->setDefaultLoc(readSourceLocation()); - if (!ShouldSkipCheckingODR) { - FD->ODRHash = Record.readInt(); - FD->setHasODRHash(true); - } + FD->ODRHash = Record.readInt(); + FD->setHasODRHash(true); if (FD->isDefaulted() || FD->isDeletedAsWritten()) { // If 'Info' is nonzero, we need to read an DefaultedOrDeletedInfo; if, @@ -1971,8 +1962,6 @@ void ASTDeclReader::ReadCXXDefinitionData( BitsUnpacker CXXRecordDeclBits = Record.readInt(); - bool ShouldSkipCheckingODR = CXXRecordDeclBits.getNextBit(); - #define FIELD(Name, Width, Merge) \ if (!CXXRecordDeclBits.canGetNextNBits(Width)) \ CXXRecordDeclBits.updateValue(Record.readInt()); \ @@ -1981,12 +1970,9 @@ void ASTDeclReader::ReadCXXDefinitionData( #include "clang/AST/CXXRecordDeclDefinitionBits.def" #undef FIELD - // We only perform ODR checks for decls not in GMF. - if (!ShouldSkipCheckingODR) { - // Note: the caller has deserialized the IsLambda bit already. - Data.ODRHash = Record.readInt(); - Data.HasODRHash = true; - } + // Note: the caller has deserialized the IsLambda bit already. + Data.ODRHash = Record.readInt(); + Data.HasODRHash = true; if (Record.readInt()) { Reader.DefinitionSource[D] = diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp index 5b5b468532f32..c78d8943d6d92 100644 --- a/clang/lib/Serialization/ASTWriter.cpp +++ b/clang/lib/Serialization/ASTWriter.cpp @@ -6543,9 +6543,6 @@ void ASTRecordWriter::AddCXXDefinitionData(const CXXRecordDecl *D) { BitsPacker DefinitionBits; - bool ShouldSkipCheckingODR = shouldSkipCheckingODR(D); - DefinitionBits.addBit(ShouldSkipCheckingODR); - #define FIELD(Name, Width, Merge) \ if (!DefinitionBits.canWriteNextNBits(Width)) { \ Record->push_back(DefinitionBits); \ @@ -6558,11 +6555,9 @@ void ASTRecordWriter::AddCXXDefinitionData(const CXXRecordDecl *D) { Record->push_back(DefinitionBits); - // We only perform ODR checks for decls not in GMF. - if (!ShouldSkipCheckingODR) - // getODRHash will compute the ODRHash if it has not been previously - // computed. - Record->push_back(D->getODRHash()); + // getODRHash will compute the ODRHash if it has not been previously + // computed. + Record->push_back(D->getODRHash()); bool ModulesDebugInfo = Writer->Context->getLangOpts().ModulesDebugInfo && !D->isDependentType(); diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp index 5dff0cec5c0ea..17c774038571e 100644 --- a/clang/lib/Serialization/ASTWriterDecl.cpp +++ b/clang/lib/Serialization/ASTWriterDecl.cpp @@ -527,16 +527,12 @@ void ASTDeclWriter::VisitEnumDecl(EnumDecl *D) { BitsPacker EnumDeclBits; EnumDeclBits.addBits(D->getNumPositiveBits(), /*BitWidth=*/8); EnumDeclBits.addBits(D->getNumNegativeBits(), /*BitWidth=*/8); - bool ShouldSkipCheckingODR = shouldSkipCheckingODR(D); - EnumDeclBits.addBit(ShouldSkipCheckingODR); EnumDeclBits.addBit(D->isScoped()); EnumDeclBits.addBit(D->isScopedUsingClassTag()); EnumDeclBits.addBit(D->isFixed()); Record.push_back(EnumDeclBits); - // We only perform ODR checks for decls not in GMF. - if (!ShouldSkipCheckingODR) - Record.push_back(D->getODRHash()); + Record.push_back(D->getODRHash()); if (MemberSpecializationInfo *MemberInfo = D->getMemberSpecializationInfo()) { Record.AddDeclRef(MemberInfo->getInstantiatedFrom()); @@ -553,7 +549,7 @@ void ASTDeclWriter::VisitEnumDecl(EnumDecl *D) { !D->isTopLevelDeclInObjCContainer() && !CXXRecordDecl::classofKind(D->getKind()) && !D->getIntegerTypeSourceInfo() && !D->getMemberSpecializationInfo() && - !needsAnonymousDeclarationNumber(D) && !shouldSkipCheckingODR(D) && + !needsAnonymousDeclarationNumber(D) && D->getDeclName().getNameKind() == DeclarationName::Identifier) AbbrevToUse = Writer.getDeclEnumAbbrev(); @@ -719,8 +715,6 @@ void ASTDeclWriter::VisitFunctionDecl(FunctionDecl *D) { // FIXME: stable encoding FunctionDeclBits.addBits(llvm::to_underlying(D->getLinkageInternal()), 3); FunctionDeclBits.addBits((uint32_t)D->getStorageClass(), /*BitWidth=*/3); - bool ShouldSkipCheckingODR = shouldSkipCheckingODR(D); - FunctionDeclBits.addBit(ShouldSkipCheckingODR); FunctionDeclBits.addBit(D->isInlineSpecified()); FunctionDeclBits.addBit(D->isInlined()); FunctionDeclBits.addBit(D->hasSkippedBody()); @@ -746,9 +740,7 @@ void ASTDeclWriter::VisitFunctionDecl(FunctionDecl *D) { if (D->isExplicitlyDefaulted()) Record.AddSourceLocation(D->getDefaultLoc()); - // We only perform ODR checks for decls not in GMF. - if (!ShouldSkipCheckingODR) - Record.push_back(D->getODRHash()); + Record.push_back(D->getODRHash()); if (D->isDefaulted() || D->isDeletedAsWritten()) { if (auto *FDI = D->getDefalutedOrDeletedInfo()) { @@ -1560,8 +1552,7 @@ void ASTDeclWriter::VisitCXXMethodDecl(CXXMethodDecl *D) { D->getFirstDecl() == D->getMostRecentDecl() && !D->isInvalidDecl() && !D->hasAttrs() && !D->isTopLevelDeclInObjCContainer() && D->getDeclName().getNameKind() == DeclarationName::Identifier && - !shouldSkipCheckingODR(D) && !D->hasExtInfo() && - !D->isExplicitlyDefaulted()) { + !D->hasExtInfo() && !D->isExplicitlyDefaulted()) { if (D->getTemplatedKind() == FunctionDecl::TK_NonTemplate || D->getTemplatedKind() == FunctionDecl::TK_FunctionTemplate || D->getTemplatedKind() == FunctionDecl::TK_MemberSpecialization || From 464d321ee8dde1eaf14b5537eaf030e6df513849 Mon Sep 17 00:00:00 2001 From: Kareem Ergawy Date: Thu, 18 Jul 2024 06:00:36 +0200 Subject: [PATCH 366/777] [flang][stack-arrays] Extend pass to work on declare ops and within omp regions (#98810) Extends the stack-arrays pass to support `fir.declare` ops. Before that, we did not recognize malloc-free pairs for which `fir.declare` is used to declare the allocated entity. This is because the `free` op was invoked on the result of the `fir.declare` op and did not directly use the allocated memory SSA value. This also extends the pass to collect the analysis results within OpenMP regions. --- .../lib/Optimizer/Transforms/StackArrays.cpp | 54 +++++++++++++++--- flang/test/Transforms/stack-arrays-hlfir.f90 | 55 +++++++++++++++++++ flang/test/Transforms/stack-arrays.fir | 41 ++++++++++++-- 3 files changed, 138 insertions(+), 12 deletions(-) create mode 100644 flang/test/Transforms/stack-arrays-hlfir.f90 diff --git a/flang/lib/Optimizer/Transforms/StackArrays.cpp b/flang/lib/Optimizer/Transforms/StackArrays.cpp index e8fa70ebc39d8..bdc2d9cd9c6c4 100644 --- a/flang/lib/Optimizer/Transforms/StackArrays.cpp +++ b/flang/lib/Optimizer/Transforms/StackArrays.cpp @@ -287,7 +287,7 @@ mlir::ChangeResult LatticePoint::join(const AbstractDenseLattice &lattice) { void LatticePoint::print(llvm::raw_ostream &os) const { for (const auto &[value, state] : stateMap) { - os << value << ": "; + os << "\n * " << value << ": "; ::print(os, state); } } @@ -361,6 +361,13 @@ void AllocationAnalysis::visitOperation(mlir::Operation *op, } else if (mlir::isa(op)) { assert(op->getNumOperands() == 1 && "fir.freemem has one operand"); mlir::Value operand = op->getOperand(0); + + // Note: StackArrays is scheduled in the pass pipeline after lowering hlfir + // to fir. Therefore, we only need to handle `fir::DeclareOp`s. + if (auto declareOp = + llvm::dyn_cast_if_present(operand.getDefiningOp())) + operand = declareOp.getMemref(); + std::optional operandState = before.get(operand); if (operandState && *operandState == AllocationState::Allocated) { // don't tag things not allocated in this function as freed, so that we @@ -452,6 +459,9 @@ StackArraysAnalysisWrapper::analyseFunction(mlir::Operation *func) { }; func->walk([&](mlir::func::ReturnOp child) { joinOperationLattice(child); }); func->walk([&](fir::UnreachableOp child) { joinOperationLattice(child); }); + func->walk( + [&](mlir::omp::TerminatorOp child) { joinOperationLattice(child); }); + llvm::DenseSet freedValues; point.appendFreedValues(freedValues); @@ -518,9 +528,18 @@ AllocMemConversion::matchAndRewrite(fir::AllocMemOp allocmem, // remove freemem operations llvm::SmallVector erases; - for (mlir::Operation *user : allocmem.getOperation()->getUsers()) + for (mlir::Operation *user : allocmem.getOperation()->getUsers()) { + if (auto declareOp = mlir::dyn_cast_if_present(user)) { + for (mlir::Operation *user : declareOp->getUsers()) { + if (mlir::isa(user)) + erases.push_back(user); + } + } + if (mlir::isa(user)) erases.push_back(user); + } + // now we are done iterating the users, it is safe to mutate them for (mlir::Operation *erase : erases) rewriter.eraseOp(erase); @@ -633,9 +652,19 @@ AllocMemConversion::findAllocaLoopInsertionPoint(fir::AllocMemOp &oldAlloc) { // find freemem ops llvm::SmallVector freeOps; - for (mlir::Operation *user : oldAllocOp->getUsers()) + + for (mlir::Operation *user : oldAllocOp->getUsers()) { + if (auto declareOp = mlir::dyn_cast_if_present(user)) { + for (mlir::Operation *user : declareOp->getUsers()) { + if (mlir::isa(user)) + freeOps.push_back(user); + } + } + if (mlir::isa(user)) freeOps.push_back(user); + } + assert(freeOps.size() && "DFA should only return freed memory"); // Don't attempt to reason about a stacksave/stackrestore between different @@ -717,12 +746,23 @@ void AllocMemConversion::insertStackSaveRestore( mlir::SymbolRefAttr stackRestoreSym = builder.getSymbolRefAttr(stackRestoreFn.getName()); + auto createStackRestoreCall = [&](mlir::Operation *user) { + builder.setInsertionPoint(user); + builder.create(user->getLoc(), + stackRestoreFn.getFunctionType().getResults(), + stackRestoreSym, mlir::ValueRange{sp}); + }; + for (mlir::Operation *user : oldAlloc->getUsers()) { + if (auto declareOp = mlir::dyn_cast_if_present(user)) { + for (mlir::Operation *user : declareOp->getUsers()) { + if (mlir::isa(user)) + createStackRestoreCall(user); + } + } + if (mlir::isa(user)) { - builder.setInsertionPoint(user); - builder.create(user->getLoc(), - stackRestoreFn.getFunctionType().getResults(), - stackRestoreSym, mlir::ValueRange{sp}); + createStackRestoreCall(user); } } diff --git a/flang/test/Transforms/stack-arrays-hlfir.f90 b/flang/test/Transforms/stack-arrays-hlfir.f90 new file mode 100644 index 0000000000000..50261b3078466 --- /dev/null +++ b/flang/test/Transforms/stack-arrays-hlfir.f90 @@ -0,0 +1,55 @@ +! Similar to stack-arrays.f90; i.e. both test the stack-arrays pass for different +! kinds of supported inputs. This one differs in that it takes the hlfir lowering +! path in flag rather than the fir one. For example, temp arrays are lowered +! differently in hlfir vs. fir and the IR that reaches the stack arrays pass looks +! quite different. + + +! RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - \ +! RUN: | fir-opt --lower-hlfir-ordered-assignments \ +! RUN: --bufferize-hlfir \ +! RUN: --convert-hlfir-to-fir \ +! RUN: --array-value-copy \ +! RUN: --stack-arrays \ +! RUN: | FileCheck %s + +subroutine temp_array + implicit none + integer (8) :: lV + integer (8), dimension (2) :: iaVS + + lV = 202 + + iaVS = [lV, lV] +end subroutine temp_array +! CHECK-LABEL: func.func @_QPtemp_array{{.*}} { +! CHECK-NOT: fir.allocmem +! CHECK-NOT: fir.freemem +! CHECK: fir.alloca !fir.array<2xi64> +! CHECK-NOT: fir.allocmem +! CHECK-NOT: fir.freemem +! CHECK: return +! CHECK-NEXT: } + +subroutine omp_temp_array + implicit none + integer (8) :: lV + integer (8), dimension (2) :: iaVS + + lV = 202 + + !$omp target + iaVS = [lV, lV] + !$omp end target +end subroutine omp_temp_array +! CHECK-LABEL: func.func @_QPomp_temp_array{{.*}} { +! CHECK: omp.target {{.*}} { +! CHECK-NOT: fir.allocmem +! CHECK-NOT: fir.freemem +! CHECK: fir.alloca !fir.array<2xi64> +! CHECK-NOT: fir.allocmem +! CHECK-NOT: fir.freemem +! CHECK: omp.terminator +! CHECK-NEXT: } +! CHECK: return +! CHECK-NEXT: } diff --git a/flang/test/Transforms/stack-arrays.fir b/flang/test/Transforms/stack-arrays.fir index a2ffe555091eb..45c22c15f7995 100644 --- a/flang/test/Transforms/stack-arrays.fir +++ b/flang/test/Transforms/stack-arrays.fir @@ -339,13 +339,10 @@ func.func @omp_placement1() { return } // CHECK: func.func @omp_placement1() { +// CHECK-NEXT: %[[MEM:.*]] = fir.alloca !fir.array<42xi32> +// CHECK-NEXT: %[[MEM_CONV:.*]] = fir.convert %[[MEM]] : (!fir.ref>) -> !fir.heap> // CHECK-NEXT: omp.sections { // CHECK-NEXT: omp.section { -// CHECK-NEXT: %[[MEM:.*]] = fir.allocmem !fir.array<42xi32> -// TODO: this allocation should be moved to the stack. Unfortunately, the data -// flow analysis fails to propogate the lattice out of the omp region to the -// return satement. -// CHECK-NEXT: fir.freemem %[[MEM]] : !fir.heap> // CHECK-NEXT: omp.terminator // CHECK-NEXT: } // CHECK-NEXT: omp.terminator @@ -369,3 +366,37 @@ func.func @stop_terminator() { // CHECK-NEXT: %[[NONE:.*]] = fir.call @_FortranAStopStatement(%[[ZERO]], %[[FALSE]], %[[FALSE]]) : (i32, i1, i1) -> none // CHECK-NEXT: fir.unreachable // CHECK-NEXT: } + + +// check that stack allocations that use fir.declare which must be placed in loops +// use stacksave +func.func @placement_loop_declare() { + %c1 = arith.constant 1 : index + %c1_i32 = fir.convert %c1 : (index) -> i32 + %c2 = arith.constant 2 : index + %c10 = arith.constant 10 : index + %0:2 = fir.do_loop %arg0 = %c1 to %c10 step %c1 iter_args(%arg1 = %c1_i32) -> (index, i32) { + %3 = arith.addi %c1, %c2 : index + // operand is now available + %4 = fir.allocmem !fir.array, %3 + %5 = fir.declare %4 {uniq_name = "temp"} : (!fir.heap>) -> !fir.heap> + // ... + fir.freemem %5 : !fir.heap> + fir.result %3, %c1_i32 : index, i32 + } + return +} +// CHECK: func.func @placement_loop_declare() { +// CHECK-NEXT: %[[C1:.*]] = arith.constant 1 : index +// CHECK-NEXT: %[[C1_I32:.*]] = fir.convert %[[C1]] : (index) -> i32 +// CHECK-NEXT: %[[C2:.*]] = arith.constant 2 : index +// CHECK-NEXT: %[[C10:.*]] = arith.constant 10 : index +// CHECK-NEXT: fir.do_loop +// CHECK-NEXT: %[[SUM:.*]] = arith.addi %[[C1]], %[[C2]] : index +// CHECK-NEXT: %[[SP:.*]] = fir.call @llvm.stacksave.p0() : () -> !fir.ref +// CHECK-NEXT: %[[MEM:.*]] = fir.alloca !fir.array, %[[SUM]] +// CHECK: fir.call @llvm.stackrestore.p0(%[[SP]]) +// CHECK-NEXT: fir.result +// CHECK-NEXT: } +// CHECK-NEXT: return +// CHECK-NEXT: } From 5338bd3c8ac5e313a09fffbe84aacc51a16e17f8 Mon Sep 17 00:00:00 2001 From: vporpo Date: Wed, 17 Jul 2024 21:57:52 -0700 Subject: [PATCH 367/777] [SandboxIR] IR Tracker (#99238) This is the first patch in a series of patches for the IR change tracking component of SandboxIR. The tracker collects changes in a vector of `IRChangeBase` objects and provides a `save()`/`accept()`/`revert()` API. Each type of IR changing event is captured by a dedicated subclass of `IRChangeBase`. This patch implements only one of them, that for updating a `sandboxir::Use` source value, named `UseSet`. --- llvm/docs/SandboxIR.md | 18 +++ llvm/include/llvm/SandboxIR/SandboxIR.h | 12 ++ llvm/include/llvm/SandboxIR/Tracker.h | 155 +++++++++++++++++++++++ llvm/include/llvm/SandboxIR/Use.h | 1 + llvm/lib/SandboxIR/CMakeLists.txt | 1 + llvm/lib/SandboxIR/SandboxIR.cpp | 26 +++- llvm/lib/SandboxIR/Tracker.cpp | 82 ++++++++++++ llvm/unittests/SandboxIR/CMakeLists.txt | 1 + llvm/unittests/SandboxIR/TrackerTest.cpp | 148 ++++++++++++++++++++++ 9 files changed, 443 insertions(+), 1 deletion(-) create mode 100644 llvm/include/llvm/SandboxIR/Tracker.h create mode 100644 llvm/lib/SandboxIR/Tracker.cpp create mode 100644 llvm/unittests/SandboxIR/TrackerTest.cpp diff --git a/llvm/docs/SandboxIR.md b/llvm/docs/SandboxIR.md index 8f8752f102c76..3b792659bb59b 100644 --- a/llvm/docs/SandboxIR.md +++ b/llvm/docs/SandboxIR.md @@ -51,3 +51,21 @@ For example, for `sandboxir::User::setOperand(OpIdx, sandboxir::Value *Op)`: - We get the corresponding LLVM User: `llvm::User *LLVMU = cast(Val)` - Next we get the corresponding LLVM Operand: `llvm::Value *LLVMOp = Op->Val` - Finally we modify `LLVMU`'s operand: `LLVMU->setOperand(OpIdx, LLVMOp) + +## IR Change Tracking +Sandbox IR's state can be saved and restored. +This is done with the help of the tracker component that is tightly coupled to the public Sandbox IR API functions. +Please note that nested saves/restores are currently not supported. + +To save the state and enable tracking the user needs to call `sandboxir::Context::save()`. +From this point on any change made to the Sandbox IR state will automatically create a change object and register it with the tracker, without any intervention from the user. +The changes are accumulated in a vector within the tracker. + +To rollback to the saved state the user needs to call `sandboxir::Context::revert()`. +Reverting back to the saved state is a matter of going over all the accumulated changes in reverse and undoing each individual change. + +To accept the changes made to the IR the user needs to call `sandboxir::Context::accept()`. +Internally this will go through the changes and run any finalization required. + +Please note that after a call to `revert()` or `accept()` tracking will stop. +To start tracking again, the user needs to call `save()`. diff --git a/llvm/include/llvm/SandboxIR/SandboxIR.h b/llvm/include/llvm/SandboxIR/SandboxIR.h index 473bd93aea7c1..c5d59ba47ca31 100644 --- a/llvm/include/llvm/SandboxIR/SandboxIR.h +++ b/llvm/include/llvm/SandboxIR/SandboxIR.h @@ -61,6 +61,7 @@ #include "llvm/IR/Function.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" +#include "llvm/SandboxIR/Tracker.h" #include "llvm/SandboxIR/Use.h" #include "llvm/Support/raw_ostream.h" #include @@ -171,6 +172,7 @@ class Value { friend class Context; // For getting `Val`. friend class User; // For getting `Val`. + friend class Use; // For getting `Val`. /// All values point to the context. Context &Ctx; @@ -641,6 +643,8 @@ class BasicBlock : public Value { class Context { protected: LLVMContext &LLVMCtx; + Tracker IRTracker; + /// Maps LLVM Value to the corresponding sandboxir::Value. Owns all /// SandboxIR objects. DenseMap> @@ -680,6 +684,14 @@ class Context { public: Context(LLVMContext &LLVMCtx) : LLVMCtx(LLVMCtx) {} + Tracker &getTracker() { return IRTracker; } + /// Convenience function for `getTracker().save()` + void save() { IRTracker.save(); } + /// Convenience function for `getTracker().revert()` + void revert() { IRTracker.revert(); } + /// Convenience function for `getTracker().accept()` + void accept() { IRTracker.accept(); } + sandboxir::Value *getValue(llvm::Value *V) const; const sandboxir::Value *getValue(const llvm::Value *V) const { return getValue(const_cast(V)); diff --git a/llvm/include/llvm/SandboxIR/Tracker.h b/llvm/include/llvm/SandboxIR/Tracker.h new file mode 100644 index 0000000000000..2d0904f5665b1 --- /dev/null +++ b/llvm/include/llvm/SandboxIR/Tracker.h @@ -0,0 +1,155 @@ +//===- Tracker.h ------------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file is the component of SandboxIR that tracks all changes made to its +// state, such that we can revert the state when needed. +// +// Tracking changes +// ---------------- +// The user needs to call `Tracker::save()` to enable tracking changes +// made to SandboxIR. From that point on, any change made to SandboxIR, will +// automatically create a change tracking object and register it with the +// tracker. IR-change objects are subclasses of `IRChangeBase` and get +// registered with the `Tracker::track()` function. The change objects +// are saved in the order they are registered with the tracker and are stored in +// the `Tracker::Changes` vector. All of this is done transparently to +// the user. +// +// Reverting changes +// ----------------- +// Calling `Tracker::revert()` will restore the state saved when +// `Tracker::save()` was called. Internally this goes through the +// change objects in `Tracker::Changes` in reverse order, calling their +// `IRChangeBase::revert()` function one by one. +// +// Accepting changes +// ----------------- +// The user needs to either revert or accept changes before the tracker object +// is destroyed. This is enforced in the tracker's destructor. +// This is the job of `Tracker::accept()`. Internally this will go +// through the change objects in `Tracker::Changes` in order, calling +// `IRChangeBase::accept()`. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_SANDBOXIR_TRACKER_H +#define LLVM_SANDBOXIR_TRACKER_H + +#include "llvm/ADT/SmallVector.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Module.h" +#include "llvm/SandboxIR/Use.h" +#include "llvm/Support/Debug.h" +#include +#include + +namespace llvm::sandboxir { + +class BasicBlock; +class Tracker; + +/// The base class for IR Change classes. +class IRChangeBase { +protected: + Tracker &Parent; + +public: + IRChangeBase(Tracker &Parent); + /// This runs when changes get reverted. + virtual void revert() = 0; + /// This runs when changes get accepted. + virtual void accept() = 0; + virtual ~IRChangeBase() = default; +#ifndef NDEBUG + /// \Returns the index of this change by iterating over all changes in the + /// tracker. This is only used for debugging. + unsigned getIdx() const; + void dumpCommon(raw_ostream &OS) const { OS << getIdx() << ". "; } + virtual void dump(raw_ostream &OS) const = 0; + LLVM_DUMP_METHOD virtual void dump() const = 0; + friend raw_ostream &operator<<(raw_ostream &OS, const IRChangeBase &C) { + C.dump(OS); + return OS; + } +#endif +}; + +/// Tracks the change of the source Value of a sandboxir::Use. +class UseSet : public IRChangeBase { + Use U; + Value *OrigV = nullptr; + +public: + UseSet(const Use &U, Tracker &Tracker) + : IRChangeBase(Tracker), U(U), OrigV(U.get()) {} + void revert() final { U.set(OrigV); } + void accept() final {} +#ifndef NDEBUG + void dump(raw_ostream &OS) const final { + dumpCommon(OS); + OS << "UseSet"; + } + LLVM_DUMP_METHOD void dump() const final; +#endif +}; + +/// The tracker collects all the change objects and implements the main API for +/// saving / reverting / accepting. +class Tracker { +public: + enum class TrackerState { + Disabled, ///> Tracking is disabled + Record, ///> Tracking changes + }; + +private: + /// The list of changes that are being tracked. + SmallVector> Changes; +#ifndef NDEBUG + friend unsigned IRChangeBase::getIdx() const; // For accessing `Changes`. +#endif + /// The current state of the tracker. + TrackerState State = TrackerState::Disabled; + +public: +#ifndef NDEBUG + /// Helps catch bugs where we are creating new change objects while in the + /// middle of creating other change objects. + bool InMiddleOfCreatingChange = false; +#endif // NDEBUG + + Tracker() = default; + ~Tracker(); + /// Record \p Change and take ownership. This is the main function used to + /// track Sandbox IR changes. + void track(std::unique_ptr &&Change); + /// \Returns true if the tracker is recording changes. + bool isTracking() const { return State == TrackerState::Record; } + /// \Returns the current state of the tracker. + TrackerState getState() const { return State; } + /// Turns on IR tracking. + void save(); + /// Stops tracking and accept changes. + void accept(); + /// Stops tracking and reverts to saved state. + void revert(); + +#ifndef NDEBUG + void dump(raw_ostream &OS) const; + LLVM_DUMP_METHOD void dump() const; + friend raw_ostream &operator<<(raw_ostream &OS, const Tracker &Tracker) { + Tracker.dump(OS); + return OS; + } +#endif // NDEBUG +}; + +} // namespace llvm::sandboxir + +#endif // LLVM_SANDBOXIR_TRACKER_H diff --git a/llvm/include/llvm/SandboxIR/Use.h b/llvm/include/llvm/SandboxIR/Use.h index 33afb54c1ff29..d77b4568d0fab 100644 --- a/llvm/include/llvm/SandboxIR/Use.h +++ b/llvm/include/llvm/SandboxIR/Use.h @@ -44,6 +44,7 @@ class Use { public: operator Value *() const { return get(); } Value *get() const; + void set(Value *V); class User *getUser() const { return Usr; } unsigned getOperandNo() const; Context *getContext() const { return Ctx; } diff --git a/llvm/lib/SandboxIR/CMakeLists.txt b/llvm/lib/SandboxIR/CMakeLists.txt index 225eca0cadd1a..6c0666b186b8a 100644 --- a/llvm/lib/SandboxIR/CMakeLists.txt +++ b/llvm/lib/SandboxIR/CMakeLists.txt @@ -1,5 +1,6 @@ add_llvm_component_library(LLVMSandboxIR SandboxIR.cpp + Tracker.cpp ADDITIONAL_HEADER_DIRS ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms/SandboxIR diff --git a/llvm/lib/SandboxIR/SandboxIR.cpp b/llvm/lib/SandboxIR/SandboxIR.cpp index 2984c6eaccd64..944869a37989c 100644 --- a/llvm/lib/SandboxIR/SandboxIR.cpp +++ b/llvm/lib/SandboxIR/SandboxIR.cpp @@ -16,6 +16,8 @@ using namespace llvm::sandboxir; Value *Use::get() const { return Ctx->getValue(LLVMUse->get()); } +void Use::set(Value *V) { LLVMUse->set(V->Val); } + unsigned Use::getOperandNo() const { return Usr->getUseOperandNo(*this); } #ifndef NDEBUG @@ -115,13 +117,24 @@ void Value::replaceUsesWithIf( User *DstU = cast_or_null(Ctx.getValue(LLVMUse.getUser())); if (DstU == nullptr) return false; - return ShouldReplace(Use(&LLVMUse, DstU, Ctx)); + Use UseToReplace(&LLVMUse, DstU, Ctx); + if (!ShouldReplace(UseToReplace)) + return false; + auto &Tracker = Ctx.getTracker(); + if (Tracker.isTracking()) + Tracker.track(std::make_unique(UseToReplace, Tracker)); + return true; }); } void Value::replaceAllUsesWith(Value *Other) { assert(getType() == Other->getType() && "Replacing with Value of different type!"); + auto &Tracker = Ctx.getTracker(); + if (Tracker.isTracking()) { + for (auto Use : uses()) + Tracker.track(std::make_unique(Use, Tracker)); + } // We are delegating RAUW to LLVM IR's RAUW. Val->replaceAllUsesWith(Other->Val); } @@ -212,11 +225,22 @@ bool User::classof(const Value *From) { void User::setOperand(unsigned OperandIdx, Value *Operand) { assert(isa(Val) && "No operands!"); + auto &Tracker = Ctx.getTracker(); + if (Tracker.isTracking()) + Tracker.track(std::make_unique(getOperandUse(OperandIdx), Tracker)); // We are delegating to llvm::User::setOperand(). cast(Val)->setOperand(OperandIdx, Operand->Val); } bool User::replaceUsesOfWith(Value *FromV, Value *ToV) { + auto &Tracker = Ctx.getTracker(); + if (Tracker.isTracking()) { + for (auto OpIdx : seq(0, getNumOperands())) { + auto Use = getOperandUse(OpIdx); + if (Use.get() == FromV) + Tracker.track(std::make_unique(Use, Tracker)); + } + } // We are delegating RUOW to LLVM IR's RUOW. return cast(Val)->replaceUsesOfWith(FromV->Val, ToV->Val); } diff --git a/llvm/lib/SandboxIR/Tracker.cpp b/llvm/lib/SandboxIR/Tracker.cpp new file mode 100644 index 0000000000000..1182f5c55d10b --- /dev/null +++ b/llvm/lib/SandboxIR/Tracker.cpp @@ -0,0 +1,82 @@ +//===- Tracker.cpp --------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/SandboxIR/Tracker.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Instruction.h" +#include "llvm/SandboxIR/SandboxIR.h" +#include + +using namespace llvm::sandboxir; + +IRChangeBase::IRChangeBase(Tracker &Parent) : Parent(Parent) { +#ifndef NDEBUG + assert(!Parent.InMiddleOfCreatingChange && + "We are in the middle of creating another change!"); + if (Parent.isTracking()) + Parent.InMiddleOfCreatingChange = true; +#endif // NDEBUG +} + +#ifndef NDEBUG +unsigned IRChangeBase::getIdx() const { + auto It = + find_if(Parent.Changes, [this](auto &Ptr) { return Ptr.get() == this; }); + return It - Parent.Changes.begin(); +} + +void UseSet::dump() const { + dump(dbgs()); + dbgs() << "\n"; +} +#endif // NDEBUG + +Tracker::~Tracker() { + assert(Changes.empty() && "You must accept or revert changes!"); +} + +void Tracker::track(std::unique_ptr &&Change) { + assert(State == TrackerState::Record && "The tracker should be tracking!"); + Changes.push_back(std::move(Change)); + +#ifndef NDEBUG + InMiddleOfCreatingChange = false; +#endif +} + +void Tracker::save() { State = TrackerState::Record; } + +void Tracker::revert() { + assert(State == TrackerState::Record && "Forgot to save()!"); + State = TrackerState::Disabled; + for (auto &Change : reverse(Changes)) + Change->revert(); + Changes.clear(); +} + +void Tracker::accept() { + assert(State == TrackerState::Record && "Forgot to save()!"); + State = TrackerState::Disabled; + for (auto &Change : Changes) + Change->accept(); + Changes.clear(); +} + +#ifndef NDEBUG +void Tracker::dump(raw_ostream &OS) const { + for (const auto &ChangePtr : Changes) { + ChangePtr->dump(OS); + OS << "\n"; + } +} +void Tracker::dump() const { + dump(dbgs()); + dbgs() << "\n"; +} +#endif // NDEBUG diff --git a/llvm/unittests/SandboxIR/CMakeLists.txt b/llvm/unittests/SandboxIR/CMakeLists.txt index 362653bfff965..3f43f6337b919 100644 --- a/llvm/unittests/SandboxIR/CMakeLists.txt +++ b/llvm/unittests/SandboxIR/CMakeLists.txt @@ -6,4 +6,5 @@ set(LLVM_LINK_COMPONENTS add_llvm_unittest(SandboxIRTests SandboxIRTest.cpp + TrackerTest.cpp ) diff --git a/llvm/unittests/SandboxIR/TrackerTest.cpp b/llvm/unittests/SandboxIR/TrackerTest.cpp new file mode 100644 index 0000000000000..f090dc521c32b --- /dev/null +++ b/llvm/unittests/SandboxIR/TrackerTest.cpp @@ -0,0 +1,148 @@ +//===- TrackerTest.cpp ----------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/AsmParser/Parser.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Module.h" +#include "llvm/SandboxIR/SandboxIR.h" +#include "llvm/Support/SourceMgr.h" +#include "gtest/gtest.h" + +using namespace llvm; + +struct TrackerTest : public testing::Test { + LLVMContext C; + std::unique_ptr M; + + void parseIR(LLVMContext &C, const char *IR) { + SMDiagnostic Err; + M = parseAssemblyString(IR, Err, C); + if (!M) + Err.print("TrackerTest", errs()); + } + BasicBlock *getBasicBlockByName(Function &F, StringRef Name) { + for (BasicBlock &BB : F) + if (BB.getName() == Name) + return &BB; + llvm_unreachable("Expected to find basic block!"); + } +}; + +TEST_F(TrackerTest, SetOperand) { + parseIR(C, R"IR( +define void @foo(ptr %ptr) { + %gep0 = getelementptr float, ptr %ptr, i32 0 + %gep1 = getelementptr float, ptr %ptr, i32 1 + %ld0 = load float, ptr %gep0 + store float undef, ptr %gep0 + ret void +} +)IR"); + Function &LLVMF = *M->getFunction("foo"); + sandboxir::Context Ctx(C); + auto *F = Ctx.createFunction(&LLVMF); + auto *BB = &*F->begin(); + auto &Tracker = Ctx.getTracker(); + Tracker.save(); + auto It = BB->begin(); + auto *Gep0 = &*It++; + auto *Gep1 = &*It++; + auto *Ld = &*It++; + auto *St = &*It++; + St->setOperand(0, Ld); + St->setOperand(1, Gep1); + Ld->setOperand(0, Gep1); + EXPECT_EQ(St->getOperand(0), Ld); + EXPECT_EQ(St->getOperand(1), Gep1); + EXPECT_EQ(Ld->getOperand(0), Gep1); + + Ctx.getTracker().revert(); + EXPECT_NE(St->getOperand(0), Ld); + EXPECT_EQ(St->getOperand(1), Gep0); + EXPECT_EQ(Ld->getOperand(0), Gep0); +} + +TEST_F(TrackerTest, RUWIf_RAUW_RUOW) { + parseIR(C, R"IR( +define void @foo(ptr %ptr) { + %ld0 = load float, ptr %ptr + %ld1 = load float, ptr %ptr + store float %ld0, ptr %ptr + store float %ld0, ptr %ptr + ret void +} +)IR"); + llvm::Function &LLVMF = *M->getFunction("foo"); + sandboxir::Context Ctx(C); + llvm::BasicBlock *LLVMBB = &*LLVMF.begin(); + Ctx.createFunction(&LLVMF); + auto *BB = cast(Ctx.getValue(LLVMBB)); + auto It = BB->begin(); + sandboxir::Instruction *Ld0 = &*It++; + sandboxir::Instruction *Ld1 = &*It++; + sandboxir::Instruction *St0 = &*It++; + sandboxir::Instruction *St1 = &*It++; + Ctx.save(); + // Check RUWIf when the lambda returns false. + Ld0->replaceUsesWithIf(Ld1, [](const sandboxir::Use &Use) { return false; }); + EXPECT_EQ(St0->getOperand(0), Ld0); + EXPECT_EQ(St1->getOperand(0), Ld0); + + // Check RUWIf when the lambda returns true. + Ld0->replaceUsesWithIf(Ld1, [](const sandboxir::Use &Use) { return true; }); + EXPECT_EQ(St0->getOperand(0), Ld1); + EXPECT_EQ(St1->getOperand(0), Ld1); + Ctx.revert(); + EXPECT_EQ(St0->getOperand(0), Ld0); + EXPECT_EQ(St1->getOperand(0), Ld0); + + // Check RUWIf user == St0. + Ctx.save(); + Ld0->replaceUsesWithIf( + Ld1, [St0](const sandboxir::Use &Use) { return Use.getUser() == St0; }); + EXPECT_EQ(St0->getOperand(0), Ld1); + EXPECT_EQ(St1->getOperand(0), Ld0); + Ctx.revert(); + EXPECT_EQ(St0->getOperand(0), Ld0); + EXPECT_EQ(St1->getOperand(0), Ld0); + + // Check RUWIf user == St1. + Ctx.save(); + Ld0->replaceUsesWithIf( + Ld1, [St1](const sandboxir::Use &Use) { return Use.getUser() == St1; }); + EXPECT_EQ(St0->getOperand(0), Ld0); + EXPECT_EQ(St1->getOperand(0), Ld1); + Ctx.revert(); + EXPECT_EQ(St0->getOperand(0), Ld0); + EXPECT_EQ(St1->getOperand(0), Ld0); + + // Check RAUW. + Ctx.save(); + Ld1->replaceAllUsesWith(Ld0); + EXPECT_EQ(St0->getOperand(0), Ld0); + EXPECT_EQ(St1->getOperand(0), Ld0); + Ctx.revert(); + EXPECT_EQ(St0->getOperand(0), Ld0); + EXPECT_EQ(St1->getOperand(0), Ld0); + + // Check RUOW. + Ctx.save(); + St0->replaceUsesOfWith(Ld0, Ld1); + EXPECT_EQ(St0->getOperand(0), Ld1); + Ctx.revert(); + EXPECT_EQ(St0->getOperand(0), Ld0); + + // Check accept(). + Ctx.save(); + St0->replaceUsesOfWith(Ld0, Ld1); + EXPECT_EQ(St0->getOperand(0), Ld1); + Ctx.accept(); + EXPECT_EQ(St0->getOperand(0), Ld1); +} From 27ee33d1368b9772f75285932c00479a0fae82ee Mon Sep 17 00:00:00 2001 From: Hsiangkai Wang Date: Thu, 18 Jul 2024 06:04:53 +0100 Subject: [PATCH 368/777] [mlir][linalg] Decompose winograd operators (#96183) Convert Linalg winograd_filter_transform, winograd_input_transform, and winograd_output_transform into nested loops with matrix multiplication with constant transform matrices. Support several configurations of Winograd Conv2D, including F(2, 3), F(4, 3) and F(2, 5). These configurations show that the implementation can support different kernel size (3 and 5) and different output size (2 and 4). Besides symetric kernel size 3x3 and 5x5, this patch also supports 1x3, 3x1, 1x5, and 5x1 kernels. The implementation is based on the paper, Fast Algorithm for Convolutional Neural Networks. (https://arxiv.org/abs/1509.09308) Reviewers: ftynse, Max191, GeorgeARM, nicolasvasilache, MaheshRavishankar, dcaballe, rengolin Reviewed By: ftynse, Max191 Pull Request: https://github.com/llvm/llvm-project/pull/96183 --- .../Dialect/Linalg/Transforms/Transforms.h | 3 + .../Linalg/Transforms/WinogradConv2D.cpp | 856 ++++++++++++++++++ .../Linalg/winograd-conv2d-rewrite.mlir | 120 +++ .../Dialect/Linalg/TestLinalgTransforms.cpp | 11 + 4 files changed, 990 insertions(+) create mode 100644 mlir/test/Dialect/Linalg/winograd-conv2d-rewrite.mlir diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h index eac6eb4387a0f..0c7a8edff222f 100644 --- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h +++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h @@ -1746,6 +1746,9 @@ void populateBlockPackMatmulPatterns(RewritePatternSet &patterns, void populateWinogradConv2DPatterns(RewritePatternSet &patterns, int64_t m, int64_t r); +/// Patterns to decompose Winograd operators. +void populateDecomposeWinogradOpsPatterns(RewritePatternSet &patterns); + /// Adds patterns that reduce the rank of named contraction ops that have /// unit dimensions in the operand(s) by converting to a sequence of `collapse_shape`, /// ``, `expand_shape` (if on tensors). For example a diff --git a/mlir/lib/Dialect/Linalg/Transforms/WinogradConv2D.cpp b/mlir/lib/Dialect/Linalg/Transforms/WinogradConv2D.cpp index 18dd4769f9a49..754f832e98eea 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/WinogradConv2D.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/WinogradConv2D.cpp @@ -12,10 +12,14 @@ // //===----------------------------------------------------------------------===// +#include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/Linalg/IR/Linalg.h" #include "mlir/Dialect/Linalg/Utils/Utils.h" #include "mlir/Dialect/Tensor/IR/Tensor.h" +#include "mlir/Dialect/Tosa/Utils/ConversionUtils.h" #include "mlir/Dialect/Utils/StaticValueUtils.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" #include "llvm/Support/MathExtras.h" namespace mlir { @@ -23,6 +27,156 @@ namespace linalg { namespace { +// clang-format off +/// Winograd Conv2D uses a minimal 2D filtering algorithm to calculate its +/// result. The formula of minimal 2D filtering algorithm F(m x m, r x r), +/// m is the output dimension and r is the filter dimension, is +/// +/// Y = A^T x [ (G x g x G^T) x (B^T x d x B) ] x A +/// +/// g is filter and d is input data. We need to prepare 6 constant +/// transformation matrices, G, G^T, B^T, B, A^T, and A for this formula. +/// +/// The following tables define these constant transformation matrices for +/// F(2 x 2, 3 x 3), F(4 x 4, 3 x 3), and F(2 x 2, 5 x 5) +constexpr float G_2x2_3x3[] = { + -1, 0, 0, + 1./2, -1./2, 1./2, + 1./2, 1./2, 1./2, + 0, 0, 1 +}; + +constexpr float GT_2x2_3x3[] = { + -1, 1./2, 1./2, 0, + 0, -1./2, 1./2, 0, + 0, 1./2, 1./2, 1 +}; + +constexpr float BT_2x2_3x3[] = { + -1, 0, 1, 0, + 0, -1, 1, 0, + 0, 1, 1, 0, + 0, -1, 0, 1 +}; + +constexpr float B_2x2_3x3[] = { + -1, 0, 0, 0, + 0, -1, 1, -1, + 1, 1, 1, 0, + 0, 0, 0, 1 +}; + +constexpr float AT_2x2_3x3[] = { + 1, 1, 1, 0, + 0, -1, 1, 1 +}; + +constexpr float A_2x2_3x3[] = { + 1, 0, + 1, -1, + 1, 1, + 0, 1 +}; + +constexpr float G_4x4_3x3[] = { + 1, 0, 0, + -1./3, 1./3, -1./3, + -1./3, -1./3, -1./3, + 1./12, -1./6, 1./3, + 1./12, 1./6, 1./3, + 0, 0, 1 +}; + +constexpr float GT_4x4_3x3[] = { + 1, -1./3, -1./3, 1./12, 1./12, 0, + 0, 1./3, -1./3, -1./6, 1./6, 0, + 0, -1./3, -1./3, 1./3, 1./3, 1 +}; + +constexpr float BT_4x4_3x3[] = { + 1./4, 0, -5./16, 0, 1./16, 0, + 0, 1./4, -1./4, -1./16, 1./16, 0, + 0, -1./4, -1./4, 1./16, 1./16, 0, + 0, 1./4, -1./8, -1./4, 1./8, 0, + 0, -1./4, -1./8, 1./4, 1./8, 0, + 0, 1./4, 0, -5./16, 0, 1./16 +}; + +constexpr float B_4x4_3x3[] = { + 1./4, 0, 0, 0, 0, 0, + 0, 1./4, -1./4, 1./4, -1./4, 1./4, + -5./16, -1./4, -1./4, -1./8, -1./8, 0, + 0, -1./16, 1./16, -1./4, 1./4, -5./16, + 1./16, 1./16, 1./16, 1./8, 1./8, 0, + 0, 0, 0, 0, 0, 1./16 +}; + +constexpr float AT_4x4_3x3[] = { + 1./8, 1./4, 1./4, 1./8, 1./8, 0, + 0, -1./4, 1./4, -1./4, 1./4, 0, + 0, 1./4, 1./4, 1./2, 1./2, 0, + 0, -1./4, 1./4, -1, 1, 1./2 +}; + +constexpr float A_4x4_3x3[] = { + 1./8, 0, 0, 0, + 1./4, -1./4, 1./4, -1./4, + 1./4, 1./4, 1./4, 1./4, + 1./8, -1./4, 1./2, -1, + 1./8, 1./4, 1./2, 1, + 0, 0, 0, 1./2 +}; + +constexpr float G_2x2_5x5[] = { + 1, 0, 0, 0, 0, + 1./6, -1./6, 1./6, -1./6, 1./6, + -1./6, -1./6, -1./6, -1./6, -1./6, +-4./15, 2./15, -1./15, 1./30, -1./60, + 1./60, 1./30, 1./15, 2./15, 4./15, + 0, 0, 0, 0, 1 +}; + +constexpr float GT_2x2_5x5[] = { + 1, 1./6, -1./6, -4./15, 1./60, 0, + 0, -1./6, -1./6, 2./15, 1./30, 0, + 0, 1./6, -1./6, -1./15, 1./15, 0, + 0, -1./6, -1./6, 1./30, 2./15, 0, + 0, 1./6, -1./6, -1./60, 4./15, 1 +}; + +constexpr float BT_2x2_5x5[] = { + 1./8, 3./16, -1./4, -3./16, 1./8, 0, + 0, 1./8, 1./16, -5./16, 1./8, 0, + 0, -1./8, -5./16, -1./16, 1./8, 0, + 0, 1./4, -1./8, -1./4, 1./8, 0, + 0, -1./8, -1./4, 1./8, 1./4, 0, + 0, 1./8, 3./16, -1./4, -3./16, 1./8 +}; + +constexpr float B_2x2_5x5[] = { + 1./8, 0, 0, 0, 0, 0, + 3./16, 1./8, -1./8, 1./4, -1./8, 1./8, + -1./4, 1./16, -5./16, -1./8, -1./4, 3./16, + -3./16, -5./16, -1./16, -1./4, 1./8, -1./4, + 1./8, 1./8, 1./8, 1./8, 1./4, -3./16, + 0, 0, 0, 0, 0, 1./8 +}; + +constexpr float AT_2x2_5x5[] = { + 1./2, 1, 1, 2, 1, 0, + 0, -1, 1, -1, 2, 1./2 +}; + +constexpr float A_2x2_5x5[] = { + 1./2, 0, + 1, -1, + 1, 1, + 2, -1, + 1, 2, + 0, 1./2 +}; +// clang-format on + using TransformMapKeyTy = std::pair; /// We use F(m, r) to define the size of minimal filtering algorithms. @@ -36,6 +190,408 @@ constexpr TransformMapKeyTy F_2_3{2, 3}; constexpr TransformMapKeyTy F_4_3{4, 3}; constexpr TransformMapKeyTy F_2_5{2, 5}; +/// Structure to keep information of constant transform matrices. +struct TransformMatrix { + TransformMatrix(const float *table, int64_t rows, int64_t cols, + int64_t scalarFactor = 1) + : table(table), rows(rows), cols(cols), scalarFactor(scalarFactor) {} + + const float *table; + int64_t rows; + int64_t cols; + int64_t scalarFactor; +}; + +/// Utility function to convert constant array to arith.constant Value. +Value create2DTransformMatrix(OpBuilder &builder, Location loc, + TransformMatrix transform, Type type) { + ArrayRef constVec(transform.table, transform.rows * transform.cols); + + return builder.create( + loc, DenseFPElementsAttr::get( + RankedTensorType::get( + SmallVector{transform.rows, transform.cols}, type), + constVec)); +} + +/// Extract height x width data from 4D tensors. +Value extract2DDataFrom4D(OpBuilder &builder, Location loc, Value source, + Value loopNorFIndex, Value loopCorFIndex, + Value heightOffset, Value widthOffset, + int64_t extractHeight, int64_t extractWidth, + int64_t loopNorFIdx, int64_t loopCorFIdx, + int64_t heightIdx, int64_t widthIdx) { + auto sourceType = cast(source.getType()); + Type elementType = sourceType.getElementType(); + int64_t srcSize = sourceType.getRank(); + + auto oneIndex = builder.getIndexAttr(1); + SmallVector offsets; + offsets.resize(srcSize); + offsets[loopNorFIdx] = loopNorFIndex; + offsets[loopCorFIdx] = loopCorFIndex; + offsets[heightIdx] = heightOffset; + offsets[widthIdx] = widthOffset; + SmallVector sizes(srcSize, oneIndex); + sizes[heightIdx] = builder.getIndexAttr(extractHeight); + sizes[widthIdx] = builder.getIndexAttr(extractWidth); + SmallVector strides(srcSize, oneIndex); + + auto extractFilterType = + RankedTensorType::get({extractHeight, extractWidth}, elementType); + auto extractFilterOp = builder.create( + loc, extractFilterType, source, offsets, sizes, strides); + + return extractFilterOp; +} + +/// Extract height x width data from 6D tensors. +Value extract2DDataFrom6D(OpBuilder &builder, Location loc, Value source, + Value tileHIndex, Value tileWIndex, + Value loopNorFIndex, Value loopCorFIndex, + int64_t tileHIdx, int64_t tileWIdx, + int64_t loopNorFIdx, int64_t loopCorFIdx, + int64_t heightIdx, int64_t widthIdx) { + auto sourceType = cast(source.getType()); + Type elementType = sourceType.getElementType(); + auto sourceShape = sourceType.getShape(); + int64_t srcSize = sourceType.getRank(); + int64_t height = sourceShape[heightIdx]; + int64_t width = sourceShape[widthIdx]; + + auto zeroIndex = builder.getIndexAttr(0); + auto oneIndex = builder.getIndexAttr(1); + SmallVector offsets(srcSize, zeroIndex); + offsets.resize(srcSize); + offsets[tileHIdx] = tileHIndex; + offsets[tileWIdx] = tileWIndex; + offsets[loopNorFIdx] = loopNorFIndex; + offsets[loopCorFIdx] = loopCorFIndex; + SmallVector sizes(srcSize, oneIndex); + sizes[heightIdx] = builder.getIndexAttr(height); + sizes[widthIdx] = builder.getIndexAttr(width); + SmallVector strides(srcSize, oneIndex); + + auto extractFilterType = RankedTensorType::get({height, width}, elementType); + auto extractFilterOp = builder.create( + loc, extractFilterType, source, offsets, sizes, strides); + + return extractFilterOp; +} + +/// Insert transformed height x width data to 4D tensors which it is +/// extracted from. +Value insert2DDataTo4D(OpBuilder &builder, Location loc, Value source, + Value dest, Value loopNorFIndex, Value loopCorFIndex, + Value heightOffset, Value widthOffset, int64_t height, + int64_t width, int64_t loopNorFIdx, int64_t loopCorFIdx, + int64_t heightIdx, int64_t widthIdx) { + int64_t destSize = cast(dest.getType()).getRank(); + auto oneIndex = builder.getIndexAttr(1); + SmallVector retOffsets; + retOffsets.resize(destSize); + retOffsets[loopNorFIdx] = loopNorFIndex; + retOffsets[loopCorFIdx] = loopCorFIndex; + retOffsets[heightIdx] = heightOffset; + retOffsets[widthIdx] = widthOffset; + SmallVector retSizes(destSize, oneIndex); + retSizes[heightIdx] = builder.getIndexAttr(height); + retSizes[widthIdx] = builder.getIndexAttr(width); + SmallVector strides(destSize, oneIndex); + + auto insertSliceOp = builder.create( + loc, source, dest, retOffsets, retSizes, strides); + + return insertSliceOp; +} + +/// Insert transformed height x width data to 6D tensors which it is +/// extracted from. +Value insert2DDataTo6D(OpBuilder &builder, Location loc, Value source, + Value dest, Value tileHIndex, Value tileWIndex, + Value loopNorFIndex, Value loopCorFIndex, int64_t height, + int64_t width, int64_t tileHIdx, int64_t tileWIdx, + int64_t loopNorFIdx, int64_t loopCorFIdx, + int64_t heightIdx, int64_t widthIdx) { + int64_t destSize = cast(dest.getType()).getRank(); + auto zeroIndex = builder.getIndexAttr(0); + auto oneIndex = builder.getIndexAttr(1); + SmallVector retOffsets(destSize, zeroIndex); + retOffsets.resize(destSize); + retOffsets[tileHIdx] = tileHIndex; + retOffsets[tileWIdx] = tileWIndex; + retOffsets[loopNorFIdx] = loopNorFIndex; + retOffsets[loopCorFIdx] = loopCorFIndex; + SmallVector retSizes(destSize, oneIndex); + retSizes[heightIdx] = builder.getIndexAttr(height); + retSizes[widthIdx] = builder.getIndexAttr(width); + SmallVector strides(destSize, oneIndex); + + auto insertSliceOp = builder.create( + loc, source, dest, retOffsets, retSizes, strides); + + return insertSliceOp; +} + +/// This function transforms the filter. The data layout of the filter is FHWC. +/// The transformation matrix is 2-dimension. We need to extract H x W from +/// FHWC first. We need to generate 2 levels of loops to iterate on F and C. +/// After the transformation, we get +/// +/// scf.for %f = lo_f to hi_f step 1 +/// scf.for %c = lo_c to hi_c step 1 +/// %extracted = extract filter from filter +/// %ret = linalg.matmul G, %extracted +/// %ret = linalg.matmul %ret, GT +/// %inserted = insert %ret into filter +Value filterTransform(RewriterBase &rewriter, Location loc, Value filter, + Value retValue, int64_t m, int64_t r, + bool leftTransform = true, bool rightTransform = true) { + // Map from (m, r) to G transform matrix. + static const llvm::SmallDenseMap + GMatrices = { + {F_2_3, TransformMatrix(G_2x2_3x3, 4, 3)}, + {F_4_3, TransformMatrix(G_4x4_3x3, 6, 3)}, + {F_2_5, TransformMatrix(G_2x2_5x5, 6, 5)}, + }; + + // Map from (m, r) to GT transform matrix. + static const llvm::SmallDenseMap + GTMatrices = { + {F_2_3, TransformMatrix(GT_2x2_3x3, 3, 4)}, + {F_4_3, TransformMatrix(GT_4x4_3x3, 3, 6)}, + {F_2_5, TransformMatrix(GT_2x2_5x5, 5, 6)}, + }; + + auto filterType = cast(filter.getType()); + Type elementType = filterType.getElementType(); + auto filterShape = filterType.getShape(); // F, H, W, C + int64_t filterF = filterShape[0]; + int64_t filterH = filterShape[1]; + int64_t filterW = filterShape[2]; + int64_t filterC = filterShape[3]; + + if (filterH != r && filterH != 1) + return Value(); + if (filterW != r && filterW != 1) + return Value(); + + Value zeroIdx = rewriter.create(loc, 0); + auto buildBody = [&](OpBuilder &builder, Location loc, ValueRange ivs, + ValueRange args) -> scf::ValueVector { + Value FIter = ivs[0]; + Value CIter = ivs[1]; + + // Extract (H, W) from (F, H, W, C). + auto extractFilter = + extract2DDataFrom4D(builder, loc, filter, FIter, CIter, zeroIdx, + zeroIdx, filterH, filterW, /*loopNorFIdx=*/0, + /*loopCorFIdx=*/3, /*heightIdx=*/1, /*widthIdx=*/2); + + TransformMapKeyTy key = {m, r}; + int64_t retRows = 1; + Value matmulRetValue = extractFilter; + if (leftTransform) { + // Get constant transform matrix G. + auto it = GMatrices.find(key); + if (it == GMatrices.end()) + return {}; + const TransformMatrix &GMatrix = it->second; + + retRows = GMatrix.rows; + auto matmulType = RankedTensorType::get({retRows, filterW}, elementType); + auto init = builder.create(loc, matmulType.getShape(), + elementType); + + Value G = create2DTransformMatrix(builder, loc, GMatrix, elementType); + // Multiply G x g. + auto matmulOp = builder.create( + loc, matmulType, ValueRange{G, extractFilter}, ValueRange{init}); + matmulRetValue = matmulOp.getResult(0); + } + + if (rightTransform) { + // Get constant transform matrix GT. + auto it = GTMatrices.find(key); + if (it == GTMatrices.end()) + return {}; + const TransformMatrix >Matrix = it->second; + + auto matmulType = + RankedTensorType::get({retRows, GTMatrix.cols}, elementType); + auto init = builder.create(loc, matmulType.getShape(), + elementType); + + Value GT = create2DTransformMatrix(builder, loc, GTMatrix, elementType); + // Multiply u = (G x g) x GT. + auto matmulOp = builder.create( + loc, matmulType, ValueRange{matmulRetValue, GT}, ValueRange{init}); + matmulRetValue = matmulOp.getResult(0); + } + + // Insert (H, W) to (H, W, C, F). + int64_t retHeight = leftTransform ? m + r - 1 : 1; + int64_t retWidth = rightTransform ? m + r - 1 : 1; + + auto insertSliceOp = + insert2DDataTo4D(builder, loc, matmulRetValue, args[0], FIter, CIter, + zeroIdx, zeroIdx, retHeight, retWidth, + /*loopNorFIdx=*/3, /*loopCorFIdx=*/2, + /*heightIdx=*/0, /*widthIdx=*/1); + + return {insertSliceOp}; + }; + + auto fUpperBound = rewriter.create(loc, filterF); + auto cUpperBound = rewriter.create(loc, filterC); + auto oneStep = rewriter.create(loc, 1); + scf::LoopNest loops = scf::buildLoopNest( + rewriter, loc, {zeroIdx, zeroIdx}, {fUpperBound, cUpperBound}, + {oneStep, oneStep}, {retValue}, buildBody); + return loops.results[0]; +} + +/// This function transforms the input. The data layout of the input is NHWC. +/// The transformation matrix is 2-dimension. We need to extract H x W from +/// NHWC first. We need to generate 2 levels of loops to iterate on N and C. +/// After the transformation, we get +/// +/// scf.for %h = 0 to tileH step 1 +/// scf.for %w = 0 to tileW step 1 +/// scf.for %n = 0 to N step 1 +/// scf.for %c = 0 to C step 1 +/// %extracted = extract %extracted from +/// %input +/// at [%n, (%h x m), (%w x m), %c] +/// %ret = linalg.matmul BT, %extracted +/// %ret = linalg.matmul %ret, B +/// %inserted = insert %ret into +/// %output +/// at [0, 0, %h, %w, %n, %c] +Value inputTransform(RewriterBase &rewriter, Location loc, Value input, + Value retValue, int64_t m, int64_t r, + bool leftTransform = true, bool rightTransform = true) { + // Map from (m, r) to BT transform matrix. + static const llvm::SmallDenseMap + BTMatrices = { + {F_2_3, TransformMatrix(BT_2x2_3x3, 4, 4)}, + {F_4_3, TransformMatrix(BT_4x4_3x3, 6, 6)}, + {F_2_5, TransformMatrix(BT_2x2_5x5, 6, 6)}, + }; + + // Map from (m, r) to B transform matrix. + static const llvm::SmallDenseMap + BMatrices = { + {F_2_3, TransformMatrix(B_2x2_3x3, 4, 4)}, + {F_4_3, TransformMatrix(B_4x4_3x3, 6, 6)}, + {F_2_5, TransformMatrix(B_2x2_5x5, 6, 6)}, + }; + + auto inputType = cast(input.getType()); + Type elementType = inputType.getElementType(); + auto inputShape = inputType.getShape(); // N, H, W, C + int64_t inputN = inputShape[0]; + int64_t inputH = inputShape[1]; + int64_t inputW = inputShape[2]; + int64_t inputC = inputShape[3]; + auto valueType = cast(retValue.getType()); + auto valueShape = valueType.getShape(); // alphaH, alphaW, HTile, WTile, N, C + int64_t tileH = valueShape[2]; + int64_t tileW = valueShape[3]; + int64_t alphaH = leftTransform ? m + r - 1 : 1; + int64_t alphaW = rightTransform ? m + r - 1 : 1; + + if ((inputH != (tileH * m) + (r - 1)) && inputH != 1) + return Value(); + if ((inputW != (tileW * m) + (r - 1)) && inputW != 1) + return Value(); + + auto buildBody = [&](OpBuilder &builder, Location loc, ValueRange ivs, + ValueRange args) -> scf::ValueVector { + Value tileHIter = ivs[0]; + Value tileWIter = ivs[1]; + Value NIter = ivs[2]; + Value CIter = ivs[3]; + + auto context = builder.getContext(); + auto affineMap = + AffineMap::get(1, 0, {builder.getAffineDimExpr(0) * m}, context); + Value heightOffset = + builder.create(loc, affineMap, tileHIter); + Value widthOffset = + builder.create(loc, affineMap, tileWIter); + + // Extract (H, W) from (N, H, W, C). + auto extractInput = + extract2DDataFrom4D(builder, loc, input, NIter, CIter, heightOffset, + widthOffset, alphaH, alphaW, /*loopNorFIdx=*/0, + /*loopCorFIdx=*/3, /*heightIdx=*/1, /*widthIdx=*/2); + + TransformMapKeyTy key = {m, r}; + int64_t retRows = 1; + int64_t retCols = 1; + Value matmulRetValue = extractInput; + if (leftTransform) { + // Get constant transform matrix BT. + auto it = BTMatrices.find(key); + if (it == BTMatrices.end()) + return {}; + const TransformMatrix &BTMatrix = it->second; + + retRows = BTMatrix.rows; + auto matmulType = RankedTensorType::get({retRows, alphaW}, elementType); + auto init = builder.create(loc, matmulType.getShape(), + elementType); + + Value BT = + create2DTransformMatrix(builder, loc, BTMatrix, builder.getF32Type()); + // Multiply BT x d. + auto matmulOp = builder.create( + loc, matmulType, ValueRange{BT, matmulRetValue}, ValueRange{init}); + matmulRetValue = matmulOp.getResult(0); + } + + if (rightTransform) { + // Get constant transform matrix B. + auto it = BMatrices.find(key); + if (it == BMatrices.end()) + return {}; + const TransformMatrix &BMatrix = it->second; + + retCols = BMatrix.cols; + auto matmulType = RankedTensorType::get({retRows, retCols}, elementType); + auto init = builder.create(loc, matmulType.getShape(), + elementType); + Value B = + create2DTransformMatrix(builder, loc, BMatrix, builder.getF32Type()); + // Multiply v = (BT x d) x B. + auto matmulOp = builder.create( + loc, matmulType, ValueRange{matmulRetValue, B}, ValueRange{init}); + matmulRetValue = matmulOp.getResult(0); + } + + // Insert (H, W) to (H, W, tileH, tileW, N, C). + auto combinedVal = insert2DDataTo6D( + builder, loc, matmulRetValue, args[0], tileHIter, tileWIter, NIter, + CIter, retRows, retCols, 2, 3, /*loopNorFIdx=*/4, /*loopCorFIdx=*/5, + /*heightIdx=*/0, /*widthIdx=*/1); + + return {combinedVal}; + }; + + auto zeroIdx = rewriter.create(loc, 0); + auto tileHBound = rewriter.create(loc, tileH); + auto tileWBound = rewriter.create(loc, tileW); + auto nUpperBound = rewriter.create(loc, inputN); + auto cUpperBound = rewriter.create(loc, inputC); + auto oneStep = rewriter.create(loc, 1); + scf::LoopNest loops = scf::buildLoopNest( + rewriter, loc, {zeroIdx, zeroIdx, zeroIdx, zeroIdx}, + {tileHBound, tileWBound, nUpperBound, cUpperBound}, + {oneStep, oneStep, oneStep, oneStep}, {retValue}, buildBody); + return loops.results[0]; +} + /// This function generates linalg.batch_matmul to multiply input with filter. /// linalg.batch_matmul only supports 3-dimensional inputs. We can treat /// tileH x tileW x H x W data as the 1-dimensional data array. That is to @@ -107,6 +663,185 @@ static Value matrixMultiply(RewriterBase &rewriter, Location loc, return expandOutput; } +/// This function transforms the output. The data layout of the output is HWNF. +/// The transformation matrix is 2-dimension. We need to extract H x W from +/// HWNF first. We need to generate 2 levels of loops to iterate on N and F. +/// After the transformation, we get +/// +/// scf.for %h = 0 to tileH step 1 +/// scf.for %w = 0 to tileW step 1 +/// scf.for %n = 0 to N step 1 +/// scf.for %f = 0 to F step 1 +/// %extracted = extract %extracted from +/// %input +/// at [0, 0, %h, %w, %n, %f] +/// %ret = linalg.matmul AT, %extracted +/// %ret = linalg.matmul %ret, A +/// %inserted = insert %ret into +/// output +/// at [%n, (%h x m), (%w x m), %f] +Value outputTransform(RewriterBase &rewriter, Location loc, Value value, + Value output, int64_t m, int64_t r, + bool leftTransform = true, bool rightTransform = true) { + // Map from (m, r) to AT transform matrix. + static const llvm::SmallDenseMap + ATMatrices = { + {F_2_3, TransformMatrix(AT_2x2_3x3, 2, 4)}, + {F_4_3, TransformMatrix(AT_4x4_3x3, 4, 6, 32)}, + {F_2_5, TransformMatrix(AT_2x2_5x5, 2, 6, 16)}, + }; + + // Map from (m, r) to A transform matrix. + static const llvm::SmallDenseMap + AMatrices = { + {F_2_3, TransformMatrix(A_2x2_3x3, 4, 2)}, + {F_4_3, TransformMatrix(A_4x4_3x3, 6, 4, 32)}, + {F_2_5, TransformMatrix(A_2x2_5x5, 6, 2, 16)}, + }; + + auto valueType = cast(value.getType()); + Type elementType = valueType.getElementType(); + auto valueShape = valueType.getShape(); // H, W, TileH, TileW, N, F + int64_t valueH = valueShape[0]; + int64_t valueW = valueShape[1]; + int64_t valueN = valueShape[4]; + int64_t valueF = valueShape[5]; + int64_t alphaH = leftTransform ? m + r - 1 : 1; + int64_t alphaW = rightTransform ? m + r - 1 : 1; + + if (valueH != alphaH && valueH != 1) + return Value(); + if (valueW != alphaW && valueW != 1) + return Value(); + + auto buildBody = [&](OpBuilder &builder, Location loc, ValueRange ivs, + ValueRange args) -> scf::ValueVector { + Value tileHIter = ivs[0]; + Value tileWIter = ivs[1]; + Value NIter = ivs[2]; + Value FIter = ivs[3]; + + // Extract (H, W) from (H, W, tileH, tileW, N, F). + auto extractValue = + extract2DDataFrom6D(builder, loc, value, tileHIter, tileWIter, NIter, + FIter, 2, 3, /*loopNorFIdx=*/4, + /*loopCorFIdx=*/5, /*heightIdx=*/0, /*widthIdx=*/1); + + TransformMapKeyTy key = {m, r}; + int64_t retRows = 1; + int64_t retCols = 1; + int64_t leftScalarFactor = 1; + int64_t rightScalarFactor = 1; + Value matmulRetValue = extractValue; + if (leftTransform) { + // Get constant transform matrix AT. + auto it = ATMatrices.find(key); + if (it == ATMatrices.end()) + return {}; + const TransformMatrix &ATMatrix = it->second; + + leftScalarFactor = ATMatrix.scalarFactor; + retRows = ATMatrix.rows; + auto matmulType = RankedTensorType::get({retRows, valueW}, elementType); + auto init = builder.create(loc, matmulType.getShape(), + elementType); + + Value AT = create2DTransformMatrix(builder, loc, ATMatrix, elementType); + // Multiply AT x m. + auto matmulOp = builder.create( + loc, matmulType, ValueRange{AT, matmulRetValue}, ValueRange{init}); + matmulRetValue = matmulOp.getResult(0); + } + + if (rightTransform) { + // Get constant transform matrix T. + auto it = AMatrices.find(key); + if (it == AMatrices.end()) + return {}; + const TransformMatrix &AMatrix = it->second; + + rightScalarFactor = AMatrix.scalarFactor; + auto matmulType = + RankedTensorType::get({retRows, AMatrix.cols}, elementType); + retCols = AMatrix.cols; + auto init = builder.create(loc, matmulType.getShape(), + elementType); + + Value A = create2DTransformMatrix(builder, loc, AMatrix, elementType); + // Multiply y = (AT x m) x A. + auto matmulOp = builder.create( + loc, matmulType, ValueRange{matmulRetValue, A}, ValueRange{init}); + matmulRetValue = matmulOp.getResult(0); + } + + if (leftScalarFactor * rightScalarFactor != 1) { + // Multiply scalar factor. + Value scalarFactor = builder.create( + loc, + FloatAttr::get(elementType, leftScalarFactor * rightScalarFactor)); + auto matmulType = RankedTensorType::get({retRows, retCols}, elementType); + auto init = builder.create(loc, matmulType.getShape(), + elementType); + + auto identityAffineMap = rewriter.getMultiDimIdentityMap(2); + SmallVector affineMaps = { + AffineMap::get(2, 0, init.getContext()), identityAffineMap}; + auto broadcastedScalar = + rewriter + .create( + loc, matmulType, ValueRange{scalarFactor}, ValueRange{init}, + affineMaps, + llvm::ArrayRef{ + utils::IteratorType::parallel, + utils::IteratorType::parallel}, + [&](OpBuilder &nestedBuilder, Location nestedLoc, + ValueRange args) { + nestedBuilder.create(nestedLoc, args[0]); + }) + .getResult(0); + + matmulRetValue = builder + .create( + loc, matmulType, + ValueRange{broadcastedScalar, matmulRetValue}, + ValueRange{init}) + .getResult(0); + } + + auto context = builder.getContext(); + auto affineMap = + AffineMap::get(1, 0, {builder.getAffineDimExpr(0) * m}, context); + Value heightOffset = + builder.create(loc, affineMap, tileHIter); + Value widthOffset = + builder.create(loc, affineMap, tileWIter); + + // Insert (H, W) to (N, H, W, F). + Value combinedVal = + insert2DDataTo4D(builder, loc, matmulRetValue, args[0], NIter, FIter, + heightOffset, widthOffset, retRows, retCols, + /*loopNorFIdx=*/0, + /*loopCorFIdx=*/3, /*heightIdx=*/1, + /*widthIdx=*/2); + + return {combinedVal}; + }; + + int64_t tilwH = valueShape[2]; + int64_t tileW = valueShape[3]; + auto zeroIdx = rewriter.create(loc, 0); + auto tileHBound = rewriter.create(loc, tilwH); + auto tileWBound = rewriter.create(loc, tileW); + auto nUpperBound = rewriter.create(loc, valueN); + auto fUpperBound = rewriter.create(loc, valueF); + auto oneStep = rewriter.create(loc, 1); + scf::LoopNest loops = scf::buildLoopNest( + rewriter, loc, {zeroIdx, zeroIdx, zeroIdx, zeroIdx}, + {tileHBound, tileWBound, nUpperBound, fUpperBound}, + {oneStep, oneStep, oneStep, oneStep}, {output}, buildBody); + return loops.results[0]; +} + /// Create an empty tensor with alignedType and insert the value into the /// created empty tensor with aligned size. static Value padToAlignedTensor(RewriterBase &rewriter, Location loc, @@ -292,6 +1027,120 @@ winogradConv2DHelper(RewriterBase &rewriter, linalg::Conv2DNhwcFhwcOp convOp, return transformedOutput.getDefiningOp(); } +/// A helper function to decompose linalg.winograd_filter_transform. +FailureOr +decomposeWinogradFilterTransformHelper(RewriterBase &rewriter, + linalg::WinogradFilterTransformOp op) { + Location loc = op.getLoc(); + Value filter = op.getFilter(); + auto filterType = cast(filter.getType()); + auto filterShape = filterType.getShape(); + int64_t filterH = filterShape[1]; + int64_t filterW = filterShape[2]; + + // For F(m x 1, r x 1), we only need to do left side transform. + bool leftTransform = filterH != 1; + // For F(1 x m, 1 x r), we only need to do right side transform. + bool rightTransform = filterW != 1; + Value transformedFilter = + filterTransform(rewriter, loc, filter, op.getOutput(), op.getM(), + op.getR(), leftTransform, rightTransform); + if (!transformedFilter) + return failure(); + + rewriter.replaceOp(op, transformedFilter); + + return transformedFilter.getDefiningOp(); +} + +/// A helper function to decompose linalg.winograd_input_transform. +FailureOr +decomposeWinogradInputTransformHelper(RewriterBase &rewriter, + linalg::WinogradInputTransformOp op) { + Location loc = op.getLoc(); + Value input = op.getInput(); + auto inputType = cast(input.getType()); + auto inputShape = inputType.getShape(); + int64_t inputH = inputShape[1]; + int64_t inputW = inputShape[2]; + + // For F(m x 1, r x 1), we only need to do left side transform. + bool leftTransform = inputH != 1; + // For F(1 x m, 1 x r), we only need to do right side transform. + bool rightTransform = inputW != 1; + Value transformedInput = + inputTransform(rewriter, loc, op.getInput(), op.getOutput(), op.getM(), + op.getR(), leftTransform, rightTransform); + if (!transformedInput) + return failure(); + + rewriter.replaceOp(op, transformedInput); + + return transformedInput.getDefiningOp(); +} + +/// A helper function to decompose linalg.winograd_output_transform. +FailureOr +decomposeWinogradOutputTransformHelper(RewriterBase &rewriter, + linalg::WinogradOutputTransformOp op) { + Location loc = op.getLoc(); + Value value = op.getValue(); + auto valueType = cast(value.getType()); + auto valueShape = valueType.getShape(); + int64_t valueH = valueShape[0]; + int64_t valueW = valueShape[1]; + + // For F(m x 1, r x 1), we only need to do left side transform. + bool leftTransform = valueH != 1; + // For F(1 x m, 1 x r), we only need to do right side transform. + bool rightTransform = valueW != 1; + Value transformedOutput = + outputTransform(rewriter, loc, value, op.getOutput(), op.getM(), + op.getR(), leftTransform, rightTransform); + if (!transformedOutput) + return failure(); + + rewriter.replaceOp(op, transformedOutput); + + return transformedOutput.getDefiningOp(); +} + +/// A rewrite pattern to decompose linalg.winograd_filter_transform operations. +class DecomposeWinogradFilterTransform final + : public OpRewritePattern { +public: + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(linalg::WinogradFilterTransformOp op, + PatternRewriter &rewriter) const override { + return decomposeWinogradFilterTransformHelper(rewriter, op); + } +}; + +/// A rewrite pattern to decompose linalg.winograd_input_transform operations. +class DecomposeWinogradInputTransform final + : public OpRewritePattern { +public: + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(linalg::WinogradInputTransformOp op, + PatternRewriter &rewriter) const override { + return decomposeWinogradInputTransformHelper(rewriter, op); + } +}; + +/// A rewrite pattern to decompose linalg.winograd_output_transform operations. +class DecomposeWinogradOutputTransform final + : public OpRewritePattern { +public: + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(linalg::WinogradOutputTransformOp op, + PatternRewriter &rewriter) const override { + return decomposeWinogradOutputTransformHelper(rewriter, op); + } +}; + /// A rewrite pattern for Winograd Conv2D algorithm. class WinogradConv2DNhwcFhwc final : public OpRewritePattern { @@ -328,5 +1177,12 @@ void populateWinogradConv2DPatterns(RewritePatternSet &patterns, int64_t m, patterns.insert(context, m, r); } +void populateDecomposeWinogradOpsPatterns(RewritePatternSet &patterns) { + MLIRContext *context = patterns.getContext(); + patterns + .insert(context); +} + } // end namespace linalg } // end namespace mlir diff --git a/mlir/test/Dialect/Linalg/winograd-conv2d-rewrite.mlir b/mlir/test/Dialect/Linalg/winograd-conv2d-rewrite.mlir new file mode 100644 index 0000000000000..095a6636b68dc --- /dev/null +++ b/mlir/test/Dialect/Linalg/winograd-conv2d-rewrite.mlir @@ -0,0 +1,120 @@ +// RUN: mlir-opt %s -split-input-file -test-linalg-transform-patterns=test-decompose-winograd-ops | FileCheck %s + +func.func @conv2d(%arg0: tensor<2x11x11x5xf32>, %arg1: tensor<2x3x3x5xf32>, %arg2: tensor<2x9x9x2xf32>) -> tensor<2x9x9x2xf32> { + %cst = arith.constant 0.000000e+00 : f32 + %2 = tensor.empty() : tensor<6x6x5x2xf32> + %3 = linalg.winograd_filter_transform m(4) r(3) ins(%arg1 : tensor<2x3x3x5xf32>) outs(%2 : tensor<6x6x5x2xf32>) -> tensor<6x6x5x2xf32> + %padded = tensor.pad %arg0 low[0, 0, 0, 0] high[0, 3, 3, 0] { + ^bb0(%arg3: index, %arg4: index, %arg5: index, %arg6: index): + tensor.yield %cst : f32 + } : tensor<2x11x11x5xf32> to tensor<2x14x14x5xf32> + %4 = tensor.empty() : tensor<6x6x3x3x2x5xf32> + %5 = linalg.winograd_input_transform m(4) r(3) ins(%padded : tensor<2x14x14x5xf32>) outs(%4 : tensor<6x6x3x3x2x5xf32>) -> tensor<6x6x3x3x2x5xf32> + %collapsed = tensor.collapse_shape %3 [[0, 1], [2], [3]] : tensor<6x6x5x2xf32> into tensor<36x5x2xf32> + %collapsed_0 = tensor.collapse_shape %5 [[0, 1], [2, 3, 4], [5]] : tensor<6x6x3x3x2x5xf32> into tensor<36x18x5xf32> + %6 = tensor.empty() : tensor<36x18x2xf32> + %7 = linalg.batch_matmul ins(%collapsed_0, %collapsed : tensor<36x18x5xf32>, tensor<36x5x2xf32>) outs(%6 : tensor<36x18x2xf32>) -> tensor<36x18x2xf32> + %expanded = tensor.expand_shape %7 [[0, 1], [2, 3, 4], [5]] output_shape [6, 6, 3, 3, 2, 2] : tensor<36x18x2xf32> into tensor<6x6x3x3x2x2xf32> + %padded_1 = tensor.pad %arg2 low[0, 0, 0, 0] high[0, 3, 3, 0] { + ^bb0(%arg3: index, %arg4: index, %arg5: index, %arg6: index): + tensor.yield %cst : f32 + } : tensor<2x9x9x2xf32> to tensor<2x12x12x2xf32> + %8 = linalg.winograd_output_transform m(4) r(3) ins(%expanded : tensor<6x6x3x3x2x2xf32>) outs(%padded_1 : tensor<2x12x12x2xf32>) -> tensor<2x12x12x2xf32> + %extracted_slice = tensor.extract_slice %8[0, 0, 0, 0] [2, 9, 9, 2] [1, 1, 1, 1] : tensor<2x12x12x2xf32> to tensor<2x9x9x2xf32> + return %extracted_slice : tensor<2x9x9x2xf32> +} + +// CHECK: #[[$MAP0:.+]] = affine_map<(d0) -> (d0 * 4)> +// CHECK: #[[$MAP1:.+]] = affine_map<(d0, d1) -> ()> +// CHECK: #[[$MAP2:.+]] = affine_map<(d0, d1) -> (d0, d1)> +// CHECK-LABEL: func.func @conv2d +// CHECK-SAME: (%[[ARG0:.*]]: tensor<2x11x11x5xf32>, %[[ARG1:.*]]: tensor<2x3x3x5xf32>, %[[ARG2:.*]]: tensor<2x9x9x2xf32>) -> tensor<2x9x9x2xf32> { +// CHECK-DAG: %[[CST:.*]] = arith.constant 1.024000e+03 : f32 +// CHECK-DAG: %[[CST_0:.*]] = arith.constant dense<{{\[}}[1.250000e-01, 0.000000e+00, 0.000000e+00, 0.000000e+00], [2.500000e-01, -2.500000e-01, 2.500000e-01, -2.500000e-01], [2.500000e-01, 2.500000e-01, 2.500000e-01, 2.500000e-01], [1.250000e-01, -2.500000e-01, 5.000000e-01, -1.000000e+00], [1.250000e-01, 2.500000e-01, 5.000000e-01, 1.000000e+00], [0.000000e+00, 0.000000e+00, 0.000000e+00, 5.000000e-01]]> : tensor<6x4xf32> +// CHECK-DAG: %[[CST_1:.*]] = arith.constant dense<{{\[}}[1.250000e-01, 2.500000e-01, 2.500000e-01, 1.250000e-01, 1.250000e-01, 0.000000e+00], [0.000000e+00, -2.500000e-01, 2.500000e-01, -2.500000e-01, 2.500000e-01, 0.000000e+00], [0.000000e+00, 2.500000e-01, 2.500000e-01, 5.000000e-01, 5.000000e-01, 0.000000e+00], [0.000000e+00, -2.500000e-01, 2.500000e-01, -1.000000e+00, 1.000000e+00, 5.000000e-01]]> : tensor<4x6xf32> +// CHECK-DAG: %[[CST_2:.*]] = arith.constant dense<{{\[}}[2.500000e-01, 0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00], [0.000000e+00, 2.500000e-01, -2.500000e-01, 2.500000e-01, -2.500000e-01, 2.500000e-01], [-3.125000e-01, -2.500000e-01, -2.500000e-01, -1.250000e-01, -1.250000e-01, 0.000000e+00], [0.000000e+00, -6.250000e-02, 6.250000e-02, -2.500000e-01, 2.500000e-01, -3.125000e-01], [6.250000e-02, 6.250000e-02, 6.250000e-02, 1.250000e-01, 1.250000e-01, 0.000000e+00], [0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00, 6.250000e-02]]> : tensor<6x6xf32> +// CHECK-DAG: %[[CST_3:.*]] = arith.constant dense<{{\[}}[2.500000e-01, 0.000000e+00, -3.125000e-01, 0.000000e+00, 6.250000e-02, 0.000000e+00], [0.000000e+00, 2.500000e-01, -2.500000e-01, -6.250000e-02, 6.250000e-02, 0.000000e+00], [0.000000e+00, -2.500000e-01, -2.500000e-01, 6.250000e-02, 6.250000e-02, 0.000000e+00], [0.000000e+00, 2.500000e-01, -1.250000e-01, -2.500000e-01, 1.250000e-01, 0.000000e+00], [0.000000e+00, -2.500000e-01, -1.250000e-01, 2.500000e-01, 1.250000e-01, 0.000000e+00], [0.000000e+00, 2.500000e-01, 0.000000e+00, -3.125000e-01, 0.000000e+00, 6.250000e-02]]> : tensor<6x6xf32> +// CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index +// CHECK-DAG: %[[CST_4:.*]] = arith.constant dense<{{\[}}[1.000000e+00, -0.333333343, -0.333333343, 0.0833333358, 0.0833333358, 0.000000e+00], [0.000000e+00, 0.333333343, -0.333333343, -0.166666672, 0.166666672, 0.000000e+00], [0.000000e+00, -0.333333343, -0.333333343, 0.333333343, 0.333333343, 1.000000e+00]]> : tensor<3x6xf32> +// CHECK-DAG: %[[CST_5:.*]] = arith.constant dense<{{\[}}[1.000000e+00, 0.000000e+00, 0.000000e+00], [-0.333333343, 0.333333343, -0.333333343], [-0.333333343, -0.333333343, -0.333333343], [0.0833333358, -0.166666672, 0.333333343], [0.0833333358, 0.166666672, 0.333333343], [0.000000e+00, 0.000000e+00, 1.000000e+00]]> : tensor<6x3xf32> +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C5:.*]] = arith.constant 5 : index +// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[CST_6:.*]] = arith.constant 0.000000e+00 : f32 +// CHECK: %[[S0:.*]] = tensor.empty() : tensor<6x6x5x2xf32> +// CHECK-NEXT: %[[S1:.*]] = scf.for %[[ARG3:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG4:.*]] = %[[S0]]) -> (tensor<6x6x5x2xf32>) { +// CHECK-NEXT: %[[S7:.*]] = scf.for %[[ARG5:.*]] = %[[C0]] to %[[C5]] step %[[C1]] iter_args(%[[ARG6:.*]] = %[[ARG4]]) -> (tensor<6x6x5x2xf32>) { +// CHECK-NEXT: %[[EXTRACTED_SLICE_9:.*]] = tensor.extract_slice %[[ARG1]][%[[ARG3]], %[[C0]], %[[C0]], %[[ARG5]]] [1, 3, 3, 1] [1, 1, 1, 1] : tensor<2x3x3x5xf32> to tensor<3x3xf32> +// CHECK-NEXT: %[[S8:.*]] = tensor.empty() : tensor<6x3xf32> +// CHECK-NEXT: %[[S9:.*]] = linalg.matmul ins(%[[CST_5]], %[[EXTRACTED_SLICE_9]] : tensor<6x3xf32>, tensor<3x3xf32>) outs(%[[S8]] : tensor<6x3xf32>) -> tensor<6x3xf32> +// CHECK-NEXT: %[[S10:.*]] = tensor.empty() : tensor<6x6xf32> +// CHECK-NEXT: %[[S11:.*]] = linalg.matmul ins(%[[S9]], %[[CST_4]] : tensor<6x3xf32>, tensor<3x6xf32>) outs(%[[S10]] : tensor<6x6xf32>) -> tensor<6x6xf32> +// CHECK-NEXT: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S11]] into %[[ARG6]][%[[C0]], %[[C0]], %[[ARG5]], %[[ARG3]]] [6, 6, 1, 1] [1, 1, 1, 1] : tensor<6x6xf32> into tensor<6x6x5x2xf32> +// CHECK-NEXT: scf.yield %[[INSERTED_SLICE]] : tensor<6x6x5x2xf32> +// CHECK-NEXT: } +// CHECK-NEXT: scf.yield %[[S7]] : tensor<6x6x5x2xf32> +// CHECK-NEXT: } +// CHECK-NEXT: %[[PADDED:.*]] = tensor.pad %[[ARG0]] low[0, 0, 0, 0] high[0, 3, 3, 0] { +// CHECK-NEXT: ^bb0(%[[ARG3:.*]]: index, %[[ARG4:.*]]: index, %[[ARG5:.*]]: index, %[[ARG6:.*]]: index): +// CHECK-NEXT: tensor.yield %[[CST_6]] : f32 +// CHECK-NEXT: } : tensor<2x11x11x5xf32> to tensor<2x14x14x5xf32> +// CHECK-NEXT: %[[S2:.*]] = tensor.empty() : tensor<6x6x3x3x2x5xf32> +// CHECK-NEXT: %[[S3:.*]] = scf.for %[[ARG3:.*]] = %[[C0]] to %[[C3]] step %[[C1]] iter_args(%[[ARG4:.*]] = %[[S2]]) -> (tensor<6x6x3x3x2x5xf32>) { +// CHECK-NEXT: %[[S7:.*]] = scf.for %[[ARG5:.*]] = %[[C0]] to %[[C3]] step %[[C1]] iter_args(%[[ARG6:.*]] = %[[ARG4]]) -> (tensor<6x6x3x3x2x5xf32>) { +// CHECK-NEXT: %[[S8:.*]] = scf.for %[[ARG7:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG8:.*]] = %[[ARG6]]) -> (tensor<6x6x3x3x2x5xf32>) { +// CHECK-NEXT: %[[S9:.*]] = scf.for %[[ARG9:.*]] = %[[C0]] to %[[C5]] step %[[C1]] iter_args(%[[ARG10:.*]] = %[[ARG8]]) -> (tensor<6x6x3x3x2x5xf32>) { +// CHECK-NEXT: %[[S10:.*]] = affine.apply #[[$MAP0]](%[[ARG3]]) +// CHECK-NEXT: %[[S11:.*]] = affine.apply #[[$MAP0]](%[[ARG5]]) +// CHECK-NEXT: %[[EXTRACTED_SLICE_9:.*]] = tensor.extract_slice %[[PADDED]][%[[ARG7]], %[[S10]], %[[S11]], %[[ARG9]]] [1, 6, 6, 1] [1, 1, 1, 1] : tensor<2x14x14x5xf32> to tensor<6x6xf32> +// CHECK-NEXT: %[[S12:.*]] = tensor.empty() : tensor<6x6xf32> +// CHECK-NEXT: %[[S13:.*]] = linalg.matmul ins(%[[CST_3]], %[[EXTRACTED_SLICE_9]] : tensor<6x6xf32>, tensor<6x6xf32>) outs(%[[S12]] : tensor<6x6xf32>) -> tensor<6x6xf32> +// CHECK-NEXT: %[[S14:.*]] = tensor.empty() : tensor<6x6xf32> +// CHECK-NEXT: %[[S15:.*]] = linalg.matmul ins(%[[S13]], %[[CST_2]] : tensor<6x6xf32>, tensor<6x6xf32>) outs(%[[S14]] : tensor<6x6xf32>) -> tensor<6x6xf32> +// CHECK-NEXT: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S15]] into %[[ARG10]][0, 0, %[[ARG3]], %[[ARG5]], %[[ARG7]], %[[ARG9]]] [6, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<6x6xf32> into tensor<6x6x3x3x2x5xf32> +// CHECK-NEXT: scf.yield %[[INSERTED_SLICE]] : tensor<6x6x3x3x2x5xf32> +// CHECK-NEXT: } +// CHECK-NEXT: scf.yield %[[S9]] : tensor<6x6x3x3x2x5xf32> +// CHECK-NEXT: } +// CHECK-NEXT: scf.yield %[[S8]] : tensor<6x6x3x3x2x5xf32> +// CHECK-NEXT: } +// CHECK-NEXT: scf.yield %[[S7]] : tensor<6x6x3x3x2x5xf32> +// CHECK-NEXT: } +// CHECK-NEXT: %[[COLLAPSED:.*]] = tensor.collapse_shape %[[S1]] {{\[}}[0, 1], [2], [3]] : tensor<6x6x5x2xf32> into tensor<36x5x2xf32> +// CHECK-NEXT: %[[COLLAPSED_7:.*]] = tensor.collapse_shape %[[S3]] {{\[}}[0, 1], [2, 3, 4], [5]] : tensor<6x6x3x3x2x5xf32> into tensor<36x18x5xf32> +// CHECK-NEXT: %[[S4:.*]] = tensor.empty() : tensor<36x18x2xf32> +// CHECK-NEXT: %[[S5:.*]] = linalg.batch_matmul ins(%[[COLLAPSED_7]], %[[COLLAPSED]] : tensor<36x18x5xf32>, tensor<36x5x2xf32>) outs(%[[S4]] : tensor<36x18x2xf32>) -> tensor<36x18x2xf32> +// CHECK-NEXT: %[[EXPANDED:.*]] = tensor.expand_shape %[[S5]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [6, 6, 3, 3, 2, 2] : tensor<36x18x2xf32> into tensor<6x6x3x3x2x2xf32> +// CHECK-NEXT: %[[PADDED_8:.*]] = tensor.pad %[[ARG2]] low[0, 0, 0, 0] high[0, 3, 3, 0] { +// CHECK-NEXT: ^bb0(%[[ARG3:.*]]: index, %[[ARG4:.*]]: index, %[[ARG5:.*]]: index, %[[ARG6:.*]]: index): +// CHECK-NEXT: tensor.yield %[[CST_6]] : f32 +// CHECK-NEXT: } : tensor<2x9x9x2xf32> to tensor<2x12x12x2xf32> +// CHECK-NEXT: %[[S6:.*]] = scf.for %[[ARG3:.*]] = %[[C0]] to %[[C3]] step %[[C1]] iter_args(%[[ARG4:.*]] = %[[PADDED_8]]) -> (tensor<2x12x12x2xf32>) { +// CHECK-NEXT: %[[S7:.*]] = scf.for %[[ARG5:.*]] = %[[C0]] to %[[C3]] step %[[C1]] iter_args(%[[ARG6:.*]] = %[[ARG4]]) -> (tensor<2x12x12x2xf32>) { +// CHECK-NEXT: %[[S8:.*]] = scf.for %[[ARG7:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG8:.*]] = %[[ARG6]]) -> (tensor<2x12x12x2xf32>) { +// CHECK-NEXT: %[[S9:.*]] = scf.for %[[ARG9:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG10:.*]] = %[[ARG8]]) -> (tensor<2x12x12x2xf32>) { +// CHECK-NEXT: %[[EXTRACTED_SLICE_9:.*]] = tensor.extract_slice %[[EXPANDED]][0, 0, %[[ARG3]], %[[ARG5]], %[[ARG7]], %[[ARG9]]] [6, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<6x6x3x3x2x2xf32> to tensor<6x6xf32> +// CHECK-NEXT: %[[S10:.*]] = tensor.empty() : tensor<4x6xf32> +// CHECK-NEXT: %[[S11:.*]] = linalg.matmul ins(%[[CST_1]], %[[EXTRACTED_SLICE_9]] : tensor<4x6xf32>, tensor<6x6xf32>) outs(%[[S10]] : tensor<4x6xf32>) -> tensor<4x6xf32> +// CHECK-NEXT: %[[S12:.*]] = tensor.empty() : tensor<4x4xf32> +// CHECK-NEXT: %[[S13:.*]] = linalg.matmul ins(%[[S11]], %[[CST_0]] : tensor<4x6xf32>, tensor<6x4xf32>) outs(%[[S12]] : tensor<4x4xf32>) -> tensor<4x4xf32> +// CHECK-NEXT: %[[S14:.*]] = tensor.empty() : tensor<4x4xf32> +// CHECK-NEXT: %[[S15:.*]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]]], iterator_types = ["parallel", "parallel"]} ins(%[[CST]] : f32) outs(%[[S14]] : tensor<4x4xf32>) { +// CHECK-NEXT: ^bb0(%[[IN:.*]]: f32, %[[OUT:.*]]: f32): +// CHECK-NEXT: linalg.yield %[[IN]] : f32 +// CHECK-NEXT: } -> tensor<4x4xf32> +// CHECK-NEXT: %[[S16:.*]] = linalg.mul ins(%[[S15]], %[[S13]] : tensor<4x4xf32>, tensor<4x4xf32>) outs(%[[S14]] : tensor<4x4xf32>) -> tensor<4x4xf32> +// CHECK-NEXT: %[[S17:.*]] = affine.apply #[[$MAP0]](%[[ARG3]]) +// CHECK-NEXT: %[[S18:.*]] = affine.apply #[[$MAP0]](%[[ARG5]]) +// CHECK-NEXT: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S16]] into %[[ARG10]][%[[ARG7]], %[[S17]], %[[S18]], %[[ARG9]]] [1, 4, 4, 1] [1, 1, 1, 1] : tensor<4x4xf32> into tensor<2x12x12x2xf32> +// CHECK-NEXT: scf.yield %[[INSERTED_SLICE]] : tensor<2x12x12x2xf32> +// CHECK-NEXT: } +// CHECK-NEXT: scf.yield %[[S9]] : tensor<2x12x12x2xf32> +// CHECK-NEXT: } +// CHECK-NEXT: scf.yield %[[S8]] : tensor<2x12x12x2xf32> +// CHECK-NEXT: } +// CHECK-NEXT: scf.yield %[[S7]] : tensor<2x12x12x2xf32> +// CHECK-NEXT: } +// CHECK-NEXT: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[S6]][0, 0, 0, 0] [2, 9, 9, 2] [1, 1, 1, 1] : tensor<2x12x12x2xf32> to tensor<2x9x9x2xf32> +// CHECK-NEXT: return %[[EXTRACTED_SLICE]] : tensor<2x9x9x2xf32> +// CHECK-NEXT: } diff --git a/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp b/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp index 12cb46a5968f1..5899f56da7345 100644 --- a/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp +++ b/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp @@ -127,6 +127,9 @@ struct TestLinalgTransforms *this, "test-winograd-conv2d", llvm::cl::desc("Test transform conv2d by Winograd conv2d algorithm"), llvm::cl::init(false)}; + Option testDecomposeWinogradOps{ + *this, "test-decompose-winograd-ops", + llvm::cl::desc("Test decompose Winograd ops"), llvm::cl::init(false)}; }; } // namespace @@ -218,6 +221,12 @@ static void applyWinogradConv2D(func::FuncOp funcOp) { (void)applyPatternsAndFoldGreedily(funcOp, std::move(patterns)); } +static void applyDecomposeWinogradOps(func::FuncOp funcOp) { + RewritePatternSet patterns(funcOp.getContext()); + populateDecomposeWinogradOpsPatterns(patterns); + (void)applyPatternsAndFoldGreedily(funcOp, std::move(patterns)); +} + /// Apply transformations specified as patterns. void TestLinalgTransforms::runOnOperation() { if (testPatterns) @@ -244,6 +253,8 @@ void TestLinalgTransforms::runOnOperation() { return applyEraseUnnecessaryInputs(getOperation()); if (testWinogradConv2D) return applyWinogradConv2D(getOperation()); + if (testDecomposeWinogradOps) + return applyDecomposeWinogradOps(getOperation()); } namespace mlir { From 4782a4ab0ad43b2f47f20afbe025b841d7f0ac04 Mon Sep 17 00:00:00 2001 From: Joachim Date: Thu, 18 Jul 2024 07:41:41 +0200 Subject: [PATCH 369/777] [OpenMP] Fix calculation of dependencies for multi-dimensional iteration space (#99347) The expectation for multiple iterators used in a single depend clause (`depend(iterator(i=0:5,j=0:5), in:x[i][j])`) is that the iterator space is the product of the iteration vectors (25 in that case). The current codeGen only works correctly, if `numIterators() = 1`. For more iterators, the execution results in runtime assertions or segfaults. The modified codeGen first calculates the iteration space, then multiplies to the number of dependencies in the depend clause and finally adds to the total number of iterator dependencies. --- clang/lib/CodeGen/CGOpenMPRuntime.cpp | 12 ++++++---- clang/test/OpenMP/depend_iterator_bug.c | 29 +++++++++++++++++++++++++ clang/test/OpenMP/task_codegen.c | 3 ++- 3 files changed, 39 insertions(+), 5 deletions(-) diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp index 652fb700fc6af..a6a87ec88ee8a 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -4259,14 +4259,18 @@ std::pair CGOpenMPRuntime::emitDependClause( // Include number of iterations, if any. if (const auto *IE = cast_or_null(D.IteratorExpr)) { + llvm::Value *ClauseIteratorSpace = + llvm::ConstantInt::get(CGF.IntPtrTy, 1); for (unsigned I = 0, E = IE->numOfIterators(); I < E; ++I) { llvm::Value *Sz = CGF.EmitScalarExpr(IE->getHelper(I).Upper); Sz = CGF.Builder.CreateIntCast(Sz, CGF.IntPtrTy, /*isSigned=*/false); - llvm::Value *NumClauseDeps = CGF.Builder.CreateNUWMul( - Sz, llvm::ConstantInt::get(CGF.IntPtrTy, D.DepExprs.size())); - NumOfRegularWithIterators = - CGF.Builder.CreateNUWAdd(NumOfRegularWithIterators, NumClauseDeps); + ClauseIteratorSpace = CGF.Builder.CreateNUWMul(Sz, ClauseIteratorSpace); } + llvm::Value *NumClauseDeps = CGF.Builder.CreateNUWMul( + ClauseIteratorSpace, + llvm::ConstantInt::get(CGF.IntPtrTy, D.DepExprs.size())); + NumOfRegularWithIterators = + CGF.Builder.CreateNUWAdd(NumOfRegularWithIterators, NumClauseDeps); HasRegularWithIterators = true; continue; } diff --git a/clang/test/OpenMP/depend_iterator_bug.c b/clang/test/OpenMP/depend_iterator_bug.c index b4aaaac08374f..ff11d8a5d0e40 100644 --- a/clang/test/OpenMP/depend_iterator_bug.c +++ b/clang/test/OpenMP/depend_iterator_bug.c @@ -5,6 +5,7 @@ int x[100]; int y[100]; +int z[100][100]; // CHECK-LABEL: @many_iterators_single_clause( // CHECK: [[VLA:%.*]] = alloca [[STRUCT_KMP_DEPEND_INFO:%.*]], i64 10, align 16 @@ -24,3 +25,31 @@ void many_iterators_many_clauses(void) { { } } + +// CHECK-LABEL: @multidim_iterators_clause1( +// CHECK: [[VLA:%.*]] = alloca [[STRUCT_KMP_DEPEND_INFO:%.*]], i64 1, align 16 +// CHECK: = call i32 @__kmpc_omp_task_with_deps(ptr {{.*}}, i32 {{.*}}, ptr {{.*}}, i32 1, ptr {{.*}}, i32 0, ptr null) +void multidim_iterators_clause1(void) { + #pragma omp task depend(iterator(i=0:1, j=0:1), in: z[i][j]) + { + } +} + +// CHECK-LABEL: @multidim_iterators_offset_clause( +// CHECK: [[VLA:%.*]] = alloca [[STRUCT_KMP_DEPEND_INFO:%.*]], i64 1, align 16 +// CHECK: = call i32 @__kmpc_omp_task_with_deps(ptr {{.*}}, i32 {{.*}}, ptr {{.*}}, i32 1, ptr {{.*}}, i32 0, ptr null) +void multidim_iterators_offset_clause(void) { + #pragma omp task depend(iterator(i=5:6, j=10:11), in: z[i][j]) + { + } +} + +// CHECK-LABEL: @multidim_iterators_clause25( +// CHECK: [[VLA:%.*]] = alloca [[STRUCT_KMP_DEPEND_INFO:%.*]], i64 25, align 16 +// CHECK: = call i32 @__kmpc_omp_task_with_deps(ptr {{.*}}, i32 {{.*}}, ptr {{.*}}, i32 25, ptr {{.*}}, i32 0, ptr null) +void multidim_iterators_clause25(void) { + #pragma omp task depend(iterator(i=0:5, j=0:5), in: z[i][j]) + { + } +} + diff --git a/clang/test/OpenMP/task_codegen.c b/clang/test/OpenMP/task_codegen.c index be404827ce901..0ca815c0eccf1 100644 --- a/clang/test/OpenMP/task_codegen.c +++ b/clang/test/OpenMP/task_codegen.c @@ -139,7 +139,8 @@ for (int i = 0; i < 10; ++i) // CHECK: [[EB_SUB_2_ADD_1_SUB:%.+]] = sub i32 [[EB_SUB_2_ADD]], 1 // CHECK: [[EB_SUB_2_ADD_1_SUB_2_DIV:%.+]] = udiv i32 [[EB_SUB_2_ADD_1_SUB]], 2 // CHECK: [[ELEMS:%.+]] = zext i32 [[EB_SUB_2_ADD_1_SUB_2_DIV]] to i64 - // CHECK: [[NELEMS:%.+]] = mul nuw i64 [[ELEMS]], 1 + // CHECK: [[ELEMS2:%.+]] = mul nuw i64 [[ELEMS]], 1 + // CHECK: [[NELEMS:%.+]] = mul nuw i64 [[ELEMS2]], 1 // ITERATOR_TOTAL = NELEMS + 0; // CHECK: [[ITERATOR_TOTAL:%.+]] = add nuw i64 0, [[NELEMS]] From 810adbaa0236eed10ecfd9f96837b6d23d308a7d Mon Sep 17 00:00:00 2001 From: Tristan Ross Date: Wed, 17 Jul 2024 22:57:26 -0700 Subject: [PATCH 370/777] [RISCV] Remove unused include in RISCVMCTargetDesc.h (#98790) Goes in hand with #97130, split out to figure out CI fails. Should just build whatever subprojects utilize the `RISCVMCTargetDesc.h` header and it should build & test just like normal. Co-authored-by: pca006132 --- llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h index d4aa0fe99078e..6cc22af601fdb 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h @@ -13,7 +13,6 @@ #ifndef LLVM_LIB_TARGET_RISCV_MCTARGETDESC_RISCVMCTARGETDESC_H #define LLVM_LIB_TARGET_RISCV_MCTARGETDESC_RISCVMCTARGETDESC_H -#include "llvm/Config/config.h" #include "llvm/MC/MCTargetOptions.h" #include "llvm/Support/DataTypes.h" #include From fbf8b82cd02818c0888805bb39abbf550333bea6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Wed, 17 Jul 2024 13:51:56 +0200 Subject: [PATCH 371/777] [clang][Interp][NFC] Be more cautious about Block initialization state ... when moving a Block to a DeadBlock. Only invoke the MoveFn if the old block was initialized at all. --- clang/lib/AST/Interp/InterpBlock.cpp | 3 +++ clang/lib/AST/Interp/InterpState.cpp | 4 +++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/clang/lib/AST/Interp/InterpBlock.cpp b/clang/lib/AST/Interp/InterpBlock.cpp index 7bef5e678c074..5ac778aeb6075 100644 --- a/clang/lib/AST/Interp/InterpBlock.cpp +++ b/clang/lib/AST/Interp/InterpBlock.cpp @@ -110,6 +110,9 @@ DeadBlock::DeadBlock(DeadBlock *&Root, Block *Blk) } void DeadBlock::free() { + if (B.IsInitialized) + B.invokeDtor(); + if (Prev) Prev->Next = Next; if (Next) diff --git a/clang/lib/AST/Interp/InterpState.cpp b/clang/lib/AST/Interp/InterpState.cpp index 332f551838b72..4ea05305540ee 100644 --- a/clang/lib/AST/Interp/InterpState.cpp +++ b/clang/lib/AST/Interp/InterpState.cpp @@ -69,13 +69,15 @@ void InterpState::deallocate(Block *B) { char *Memory = reinterpret_cast(std::malloc(sizeof(DeadBlock) + Size)); auto *D = new (Memory) DeadBlock(DeadBlocks, B); + std::memset(D->B.rawData(), 0, D->B.getSize()); // Move data and metadata from the old block to the new (dead)block. - if (Desc->MoveFn) { + if (B->IsInitialized && Desc->MoveFn) { Desc->MoveFn(B, B->data(), D->data(), Desc); if (Desc->getMetadataSize() > 0) std::memcpy(D->rawData(), B->rawData(), Desc->getMetadataSize()); } + D->B.IsInitialized = B->IsInitialized; // We moved the contents over to the DeadBlock. B->IsInitialized = false; From 0e986e395f9cd759b859ba0c934c0d73de4554c8 Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Thu, 18 Jul 2024 06:02:49 +0000 Subject: [PATCH 372/777] [MLGO] Fix MLGO executable scripts The MLGO executable scripts were previously set up incorrectly with the entrypoints. This patch corrects the entrypoints so that the scripts work as expected rather than throwing import errors in the wrapper. --- llvm/utils/mlgo-utils/pyproject.toml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/utils/mlgo-utils/pyproject.toml b/llvm/utils/mlgo-utils/pyproject.toml index c4139e52b4246..b72465c2417c2 100644 --- a/llvm/utils/mlgo-utils/pyproject.toml +++ b/llvm/utils/mlgo-utils/pyproject.toml @@ -17,6 +17,6 @@ classifiers = [ version = {attr = "mlgo.__version__"} [project.scripts] -combine_training_corpus = "mlgo.combine_training_corpus:entrypoint" -extract_ir = "mlgo.extract_ir:entrypoint" -make_corpus = "mlgo.make_corpus:entrypoint" +combine_training_corpus = "mlgo.corpus.combine_training_corpus:parse_args_and_run" +extract_ir = "mlgo.corpus.extract_ir:parse_args_and_run" +make_corpus = "mlgo.corpus.make_corpus:parse_args_and_run" From 1e6672af2497042d5dad0236c2ad9e61f879ac07 Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Sat, 13 Jul 2024 22:13:23 +0200 Subject: [PATCH 373/777] [Flang][Runtime] Simplify StringLength implementation This implementation relies on arithmetic conversion, let's see what happens when we do std::size_t length{std::strlen(string)}; if (length <= std::numeric_limits::max()) return static_cast(length); 1) if size_t == uint32_t (or lower), then the comparison operator invokes integral promotion to uint64_t, the comparison happens, it's fine. 2) if size_t == uint64_t, then the comparison is done between unsigned types, which implies a conversion of std::numeric_limits::max() to uint64_t, which happens without accuracy loss, fine 3) if size_t == uint128_t (or higher), then we invoke integral promotion of std::int64_t, it's also fine. So this snippet has the same behavior as the existing one, while being easier to read. --- flang/runtime/command.cpp | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/flang/runtime/command.cpp b/flang/runtime/command.cpp index e642248a25e68..a555e26f96a66 100644 --- a/flang/runtime/command.cpp +++ b/flang/runtime/command.cpp @@ -47,13 +47,9 @@ pid_t RTNAME(GetPID)() { return getpid(); } // Returns the length of the \p string. Assumes \p string is valid. static std::int64_t StringLength(const char *string) { std::size_t length{std::strlen(string)}; - if constexpr (sizeof(std::size_t) < sizeof(std::int64_t)) { + if (length <= std::numeric_limits::max()) return static_cast(length); - } else { - std::size_t max{std::numeric_limits::max()}; - return length > max ? 0 // Just fail. - : static_cast(length); - } + return 0; } static void FillWithSpaces(const Descriptor &value, std::size_t offset = 0) { From f36331770267501e157ac34afc3ca7d7a0bfb52c Mon Sep 17 00:00:00 2001 From: Alexander Pivovarov Date: Wed, 17 Jul 2024 23:33:52 -0700 Subject: [PATCH 374/777] [APFloat] Add support for f8E4M3 IEEE 754 type (#97179) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR adds `f8E4M3` type to APFloat. `f8E4M3` type follows IEEE 754 convention ```c f8E4M3 (IEEE 754) - Exponent bias: 7 - Maximum stored exponent value: 14 (binary 1110) - Maximum unbiased exponent value: 14 - 7 = 7 - Minimum stored exponent value: 1 (binary 0001) - Minimum unbiased exponent value: 1 − 7 = −6 - Precision specifies the total number of bits used for the significand (mantisa), including implicit leading integer bit = 3 + 1 = 4 - Follows IEEE 754 conventions for representation of special values - Has Positive and Negative zero - Has Positive and Negative infinity - Has NaNs Additional details: - Max exp (unbiased): 7 - Min exp (unbiased): -6 - Infinities (+/-): S.1111.000 - Zeros (+/-): S.0000.000 - NaNs: S.1111.{001, 010, 011, 100, 101, 110, 111} - Max normal number: S.1110.111 = +/-2^(7) x (1 + 0.875) = +/-240 - Min normal number: S.0001.000 = +/-2^(-6) - Max subnormal number: S.0000.111 = +/-2^(-6) x 0.875 = +/-2^(-9) x 7 - Min subnormal number: S.0000.001 = +/-2^(-6) x 0.125 = +/-2^(-9) ``` Related PRs: - [PR-97118](https://github.com/llvm/llvm-project/pull/97118) Add f8E4M3 IEEE 754 type to mlir --- clang/include/clang/AST/Stmt.h | 6 +-- clang/lib/AST/MicrosoftMangle.cpp | 1 + llvm/include/llvm/ADT/APFloat.h | 6 +++ llvm/lib/Support/APFloat.cpp | 20 +++++++++ llvm/unittests/ADT/APFloatTest.cpp | 66 ++++++++++++++++++++++++++++++ 5 files changed, 96 insertions(+), 3 deletions(-) diff --git a/clang/include/clang/AST/Stmt.h b/clang/include/clang/AST/Stmt.h index e91e89d728ca0..bbd7634bcc3bf 100644 --- a/clang/include/clang/AST/Stmt.h +++ b/clang/include/clang/AST/Stmt.h @@ -460,10 +460,10 @@ class alignas(void *) Stmt { unsigned : NumExprBits; static_assert( - llvm::APFloat::S_MaxSemantics < 16, - "Too many Semantics enum values to fit in bitfield of size 4"); + llvm::APFloat::S_MaxSemantics < 32, + "Too many Semantics enum values to fit in bitfield of size 5"); LLVM_PREFERRED_TYPE(llvm::APFloat::Semantics) - unsigned Semantics : 4; // Provides semantics for APFloat construction + unsigned Semantics : 5; // Provides semantics for APFloat construction LLVM_PREFERRED_TYPE(bool) unsigned IsExact : 1; }; diff --git a/clang/lib/AST/MicrosoftMangle.cpp b/clang/lib/AST/MicrosoftMangle.cpp index fac14ce1dce8c..4016043df62ed 100644 --- a/clang/lib/AST/MicrosoftMangle.cpp +++ b/clang/lib/AST/MicrosoftMangle.cpp @@ -981,6 +981,7 @@ void MicrosoftCXXNameMangler::mangleFloat(llvm::APFloat Number) { case APFloat::S_IEEEquad: Out << 'Y'; break; case APFloat::S_PPCDoubleDouble: Out << 'Z'; break; case APFloat::S_Float8E5M2: + case APFloat::S_Float8E4M3: case APFloat::S_Float8E4M3FN: case APFloat::S_Float8E5M2FNUZ: case APFloat::S_Float8E4M3FNUZ: diff --git a/llvm/include/llvm/ADT/APFloat.h b/llvm/include/llvm/ADT/APFloat.h index db2fa480655c6..bff8e6490d1de 100644 --- a/llvm/include/llvm/ADT/APFloat.h +++ b/llvm/include/llvm/ADT/APFloat.h @@ -166,6 +166,9 @@ struct APFloatBase { // This format's exponent bias is 16, instead of the 15 (2 ** (5 - 1) - 1) // that IEEE precedent would imply. S_Float8E5M2FNUZ, + // 8-bit floating point number following IEEE-754 conventions with bit + // layout S1E4M3. + S_Float8E4M3, // 8-bit floating point number mostly following IEEE-754 conventions with // bit layout S1E4M3 as described in https://arxiv.org/abs/2209.05433. // Unlike IEEE-754 types, there are no infinity values, and NaN is @@ -217,6 +220,7 @@ struct APFloatBase { static const fltSemantics &PPCDoubleDouble() LLVM_READNONE; static const fltSemantics &Float8E5M2() LLVM_READNONE; static const fltSemantics &Float8E5M2FNUZ() LLVM_READNONE; + static const fltSemantics &Float8E4M3() LLVM_READNONE; static const fltSemantics &Float8E4M3FN() LLVM_READNONE; static const fltSemantics &Float8E4M3FNUZ() LLVM_READNONE; static const fltSemantics &Float8E4M3B11FNUZ() LLVM_READNONE; @@ -638,6 +642,7 @@ class IEEEFloat final : public APFloatBase { APInt convertPPCDoubleDoubleAPFloatToAPInt() const; APInt convertFloat8E5M2APFloatToAPInt() const; APInt convertFloat8E5M2FNUZAPFloatToAPInt() const; + APInt convertFloat8E4M3APFloatToAPInt() const; APInt convertFloat8E4M3FNAPFloatToAPInt() const; APInt convertFloat8E4M3FNUZAPFloatToAPInt() const; APInt convertFloat8E4M3B11FNUZAPFloatToAPInt() const; @@ -656,6 +661,7 @@ class IEEEFloat final : public APFloatBase { void initFromPPCDoubleDoubleAPInt(const APInt &api); void initFromFloat8E5M2APInt(const APInt &api); void initFromFloat8E5M2FNUZAPInt(const APInt &api); + void initFromFloat8E4M3APInt(const APInt &api); void initFromFloat8E4M3FNAPInt(const APInt &api); void initFromFloat8E4M3FNUZAPInt(const APInt &api); void initFromFloat8E4M3B11FNUZAPInt(const APInt &api); diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp index 3664de71d06df..26b4f8e55448f 100644 --- a/llvm/lib/Support/APFloat.cpp +++ b/llvm/lib/Support/APFloat.cpp @@ -136,6 +136,7 @@ static constexpr fltSemantics semIEEEquad = {16383, -16382, 113, 128}; static constexpr fltSemantics semFloat8E5M2 = {15, -14, 3, 8}; static constexpr fltSemantics semFloat8E5M2FNUZ = { 15, -15, 3, 8, fltNonfiniteBehavior::NanOnly, fltNanEncoding::NegativeZero}; +static constexpr fltSemantics semFloat8E4M3 = {7, -6, 4, 8}; static constexpr fltSemantics semFloat8E4M3FN = { 8, -6, 4, 8, fltNonfiniteBehavior::NanOnly, fltNanEncoding::AllOnes}; static constexpr fltSemantics semFloat8E4M3FNUZ = { @@ -208,6 +209,8 @@ const llvm::fltSemantics &APFloatBase::EnumToSemantics(Semantics S) { return Float8E5M2(); case S_Float8E5M2FNUZ: return Float8E5M2FNUZ(); + case S_Float8E4M3: + return Float8E4M3(); case S_Float8E4M3FN: return Float8E4M3FN(); case S_Float8E4M3FNUZ: @@ -246,6 +249,8 @@ APFloatBase::SemanticsToEnum(const llvm::fltSemantics &Sem) { return S_Float8E5M2; else if (&Sem == &llvm::APFloat::Float8E5M2FNUZ()) return S_Float8E5M2FNUZ; + else if (&Sem == &llvm::APFloat::Float8E4M3()) + return S_Float8E4M3; else if (&Sem == &llvm::APFloat::Float8E4M3FN()) return S_Float8E4M3FN; else if (&Sem == &llvm::APFloat::Float8E4M3FNUZ()) @@ -276,6 +281,7 @@ const fltSemantics &APFloatBase::PPCDoubleDouble() { } const fltSemantics &APFloatBase::Float8E5M2() { return semFloat8E5M2; } const fltSemantics &APFloatBase::Float8E5M2FNUZ() { return semFloat8E5M2FNUZ; } +const fltSemantics &APFloatBase::Float8E4M3() { return semFloat8E4M3; } const fltSemantics &APFloatBase::Float8E4M3FN() { return semFloat8E4M3FN; } const fltSemantics &APFloatBase::Float8E4M3FNUZ() { return semFloat8E4M3FNUZ; } const fltSemantics &APFloatBase::Float8E4M3B11FNUZ() { @@ -3617,6 +3623,11 @@ APInt IEEEFloat::convertFloat8E5M2FNUZAPFloatToAPInt() const { return convertIEEEFloatToAPInt(); } +APInt IEEEFloat::convertFloat8E4M3APFloatToAPInt() const { + assert(partCount() == 1); + return convertIEEEFloatToAPInt(); +} + APInt IEEEFloat::convertFloat8E4M3FNAPFloatToAPInt() const { assert(partCount() == 1); return convertIEEEFloatToAPInt(); @@ -3681,6 +3692,9 @@ APInt IEEEFloat::bitcastToAPInt() const { if (semantics == (const llvm::fltSemantics *)&semFloat8E5M2FNUZ) return convertFloat8E5M2FNUZAPFloatToAPInt(); + if (semantics == (const llvm::fltSemantics *)&semFloat8E4M3) + return convertFloat8E4M3APFloatToAPInt(); + if (semantics == (const llvm::fltSemantics *)&semFloat8E4M3FN) return convertFloat8E4M3FNAPFloatToAPInt(); @@ -3902,6 +3916,10 @@ void IEEEFloat::initFromFloat8E5M2FNUZAPInt(const APInt &api) { initFromIEEEAPInt(api); } +void IEEEFloat::initFromFloat8E4M3APInt(const APInt &api) { + initFromIEEEAPInt(api); +} + void IEEEFloat::initFromFloat8E4M3FNAPInt(const APInt &api) { initFromIEEEAPInt(api); } @@ -3951,6 +3969,8 @@ void IEEEFloat::initFromAPInt(const fltSemantics *Sem, const APInt &api) { return initFromFloat8E5M2APInt(api); if (Sem == &semFloat8E5M2FNUZ) return initFromFloat8E5M2FNUZAPInt(api); + if (Sem == &semFloat8E4M3) + return initFromFloat8E4M3APInt(api); if (Sem == &semFloat8E4M3FN) return initFromFloat8E4M3FNAPInt(api); if (Sem == &semFloat8E4M3FNUZ) diff --git a/llvm/unittests/ADT/APFloatTest.cpp b/llvm/unittests/ADT/APFloatTest.cpp index 86a25f4394e19..d50bdf4a65dcb 100644 --- a/llvm/unittests/ADT/APFloatTest.cpp +++ b/llvm/unittests/ADT/APFloatTest.cpp @@ -2133,6 +2133,8 @@ TEST(APFloatTest, getZero) { {&APFloat::Float8E5M2(), true, true, {0x80ULL, 0}, 1}, {&APFloat::Float8E5M2FNUZ(), false, false, {0, 0}, 1}, {&APFloat::Float8E5M2FNUZ(), true, false, {0, 0}, 1}, + {&APFloat::Float8E4M3(), false, true, {0, 0}, 1}, + {&APFloat::Float8E4M3(), true, true, {0x80ULL, 0}, 1}, {&APFloat::Float8E4M3FN(), false, true, {0, 0}, 1}, {&APFloat::Float8E4M3FN(), true, true, {0x80ULL, 0}, 1}, {&APFloat::Float8E4M3FNUZ(), false, false, {0, 0}, 1}, @@ -6532,6 +6534,34 @@ TEST(APFloatTest, Float8E5M2ToDouble) { EXPECT_TRUE(std::isnan(QNaN.convertToDouble())); } +TEST(APFloatTest, Float8E4M3ToDouble) { + APFloat One(APFloat::Float8E4M3(), "1.0"); + EXPECT_EQ(1.0, One.convertToDouble()); + APFloat Two(APFloat::Float8E4M3(), "2.0"); + EXPECT_EQ(2.0, Two.convertToDouble()); + APFloat PosLargest = APFloat::getLargest(APFloat::Float8E4M3(), false); + EXPECT_EQ(240.0F, PosLargest.convertToDouble()); + APFloat NegLargest = APFloat::getLargest(APFloat::Float8E4M3(), true); + EXPECT_EQ(-240.0F, NegLargest.convertToDouble()); + APFloat PosSmallest = + APFloat::getSmallestNormalized(APFloat::Float8E4M3(), false); + EXPECT_EQ(0x1.p-6, PosSmallest.convertToDouble()); + APFloat NegSmallest = + APFloat::getSmallestNormalized(APFloat::Float8E4M3(), true); + EXPECT_EQ(-0x1.p-6, NegSmallest.convertToDouble()); + + APFloat SmallestDenorm = APFloat::getSmallest(APFloat::Float8E4M3(), false); + EXPECT_TRUE(SmallestDenorm.isDenormal()); + EXPECT_EQ(0x1.p-9, SmallestDenorm.convertToDouble()); + + APFloat PosInf = APFloat::getInf(APFloat::Float8E4M3()); + EXPECT_EQ(std::numeric_limits::infinity(), PosInf.convertToDouble()); + APFloat NegInf = APFloat::getInf(APFloat::Float8E4M3(), true); + EXPECT_EQ(-std::numeric_limits::infinity(), NegInf.convertToDouble()); + APFloat QNaN = APFloat::getQNaN(APFloat::Float8E4M3()); + EXPECT_TRUE(std::isnan(QNaN.convertToDouble())); +} + TEST(APFloatTest, Float8E4M3FNToDouble) { APFloat One(APFloat::Float8E4M3FN(), "1.0"); EXPECT_EQ(1.0, One.convertToDouble()); @@ -6846,6 +6876,42 @@ TEST(APFloatTest, Float8E5M2ToFloat) { EXPECT_TRUE(std::isnan(QNaN.convertToFloat())); } +TEST(APFloatTest, Float8E4M3ToFloat) { + APFloat PosZero = APFloat::getZero(APFloat::Float8E4M3()); + APFloat PosZeroToFloat(PosZero.convertToFloat()); + EXPECT_TRUE(PosZeroToFloat.isPosZero()); + APFloat NegZero = APFloat::getZero(APFloat::Float8E4M3(), true); + APFloat NegZeroToFloat(NegZero.convertToFloat()); + EXPECT_TRUE(NegZeroToFloat.isNegZero()); + + APFloat One(APFloat::Float8E4M3(), "1.0"); + EXPECT_EQ(1.0F, One.convertToFloat()); + APFloat Two(APFloat::Float8E4M3(), "2.0"); + EXPECT_EQ(2.0F, Two.convertToFloat()); + + APFloat PosLargest = APFloat::getLargest(APFloat::Float8E4M3(), false); + EXPECT_EQ(240.0F, PosLargest.convertToFloat()); + APFloat NegLargest = APFloat::getLargest(APFloat::Float8E4M3(), true); + EXPECT_EQ(-240.0F, NegLargest.convertToFloat()); + APFloat PosSmallest = + APFloat::getSmallestNormalized(APFloat::Float8E4M3(), false); + EXPECT_EQ(0x1.p-6, PosSmallest.convertToFloat()); + APFloat NegSmallest = + APFloat::getSmallestNormalized(APFloat::Float8E4M3(), true); + EXPECT_EQ(-0x1.p-6, NegSmallest.convertToFloat()); + + APFloat SmallestDenorm = APFloat::getSmallest(APFloat::Float8E4M3(), false); + EXPECT_TRUE(SmallestDenorm.isDenormal()); + EXPECT_EQ(0x1.p-9, SmallestDenorm.convertToFloat()); + + APFloat PosInf = APFloat::getInf(APFloat::Float8E4M3()); + EXPECT_EQ(std::numeric_limits::infinity(), PosInf.convertToFloat()); + APFloat NegInf = APFloat::getInf(APFloat::Float8E4M3(), true); + EXPECT_EQ(-std::numeric_limits::infinity(), NegInf.convertToFloat()); + APFloat QNaN = APFloat::getQNaN(APFloat::Float8E4M3()); + EXPECT_TRUE(std::isnan(QNaN.convertToFloat())); +} + TEST(APFloatTest, Float8E4M3FNToFloat) { APFloat PosZero = APFloat::getZero(APFloat::Float8E4M3FN()); APFloat PosZeroToFloat(PosZero.convertToFloat()); From 7b08c2774ca7350b372f70f63135eacc04d739c5 Mon Sep 17 00:00:00 2001 From: Adrian Kuegel Date: Thu, 18 Jul 2024 06:34:15 +0000 Subject: [PATCH 375/777] [mlir][Linalg] Remove unused header include. There seems to be no direct usage of any tosa utils. --- mlir/lib/Dialect/Linalg/Transforms/WinogradConv2D.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/mlir/lib/Dialect/Linalg/Transforms/WinogradConv2D.cpp b/mlir/lib/Dialect/Linalg/Transforms/WinogradConv2D.cpp index 754f832e98eea..c6c770e2781ff 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/WinogradConv2D.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/WinogradConv2D.cpp @@ -17,7 +17,6 @@ #include "mlir/Dialect/Linalg/IR/Linalg.h" #include "mlir/Dialect/Linalg/Utils/Utils.h" #include "mlir/Dialect/Tensor/IR/Tensor.h" -#include "mlir/Dialect/Tosa/Utils/ConversionUtils.h" #include "mlir/Dialect/Utils/StaticValueUtils.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" #include "llvm/Support/MathExtras.h" From 0ce3ea1bfffcbd62195cf07e34477cc7cc5c5009 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Thu, 18 Jul 2024 07:45:13 +0100 Subject: [PATCH 376/777] [AMDGPU] Simplify selection of llvm.amdgcn.inverse.ballot. NFCI. (#99345) --- llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 12 ----------- .../AMDGPU/AMDGPUInstructionSelector.cpp | 13 ------------ .../Target/AMDGPU/AMDGPUInstructionSelector.h | 1 - llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 21 ++++--------------- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 4 +++- llvm/lib/Target/AMDGPU/SIInstructions.td | 10 +++++++-- 6 files changed, 15 insertions(+), 46 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 6d5ffc66d98b2..b7471bab12850 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -2775,18 +2775,6 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) { case Intrinsic::amdgcn_interp_p1_f16: SelectInterpP1F16(N); return; - case Intrinsic::amdgcn_inverse_ballot: - switch (N->getOperand(1).getValueSizeInBits()) { - case 32: - Opcode = AMDGPU::S_INVERSE_BALLOT_U32; - break; - case 64: - Opcode = AMDGPU::S_INVERSE_BALLOT_U64; - break; - default: - llvm_unreachable("Unsupported size for inverse ballot mask."); - } - break; default: SelectCode(N); break; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index dcb0f47973c4a..da3e8c0a62b08 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1055,8 +1055,6 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const { return selectIntrinsicCmp(I); case Intrinsic::amdgcn_ballot: return selectBallot(I); - case Intrinsic::amdgcn_inverse_ballot: - return selectInverseBallot(I); case Intrinsic::amdgcn_reloc_constant: return selectRelocConstant(I); case Intrinsic::amdgcn_groupstaticsize: @@ -1449,17 +1447,6 @@ bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const { return true; } -bool AMDGPUInstructionSelector::selectInverseBallot(MachineInstr &I) const { - MachineBasicBlock *BB = I.getParent(); - const DebugLoc &DL = I.getDebugLoc(); - const Register DstReg = I.getOperand(0).getReg(); - const Register MaskReg = I.getOperand(2).getReg(); - - BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(MaskReg); - I.eraseFromParent(); - return true; -} - bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const { Register DstReg = I.getOperand(0).getReg(); const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index 2d3317e04ce12..43ed210508d33 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -112,7 +112,6 @@ class AMDGPUInstructionSelector final : public InstructionSelector { bool selectDivScale(MachineInstr &MI) const; bool selectIntrinsicCmp(MachineInstr &MI) const; bool selectBallot(MachineInstr &I) const; - bool selectInverseBallot(MachineInstr &I) const; bool selectRelocConstant(MachineInstr &I) const; bool selectGroupStaticSize(MachineInstr &I) const; bool selectReturnAddress(MachineInstr &I) const; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index b68962e0541ce..d5ffb4478bee1 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -5480,24 +5480,11 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( return BB; } case AMDGPU::S_INVERSE_BALLOT_U32: - case AMDGPU::S_INVERSE_BALLOT_U64: { - MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); - const GCNSubtarget &ST = MF->getSubtarget(); - const SIRegisterInfo *TRI = ST.getRegisterInfo(); - const DebugLoc &DL = MI.getDebugLoc(); - const Register DstReg = MI.getOperand(0).getReg(); - Register MaskReg = MI.getOperand(1).getReg(); - - const bool IsVALU = TRI->isVectorRegister(MRI, MaskReg); - - if (IsVALU) { - MaskReg = TII->readlaneVGPRToSGPR(MaskReg, MI, MRI); - } - - BuildMI(*BB, &MI, DL, TII->get(AMDGPU::COPY), DstReg).addReg(MaskReg); - MI.eraseFromParent(); + case AMDGPU::S_INVERSE_BALLOT_U64: + // These opcodes only exist to let SIFixSGPRCopies insert a readfirstlane if + // necessary. After that they are equivalent to a COPY. + MI.setDesc(TII->get(AMDGPU::COPY)); return BB; - } case AMDGPU::ENDPGM_TRAP: { const DebugLoc &DL = MI.getDebugLoc(); if (BB->succ_empty() && std::next(MI.getIterator()) == BB->end()) { diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 7f7b7c4472042..52044791e6c66 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -6686,7 +6686,9 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI, MI.getOpcode() == AMDGPU::S_QUADMASK_B32 || MI.getOpcode() == AMDGPU::S_QUADMASK_B64 || MI.getOpcode() == AMDGPU::S_WQM_B32 || - MI.getOpcode() == AMDGPU::S_WQM_B64) { + MI.getOpcode() == AMDGPU::S_WQM_B64 || + MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U32 || + MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U64) { MachineOperand &Src = MI.getOperand(1); if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg()))) Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI)); diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 77b17a0f2789b..f2721fbd164bf 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -212,9 +212,15 @@ def EXIT_STRICT_WQM : SPseudoInstSI <(outs SReg_1:$sdst), (ins SReg_1:$src0)> { } let usesCustomInserter = 1 in { -def S_INVERSE_BALLOT_U32 : SPseudoInstSI <(outs SReg_32:$sdst), (ins SSrc_b32:$mask)>; +def S_INVERSE_BALLOT_U32 : SPseudoInstSI< + (outs SReg_32:$sdst), (ins SSrc_b32:$mask), + [(set i1:$sdst, (int_amdgcn_inverse_ballot i32:$mask))] +>; -def S_INVERSE_BALLOT_U64 : SPseudoInstSI <(outs SReg_64:$sdst), (ins SSrc_b64:$mask)>; +def S_INVERSE_BALLOT_U64 : SPseudoInstSI< + (outs SReg_64:$sdst), (ins SSrc_b64:$mask), + [(set i1:$sdst, (int_amdgcn_inverse_ballot i64:$mask))] +>; } // End usesCustomInserter = 1 // Pseudo instructions used for @llvm.fptrunc.round upward From 14c323cfd66454c65324c4d5b9d9b6a9c5651eca Mon Sep 17 00:00:00 2001 From: Dominik Adamski Date: Thu, 18 Jul 2024 09:00:09 +0200 Subject: [PATCH 377/777] [OpenMP][AMDGPU] Do not attach -fcuda-is-device (#99002) -fcuda-is-device flag is not used for OpenMP offloading for AMD GPUs and it does not need to be added as clang cc1 option for OpenMP code. This PR has the same functionality as https://github.com/llvm/llvm-project/pull/96909 but it doesn't introduce regression for virtual function support. --- clang/lib/CodeGen/CodeGenModule.h | 2 +- clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp | 2 -- clang/test/Driver/amdgpu-openmp-toolchain.c | 2 +- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/clang/lib/CodeGen/CodeGenModule.h b/clang/lib/CodeGen/CodeGenModule.h index caa3786c033b5..657e681730c3a 100644 --- a/clang/lib/CodeGen/CodeGenModule.h +++ b/clang/lib/CodeGen/CodeGenModule.h @@ -1010,7 +1010,7 @@ class CodeGenModule : public CodeGenTypeCache { bool shouldEmitRTTI(bool ForEH = false) { return (ForEH || getLangOpts().RTTI) && !getLangOpts().CUDAIsDevice && !(getLangOpts().OpenMP && getLangOpts().OpenMPIsTargetDevice && - getTriple().isNVPTX()); + (getTriple().isNVPTX() || getTriple().isAMDGPU())); } /// Get the address of the RTTI descriptor for the given type. diff --git a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp index 1c0fb4babe3a5..b75d400e6ce91 100644 --- a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp +++ b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp @@ -47,8 +47,6 @@ void AMDGPUOpenMPToolChain::addClangTargetOptions( assert(DeviceOffloadingKind == Action::OFK_OpenMP && "Only OpenMP offloading kinds are supported."); - CC1Args.push_back("-fcuda-is-device"); - if (DriverArgs.hasArg(options::OPT_nogpulib)) return; diff --git a/clang/test/Driver/amdgpu-openmp-toolchain.c b/clang/test/Driver/amdgpu-openmp-toolchain.c index 49af04acc4639..a153c4afb0ce8 100644 --- a/clang/test/Driver/amdgpu-openmp-toolchain.c +++ b/clang/test/Driver/amdgpu-openmp-toolchain.c @@ -7,7 +7,7 @@ // verify the tools invocations // CHECK: "-cc1" "-triple" "x86_64-unknown-linux-gnu"{{.*}}"-emit-llvm-bc"{{.*}}"-x" "c" -// CHECK: "-cc1" "-triple" "amdgcn-amd-amdhsa" "-aux-triple" "x86_64-unknown-linux-gnu"{{.*}}"-fcuda-is-device"{{.*}}"-target-cpu" "gfx906" +// CHECK: "-cc1" "-triple" "amdgcn-amd-amdhsa" "-aux-triple" "x86_64-unknown-linux-gnu"{{.*}}"-target-cpu" "gfx906" // CHECK: "-cc1" "-triple" "x86_64-unknown-linux-gnu"{{.*}}"-emit-obj" // CHECK: clang-linker-wrapper{{.*}} "-o" "a.out" From 7aabdb8776eb11b90d43162254db47df46806ec9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Wed, 17 Jul 2024 13:57:01 +0200 Subject: [PATCH 378/777] [clang][Interp][NFC] Protect ByteCodeEmitter against unfinished fns This is similar to a check in TextNodeDumper.cpp. Without this, we will crash later when trying to iterate over FuncDecl->params(). --- clang/lib/AST/Interp/ByteCodeEmitter.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/clang/lib/AST/Interp/ByteCodeEmitter.cpp b/clang/lib/AST/Interp/ByteCodeEmitter.cpp index 17da77bc63c9b..a3d4c7d7392da 100644 --- a/clang/lib/AST/Interp/ByteCodeEmitter.cpp +++ b/clang/lib/AST/Interp/ByteCodeEmitter.cpp @@ -31,6 +31,12 @@ static bool isUnevaluatedBuiltin(unsigned BuiltinID) { } Function *ByteCodeEmitter::compileFunc(const FunctionDecl *FuncDecl) { + + // Manually created functions that haven't been assigned proper + // parameters yet. + if (!FuncDecl->param_empty() && !FuncDecl->param_begin()) + return nullptr; + bool IsLambdaStaticInvoker = false; if (const auto *MD = dyn_cast(FuncDecl); MD && MD->isLambdaStaticInvoker()) { From 4b9bcabdf05346fd72db0d1ad88faa9b969a8f13 Mon Sep 17 00:00:00 2001 From: David Green Date: Thu, 18 Jul 2024 08:16:40 +0100 Subject: [PATCH 379/777] [AArch64] Add streaming-mode stack hazards. (#98956) Under some SME contexts, a coprocessor with its own separate cache will be used for FPR operations. This can create hazards if the CPU and the SME unit try to access the same area of memory, including if the access is to an area of the stack. To try to alleviate that, this patch attempts to introduce extra padding into the stack frame between FP and GPR accesses, controlled by the StackHazardSize option. Without changing the layout of the stack frame, a stack object of the right size is added between GPR and FPR CSRs. Another is added to the stack objects section, and stack objects are sorted so that FPR > Hazard padding slot > GPRs (where possible). Unfortunately some things are not handled well (VLA area, FPR arguments on the stack, object with both GPR and FPR accesses), but if those are controlled by the user then the entire stack frame becomes GPR at the start/end with FPR in the middle, surrounded by Hazard padding. This can greatly help reduce something that can be difficult for the user to control themselves. The current implementation is opt-in through an -aarch64-stack-hazard-size flag, and should have no effect if the option is unset. In the long run the implementation might change (for example using more base pointers to separate in more cases, re-enabling ldp/stp using an extra register, etc), but this gets at least something for people to use in llvm-19 if they need it. The only change whilst the option is unset will be a fix for making sure the stack increment is added at the right place when it cannot be converted to postinc (++MBBI). I believe without extra padding that can not normally be reached. --- .../Target/AArch64/AArch64FrameLowering.cpp | 210 +- .../lib/Target/AArch64/AArch64FrameLowering.h | 4 + .../AArch64/AArch64MachineFunctionInfo.h | 27 + llvm/test/CodeGen/AArch64/stack-hazard.ll | 3051 +++++++++++++++++ 4 files changed, 3281 insertions(+), 11 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/stack-hazard.ll diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index 0f1e860fac732..0589b14949bf4 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -52,6 +52,8 @@ // | async context if needed | // | (a.k.a. "frame record") | // |-----------------------------------| <- fp(=x29) +// | | +// |-----------------------------------| // | | // | callee-saved fp/simd/SVE regs | // | | @@ -64,9 +66,11 @@ // |.aligned.in.case.it.needs.more.than| (size of this area is unknown at // |.the.standard.16-byte.alignment....| compile time; if present) // |-----------------------------------| -// | | // | local variables of fixed size | // | including spill slots | +// | | +// | | +// | | // |-----------------------------------| <- bp(not defined by ABI, // |.variable-sized.local.variables....| LLVM chooses X19) // |.(VLAs)............................| (size of this area is unknown at @@ -117,6 +121,20 @@ // // FIXME: also explain the redzone concept. // +// About stack hazards: Under some SME contexts, a coprocessor with its own +// separate cache can used for FP operations. This can create hazards if the CPU +// and the SME unit try to access the same area of memory, including if the +// access is to an area of the stack. To try to alleviate this we attempt to +// introduce extra padding into the stack frame between FP and GPR accesses, +// controlled by the StackHazardSize option. Without changing the layout of the +// stack frame in the diagram above, a stack object of size StackHazardSize is +// added between GPR and FPR CSRs. Another is added to the stack objects +// section, and stack objects are sorted so that FPR > Hazard padding slot > +// GPRs (where possible). Unfortunately some things are not handled well (VLA +// area, arguments on the stack, object with both GPR and FPR accesses), but if +// those are controlled by the user then the entire stack frame becomes GPR at +// the start/end with FPR in the middle, surrounded by Hazard padding. +// // An example of the prologue: // // .globl __foo @@ -196,6 +214,7 @@ #include "llvm/ADT/ScopeExit.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" @@ -253,6 +272,14 @@ cl::opt EnableHomogeneousPrologEpilog( cl::desc("Emit homogeneous prologue and epilogue for the size " "optimization (default = off)")); +// Stack hazard padding size. 0 = disabled. +static cl::opt StackHazardSize("aarch64-stack-hazard-size", + cl::init(0), cl::Hidden); +// Whether to insert padding into non-streaming functions (for testing). +static cl::opt + StackHazardInNonStreaming("aarch64-stack-hazard-in-non-streaming", + cl::init(false), cl::Hidden); + STATISTIC(NumRedZoneFunctions, "Number of functions using red zone"); /// Returns how much of the incoming argument stack area (in bytes) we should @@ -1461,6 +1488,10 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec( // update in so create a normal arithmetic instruction instead. if (MBBI->getOperand(MBBI->getNumOperands() - 1).getImm() != 0 || CSStackSizeInc < MinOffset || CSStackSizeInc > MaxOffset) { + // If we are destroying the frame, make sure we add the increment after the + // last frame operation. + if (FrameFlag == MachineInstr::FrameDestroy) + ++MBBI; emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, StackOffset::getFixed(CSStackSizeInc), TII, FrameFlag, false, false, nullptr, EmitCFI, @@ -2901,6 +2932,7 @@ static void computeCalleeSaveRegisterPairs( } int ScalableByteOffset = AFI->getSVECalleeSavedStackSize(); bool NeedGapToAlignStack = AFI->hasCalleeSaveStackFreeSpace(); + Register LastReg = 0; // When iterating backwards, the loop condition relies on unsigned wraparound. for (unsigned i = FirstReg; i < Count; i += RegInc) { @@ -2922,8 +2954,15 @@ static void computeCalleeSaveRegisterPairs( else llvm_unreachable("Unsupported register class."); + // Add the stack hazard size as we transition from GPR->FPR CSRs. + if (AFI->hasStackHazardSlotIndex() && + (!LastReg || !AArch64InstrInfo::isFpOrNEON(LastReg)) && + AArch64InstrInfo::isFpOrNEON(RPI.Reg1)) + ByteOffset += StackFillDir * StackHazardSize; + LastReg = RPI.Reg1; + // Add the next reg to the pair if it is in the same register class. - if (unsigned(i + RegInc) < Count) { + if (unsigned(i + RegInc) < Count && !AFI->hasStackHazardSlotIndex()) { Register NextReg = CSI[i + RegInc].getReg(); bool IsFirst = i == FirstReg; switch (RPI.Type) { @@ -3034,7 +3073,8 @@ static void computeCalleeSaveRegisterPairs( Offset += 8; RPI.Offset = Offset / Scale; - assert(((!RPI.isScalable() && RPI.Offset >= -64 && RPI.Offset <= 63) || + assert((!RPI.isPaired() || + (!RPI.isScalable() && RPI.Offset >= -64 && RPI.Offset <= 63) || (RPI.isScalable() && RPI.Offset >= -256 && RPI.Offset <= 255)) && "Offset out of bounds for LDP/STP immediate"); @@ -3455,6 +3495,80 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( return true; } +// Return the FrameID for a Load/Store instruction by looking at the MMO. +static std::optional getLdStFrameID(const MachineInstr &MI, + const MachineFrameInfo &MFI) { + if (!MI.mayLoadOrStore() || MI.getNumMemOperands() < 1) + return std::nullopt; + + MachineMemOperand *MMO = *MI.memoperands_begin(); + auto *PSV = + dyn_cast_or_null(MMO->getPseudoValue()); + if (PSV) + return std::optional(PSV->getFrameIndex()); + + if (MMO->getValue()) { + if (auto *Al = dyn_cast(getUnderlyingObject(MMO->getValue()))) { + for (int FI = MFI.getObjectIndexBegin(); FI < MFI.getObjectIndexEnd(); + FI++) + if (MFI.getObjectAllocation(FI) == Al) + return FI; + } + } + + return std::nullopt; +} + +// Check if a Hazard slot is needed for the current function, and if so create +// one for it. The index is stored in AArch64FunctionInfo->StackHazardSlotIndex, +// which can be used to determine if any hazard padding is needed. +void AArch64FrameLowering::determineStackHazardSlot( + MachineFunction &MF, BitVector &SavedRegs) const { + if (StackHazardSize == 0 || StackHazardSize % 16 != 0 || + MF.getInfo()->hasStackHazardSlotIndex()) + return; + + // Stack hazards are only needed in streaming functions. + SMEAttrs Attrs(MF.getFunction()); + if (!StackHazardInNonStreaming && Attrs.hasNonStreamingInterfaceAndBody()) + return; + + MachineFrameInfo &MFI = MF.getFrameInfo(); + + // Add a hazard slot if there are any CSR FPR registers, or are any fp-only + // stack objects. + bool HasFPRCSRs = any_of(SavedRegs.set_bits(), [](unsigned Reg) { + return AArch64::FPR64RegClass.contains(Reg) || + AArch64::FPR128RegClass.contains(Reg) || + AArch64::ZPRRegClass.contains(Reg) || + AArch64::PPRRegClass.contains(Reg); + }); + bool HasFPRStackObjects = false; + if (!HasFPRCSRs) { + std::vector FrameObjects(MFI.getObjectIndexEnd()); + for (auto &MBB : MF) { + for (auto &MI : MBB) { + std::optional FI = getLdStFrameID(MI, MFI); + if (FI && *FI >= 0 && *FI < (int)FrameObjects.size()) { + if (MFI.getStackID(*FI) == 2 || AArch64InstrInfo::isFpOrNEON(MI)) + FrameObjects[*FI] |= 2; + else + FrameObjects[*FI] |= 1; + } + } + } + HasFPRStackObjects = + any_of(FrameObjects, [](unsigned B) { return (B & 3) == 2; }); + } + + if (HasFPRCSRs || HasFPRStackObjects) { + int ID = MFI.CreateStackObject(StackHazardSize, Align(16), false); + LLVM_DEBUG(dbgs() << "Created Hazard slot at " << ID << " size " + << StackHazardSize << "\n"); + MF.getInfo()->setStackHazardSlotIndex(ID); + } +} + void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, RegScavenger *RS) const { @@ -3595,6 +3709,12 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, CSStackSize += 8; } + // Determine if a Hazard slot should be used, and increase the CSStackSize by + // StackHazardSize if so. + determineStackHazardSlot(MF, SavedRegs); + if (AFI->hasStackHazardSlotIndex()) + CSStackSize += StackHazardSize; + // Save number of saved regs, so we can easily update CSStackSize later. unsigned NumSavedRegs = SavedRegs.count(); @@ -3761,10 +3881,28 @@ bool AArch64FrameLowering::assignCalleeSavedSpillSlots( CSI.insert(CSI.end(), VGSaves.begin(), VGSaves.end()); } + Register LastReg = 0; + int HazardSlotIndex = std::numeric_limits::max(); for (auto &CS : CSI) { Register Reg = CS.getReg(); const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg); + // Create a hazard slot as we switch between GPR and FPR CSRs. + if (AFI->hasStackHazardSlotIndex() && + (!LastReg || !AArch64InstrInfo::isFpOrNEON(LastReg)) && + AArch64InstrInfo::isFpOrNEON(Reg)) { + assert(HazardSlotIndex == std::numeric_limits::max() && + "Unexpected register order for hazard slot"); + HazardSlotIndex = MFI.CreateStackObject(StackHazardSize, Align(8), true); + LLVM_DEBUG(dbgs() << "Created CSR Hazard at slot " << HazardSlotIndex + << "\n"); + AFI->setStackHazardCSRSlotIndex(HazardSlotIndex); + if ((unsigned)HazardSlotIndex < MinCSFrameIndex) + MinCSFrameIndex = HazardSlotIndex; + if ((unsigned)HazardSlotIndex > MaxCSFrameIndex) + MaxCSFrameIndex = HazardSlotIndex; + } + unsigned Size = RegInfo->getSpillSize(*RC); Align Alignment(RegInfo->getSpillAlign(*RC)); int FrameIdx = MFI.CreateStackObject(Size, Alignment, true); @@ -3785,7 +3923,22 @@ bool AArch64FrameLowering::assignCalleeSavedSpillSlots( if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx; } + LastReg = Reg; + } + + // Add hazard slot in the case where no FPR CSRs are present. + if (AFI->hasStackHazardSlotIndex() && + HazardSlotIndex == std::numeric_limits::max()) { + HazardSlotIndex = MFI.CreateStackObject(StackHazardSize, Align(8), true); + LLVM_DEBUG(dbgs() << "Created CSR Hazard at slot " << HazardSlotIndex + << "\n"); + AFI->setStackHazardCSRSlotIndex(HazardSlotIndex); + if ((unsigned)HazardSlotIndex < MinCSFrameIndex) + MinCSFrameIndex = HazardSlotIndex; + if ((unsigned)HazardSlotIndex > MaxCSFrameIndex) + MaxCSFrameIndex = HazardSlotIndex; } + return true; } @@ -3798,6 +3951,10 @@ bool AArch64FrameLowering::enableStackSlotScavenging( // function doesn't use a FP. if (AFI->hasStreamingModeChanges() && !hasFP(MF)) return false; + // Don't allow register salvaging with hazard slots, in case it moves objects + // into the wrong place. + if (AFI->hasStackHazardSlotIndex()) + return false; return AFI->hasCalleeSaveStackFreeSpace(); } @@ -4492,6 +4649,11 @@ struct FrameObject { // This object's group (which always contains the object with // ObjectFirst==true) should be placed first. bool GroupFirst = false; + + // Used to distinguish between FP and GPR accesses. The values are decided so + // that they sort FPR < Hazard < GPR and they can be or'd together. + unsigned Accesses = 0; + enum { AccessFPR = 1, AccessHazard = 2, AccessGPR = 4 }; }; class GroupBuilder { @@ -4527,8 +4689,12 @@ bool FrameObjectCompare(const FrameObject &A, const FrameObject &B) { // at the end. This also allows us to stop walking when we hit the // first invalid item after it's all sorted. // - // The "first" object goes first (closest to SP), followed by the members of - // the "first" group. + // If we want to include a stack hazard region, order FPR accesses < the + // hazard object < GPRs accesses in order to create a separation between the + // two. For the Accesses field 1 = FPR, 2 = Hazard Object, 4 = GPR. + // + // Otherwise the "first" object goes first (closest to SP), followed by the + // members of the "first" group. // // The rest are sorted by the group index to keep the groups together. // Higher numbered groups are more likely to be around longer (i.e. untagged @@ -4537,10 +4703,10 @@ bool FrameObjectCompare(const FrameObject &A, const FrameObject &B) { // // If all else equal, sort by the object index to keep the objects in the // original order. - return std::make_tuple(!A.IsValid, A.ObjectFirst, A.GroupFirst, A.GroupIndex, - A.ObjectIndex) < - std::make_tuple(!B.IsValid, B.ObjectFirst, B.GroupFirst, B.GroupIndex, - B.ObjectIndex); + return std::make_tuple(!A.IsValid, A.Accesses, A.ObjectFirst, A.GroupFirst, + A.GroupIndex, A.ObjectIndex) < + std::make_tuple(!B.IsValid, B.Accesses, B.ObjectFirst, B.GroupFirst, + B.GroupIndex, B.ObjectIndex); } } // namespace @@ -4549,6 +4715,7 @@ void AArch64FrameLowering::orderFrameObjects( if (!OrderFrameObjects || ObjectsToAllocate.empty()) return; + const AArch64FunctionInfo &AFI = *MF.getInfo(); const MachineFrameInfo &MFI = MF.getFrameInfo(); std::vector FrameObjects(MFI.getObjectIndexEnd()); for (auto &Obj : ObjectsToAllocate) { @@ -4556,12 +4723,24 @@ void AArch64FrameLowering::orderFrameObjects( FrameObjects[Obj].ObjectIndex = Obj; } - // Identify stack slots that are tagged at the same time. + // Identify FPR vs GPR slots for hazards, and stack slots that are tagged at + // the same time. GroupBuilder GB(FrameObjects); for (auto &MBB : MF) { for (auto &MI : MBB) { if (MI.isDebugInstr()) continue; + + if (AFI.hasStackHazardSlotIndex()) { + std::optional FI = getLdStFrameID(MI, MFI); + if (FI && *FI >= 0 && *FI < (int)FrameObjects.size()) { + if (MFI.getStackID(*FI) == 2 || AArch64InstrInfo::isFpOrNEON(MI)) + FrameObjects[*FI].Accesses |= FrameObject::AccessFPR; + else + FrameObjects[*FI].Accesses |= FrameObject::AccessGPR; + } + } + int OpIndex; switch (MI.getOpcode()) { case AArch64::STGloop: @@ -4600,11 +4779,20 @@ void AArch64FrameLowering::orderFrameObjects( GB.EndCurrentGroup(); } + if (AFI.hasStackHazardSlotIndex()) { + FrameObjects[AFI.getStackHazardSlotIndex()].Accesses = + FrameObject::AccessHazard; + // If a stack object is unknown or both GPR and FPR, sort it into GPR. + for (auto &Obj : FrameObjects) + if (!Obj.Accesses || + Obj.Accesses == (FrameObject::AccessGPR | FrameObject::AccessFPR)) + Obj.Accesses = FrameObject::AccessGPR; + } + // If the function's tagged base pointer is pinned to a stack slot, we want to // put that slot first when possible. This will likely place it at SP + 0, // and save one instruction when generating the base pointer because IRG does // not allow an immediate offset. - const AArch64FunctionInfo &AFI = *MF.getInfo(); std::optional TBPI = AFI.getTaggedBasePointerIndex(); if (TBPI) { FrameObjects[*TBPI].ObjectFirst = true; diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h index 941af03a78b73..da315850d6362 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h @@ -155,6 +155,10 @@ class AArch64FrameLowering : public TargetFrameLowering { int64_t RealignmentPadding, StackOffset AllocSize, bool NeedsWinCFI, bool *HasWinCFI, bool EmitCFI, StackOffset InitialOffset, bool FollowupAllocs) const; + /// Make a determination whether a Hazard slot is used and create it if + /// needed. + void determineStackHazardSlot(MachineFunction &MF, + BitVector &SavedRegs) const; /// Emit target zero call-used regs. void emitZeroCallUsedRegs(BitVector RegsToZero, diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h index 001521d1101eb..72f110cebbdc8 100644 --- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h +++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -109,6 +109,12 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { /// registers. unsigned VarArgsFPRSize = 0; + /// The stack slots used to add space between FPR and GPR accesses when using + /// hazard padding. StackHazardCSRSlotIndex is added between GPR and FPR CSRs. + /// StackHazardSlotIndex is added between (sorted) stack objects. + int StackHazardSlotIndex = std::numeric_limits::max(); + int StackHazardCSRSlotIndex = std::numeric_limits::max(); + /// True if this function has a subset of CSRs that is handled explicitly via /// copies. bool IsSplitCSR = false; @@ -346,6 +352,13 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { MaxOffset = std::max(Offset + ObjSize, MaxOffset); } + if (StackHazardCSRSlotIndex != std::numeric_limits::max()) { + int64_t Offset = MFI.getObjectOffset(StackHazardCSRSlotIndex); + int64_t ObjSize = MFI.getObjectSize(StackHazardCSRSlotIndex); + MinOffset = std::min(Offset, MinOffset); + MaxOffset = std::max(Offset + ObjSize, MaxOffset); + } + unsigned Size = alignTo(MaxOffset - MinOffset, 16); assert((!HasCalleeSavedStackSize || getCalleeSavedStackSize() == Size) && "Invalid size calculated for callee saves"); @@ -403,6 +416,20 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { unsigned getVarArgsFPRSize() const { return VarArgsFPRSize; } void setVarArgsFPRSize(unsigned Size) { VarArgsFPRSize = Size; } + bool hasStackHazardSlotIndex() const { + return StackHazardSlotIndex != std::numeric_limits::max(); + } + int getStackHazardSlotIndex() const { return StackHazardSlotIndex; } + void setStackHazardSlotIndex(int Index) { + assert(StackHazardSlotIndex == std::numeric_limits::max()); + StackHazardSlotIndex = Index; + } + int getStackHazardCSRSlotIndex() const { return StackHazardCSRSlotIndex; } + void setStackHazardCSRSlotIndex(int Index) { + assert(StackHazardCSRSlotIndex == std::numeric_limits::max()); + StackHazardCSRSlotIndex = Index; + } + unsigned getSRetReturnReg() const { return SRetReturnReg; } void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; } diff --git a/llvm/test/CodeGen/AArch64/stack-hazard.ll b/llvm/test/CodeGen/AArch64/stack-hazard.ll new file mode 100644 index 0000000000000..50a2e41f45756 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/stack-hazard.ll @@ -0,0 +1,3051 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-stack-hazard-size=0 | FileCheck %s --check-prefixes=CHECK,CHECK0 +; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-stack-hazard-size=64 | FileCheck %s --check-prefixes=CHECK,CHECK64 +; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-stack-hazard-size=1024 | FileCheck %s --check-prefixes=CHECK,CHECK1024 + +define i32 @basic(i32 noundef %num) { +; CHECK-LABEL: basic: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w0, wzr +; CHECK-NEXT: ret +entry: + ret i32 0 +} + +; Non-streaming functions don't need hazards +define i32 @csr_d8_notsc(i32 noundef %num) { +; CHECK-LABEL: csr_d8_notsc: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str d8, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset b8, -16 +; CHECK-NEXT: mov w0, wzr +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr d8, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + tail call void asm sideeffect "", "~{d8}"() #1 + ret i32 0 +} + +; Very simple - doesn't require hazards +define i32 @basic_sc(i32 noundef %num) "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: basic_sc: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w0, wzr +; CHECK-NEXT: ret +entry: + ret i32 0 +} + +; No fpr accesses/csrs - doesn't require hazards +define i32 @nocsr_alloci64(i64 %d) "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: nocsr_alloci64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x8, x0 +; CHECK-NEXT: mov w0, wzr +; CHECK-NEXT: str x8, [sp, #8] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret +entry: + %a = alloca i64 + store i64 %d, ptr %a + ret i32 0 +} + +; No fpr accesses/csrs - doesn't require hazards +define i32 @csr_x20(i32 noundef %num) "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: csr_x20: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x20, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: mov w0, wzr +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr x20, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + tail call void asm sideeffect "", "~{x20}"() #1 + ret i32 0 +} + +; CSR of d8. Make sure there is a gap between FPR and GPR +define i32 @csr_d8(i32 noundef %num) "aarch64_pstate_sm_compatible" { +; CHECK0-LABEL: csr_d8: +; CHECK0: // %bb.0: // %entry +; CHECK0-NEXT: str d8, [sp, #-16]! // 8-byte Folded Spill +; CHECK0-NEXT: .cfi_def_cfa_offset 16 +; CHECK0-NEXT: .cfi_offset b8, -16 +; CHECK0-NEXT: mov w0, wzr +; CHECK0-NEXT: //APP +; CHECK0-NEXT: //NO_APP +; CHECK0-NEXT: ldr d8, [sp], #16 // 8-byte Folded Reload +; CHECK0-NEXT: ret +; +; CHECK64-LABEL: csr_d8: +; CHECK64: // %bb.0: // %entry +; CHECK64-NEXT: sub sp, sp, #144 +; CHECK64-NEXT: str d8, [sp, #64] // 8-byte Folded Spill +; CHECK64-NEXT: .cfi_def_cfa_offset 144 +; CHECK64-NEXT: .cfi_offset b8, -80 +; CHECK64-NEXT: //APP +; CHECK64-NEXT: //NO_APP +; CHECK64-NEXT: mov w0, wzr +; CHECK64-NEXT: ldr d8, [sp, #64] // 8-byte Folded Reload +; CHECK64-NEXT: add sp, sp, #144 +; CHECK64-NEXT: ret +; +; CHECK1024-LABEL: csr_d8: +; CHECK1024: // %bb.0: // %entry +; CHECK1024-NEXT: sub sp, sp, #1040 +; CHECK1024-NEXT: str d8, [sp] // 8-byte Folded Spill +; CHECK1024-NEXT: str x29, [sp, #1032] // 8-byte Folded Spill +; CHECK1024-NEXT: sub sp, sp, #1024 +; CHECK1024-NEXT: .cfi_def_cfa_offset 2064 +; CHECK1024-NEXT: .cfi_offset w29, -8 +; CHECK1024-NEXT: .cfi_offset b8, -1040 +; CHECK1024-NEXT: mov w0, wzr +; CHECK1024-NEXT: //APP +; CHECK1024-NEXT: //NO_APP +; CHECK1024-NEXT: add sp, sp, #1024 +; CHECK1024-NEXT: ldr x29, [sp, #1032] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr d8, [sp] // 8-byte Folded Reload +; CHECK1024-NEXT: add sp, sp, #1040 +; CHECK1024-NEXT: ret +entry: + tail call void asm sideeffect "", "~{d8}"() #1 + ret i32 0 +} + +; Stack fpr objects. +define i32 @nocsr_allocd(double %d) "aarch64_pstate_sm_compatible" { +; CHECK0-LABEL: nocsr_allocd: +; CHECK0: // %bb.0: // %entry +; CHECK0-NEXT: sub sp, sp, #16 +; CHECK0-NEXT: .cfi_def_cfa_offset 16 +; CHECK0-NEXT: mov w0, wzr +; CHECK0-NEXT: str d0, [sp, #8] +; CHECK0-NEXT: add sp, sp, #16 +; CHECK0-NEXT: ret +; +; CHECK64-LABEL: nocsr_allocd: +; CHECK64: // %bb.0: // %entry +; CHECK64-NEXT: sub sp, sp, #80 +; CHECK64-NEXT: .cfi_def_cfa_offset 80 +; CHECK64-NEXT: mov w0, wzr +; CHECK64-NEXT: str d0, [sp, #72] +; CHECK64-NEXT: add sp, sp, #80 +; CHECK64-NEXT: ret +; +; CHECK1024-LABEL: nocsr_allocd: +; CHECK1024: // %bb.0: // %entry +; CHECK1024-NEXT: sub sp, sp, #1040 +; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill +; CHECK1024-NEXT: sub sp, sp, #1040 +; CHECK1024-NEXT: .cfi_def_cfa_offset 2080 +; CHECK1024-NEXT: .cfi_offset w29, -16 +; CHECK1024-NEXT: mov w0, wzr +; CHECK1024-NEXT: str d0, [sp, #1032] +; CHECK1024-NEXT: add sp, sp, #1040 +; CHECK1024-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload +; CHECK1024-NEXT: add sp, sp, #1040 +; CHECK1024-NEXT: ret +entry: + %a = alloca double + store double %d, ptr %a + ret i32 0 +} + +define i32 @csr_d8d9(i32 noundef %num) "aarch64_pstate_sm_compatible" { +; CHECK0-LABEL: csr_d8d9: +; CHECK0: // %bb.0: // %entry +; CHECK0-NEXT: stp d9, d8, [sp, #-16]! // 16-byte Folded Spill +; CHECK0-NEXT: .cfi_def_cfa_offset 16 +; CHECK0-NEXT: .cfi_offset b8, -8 +; CHECK0-NEXT: .cfi_offset b9, -16 +; CHECK0-NEXT: mov w0, wzr +; CHECK0-NEXT: //APP +; CHECK0-NEXT: //NO_APP +; CHECK0-NEXT: ldp d9, d8, [sp], #16 // 16-byte Folded Reload +; CHECK0-NEXT: ret +; +; CHECK64-LABEL: csr_d8d9: +; CHECK64: // %bb.0: // %entry +; CHECK64-NEXT: sub sp, sp, #144 +; CHECK64-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK64-NEXT: .cfi_def_cfa_offset 144 +; CHECK64-NEXT: .cfi_offset b8, -72 +; CHECK64-NEXT: .cfi_offset b9, -80 +; CHECK64-NEXT: //APP +; CHECK64-NEXT: //NO_APP +; CHECK64-NEXT: mov w0, wzr +; CHECK64-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK64-NEXT: add sp, sp, #144 +; CHECK64-NEXT: ret +; +; CHECK1024-LABEL: csr_d8d9: +; CHECK1024: // %bb.0: // %entry +; CHECK1024-NEXT: sub sp, sp, #1056 +; CHECK1024-NEXT: stp d9, d8, [sp] // 16-byte Folded Spill +; CHECK1024-NEXT: str x29, [sp, #1040] // 8-byte Folded Spill +; CHECK1024-NEXT: sub sp, sp, #1024 +; CHECK1024-NEXT: .cfi_def_cfa_offset 2080 +; CHECK1024-NEXT: .cfi_offset w29, -16 +; CHECK1024-NEXT: .cfi_offset b8, -1048 +; CHECK1024-NEXT: .cfi_offset b9, -1056 +; CHECK1024-NEXT: mov w0, wzr +; CHECK1024-NEXT: //APP +; CHECK1024-NEXT: //NO_APP +; CHECK1024-NEXT: add sp, sp, #1024 +; CHECK1024-NEXT: ldp d9, d8, [sp] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr x29, [sp, #1040] // 8-byte Folded Reload +; CHECK1024-NEXT: add sp, sp, #1056 +; CHECK1024-NEXT: ret +entry: + tail call void asm sideeffect "", "~{d8},~{d9}"() #1 + ret i32 0 +} + +define i32 @csr_d8_allocd(double %d) "aarch64_pstate_sm_compatible" { +; CHECK0-LABEL: csr_d8_allocd: +; CHECK0: // %bb.0: // %entry +; CHECK0-NEXT: stp d8, d0, [sp, #-16]! // 8-byte Folded Spill +; CHECK0-NEXT: .cfi_def_cfa_offset 16 +; CHECK0-NEXT: .cfi_offset b8, -16 +; CHECK0-NEXT: mov w0, wzr +; CHECK0-NEXT: //APP +; CHECK0-NEXT: //NO_APP +; CHECK0-NEXT: ldr d8, [sp], #16 // 8-byte Folded Reload +; CHECK0-NEXT: ret +; +; CHECK64-LABEL: csr_d8_allocd: +; CHECK64: // %bb.0: // %entry +; CHECK64-NEXT: sub sp, sp, #160 +; CHECK64-NEXT: stp d0, d8, [sp, #72] // 8-byte Folded Spill +; CHECK64-NEXT: .cfi_def_cfa_offset 160 +; CHECK64-NEXT: .cfi_offset b8, -80 +; CHECK64-NEXT: //APP +; CHECK64-NEXT: //NO_APP +; CHECK64-NEXT: mov w0, wzr +; CHECK64-NEXT: ldr d8, [sp, #80] // 8-byte Folded Reload +; CHECK64-NEXT: add sp, sp, #160 +; CHECK64-NEXT: ret +; +; CHECK1024-LABEL: csr_d8_allocd: +; CHECK1024: // %bb.0: // %entry +; CHECK1024-NEXT: sub sp, sp, #1040 +; CHECK1024-NEXT: str d8, [sp] // 8-byte Folded Spill +; CHECK1024-NEXT: str x29, [sp, #1032] // 8-byte Folded Spill +; CHECK1024-NEXT: sub sp, sp, #1040 +; CHECK1024-NEXT: .cfi_def_cfa_offset 2080 +; CHECK1024-NEXT: .cfi_offset w29, -8 +; CHECK1024-NEXT: .cfi_offset b8, -1040 +; CHECK1024-NEXT: mov w0, wzr +; CHECK1024-NEXT: //APP +; CHECK1024-NEXT: //NO_APP +; CHECK1024-NEXT: str d0, [sp, #1032] +; CHECK1024-NEXT: add sp, sp, #1040 +; CHECK1024-NEXT: ldr x29, [sp, #1032] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr d8, [sp] // 8-byte Folded Reload +; CHECK1024-NEXT: add sp, sp, #1040 +; CHECK1024-NEXT: ret +entry: + %a = alloca double + tail call void asm sideeffect "", "~{d8}"() #1 + store double %d, ptr %a + ret i32 0 +} + +define i32 @csr_d8_alloci64(i64 %d) "aarch64_pstate_sm_compatible" { +; CHECK0-LABEL: csr_d8_alloci64: +; CHECK0: // %bb.0: // %entry +; CHECK0-NEXT: str d8, [sp, #-16]! // 8-byte Folded Spill +; CHECK0-NEXT: .cfi_def_cfa_offset 16 +; CHECK0-NEXT: .cfi_offset b8, -16 +; CHECK0-NEXT: mov x8, x0 +; CHECK0-NEXT: mov w0, wzr +; CHECK0-NEXT: //APP +; CHECK0-NEXT: //NO_APP +; CHECK0-NEXT: str x8, [sp, #8] +; CHECK0-NEXT: ldr d8, [sp], #16 // 8-byte Folded Reload +; CHECK0-NEXT: ret +; +; CHECK64-LABEL: csr_d8_alloci64: +; CHECK64: // %bb.0: // %entry +; CHECK64-NEXT: sub sp, sp, #160 +; CHECK64-NEXT: str d8, [sp, #80] // 8-byte Folded Spill +; CHECK64-NEXT: .cfi_def_cfa_offset 160 +; CHECK64-NEXT: .cfi_offset b8, -80 +; CHECK64-NEXT: //APP +; CHECK64-NEXT: //NO_APP +; CHECK64-NEXT: mov x8, x0 +; CHECK64-NEXT: mov w0, wzr +; CHECK64-NEXT: ldr d8, [sp, #80] // 8-byte Folded Reload +; CHECK64-NEXT: str x8, [sp, #8] +; CHECK64-NEXT: add sp, sp, #160 +; CHECK64-NEXT: ret +; +; CHECK1024-LABEL: csr_d8_alloci64: +; CHECK1024: // %bb.0: // %entry +; CHECK1024-NEXT: sub sp, sp, #1040 +; CHECK1024-NEXT: str d8, [sp] // 8-byte Folded Spill +; CHECK1024-NEXT: str x29, [sp, #1032] // 8-byte Folded Spill +; CHECK1024-NEXT: sub sp, sp, #1040 +; CHECK1024-NEXT: .cfi_def_cfa_offset 2080 +; CHECK1024-NEXT: .cfi_offset w29, -8 +; CHECK1024-NEXT: .cfi_offset b8, -1040 +; CHECK1024-NEXT: mov x8, x0 +; CHECK1024-NEXT: mov w0, wzr +; CHECK1024-NEXT: //APP +; CHECK1024-NEXT: //NO_APP +; CHECK1024-NEXT: str x8, [sp, #8] +; CHECK1024-NEXT: add sp, sp, #1040 +; CHECK1024-NEXT: ldr x29, [sp, #1032] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr d8, [sp] // 8-byte Folded Reload +; CHECK1024-NEXT: add sp, sp, #1040 +; CHECK1024-NEXT: ret +entry: + %a = alloca i64 + tail call void asm sideeffect "", "~{d8}"() #1 + store i64 %d, ptr %a + ret i32 0 +} + +; Check the frame pointer is in the right place +define i32 @csr_d8_allocd_framepointer(double %d) "aarch64_pstate_sm_compatible" "frame-pointer"="all" { +; CHECK0-LABEL: csr_d8_allocd_framepointer: +; CHECK0: // %bb.0: // %entry +; CHECK0-NEXT: str d8, [sp, #-32]! // 8-byte Folded Spill +; CHECK0-NEXT: stp x29, x30, [sp, #16] // 16-byte Folded Spill +; CHECK0-NEXT: add x29, sp, #16 +; CHECK0-NEXT: .cfi_def_cfa w29, 16 +; CHECK0-NEXT: .cfi_offset w30, -8 +; CHECK0-NEXT: .cfi_offset w29, -16 +; CHECK0-NEXT: .cfi_offset b8, -32 +; CHECK0-NEXT: //APP +; CHECK0-NEXT: //NO_APP +; CHECK0-NEXT: ldp x29, x30, [sp, #16] // 16-byte Folded Reload +; CHECK0-NEXT: mov w0, wzr +; CHECK0-NEXT: str d0, [sp, #8] +; CHECK0-NEXT: ldr d8, [sp], #32 // 8-byte Folded Reload +; CHECK0-NEXT: ret +; +; CHECK64-LABEL: csr_d8_allocd_framepointer: +; CHECK64: // %bb.0: // %entry +; CHECK64-NEXT: sub sp, sp, #176 +; CHECK64-NEXT: str d8, [sp, #80] // 8-byte Folded Spill +; CHECK64-NEXT: stp x29, x30, [sp, #152] // 16-byte Folded Spill +; CHECK64-NEXT: add x29, sp, #80 +; CHECK64-NEXT: .cfi_def_cfa w29, 96 +; CHECK64-NEXT: .cfi_offset w30, -16 +; CHECK64-NEXT: .cfi_offset w29, -24 +; CHECK64-NEXT: .cfi_offset b8, -96 +; CHECK64-NEXT: //APP +; CHECK64-NEXT: //NO_APP +; CHECK64-NEXT: stur d0, [x29, #-8] +; CHECK64-NEXT: ldr x29, [sp, #152] // 8-byte Folded Reload +; CHECK64-NEXT: ldr d8, [sp, #80] // 8-byte Folded Reload +; CHECK64-NEXT: mov w0, wzr +; CHECK64-NEXT: add sp, sp, #176 +; CHECK64-NEXT: ret +; +; CHECK1024-LABEL: csr_d8_allocd_framepointer: +; CHECK1024: // %bb.0: // %entry +; CHECK1024-NEXT: sub sp, sp, #1056 +; CHECK1024-NEXT: str d8, [sp] // 8-byte Folded Spill +; CHECK1024-NEXT: str x29, [sp, #1032] // 8-byte Folded Spill +; CHECK1024-NEXT: mov x29, sp +; CHECK1024-NEXT: str x30, [sp, #1040] // 8-byte Folded Spill +; CHECK1024-NEXT: sub sp, sp, #1040 +; CHECK1024-NEXT: .cfi_def_cfa w29, 1056 +; CHECK1024-NEXT: .cfi_offset w30, -16 +; CHECK1024-NEXT: .cfi_offset w29, -24 +; CHECK1024-NEXT: .cfi_offset b8, -1056 +; CHECK1024-NEXT: mov w0, wzr +; CHECK1024-NEXT: //APP +; CHECK1024-NEXT: //NO_APP +; CHECK1024-NEXT: stur d0, [x29, #-8] +; CHECK1024-NEXT: add sp, sp, #1040 +; CHECK1024-NEXT: ldr x30, [sp, #1040] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x29, [sp, #1032] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr d8, [sp] // 8-byte Folded Reload +; CHECK1024-NEXT: add sp, sp, #1056 +; CHECK1024-NEXT: ret +entry: + %a = alloca double + tail call void asm sideeffect "", "~{d8}"() #1 + store double %d, ptr %a + ret i32 0 +} + +; sve stack objects should live with other fpr registers +define i32 @csr_d8_allocnxv4i32(i64 %d) "aarch64_pstate_sm_compatible" { +; CHECK0-LABEL: csr_d8_allocnxv4i32: +; CHECK0: // %bb.0: // %entry +; CHECK0-NEXT: str d8, [sp, #-16]! // 8-byte Folded Spill +; CHECK0-NEXT: str x29, [sp, #8] // 8-byte Folded Spill +; CHECK0-NEXT: addvl sp, sp, #-1 +; CHECK0-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK0-NEXT: .cfi_offset w29, -8 +; CHECK0-NEXT: .cfi_offset b8, -16 +; CHECK0-NEXT: mov z0.s, #0 // =0x0 +; CHECK0-NEXT: ptrue p0.s +; CHECK0-NEXT: mov w0, wzr +; CHECK0-NEXT: //APP +; CHECK0-NEXT: //NO_APP +; CHECK0-NEXT: st1w { z0.s }, p0, [sp] +; CHECK0-NEXT: addvl sp, sp, #1 +; CHECK0-NEXT: ldr x29, [sp, #8] // 8-byte Folded Reload +; CHECK0-NEXT: ldr d8, [sp], #16 // 8-byte Folded Reload +; CHECK0-NEXT: ret +; +; CHECK64-LABEL: csr_d8_allocnxv4i32: +; CHECK64: // %bb.0: // %entry +; CHECK64-NEXT: str d8, [sp, #-80]! // 8-byte Folded Spill +; CHECK64-NEXT: str x29, [sp, #72] // 8-byte Folded Spill +; CHECK64-NEXT: sub sp, sp, #64 +; CHECK64-NEXT: addvl sp, sp, #-1 +; CHECK64-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x90, 0x01, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 144 + 8 * VG +; CHECK64-NEXT: .cfi_offset w29, -8 +; CHECK64-NEXT: .cfi_offset b8, -80 +; CHECK64-NEXT: mov z0.s, #0 // =0x0 +; CHECK64-NEXT: ptrue p0.s +; CHECK64-NEXT: add x8, sp, #64 +; CHECK64-NEXT: mov w0, wzr +; CHECK64-NEXT: //APP +; CHECK64-NEXT: //NO_APP +; CHECK64-NEXT: st1w { z0.s }, p0, [x8] +; CHECK64-NEXT: addvl sp, sp, #1 +; CHECK64-NEXT: add sp, sp, #64 +; CHECK64-NEXT: ldr x29, [sp, #72] // 8-byte Folded Reload +; CHECK64-NEXT: ldr d8, [sp], #80 // 8-byte Folded Reload +; CHECK64-NEXT: ret +; +; CHECK1024-LABEL: csr_d8_allocnxv4i32: +; CHECK1024: // %bb.0: // %entry +; CHECK1024-NEXT: sub sp, sp, #1040 +; CHECK1024-NEXT: str d8, [sp] // 8-byte Folded Spill +; CHECK1024-NEXT: str x29, [sp, #1032] // 8-byte Folded Spill +; CHECK1024-NEXT: sub sp, sp, #1024 +; CHECK1024-NEXT: addvl sp, sp, #-1 +; CHECK1024-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x90, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 2064 + 8 * VG +; CHECK1024-NEXT: .cfi_offset w29, -8 +; CHECK1024-NEXT: .cfi_offset b8, -1040 +; CHECK1024-NEXT: mov z0.s, #0 // =0x0 +; CHECK1024-NEXT: ptrue p0.s +; CHECK1024-NEXT: add x8, sp, #1024 +; CHECK1024-NEXT: mov w0, wzr +; CHECK1024-NEXT: //APP +; CHECK1024-NEXT: //NO_APP +; CHECK1024-NEXT: st1w { z0.s }, p0, [x8] +; CHECK1024-NEXT: addvl sp, sp, #1 +; CHECK1024-NEXT: add sp, sp, #1024 +; CHECK1024-NEXT: ldr x29, [sp, #1032] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr d8, [sp] // 8-byte Folded Reload +; CHECK1024-NEXT: add sp, sp, #1040 +; CHECK1024-NEXT: ret +entry: + %a = alloca + tail call void asm sideeffect "", "~{d8}"() #1 + store zeroinitializer, ptr %a + ret i32 0 +} + +define i32 @csr_x18_25_d8_15_allocdi64(i64 %d, double %e) "aarch64_pstate_sm_compatible" { +; CHECK0-LABEL: csr_x18_25_d8_15_allocdi64: +; CHECK0: // %bb.0: // %entry +; CHECK0-NEXT: sub sp, sp, #144 +; CHECK0-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK0-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK0-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK0-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK0-NEXT: str x25, [sp, #80] // 8-byte Folded Spill +; CHECK0-NEXT: stp x24, x23, [sp, #96] // 16-byte Folded Spill +; CHECK0-NEXT: stp x22, x21, [sp, #112] // 16-byte Folded Spill +; CHECK0-NEXT: stp x20, x19, [sp, #128] // 16-byte Folded Spill +; CHECK0-NEXT: .cfi_def_cfa_offset 144 +; CHECK0-NEXT: .cfi_offset w19, -8 +; CHECK0-NEXT: .cfi_offset w20, -16 +; CHECK0-NEXT: .cfi_offset w21, -24 +; CHECK0-NEXT: .cfi_offset w22, -32 +; CHECK0-NEXT: .cfi_offset w23, -40 +; CHECK0-NEXT: .cfi_offset w24, -48 +; CHECK0-NEXT: .cfi_offset w25, -64 +; CHECK0-NEXT: .cfi_offset b8, -72 +; CHECK0-NEXT: .cfi_offset b9, -80 +; CHECK0-NEXT: .cfi_offset b10, -88 +; CHECK0-NEXT: .cfi_offset b11, -96 +; CHECK0-NEXT: .cfi_offset b12, -104 +; CHECK0-NEXT: .cfi_offset b13, -112 +; CHECK0-NEXT: .cfi_offset b14, -120 +; CHECK0-NEXT: .cfi_offset b15, -128 +; CHECK0-NEXT: //APP +; CHECK0-NEXT: //NO_APP +; CHECK0-NEXT: //APP +; CHECK0-NEXT: //NO_APP +; CHECK0-NEXT: mov x8, x0 +; CHECK0-NEXT: ldp x20, x19, [sp, #128] // 16-byte Folded Reload +; CHECK0-NEXT: ldr x25, [sp, #80] // 8-byte Folded Reload +; CHECK0-NEXT: ldp x22, x21, [sp, #112] // 16-byte Folded Reload +; CHECK0-NEXT: mov w0, wzr +; CHECK0-NEXT: ldp x24, x23, [sp, #96] // 16-byte Folded Reload +; CHECK0-NEXT: str x8, [sp, #88] +; CHECK0-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK0-NEXT: str d0, [sp, #8] +; CHECK0-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK0-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK0-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK0-NEXT: add sp, sp, #144 +; CHECK0-NEXT: ret +; +; CHECK64-LABEL: csr_x18_25_d8_15_allocdi64: +; CHECK64: // %bb.0: // %entry +; CHECK64-NEXT: sub sp, sp, #288 +; CHECK64-NEXT: stp d15, d14, [sp, #96] // 16-byte Folded Spill +; CHECK64-NEXT: stp d13, d12, [sp, #112] // 16-byte Folded Spill +; CHECK64-NEXT: stp d11, d10, [sp, #128] // 16-byte Folded Spill +; CHECK64-NEXT: stp d9, d8, [sp, #144] // 16-byte Folded Spill +; CHECK64-NEXT: stp x29, x25, [sp, #224] // 16-byte Folded Spill +; CHECK64-NEXT: stp x24, x23, [sp, #240] // 16-byte Folded Spill +; CHECK64-NEXT: stp x22, x21, [sp, #256] // 16-byte Folded Spill +; CHECK64-NEXT: stp x20, x19, [sp, #272] // 16-byte Folded Spill +; CHECK64-NEXT: .cfi_def_cfa_offset 288 +; CHECK64-NEXT: .cfi_offset w19, -8 +; CHECK64-NEXT: .cfi_offset w20, -16 +; CHECK64-NEXT: .cfi_offset w21, -24 +; CHECK64-NEXT: .cfi_offset w22, -32 +; CHECK64-NEXT: .cfi_offset w23, -40 +; CHECK64-NEXT: .cfi_offset w24, -48 +; CHECK64-NEXT: .cfi_offset w25, -56 +; CHECK64-NEXT: .cfi_offset w29, -64 +; CHECK64-NEXT: .cfi_offset b8, -136 +; CHECK64-NEXT: .cfi_offset b9, -144 +; CHECK64-NEXT: .cfi_offset b10, -152 +; CHECK64-NEXT: .cfi_offset b11, -160 +; CHECK64-NEXT: .cfi_offset b12, -168 +; CHECK64-NEXT: .cfi_offset b13, -176 +; CHECK64-NEXT: .cfi_offset b14, -184 +; CHECK64-NEXT: .cfi_offset b15, -192 +; CHECK64-NEXT: //APP +; CHECK64-NEXT: //NO_APP +; CHECK64-NEXT: //APP +; CHECK64-NEXT: //NO_APP +; CHECK64-NEXT: mov x8, x0 +; CHECK64-NEXT: ldp x20, x19, [sp, #272] // 16-byte Folded Reload +; CHECK64-NEXT: mov w0, wzr +; CHECK64-NEXT: ldp x22, x21, [sp, #256] // 16-byte Folded Reload +; CHECK64-NEXT: str x8, [sp, #8] +; CHECK64-NEXT: ldp x24, x23, [sp, #240] // 16-byte Folded Reload +; CHECK64-NEXT: str d0, [sp, #88] +; CHECK64-NEXT: ldp x29, x25, [sp, #224] // 16-byte Folded Reload +; CHECK64-NEXT: ldp d9, d8, [sp, #144] // 16-byte Folded Reload +; CHECK64-NEXT: ldp d11, d10, [sp, #128] // 16-byte Folded Reload +; CHECK64-NEXT: ldp d13, d12, [sp, #112] // 16-byte Folded Reload +; CHECK64-NEXT: ldp d15, d14, [sp, #96] // 16-byte Folded Reload +; CHECK64-NEXT: add sp, sp, #288 +; CHECK64-NEXT: ret +; +; CHECK1024-LABEL: csr_x18_25_d8_15_allocdi64: +; CHECK1024: // %bb.0: // %entry +; CHECK1024-NEXT: sub sp, sp, #1152 +; CHECK1024-NEXT: stp d15, d14, [sp] // 16-byte Folded Spill +; CHECK1024-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK1024-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK1024-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK1024-NEXT: str x29, [sp, #1088] // 8-byte Folded Spill +; CHECK1024-NEXT: str x25, [sp, #1096] // 8-byte Folded Spill +; CHECK1024-NEXT: str x24, [sp, #1104] // 8-byte Folded Spill +; CHECK1024-NEXT: str x23, [sp, #1112] // 8-byte Folded Spill +; CHECK1024-NEXT: str x22, [sp, #1120] // 8-byte Folded Spill +; CHECK1024-NEXT: str x21, [sp, #1128] // 8-byte Folded Spill +; CHECK1024-NEXT: str x20, [sp, #1136] // 8-byte Folded Spill +; CHECK1024-NEXT: str x19, [sp, #1144] // 8-byte Folded Spill +; CHECK1024-NEXT: sub sp, sp, #1056 +; CHECK1024-NEXT: .cfi_def_cfa_offset 2208 +; CHECK1024-NEXT: .cfi_offset w19, -8 +; CHECK1024-NEXT: .cfi_offset w20, -16 +; CHECK1024-NEXT: .cfi_offset w21, -24 +; CHECK1024-NEXT: .cfi_offset w22, -32 +; CHECK1024-NEXT: .cfi_offset w23, -40 +; CHECK1024-NEXT: .cfi_offset w24, -48 +; CHECK1024-NEXT: .cfi_offset w25, -56 +; CHECK1024-NEXT: .cfi_offset w29, -64 +; CHECK1024-NEXT: .cfi_offset b8, -1096 +; CHECK1024-NEXT: .cfi_offset b9, -1104 +; CHECK1024-NEXT: .cfi_offset b10, -1112 +; CHECK1024-NEXT: .cfi_offset b11, -1120 +; CHECK1024-NEXT: .cfi_offset b12, -1128 +; CHECK1024-NEXT: .cfi_offset b13, -1136 +; CHECK1024-NEXT: .cfi_offset b14, -1144 +; CHECK1024-NEXT: .cfi_offset b15, -1152 +; CHECK1024-NEXT: mov x8, x0 +; CHECK1024-NEXT: mov w0, wzr +; CHECK1024-NEXT: //APP +; CHECK1024-NEXT: //NO_APP +; CHECK1024-NEXT: //APP +; CHECK1024-NEXT: //NO_APP +; CHECK1024-NEXT: str x8, [sp, #8] +; CHECK1024-NEXT: str d0, [sp, #1048] +; CHECK1024-NEXT: add sp, sp, #1056 +; CHECK1024-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr x19, [sp, #1144] // 8-byte Folded Reload +; CHECK1024-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr x20, [sp, #1136] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x21, [sp, #1128] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x22, [sp, #1120] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x23, [sp, #1112] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x24, [sp, #1104] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x25, [sp, #1096] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x29, [sp, #1088] // 8-byte Folded Reload +; CHECK1024-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK1024-NEXT: ldp d15, d14, [sp] // 16-byte Folded Reload +; CHECK1024-NEXT: add sp, sp, #1152 +; CHECK1024-NEXT: ret +entry: + %a = alloca i64 + %b = alloca double + tail call void asm sideeffect "", "~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25}"() + tail call void asm sideeffect "", "~{d8},~{d9},~{d10},~{d11},~{d12},~{d13},~{d14},~{d15}"() + store i64 %d, ptr %a + store double %e, ptr %b + ret i32 0 +} + +define i32 @csr_x18_25_d8_15_allocdi64_locallystreaming(i64 %d, double %e) "aarch64_pstate_sm_body" "target-features"="+sme" { +; CHECK0-LABEL: csr_x18_25_d8_15_allocdi64_locallystreaming: +; CHECK0: // %bb.0: // %entry +; CHECK0-NEXT: sub sp, sp, #176 +; CHECK0-NEXT: .cfi_def_cfa_offset 176 +; CHECK0-NEXT: rdsvl x9, #1 +; CHECK0-NEXT: stp d15, d14, [sp, #48] // 16-byte Folded Spill +; CHECK0-NEXT: lsr x9, x9, #3 +; CHECK0-NEXT: stp d13, d12, [sp, #64] // 16-byte Folded Spill +; CHECK0-NEXT: stp d11, d10, [sp, #80] // 16-byte Folded Spill +; CHECK0-NEXT: str x9, [sp, #32] // 8-byte Folded Spill +; CHECK0-NEXT: cntd x9 +; CHECK0-NEXT: str x9, [sp, #40] // 8-byte Folded Spill +; CHECK0-NEXT: stp d9, d8, [sp, #96] // 16-byte Folded Spill +; CHECK0-NEXT: str x25, [sp, #112] // 8-byte Folded Spill +; CHECK0-NEXT: stp x24, x23, [sp, #128] // 16-byte Folded Spill +; CHECK0-NEXT: stp x22, x21, [sp, #144] // 16-byte Folded Spill +; CHECK0-NEXT: stp x20, x19, [sp, #160] // 16-byte Folded Spill +; CHECK0-NEXT: .cfi_offset w19, -8 +; CHECK0-NEXT: .cfi_offset w20, -16 +; CHECK0-NEXT: .cfi_offset w21, -24 +; CHECK0-NEXT: .cfi_offset w22, -32 +; CHECK0-NEXT: .cfi_offset w23, -40 +; CHECK0-NEXT: .cfi_offset w24, -48 +; CHECK0-NEXT: .cfi_offset w25, -64 +; CHECK0-NEXT: .cfi_offset b8, -72 +; CHECK0-NEXT: .cfi_offset b9, -80 +; CHECK0-NEXT: .cfi_offset b10, -88 +; CHECK0-NEXT: .cfi_offset b11, -96 +; CHECK0-NEXT: .cfi_offset b12, -104 +; CHECK0-NEXT: .cfi_offset b13, -112 +; CHECK0-NEXT: .cfi_offset b14, -120 +; CHECK0-NEXT: .cfi_offset b15, -128 +; CHECK0-NEXT: .cfi_offset vg, -136 +; CHECK0-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK0-NEXT: smstart sm +; CHECK0-NEXT: //APP +; CHECK0-NEXT: //NO_APP +; CHECK0-NEXT: //APP +; CHECK0-NEXT: //NO_APP +; CHECK0-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK0-NEXT: str x0, [sp, #24] +; CHECK0-NEXT: str d0, [sp, #16] +; CHECK0-NEXT: smstop sm +; CHECK0-NEXT: ldp x20, x19, [sp, #160] // 16-byte Folded Reload +; CHECK0-NEXT: mov w0, wzr +; CHECK0-NEXT: ldp x22, x21, [sp, #144] // 16-byte Folded Reload +; CHECK0-NEXT: ldr x25, [sp, #112] // 8-byte Folded Reload +; CHECK0-NEXT: ldp x24, x23, [sp, #128] // 16-byte Folded Reload +; CHECK0-NEXT: ldp d9, d8, [sp, #96] // 16-byte Folded Reload +; CHECK0-NEXT: ldp d11, d10, [sp, #80] // 16-byte Folded Reload +; CHECK0-NEXT: ldp d13, d12, [sp, #64] // 16-byte Folded Reload +; CHECK0-NEXT: ldp d15, d14, [sp, #48] // 16-byte Folded Reload +; CHECK0-NEXT: add sp, sp, #176 +; CHECK0-NEXT: .cfi_def_cfa_offset 0 +; CHECK0-NEXT: .cfi_restore w19 +; CHECK0-NEXT: .cfi_restore w20 +; CHECK0-NEXT: .cfi_restore w21 +; CHECK0-NEXT: .cfi_restore w22 +; CHECK0-NEXT: .cfi_restore w23 +; CHECK0-NEXT: .cfi_restore w24 +; CHECK0-NEXT: .cfi_restore w25 +; CHECK0-NEXT: .cfi_restore b8 +; CHECK0-NEXT: .cfi_restore b9 +; CHECK0-NEXT: .cfi_restore b10 +; CHECK0-NEXT: .cfi_restore b11 +; CHECK0-NEXT: .cfi_restore b12 +; CHECK0-NEXT: .cfi_restore b13 +; CHECK0-NEXT: .cfi_restore b14 +; CHECK0-NEXT: .cfi_restore b15 +; CHECK0-NEXT: ret +; +; CHECK64-LABEL: csr_x18_25_d8_15_allocdi64_locallystreaming: +; CHECK64: // %bb.0: // %entry +; CHECK64-NEXT: sub sp, sp, #304 +; CHECK64-NEXT: .cfi_def_cfa_offset 304 +; CHECK64-NEXT: rdsvl x9, #1 +; CHECK64-NEXT: stp d15, d14, [sp, #112] // 16-byte Folded Spill +; CHECK64-NEXT: lsr x9, x9, #3 +; CHECK64-NEXT: stp d13, d12, [sp, #128] // 16-byte Folded Spill +; CHECK64-NEXT: stp d11, d10, [sp, #144] // 16-byte Folded Spill +; CHECK64-NEXT: str x9, [sp, #96] // 8-byte Folded Spill +; CHECK64-NEXT: cntd x9 +; CHECK64-NEXT: str x9, [sp, #104] // 8-byte Folded Spill +; CHECK64-NEXT: stp d9, d8, [sp, #160] // 16-byte Folded Spill +; CHECK64-NEXT: stp x29, x25, [sp, #240] // 16-byte Folded Spill +; CHECK64-NEXT: stp x24, x23, [sp, #256] // 16-byte Folded Spill +; CHECK64-NEXT: stp x22, x21, [sp, #272] // 16-byte Folded Spill +; CHECK64-NEXT: stp x20, x19, [sp, #288] // 16-byte Folded Spill +; CHECK64-NEXT: .cfi_offset w19, -8 +; CHECK64-NEXT: .cfi_offset w20, -16 +; CHECK64-NEXT: .cfi_offset w21, -24 +; CHECK64-NEXT: .cfi_offset w22, -32 +; CHECK64-NEXT: .cfi_offset w23, -40 +; CHECK64-NEXT: .cfi_offset w24, -48 +; CHECK64-NEXT: .cfi_offset w25, -56 +; CHECK64-NEXT: .cfi_offset w29, -64 +; CHECK64-NEXT: .cfi_offset b8, -136 +; CHECK64-NEXT: .cfi_offset b9, -144 +; CHECK64-NEXT: .cfi_offset b10, -152 +; CHECK64-NEXT: .cfi_offset b11, -160 +; CHECK64-NEXT: .cfi_offset b12, -168 +; CHECK64-NEXT: .cfi_offset b13, -176 +; CHECK64-NEXT: .cfi_offset b14, -184 +; CHECK64-NEXT: .cfi_offset b15, -192 +; CHECK64-NEXT: .cfi_offset vg, -200 +; CHECK64-NEXT: str d0, [sp, #80] // 8-byte Folded Spill +; CHECK64-NEXT: smstart sm +; CHECK64-NEXT: //APP +; CHECK64-NEXT: //NO_APP +; CHECK64-NEXT: //APP +; CHECK64-NEXT: //NO_APP +; CHECK64-NEXT: ldr d0, [sp, #80] // 8-byte Folded Reload +; CHECK64-NEXT: str x0, [sp, #8] +; CHECK64-NEXT: str d0, [sp, #88] +; CHECK64-NEXT: smstop sm +; CHECK64-NEXT: ldp x20, x19, [sp, #288] // 16-byte Folded Reload +; CHECK64-NEXT: mov w0, wzr +; CHECK64-NEXT: ldp x22, x21, [sp, #272] // 16-byte Folded Reload +; CHECK64-NEXT: ldp x24, x23, [sp, #256] // 16-byte Folded Reload +; CHECK64-NEXT: ldp x29, x25, [sp, #240] // 16-byte Folded Reload +; CHECK64-NEXT: ldp d9, d8, [sp, #160] // 16-byte Folded Reload +; CHECK64-NEXT: ldp d11, d10, [sp, #144] // 16-byte Folded Reload +; CHECK64-NEXT: ldp d13, d12, [sp, #128] // 16-byte Folded Reload +; CHECK64-NEXT: ldp d15, d14, [sp, #112] // 16-byte Folded Reload +; CHECK64-NEXT: add sp, sp, #304 +; CHECK64-NEXT: .cfi_def_cfa_offset 0 +; CHECK64-NEXT: .cfi_restore w19 +; CHECK64-NEXT: .cfi_restore w20 +; CHECK64-NEXT: .cfi_restore w21 +; CHECK64-NEXT: .cfi_restore w22 +; CHECK64-NEXT: .cfi_restore w23 +; CHECK64-NEXT: .cfi_restore w24 +; CHECK64-NEXT: .cfi_restore w25 +; CHECK64-NEXT: .cfi_restore w29 +; CHECK64-NEXT: .cfi_restore b8 +; CHECK64-NEXT: .cfi_restore b9 +; CHECK64-NEXT: .cfi_restore b10 +; CHECK64-NEXT: .cfi_restore b11 +; CHECK64-NEXT: .cfi_restore b12 +; CHECK64-NEXT: .cfi_restore b13 +; CHECK64-NEXT: .cfi_restore b14 +; CHECK64-NEXT: .cfi_restore b15 +; CHECK64-NEXT: ret +; +; CHECK1024-LABEL: csr_x18_25_d8_15_allocdi64_locallystreaming: +; CHECK1024: // %bb.0: // %entry +; CHECK1024-NEXT: rdsvl x9, #1 +; CHECK1024-NEXT: lsr x9, x9, #3 +; CHECK1024-NEXT: sub sp, sp, #1168 +; CHECK1024-NEXT: .cfi_def_cfa_offset 1168 +; CHECK1024-NEXT: str x9, [sp] // 8-byte Folded Spill +; CHECK1024-NEXT: cntd x9 +; CHECK1024-NEXT: str x9, [sp, #8] // 8-byte Folded Spill +; CHECK1024-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK1024-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK1024-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK1024-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK1024-NEXT: str x29, [sp, #1104] // 8-byte Folded Spill +; CHECK1024-NEXT: str x25, [sp, #1112] // 8-byte Folded Spill +; CHECK1024-NEXT: str x24, [sp, #1120] // 8-byte Folded Spill +; CHECK1024-NEXT: str x23, [sp, #1128] // 8-byte Folded Spill +; CHECK1024-NEXT: str x22, [sp, #1136] // 8-byte Folded Spill +; CHECK1024-NEXT: str x21, [sp, #1144] // 8-byte Folded Spill +; CHECK1024-NEXT: str x20, [sp, #1152] // 8-byte Folded Spill +; CHECK1024-NEXT: str x19, [sp, #1160] // 8-byte Folded Spill +; CHECK1024-NEXT: .cfi_offset w19, -8 +; CHECK1024-NEXT: .cfi_offset w20, -16 +; CHECK1024-NEXT: .cfi_offset w21, -24 +; CHECK1024-NEXT: .cfi_offset w22, -32 +; CHECK1024-NEXT: .cfi_offset w23, -40 +; CHECK1024-NEXT: .cfi_offset w24, -48 +; CHECK1024-NEXT: .cfi_offset w25, -56 +; CHECK1024-NEXT: .cfi_offset w29, -64 +; CHECK1024-NEXT: .cfi_offset b8, -1096 +; CHECK1024-NEXT: .cfi_offset b9, -1104 +; CHECK1024-NEXT: .cfi_offset b10, -1112 +; CHECK1024-NEXT: .cfi_offset b11, -1120 +; CHECK1024-NEXT: .cfi_offset b12, -1128 +; CHECK1024-NEXT: .cfi_offset b13, -1136 +; CHECK1024-NEXT: .cfi_offset b14, -1144 +; CHECK1024-NEXT: .cfi_offset b15, -1152 +; CHECK1024-NEXT: .cfi_offset vg, -1160 +; CHECK1024-NEXT: sub sp, sp, #1056 +; CHECK1024-NEXT: .cfi_def_cfa_offset 2224 +; CHECK1024-NEXT: str d0, [sp, #1040] // 8-byte Folded Spill +; CHECK1024-NEXT: smstart sm +; CHECK1024-NEXT: //APP +; CHECK1024-NEXT: //NO_APP +; CHECK1024-NEXT: //APP +; CHECK1024-NEXT: //NO_APP +; CHECK1024-NEXT: ldr d0, [sp, #1040] // 8-byte Folded Reload +; CHECK1024-NEXT: str x0, [sp, #8] +; CHECK1024-NEXT: str d0, [sp, #1048] +; CHECK1024-NEXT: smstop sm +; CHECK1024-NEXT: mov w0, wzr +; CHECK1024-NEXT: add sp, sp, #1056 +; CHECK1024-NEXT: .cfi_def_cfa_offset 1168 +; CHECK1024-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr x19, [sp, #1160] // 8-byte Folded Reload +; CHECK1024-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr x20, [sp, #1152] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x21, [sp, #1144] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x22, [sp, #1136] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x23, [sp, #1128] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x24, [sp, #1120] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x25, [sp, #1112] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x29, [sp, #1104] // 8-byte Folded Reload +; CHECK1024-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK1024-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK1024-NEXT: add sp, sp, #1168 +; CHECK1024-NEXT: .cfi_def_cfa_offset 0 +; CHECK1024-NEXT: .cfi_restore w19 +; CHECK1024-NEXT: .cfi_restore w20 +; CHECK1024-NEXT: .cfi_restore w21 +; CHECK1024-NEXT: .cfi_restore w22 +; CHECK1024-NEXT: .cfi_restore w23 +; CHECK1024-NEXT: .cfi_restore w24 +; CHECK1024-NEXT: .cfi_restore w25 +; CHECK1024-NEXT: .cfi_restore w29 +; CHECK1024-NEXT: .cfi_restore b8 +; CHECK1024-NEXT: .cfi_restore b9 +; CHECK1024-NEXT: .cfi_restore b10 +; CHECK1024-NEXT: .cfi_restore b11 +; CHECK1024-NEXT: .cfi_restore b12 +; CHECK1024-NEXT: .cfi_restore b13 +; CHECK1024-NEXT: .cfi_restore b14 +; CHECK1024-NEXT: .cfi_restore b15 +; CHECK1024-NEXT: ret +entry: + %a = alloca i64 + %b = alloca double + tail call void asm sideeffect "", "~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25}"() + tail call void asm sideeffect "", "~{d8},~{d9},~{d10},~{d11},~{d12},~{d13},~{d14},~{d15}"() + store i64 %d, ptr %a + store double %e, ptr %b + ret i32 0 +} + +; We don't currently handle fpr stack arguments very well (they are hopefully relatively rare). +define float @nocsr_stackargs(float %a, float %b, float %c, float %d, float %e, float %f, float %g, float %h, float %i) "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: nocsr_stackargs: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr s0, [sp] +; CHECK-NEXT: ret +entry: + ret float %i +} + +define float @csr_x20_stackargs(float %a, float %b, float %c, float %d, float %e, float %f, float %g, float %h, float %i) "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: csr_x20_stackargs: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x20, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: ldr s0, [sp, #16] +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr x20, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + tail call void asm sideeffect "", "~{x20}"() #1 + ret float %i +} + +define float @csr_d8_stackargs(float %a, float %b, float %c, float %d, float %e, float %f, float %g, float %h, float %i) "aarch64_pstate_sm_compatible" { +; CHECK0-LABEL: csr_d8_stackargs: +; CHECK0: // %bb.0: // %entry +; CHECK0-NEXT: str d8, [sp, #-16]! // 8-byte Folded Spill +; CHECK0-NEXT: .cfi_def_cfa_offset 16 +; CHECK0-NEXT: .cfi_offset b8, -16 +; CHECK0-NEXT: ldr s0, [sp, #16] +; CHECK0-NEXT: //APP +; CHECK0-NEXT: //NO_APP +; CHECK0-NEXT: ldr d8, [sp], #16 // 8-byte Folded Reload +; CHECK0-NEXT: ret +; +; CHECK64-LABEL: csr_d8_stackargs: +; CHECK64: // %bb.0: // %entry +; CHECK64-NEXT: sub sp, sp, #144 +; CHECK64-NEXT: str d8, [sp, #64] // 8-byte Folded Spill +; CHECK64-NEXT: .cfi_def_cfa_offset 144 +; CHECK64-NEXT: .cfi_offset b8, -80 +; CHECK64-NEXT: //APP +; CHECK64-NEXT: //NO_APP +; CHECK64-NEXT: ldr s0, [sp, #144] +; CHECK64-NEXT: ldr d8, [sp, #64] // 8-byte Folded Reload +; CHECK64-NEXT: add sp, sp, #144 +; CHECK64-NEXT: ret +; +; CHECK1024-LABEL: csr_d8_stackargs: +; CHECK1024: // %bb.0: // %entry +; CHECK1024-NEXT: sub sp, sp, #1040 +; CHECK1024-NEXT: str d8, [sp] // 8-byte Folded Spill +; CHECK1024-NEXT: str x29, [sp, #1032] // 8-byte Folded Spill +; CHECK1024-NEXT: sub sp, sp, #1024 +; CHECK1024-NEXT: .cfi_def_cfa_offset 2064 +; CHECK1024-NEXT: .cfi_offset w29, -8 +; CHECK1024-NEXT: .cfi_offset b8, -1040 +; CHECK1024-NEXT: ldr s0, [sp, #2064] +; CHECK1024-NEXT: //APP +; CHECK1024-NEXT: //NO_APP +; CHECK1024-NEXT: add sp, sp, #1024 +; CHECK1024-NEXT: ldr x29, [sp, #1032] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr d8, [sp] // 8-byte Folded Reload +; CHECK1024-NEXT: add sp, sp, #1040 +; CHECK1024-NEXT: ret +entry: + tail call void asm sideeffect "", "~{d8}"() #1 + ret float %i +} + +; SVE calling conventions +define i32 @svecc_basic(i32 noundef %num, %vs) "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: svecc_basic: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w0, wzr +; CHECK-NEXT: ret +entry: + ret i32 0 +} + +define i32 @svecc_csr_x20(i32 noundef %num, %vs) "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: svecc_csr_x20: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x20, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: mov w0, wzr +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr x20, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + tail call void asm sideeffect "", "~{x20}"() #1 + ret i32 0 +} + +define i32 @svecc_csr_d8(i32 noundef %num, %vs) "aarch64_pstate_sm_compatible" { +; CHECK0-LABEL: svecc_csr_d8: +; CHECK0: // %bb.0: // %entry +; CHECK0-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK0-NEXT: addvl sp, sp, #-1 +; CHECK0-NEXT: str z8, [sp] // 16-byte Folded Spill +; CHECK0-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK0-NEXT: .cfi_offset w29, -16 +; CHECK0-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG +; CHECK0-NEXT: //APP +; CHECK0-NEXT: //NO_APP +; CHECK0-NEXT: mov w0, wzr +; CHECK0-NEXT: ldr z8, [sp] // 16-byte Folded Reload +; CHECK0-NEXT: addvl sp, sp, #1 +; CHECK0-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK0-NEXT: ret +; +; CHECK64-LABEL: svecc_csr_d8: +; CHECK64: // %bb.0: // %entry +; CHECK64-NEXT: sub sp, sp, #80 +; CHECK64-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK64-NEXT: addvl sp, sp, #-1 +; CHECK64-NEXT: str z8, [sp] // 16-byte Folded Spill +; CHECK64-NEXT: sub sp, sp, #64 +; CHECK64-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x90, 0x01, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 144 + 8 * VG +; CHECK64-NEXT: .cfi_offset w29, -16 +; CHECK64-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x11, 0xb0, 0x7f, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 80 - 8 * VG +; CHECK64-NEXT: mov w0, wzr +; CHECK64-NEXT: //APP +; CHECK64-NEXT: //NO_APP +; CHECK64-NEXT: add sp, sp, #64 +; CHECK64-NEXT: ldr z8, [sp] // 16-byte Folded Reload +; CHECK64-NEXT: addvl sp, sp, #1 +; CHECK64-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK64-NEXT: add sp, sp, #80 +; CHECK64-NEXT: ret +; +; CHECK1024-LABEL: svecc_csr_d8: +; CHECK1024: // %bb.0: // %entry +; CHECK1024-NEXT: sub sp, sp, #1040 +; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill +; CHECK1024-NEXT: addvl sp, sp, #-1 +; CHECK1024-NEXT: str z8, [sp] // 16-byte Folded Spill +; CHECK1024-NEXT: sub sp, sp, #1024 +; CHECK1024-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x90, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 2064 + 8 * VG +; CHECK1024-NEXT: .cfi_offset w29, -16 +; CHECK1024-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x11, 0xf0, 0x77, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 1040 - 8 * VG +; CHECK1024-NEXT: mov w0, wzr +; CHECK1024-NEXT: //APP +; CHECK1024-NEXT: //NO_APP +; CHECK1024-NEXT: add sp, sp, #1024 +; CHECK1024-NEXT: ldr z8, [sp] // 16-byte Folded Reload +; CHECK1024-NEXT: addvl sp, sp, #1 +; CHECK1024-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload +; CHECK1024-NEXT: add sp, sp, #1040 +; CHECK1024-NEXT: ret +entry: + tail call void asm sideeffect "", "~{d8}"() #1 + ret i32 0 +} + +define i32 @svecc_csr_d8d9(i32 noundef %num, %vs) "aarch64_pstate_sm_compatible" { +; CHECK0-LABEL: svecc_csr_d8d9: +; CHECK0: // %bb.0: // %entry +; CHECK0-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK0-NEXT: addvl sp, sp, #-2 +; CHECK0-NEXT: str z9, [sp] // 16-byte Folded Spill +; CHECK0-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG +; CHECK0-NEXT: .cfi_offset w29, -16 +; CHECK0-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG +; CHECK0-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG +; CHECK0-NEXT: //APP +; CHECK0-NEXT: //NO_APP +; CHECK0-NEXT: mov w0, wzr +; CHECK0-NEXT: ldr z9, [sp] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: addvl sp, sp, #2 +; CHECK0-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK0-NEXT: ret +; +; CHECK64-LABEL: svecc_csr_d8d9: +; CHECK64: // %bb.0: // %entry +; CHECK64-NEXT: sub sp, sp, #80 +; CHECK64-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK64-NEXT: addvl sp, sp, #-2 +; CHECK64-NEXT: str z9, [sp] // 16-byte Folded Spill +; CHECK64-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: sub sp, sp, #64 +; CHECK64-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x90, 0x01, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 144 + 16 * VG +; CHECK64-NEXT: .cfi_offset w29, -16 +; CHECK64-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x11, 0xb0, 0x7f, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 80 - 8 * VG +; CHECK64-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x11, 0xb0, 0x7f, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 80 - 16 * VG +; CHECK64-NEXT: mov w0, wzr +; CHECK64-NEXT: //APP +; CHECK64-NEXT: //NO_APP +; CHECK64-NEXT: add sp, sp, #64 +; CHECK64-NEXT: ldr z9, [sp] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: addvl sp, sp, #2 +; CHECK64-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK64-NEXT: add sp, sp, #80 +; CHECK64-NEXT: ret +; +; CHECK1024-LABEL: svecc_csr_d8d9: +; CHECK1024: // %bb.0: // %entry +; CHECK1024-NEXT: sub sp, sp, #1040 +; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill +; CHECK1024-NEXT: addvl sp, sp, #-2 +; CHECK1024-NEXT: str z9, [sp] // 16-byte Folded Spill +; CHECK1024-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: sub sp, sp, #1024 +; CHECK1024-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x90, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 2064 + 16 * VG +; CHECK1024-NEXT: .cfi_offset w29, -16 +; CHECK1024-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x11, 0xf0, 0x77, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 1040 - 8 * VG +; CHECK1024-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x11, 0xf0, 0x77, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 1040 - 16 * VG +; CHECK1024-NEXT: mov w0, wzr +; CHECK1024-NEXT: //APP +; CHECK1024-NEXT: //NO_APP +; CHECK1024-NEXT: add sp, sp, #1024 +; CHECK1024-NEXT: ldr z9, [sp] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: addvl sp, sp, #2 +; CHECK1024-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload +; CHECK1024-NEXT: add sp, sp, #1040 +; CHECK1024-NEXT: ret +entry: + tail call void asm sideeffect "", "~{d8},~{d9}"() #1 + ret i32 0 +} + +define i32 @svecc_csr_d8_allocd(double %d, %vs) "aarch64_pstate_sm_compatible" { +; CHECK0-LABEL: svecc_csr_d8_allocd: +; CHECK0: // %bb.0: // %entry +; CHECK0-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK0-NEXT: addvl sp, sp, #-1 +; CHECK0-NEXT: str z8, [sp] // 16-byte Folded Spill +; CHECK0-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK0-NEXT: .cfi_offset w29, -16 +; CHECK0-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG +; CHECK0-NEXT: //APP +; CHECK0-NEXT: //NO_APP +; CHECK0-NEXT: addvl x8, sp, #1 +; CHECK0-NEXT: mov w0, wzr +; CHECK0-NEXT: ldr z8, [sp] // 16-byte Folded Reload +; CHECK0-NEXT: str d0, [x8, #8] +; CHECK0-NEXT: addvl sp, sp, #1 +; CHECK0-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK0-NEXT: ret +; +; CHECK64-LABEL: svecc_csr_d8_allocd: +; CHECK64: // %bb.0: // %entry +; CHECK64-NEXT: sub sp, sp, #80 +; CHECK64-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK64-NEXT: addvl sp, sp, #-1 +; CHECK64-NEXT: str z8, [sp] // 16-byte Folded Spill +; CHECK64-NEXT: sub sp, sp, #80 +; CHECK64-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0xa0, 0x01, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 160 + 8 * VG +; CHECK64-NEXT: .cfi_offset w29, -16 +; CHECK64-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x11, 0xb0, 0x7f, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 80 - 8 * VG +; CHECK64-NEXT: mov w0, wzr +; CHECK64-NEXT: //APP +; CHECK64-NEXT: //NO_APP +; CHECK64-NEXT: str d0, [sp, #72] +; CHECK64-NEXT: add sp, sp, #80 +; CHECK64-NEXT: ldr z8, [sp] // 16-byte Folded Reload +; CHECK64-NEXT: addvl sp, sp, #1 +; CHECK64-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK64-NEXT: add sp, sp, #80 +; CHECK64-NEXT: ret +; +; CHECK1024-LABEL: svecc_csr_d8_allocd: +; CHECK1024: // %bb.0: // %entry +; CHECK1024-NEXT: sub sp, sp, #1040 +; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill +; CHECK1024-NEXT: addvl sp, sp, #-1 +; CHECK1024-NEXT: str z8, [sp] // 16-byte Folded Spill +; CHECK1024-NEXT: sub sp, sp, #1040 +; CHECK1024-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0xa0, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 2080 + 8 * VG +; CHECK1024-NEXT: .cfi_offset w29, -16 +; CHECK1024-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x11, 0xf0, 0x77, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 1040 - 8 * VG +; CHECK1024-NEXT: mov w0, wzr +; CHECK1024-NEXT: //APP +; CHECK1024-NEXT: //NO_APP +; CHECK1024-NEXT: str d0, [sp, #1032] +; CHECK1024-NEXT: add sp, sp, #1040 +; CHECK1024-NEXT: ldr z8, [sp] // 16-byte Folded Reload +; CHECK1024-NEXT: addvl sp, sp, #1 +; CHECK1024-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload +; CHECK1024-NEXT: add sp, sp, #1040 +; CHECK1024-NEXT: ret +entry: + %a = alloca double + tail call void asm sideeffect "", "~{d8}"() #1 + store double %d, ptr %a + ret i32 0 +} + +define i32 @svecc_csr_d8_alloci64(i64 %d, %vs) "aarch64_pstate_sm_compatible" { +; CHECK0-LABEL: svecc_csr_d8_alloci64: +; CHECK0: // %bb.0: // %entry +; CHECK0-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK0-NEXT: addvl sp, sp, #-1 +; CHECK0-NEXT: str z8, [sp] // 16-byte Folded Spill +; CHECK0-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK0-NEXT: .cfi_offset w29, -16 +; CHECK0-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG +; CHECK0-NEXT: //APP +; CHECK0-NEXT: //NO_APP +; CHECK0-NEXT: mov x8, x0 +; CHECK0-NEXT: addvl x9, sp, #1 +; CHECK0-NEXT: ldr z8, [sp] // 16-byte Folded Reload +; CHECK0-NEXT: mov w0, wzr +; CHECK0-NEXT: str x8, [x9, #8] +; CHECK0-NEXT: addvl sp, sp, #1 +; CHECK0-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK0-NEXT: ret +; +; CHECK64-LABEL: svecc_csr_d8_alloci64: +; CHECK64: // %bb.0: // %entry +; CHECK64-NEXT: sub sp, sp, #80 +; CHECK64-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK64-NEXT: addvl sp, sp, #-1 +; CHECK64-NEXT: str z8, [sp] // 16-byte Folded Spill +; CHECK64-NEXT: sub sp, sp, #80 +; CHECK64-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0xa0, 0x01, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 160 + 8 * VG +; CHECK64-NEXT: .cfi_offset w29, -16 +; CHECK64-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x11, 0xb0, 0x7f, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 80 - 8 * VG +; CHECK64-NEXT: mov x8, x0 +; CHECK64-NEXT: mov w0, wzr +; CHECK64-NEXT: //APP +; CHECK64-NEXT: //NO_APP +; CHECK64-NEXT: str x8, [sp, #8] +; CHECK64-NEXT: add sp, sp, #80 +; CHECK64-NEXT: ldr z8, [sp] // 16-byte Folded Reload +; CHECK64-NEXT: addvl sp, sp, #1 +; CHECK64-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK64-NEXT: add sp, sp, #80 +; CHECK64-NEXT: ret +; +; CHECK1024-LABEL: svecc_csr_d8_alloci64: +; CHECK1024: // %bb.0: // %entry +; CHECK1024-NEXT: sub sp, sp, #1040 +; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill +; CHECK1024-NEXT: addvl sp, sp, #-1 +; CHECK1024-NEXT: str z8, [sp] // 16-byte Folded Spill +; CHECK1024-NEXT: sub sp, sp, #1040 +; CHECK1024-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0xa0, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 2080 + 8 * VG +; CHECK1024-NEXT: .cfi_offset w29, -16 +; CHECK1024-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x11, 0xf0, 0x77, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 1040 - 8 * VG +; CHECK1024-NEXT: mov x8, x0 +; CHECK1024-NEXT: mov w0, wzr +; CHECK1024-NEXT: //APP +; CHECK1024-NEXT: //NO_APP +; CHECK1024-NEXT: str x8, [sp, #8] +; CHECK1024-NEXT: add sp, sp, #1040 +; CHECK1024-NEXT: ldr z8, [sp] // 16-byte Folded Reload +; CHECK1024-NEXT: addvl sp, sp, #1 +; CHECK1024-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload +; CHECK1024-NEXT: add sp, sp, #1040 +; CHECK1024-NEXT: ret +entry: + %a = alloca i64 + tail call void asm sideeffect "", "~{d8}"() #1 + store i64 %d, ptr %a + ret i32 0 +} + +define i32 @svecc_csr_d8_allocnxv4i32(i64 %d, %vs) "aarch64_pstate_sm_compatible" { +; CHECK0-LABEL: svecc_csr_d8_allocnxv4i32: +; CHECK0: // %bb.0: // %entry +; CHECK0-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK0-NEXT: addvl sp, sp, #-1 +; CHECK0-NEXT: str z8, [sp] // 16-byte Folded Spill +; CHECK0-NEXT: addvl sp, sp, #-1 +; CHECK0-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG +; CHECK0-NEXT: .cfi_offset w29, -16 +; CHECK0-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG +; CHECK0-NEXT: mov z0.s, #0 // =0x0 +; CHECK0-NEXT: ptrue p0.s +; CHECK0-NEXT: mov w0, wzr +; CHECK0-NEXT: //APP +; CHECK0-NEXT: //NO_APP +; CHECK0-NEXT: st1w { z0.s }, p0, [sp] +; CHECK0-NEXT: addvl sp, sp, #1 +; CHECK0-NEXT: ldr z8, [sp] // 16-byte Folded Reload +; CHECK0-NEXT: addvl sp, sp, #1 +; CHECK0-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK0-NEXT: ret +; +; CHECK64-LABEL: svecc_csr_d8_allocnxv4i32: +; CHECK64: // %bb.0: // %entry +; CHECK64-NEXT: sub sp, sp, #80 +; CHECK64-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK64-NEXT: addvl sp, sp, #-1 +; CHECK64-NEXT: str z8, [sp] // 16-byte Folded Spill +; CHECK64-NEXT: sub sp, sp, #64 +; CHECK64-NEXT: addvl sp, sp, #-1 +; CHECK64-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x90, 0x01, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 144 + 16 * VG +; CHECK64-NEXT: .cfi_offset w29, -16 +; CHECK64-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x11, 0xb0, 0x7f, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 80 - 8 * VG +; CHECK64-NEXT: mov z0.s, #0 // =0x0 +; CHECK64-NEXT: ptrue p0.s +; CHECK64-NEXT: add x8, sp, #64 +; CHECK64-NEXT: mov w0, wzr +; CHECK64-NEXT: //APP +; CHECK64-NEXT: //NO_APP +; CHECK64-NEXT: st1w { z0.s }, p0, [x8] +; CHECK64-NEXT: add sp, sp, #64 +; CHECK64-NEXT: addvl sp, sp, #1 +; CHECK64-NEXT: ldr z8, [sp] // 16-byte Folded Reload +; CHECK64-NEXT: addvl sp, sp, #1 +; CHECK64-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK64-NEXT: add sp, sp, #80 +; CHECK64-NEXT: ret +; +; CHECK1024-LABEL: svecc_csr_d8_allocnxv4i32: +; CHECK1024: // %bb.0: // %entry +; CHECK1024-NEXT: sub sp, sp, #1040 +; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill +; CHECK1024-NEXT: addvl sp, sp, #-1 +; CHECK1024-NEXT: str z8, [sp] // 16-byte Folded Spill +; CHECK1024-NEXT: sub sp, sp, #1024 +; CHECK1024-NEXT: addvl sp, sp, #-1 +; CHECK1024-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x90, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 2064 + 16 * VG +; CHECK1024-NEXT: .cfi_offset w29, -16 +; CHECK1024-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x11, 0xf0, 0x77, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 1040 - 8 * VG +; CHECK1024-NEXT: mov z0.s, #0 // =0x0 +; CHECK1024-NEXT: ptrue p0.s +; CHECK1024-NEXT: add x8, sp, #1024 +; CHECK1024-NEXT: mov w0, wzr +; CHECK1024-NEXT: //APP +; CHECK1024-NEXT: //NO_APP +; CHECK1024-NEXT: st1w { z0.s }, p0, [x8] +; CHECK1024-NEXT: add sp, sp, #1024 +; CHECK1024-NEXT: addvl sp, sp, #1 +; CHECK1024-NEXT: ldr z8, [sp] // 16-byte Folded Reload +; CHECK1024-NEXT: addvl sp, sp, #1 +; CHECK1024-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload +; CHECK1024-NEXT: add sp, sp, #1040 +; CHECK1024-NEXT: ret +entry: + %a = alloca + tail call void asm sideeffect "", "~{d8}"() #1 + store zeroinitializer, ptr %a + ret i32 0 +} + +define i32 @svecc_csr_x18_25_d8_15_allocdi64(i64 %d, double %e, %vs) "aarch64_pstate_sm_compatible" { +; CHECK0-LABEL: svecc_csr_x18_25_d8_15_allocdi64: +; CHECK0: // %bb.0: // %entry +; CHECK0-NEXT: stp x29, x25, [sp, #-64]! // 16-byte Folded Spill +; CHECK0-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill +; CHECK0-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill +; CHECK0-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; CHECK0-NEXT: addvl sp, sp, #-8 +; CHECK0-NEXT: str z15, [sp] // 16-byte Folded Spill +; CHECK0-NEXT: str z14, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z13, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z12, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z11, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z10, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z9, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z8, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: sub sp, sp, #16 +; CHECK0-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0xc0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 80 + 64 * VG +; CHECK0-NEXT: .cfi_offset w19, -8 +; CHECK0-NEXT: .cfi_offset w20, -16 +; CHECK0-NEXT: .cfi_offset w21, -24 +; CHECK0-NEXT: .cfi_offset w22, -32 +; CHECK0-NEXT: .cfi_offset w23, -40 +; CHECK0-NEXT: .cfi_offset w24, -48 +; CHECK0-NEXT: .cfi_offset w25, -56 +; CHECK0-NEXT: .cfi_offset w29, -64 +; CHECK0-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 64 - 8 * VG +; CHECK0-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 64 - 16 * VG +; CHECK0-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 64 - 24 * VG +; CHECK0-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 64 - 32 * VG +; CHECK0-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 64 - 40 * VG +; CHECK0-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 64 - 48 * VG +; CHECK0-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 64 - 56 * VG +; CHECK0-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 64 - 64 * VG +; CHECK0-NEXT: mov x8, x0 +; CHECK0-NEXT: mov w0, wzr +; CHECK0-NEXT: //APP +; CHECK0-NEXT: //NO_APP +; CHECK0-NEXT: //APP +; CHECK0-NEXT: //NO_APP +; CHECK0-NEXT: str x8, [sp, #8] +; CHECK0-NEXT: str d0, [sp], #16 +; CHECK0-NEXT: ldr z15, [sp] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z14, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z13, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z12, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z11, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z10, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z9, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z8, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: addvl sp, sp, #8 +; CHECK0-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; CHECK0-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload +; CHECK0-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload +; CHECK0-NEXT: ldp x29, x25, [sp], #64 // 16-byte Folded Reload +; CHECK0-NEXT: ret +; +; CHECK64-LABEL: svecc_csr_x18_25_d8_15_allocdi64: +; CHECK64: // %bb.0: // %entry +; CHECK64-NEXT: sub sp, sp, #128 +; CHECK64-NEXT: stp x29, x25, [sp, #64] // 16-byte Folded Spill +; CHECK64-NEXT: stp x24, x23, [sp, #80] // 16-byte Folded Spill +; CHECK64-NEXT: stp x22, x21, [sp, #96] // 16-byte Folded Spill +; CHECK64-NEXT: stp x20, x19, [sp, #112] // 16-byte Folded Spill +; CHECK64-NEXT: addvl sp, sp, #-8 +; CHECK64-NEXT: str z15, [sp] // 16-byte Folded Spill +; CHECK64-NEXT: str z14, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z13, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z12, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z11, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z10, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z9, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z8, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: sub sp, sp, #96 +; CHECK64-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xe0, 0x01, 0x22, 0x11, 0xc0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 224 + 64 * VG +; CHECK64-NEXT: .cfi_offset w19, -8 +; CHECK64-NEXT: .cfi_offset w20, -16 +; CHECK64-NEXT: .cfi_offset w21, -24 +; CHECK64-NEXT: .cfi_offset w22, -32 +; CHECK64-NEXT: .cfi_offset w23, -40 +; CHECK64-NEXT: .cfi_offset w24, -48 +; CHECK64-NEXT: .cfi_offset w25, -56 +; CHECK64-NEXT: .cfi_offset w29, -64 +; CHECK64-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 128 - 8 * VG +; CHECK64-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 128 - 16 * VG +; CHECK64-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 128 - 24 * VG +; CHECK64-NEXT: .cfi_escape 0x10, 0x4b, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 128 - 32 * VG +; CHECK64-NEXT: .cfi_escape 0x10, 0x4c, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 128 - 40 * VG +; CHECK64-NEXT: .cfi_escape 0x10, 0x4d, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 128 - 48 * VG +; CHECK64-NEXT: .cfi_escape 0x10, 0x4e, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 128 - 56 * VG +; CHECK64-NEXT: .cfi_escape 0x10, 0x4f, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 128 - 64 * VG +; CHECK64-NEXT: mov x8, x0 +; CHECK64-NEXT: mov w0, wzr +; CHECK64-NEXT: //APP +; CHECK64-NEXT: //NO_APP +; CHECK64-NEXT: //APP +; CHECK64-NEXT: //NO_APP +; CHECK64-NEXT: str x8, [sp, #8] +; CHECK64-NEXT: str d0, [sp, #88] +; CHECK64-NEXT: add sp, sp, #96 +; CHECK64-NEXT: ldr z15, [sp] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z14, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z13, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z12, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z11, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z10, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z9, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z8, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: addvl sp, sp, #8 +; CHECK64-NEXT: ldp x20, x19, [sp, #112] // 16-byte Folded Reload +; CHECK64-NEXT: ldp x22, x21, [sp, #96] // 16-byte Folded Reload +; CHECK64-NEXT: ldp x24, x23, [sp, #80] // 16-byte Folded Reload +; CHECK64-NEXT: ldp x29, x25, [sp, #64] // 16-byte Folded Reload +; CHECK64-NEXT: add sp, sp, #128 +; CHECK64-NEXT: ret +; +; CHECK1024-LABEL: svecc_csr_x18_25_d8_15_allocdi64: +; CHECK1024: // %bb.0: // %entry +; CHECK1024-NEXT: sub sp, sp, #1088 +; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill +; CHECK1024-NEXT: str x25, [sp, #1032] // 8-byte Folded Spill +; CHECK1024-NEXT: str x24, [sp, #1040] // 8-byte Folded Spill +; CHECK1024-NEXT: str x23, [sp, #1048] // 8-byte Folded Spill +; CHECK1024-NEXT: str x22, [sp, #1056] // 8-byte Folded Spill +; CHECK1024-NEXT: str x21, [sp, #1064] // 8-byte Folded Spill +; CHECK1024-NEXT: str x20, [sp, #1072] // 8-byte Folded Spill +; CHECK1024-NEXT: str x19, [sp, #1080] // 8-byte Folded Spill +; CHECK1024-NEXT: addvl sp, sp, #-8 +; CHECK1024-NEXT: str z15, [sp] // 16-byte Folded Spill +; CHECK1024-NEXT: str z14, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z13, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z12, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z11, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z10, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z9, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z8, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: sub sp, sp, #1056 +; CHECK1024-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xe0, 0x10, 0x22, 0x11, 0xc0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 2144 + 64 * VG +; CHECK1024-NEXT: .cfi_offset w19, -8 +; CHECK1024-NEXT: .cfi_offset w20, -16 +; CHECK1024-NEXT: .cfi_offset w21, -24 +; CHECK1024-NEXT: .cfi_offset w22, -32 +; CHECK1024-NEXT: .cfi_offset w23, -40 +; CHECK1024-NEXT: .cfi_offset w24, -48 +; CHECK1024-NEXT: .cfi_offset w25, -56 +; CHECK1024-NEXT: .cfi_offset w29, -64 +; CHECK1024-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x11, 0xc0, 0x77, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 1088 - 8 * VG +; CHECK1024-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x11, 0xc0, 0x77, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 1088 - 16 * VG +; CHECK1024-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x11, 0xc0, 0x77, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 1088 - 24 * VG +; CHECK1024-NEXT: .cfi_escape 0x10, 0x4b, 0x0b, 0x11, 0xc0, 0x77, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 1088 - 32 * VG +; CHECK1024-NEXT: .cfi_escape 0x10, 0x4c, 0x0b, 0x11, 0xc0, 0x77, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 1088 - 40 * VG +; CHECK1024-NEXT: .cfi_escape 0x10, 0x4d, 0x0b, 0x11, 0xc0, 0x77, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 1088 - 48 * VG +; CHECK1024-NEXT: .cfi_escape 0x10, 0x4e, 0x0b, 0x11, 0xc0, 0x77, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 1088 - 56 * VG +; CHECK1024-NEXT: .cfi_escape 0x10, 0x4f, 0x0b, 0x11, 0xc0, 0x77, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 1088 - 64 * VG +; CHECK1024-NEXT: mov x8, x0 +; CHECK1024-NEXT: mov w0, wzr +; CHECK1024-NEXT: //APP +; CHECK1024-NEXT: //NO_APP +; CHECK1024-NEXT: //APP +; CHECK1024-NEXT: //NO_APP +; CHECK1024-NEXT: str x8, [sp, #8] +; CHECK1024-NEXT: str d0, [sp, #1048] +; CHECK1024-NEXT: add sp, sp, #1056 +; CHECK1024-NEXT: ldr z15, [sp] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z14, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z13, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z12, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z11, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z10, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z9, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z8, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: addvl sp, sp, #8 +; CHECK1024-NEXT: ldr x19, [sp, #1080] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x20, [sp, #1072] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x21, [sp, #1064] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x22, [sp, #1056] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x23, [sp, #1048] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x24, [sp, #1040] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x25, [sp, #1032] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload +; CHECK1024-NEXT: add sp, sp, #1088 +; CHECK1024-NEXT: ret +entry: + %a = alloca i64 + %b = alloca double + tail call void asm sideeffect "", "~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25}"() + tail call void asm sideeffect "", "~{d8},~{d9},~{d10},~{d11},~{d12},~{d13},~{d14},~{d15}"() + store i64 %d, ptr %a + store double %e, ptr %b + ret i32 0 +} + + +define [2 x ] @sve_signature_pred_2xv4i1([2 x ] %arg1, [2 x ] %arg2) nounwind "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: sve_signature_pred_2xv4i1: +; CHECK: // %bb.0: +; CHECK-NEXT: mov p1.b, p3.b +; CHECK-NEXT: mov p0.b, p2.b +; CHECK-NEXT: ret + ret [2 x ] %arg2 +} + +define [2 x ] @sve_signature_pred_2xv4i1_caller([2 x ] %arg1, [2 x ] %arg2) nounwind "aarch64_pstate_sm_compatible" { +; CHECK0-LABEL: sve_signature_pred_2xv4i1_caller: +; CHECK0: // %bb.0: +; CHECK0-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK0-NEXT: addvl sp, sp, #-1 +; CHECK0-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: mov p5.b, p0.b +; CHECK0-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: mov p4.b, p1.b +; CHECK0-NEXT: mov p0.b, p2.b +; CHECK0-NEXT: mov p1.b, p3.b +; CHECK0-NEXT: mov p2.b, p5.b +; CHECK0-NEXT: mov p3.b, p4.b +; CHECK0-NEXT: bl sve_signature_pred_2xv4i1 +; CHECK0-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: addvl sp, sp, #1 +; CHECK0-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK0-NEXT: ret +; +; CHECK64-LABEL: sve_signature_pred_2xv4i1_caller: +; CHECK64: // %bb.0: +; CHECK64-NEXT: sub sp, sp, #80 +; CHECK64-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK64-NEXT: addvl sp, sp, #-1 +; CHECK64-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK64-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK64-NEXT: sub sp, sp, #64 +; CHECK64-NEXT: mov p4.b, p1.b +; CHECK64-NEXT: mov p5.b, p0.b +; CHECK64-NEXT: mov p0.b, p2.b +; CHECK64-NEXT: mov p1.b, p3.b +; CHECK64-NEXT: mov p2.b, p5.b +; CHECK64-NEXT: mov p3.b, p4.b +; CHECK64-NEXT: bl sve_signature_pred_2xv4i1 +; CHECK64-NEXT: add sp, sp, #64 +; CHECK64-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK64-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK64-NEXT: addvl sp, sp, #1 +; CHECK64-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK64-NEXT: add sp, sp, #80 +; CHECK64-NEXT: ret +; +; CHECK1024-LABEL: sve_signature_pred_2xv4i1_caller: +; CHECK1024: // %bb.0: +; CHECK1024-NEXT: sub sp, sp, #1040 +; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill +; CHECK1024-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill +; CHECK1024-NEXT: addvl sp, sp, #-1 +; CHECK1024-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK1024-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK1024-NEXT: sub sp, sp, #1024 +; CHECK1024-NEXT: mov p4.b, p1.b +; CHECK1024-NEXT: mov p5.b, p0.b +; CHECK1024-NEXT: mov p0.b, p2.b +; CHECK1024-NEXT: mov p1.b, p3.b +; CHECK1024-NEXT: mov p2.b, p5.b +; CHECK1024-NEXT: mov p3.b, p4.b +; CHECK1024-NEXT: bl sve_signature_pred_2xv4i1 +; CHECK1024-NEXT: add sp, sp, #1024 +; CHECK1024-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK1024-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK1024-NEXT: addvl sp, sp, #1 +; CHECK1024-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload +; CHECK1024-NEXT: add sp, sp, #1040 +; CHECK1024-NEXT: ret + %res = call [2 x ] @sve_signature_pred_2xv4i1([2 x ] %arg2, [2 x ] %arg1) + ret [2 x ] %res +} + +define i32 @f128_libcall(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32 %b) "aarch64_pstate_sm_compatible" { +; CHECK0-LABEL: f128_libcall: +; CHECK0: // %bb.0: +; CHECK0-NEXT: sub sp, sp, #176 +; CHECK0-NEXT: .cfi_def_cfa_offset 176 +; CHECK0-NEXT: cntd x9 +; CHECK0-NEXT: stp d15, d14, [sp, #64] // 16-byte Folded Spill +; CHECK0-NEXT: stp d13, d12, [sp, #80] // 16-byte Folded Spill +; CHECK0-NEXT: stp d11, d10, [sp, #96] // 16-byte Folded Spill +; CHECK0-NEXT: stp d9, d8, [sp, #112] // 16-byte Folded Spill +; CHECK0-NEXT: stp x30, x9, [sp, #128] // 16-byte Folded Spill +; CHECK0-NEXT: stp x22, x21, [sp, #144] // 16-byte Folded Spill +; CHECK0-NEXT: stp x20, x19, [sp, #160] // 16-byte Folded Spill +; CHECK0-NEXT: .cfi_offset w19, -8 +; CHECK0-NEXT: .cfi_offset w20, -16 +; CHECK0-NEXT: .cfi_offset w21, -24 +; CHECK0-NEXT: .cfi_offset w22, -32 +; CHECK0-NEXT: .cfi_offset w30, -48 +; CHECK0-NEXT: .cfi_offset b8, -56 +; CHECK0-NEXT: .cfi_offset b9, -64 +; CHECK0-NEXT: .cfi_offset b10, -72 +; CHECK0-NEXT: .cfi_offset b11, -80 +; CHECK0-NEXT: .cfi_offset b12, -88 +; CHECK0-NEXT: .cfi_offset b13, -96 +; CHECK0-NEXT: .cfi_offset b14, -104 +; CHECK0-NEXT: .cfi_offset b15, -112 +; CHECK0-NEXT: mov w19, w1 +; CHECK0-NEXT: mov w20, w0 +; CHECK0-NEXT: stp q0, q1, [sp] // 32-byte Folded Spill +; CHECK0-NEXT: stp q2, q3, [sp, #32] // 32-byte Folded Spill +; CHECK0-NEXT: bl __arm_sme_state +; CHECK0-NEXT: and x21, x0, #0x1 +; CHECK0-NEXT: .cfi_offset vg, -40 +; CHECK0-NEXT: tbz w21, #0, .LBB27_2 +; CHECK0-NEXT: // %bb.1: +; CHECK0-NEXT: smstop sm +; CHECK0-NEXT: .LBB27_2: +; CHECK0-NEXT: ldp q0, q1, [sp] // 32-byte Folded Reload +; CHECK0-NEXT: bl __lttf2 +; CHECK0-NEXT: tbz w21, #0, .LBB27_4 +; CHECK0-NEXT: // %bb.3: +; CHECK0-NEXT: smstart sm +; CHECK0-NEXT: .LBB27_4: +; CHECK0-NEXT: cmp w0, #0 +; CHECK0-NEXT: .cfi_restore vg +; CHECK0-NEXT: cset w21, lt +; CHECK0-NEXT: bl __arm_sme_state +; CHECK0-NEXT: and x22, x0, #0x1 +; CHECK0-NEXT: .cfi_offset vg, -40 +; CHECK0-NEXT: tbz w22, #0, .LBB27_6 +; CHECK0-NEXT: // %bb.5: +; CHECK0-NEXT: smstop sm +; CHECK0-NEXT: .LBB27_6: +; CHECK0-NEXT: ldp q0, q1, [sp, #32] // 32-byte Folded Reload +; CHECK0-NEXT: bl __getf2 +; CHECK0-NEXT: tbz w22, #0, .LBB27_8 +; CHECK0-NEXT: // %bb.7: +; CHECK0-NEXT: smstart sm +; CHECK0-NEXT: .LBB27_8: +; CHECK0-NEXT: cmp w0, #0 +; CHECK0-NEXT: cset w8, ge +; CHECK0-NEXT: tst w8, w21 +; CHECK0-NEXT: csel w0, w20, w19, ne +; CHECK0-NEXT: .cfi_restore vg +; CHECK0-NEXT: ldp x20, x19, [sp, #160] // 16-byte Folded Reload +; CHECK0-NEXT: ldr x30, [sp, #128] // 8-byte Folded Reload +; CHECK0-NEXT: ldp x22, x21, [sp, #144] // 16-byte Folded Reload +; CHECK0-NEXT: ldp d9, d8, [sp, #112] // 16-byte Folded Reload +; CHECK0-NEXT: ldp d11, d10, [sp, #96] // 16-byte Folded Reload +; CHECK0-NEXT: ldp d13, d12, [sp, #80] // 16-byte Folded Reload +; CHECK0-NEXT: ldp d15, d14, [sp, #64] // 16-byte Folded Reload +; CHECK0-NEXT: add sp, sp, #176 +; CHECK0-NEXT: .cfi_def_cfa_offset 0 +; CHECK0-NEXT: .cfi_restore w19 +; CHECK0-NEXT: .cfi_restore w20 +; CHECK0-NEXT: .cfi_restore w21 +; CHECK0-NEXT: .cfi_restore w22 +; CHECK0-NEXT: .cfi_restore w30 +; CHECK0-NEXT: .cfi_restore b8 +; CHECK0-NEXT: .cfi_restore b9 +; CHECK0-NEXT: .cfi_restore b10 +; CHECK0-NEXT: .cfi_restore b11 +; CHECK0-NEXT: .cfi_restore b12 +; CHECK0-NEXT: .cfi_restore b13 +; CHECK0-NEXT: .cfi_restore b14 +; CHECK0-NEXT: .cfi_restore b15 +; CHECK0-NEXT: ret +; +; CHECK64-LABEL: f128_libcall: +; CHECK64: // %bb.0: +; CHECK64-NEXT: sub sp, sp, #320 +; CHECK64-NEXT: .cfi_def_cfa_offset 320 +; CHECK64-NEXT: cntd x9 +; CHECK64-NEXT: stp d15, d14, [sp, #128] // 16-byte Folded Spill +; CHECK64-NEXT: stp d13, d12, [sp, #144] // 16-byte Folded Spill +; CHECK64-NEXT: stp d11, d10, [sp, #160] // 16-byte Folded Spill +; CHECK64-NEXT: stp d9, d8, [sp, #176] // 16-byte Folded Spill +; CHECK64-NEXT: stp x29, x30, [sp, #256] // 16-byte Folded Spill +; CHECK64-NEXT: stp x9, x22, [sp, #272] // 16-byte Folded Spill +; CHECK64-NEXT: stp x21, x20, [sp, #288] // 16-byte Folded Spill +; CHECK64-NEXT: str x19, [sp, #304] // 8-byte Folded Spill +; CHECK64-NEXT: .cfi_offset w19, -16 +; CHECK64-NEXT: .cfi_offset w20, -24 +; CHECK64-NEXT: .cfi_offset w21, -32 +; CHECK64-NEXT: .cfi_offset w22, -40 +; CHECK64-NEXT: .cfi_offset w30, -56 +; CHECK64-NEXT: .cfi_offset w29, -64 +; CHECK64-NEXT: .cfi_offset b8, -136 +; CHECK64-NEXT: .cfi_offset b9, -144 +; CHECK64-NEXT: .cfi_offset b10, -152 +; CHECK64-NEXT: .cfi_offset b11, -160 +; CHECK64-NEXT: .cfi_offset b12, -168 +; CHECK64-NEXT: .cfi_offset b13, -176 +; CHECK64-NEXT: .cfi_offset b14, -184 +; CHECK64-NEXT: .cfi_offset b15, -192 +; CHECK64-NEXT: mov w19, w1 +; CHECK64-NEXT: mov w20, w0 +; CHECK64-NEXT: stp q0, q1, [sp, #64] // 32-byte Folded Spill +; CHECK64-NEXT: stp q2, q3, [sp, #96] // 32-byte Folded Spill +; CHECK64-NEXT: bl __arm_sme_state +; CHECK64-NEXT: and x21, x0, #0x1 +; CHECK64-NEXT: .cfi_offset vg, -48 +; CHECK64-NEXT: tbz w21, #0, .LBB27_2 +; CHECK64-NEXT: // %bb.1: +; CHECK64-NEXT: smstop sm +; CHECK64-NEXT: .LBB27_2: +; CHECK64-NEXT: ldp q0, q1, [sp, #64] // 32-byte Folded Reload +; CHECK64-NEXT: bl __lttf2 +; CHECK64-NEXT: tbz w21, #0, .LBB27_4 +; CHECK64-NEXT: // %bb.3: +; CHECK64-NEXT: smstart sm +; CHECK64-NEXT: .LBB27_4: +; CHECK64-NEXT: cmp w0, #0 +; CHECK64-NEXT: .cfi_restore vg +; CHECK64-NEXT: cset w21, lt +; CHECK64-NEXT: bl __arm_sme_state +; CHECK64-NEXT: and x22, x0, #0x1 +; CHECK64-NEXT: .cfi_offset vg, -48 +; CHECK64-NEXT: tbz w22, #0, .LBB27_6 +; CHECK64-NEXT: // %bb.5: +; CHECK64-NEXT: smstop sm +; CHECK64-NEXT: .LBB27_6: +; CHECK64-NEXT: ldp q0, q1, [sp, #96] // 32-byte Folded Reload +; CHECK64-NEXT: bl __getf2 +; CHECK64-NEXT: tbz w22, #0, .LBB27_8 +; CHECK64-NEXT: // %bb.7: +; CHECK64-NEXT: smstart sm +; CHECK64-NEXT: .LBB27_8: +; CHECK64-NEXT: cmp w0, #0 +; CHECK64-NEXT: cset w8, ge +; CHECK64-NEXT: tst w8, w21 +; CHECK64-NEXT: csel w0, w20, w19, ne +; CHECK64-NEXT: .cfi_restore vg +; CHECK64-NEXT: ldp x20, x19, [sp, #296] // 16-byte Folded Reload +; CHECK64-NEXT: ldp x22, x21, [sp, #280] // 16-byte Folded Reload +; CHECK64-NEXT: ldp x29, x30, [sp, #256] // 16-byte Folded Reload +; CHECK64-NEXT: ldp d9, d8, [sp, #176] // 16-byte Folded Reload +; CHECK64-NEXT: ldp d11, d10, [sp, #160] // 16-byte Folded Reload +; CHECK64-NEXT: ldp d13, d12, [sp, #144] // 16-byte Folded Reload +; CHECK64-NEXT: ldp d15, d14, [sp, #128] // 16-byte Folded Reload +; CHECK64-NEXT: add sp, sp, #320 +; CHECK64-NEXT: .cfi_def_cfa_offset 0 +; CHECK64-NEXT: .cfi_restore w19 +; CHECK64-NEXT: .cfi_restore w20 +; CHECK64-NEXT: .cfi_restore w21 +; CHECK64-NEXT: .cfi_restore w22 +; CHECK64-NEXT: .cfi_restore w30 +; CHECK64-NEXT: .cfi_restore w29 +; CHECK64-NEXT: .cfi_restore b8 +; CHECK64-NEXT: .cfi_restore b9 +; CHECK64-NEXT: .cfi_restore b10 +; CHECK64-NEXT: .cfi_restore b11 +; CHECK64-NEXT: .cfi_restore b12 +; CHECK64-NEXT: .cfi_restore b13 +; CHECK64-NEXT: .cfi_restore b14 +; CHECK64-NEXT: .cfi_restore b15 +; CHECK64-NEXT: ret +; +; CHECK1024-LABEL: f128_libcall: +; CHECK1024: // %bb.0: +; CHECK1024-NEXT: sub sp, sp, #1152 +; CHECK1024-NEXT: .cfi_def_cfa_offset 1152 +; CHECK1024-NEXT: cntd x9 +; CHECK1024-NEXT: stp d15, d14, [sp] // 16-byte Folded Spill +; CHECK1024-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK1024-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK1024-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK1024-NEXT: str x29, [sp, #1088] // 8-byte Folded Spill +; CHECK1024-NEXT: str x30, [sp, #1096] // 8-byte Folded Spill +; CHECK1024-NEXT: str x9, [sp, #1104] // 8-byte Folded Spill +; CHECK1024-NEXT: str x22, [sp, #1112] // 8-byte Folded Spill +; CHECK1024-NEXT: str x21, [sp, #1120] // 8-byte Folded Spill +; CHECK1024-NEXT: str x20, [sp, #1128] // 8-byte Folded Spill +; CHECK1024-NEXT: str x19, [sp, #1136] // 8-byte Folded Spill +; CHECK1024-NEXT: .cfi_offset w19, -16 +; CHECK1024-NEXT: .cfi_offset w20, -24 +; CHECK1024-NEXT: .cfi_offset w21, -32 +; CHECK1024-NEXT: .cfi_offset w22, -40 +; CHECK1024-NEXT: .cfi_offset w30, -56 +; CHECK1024-NEXT: .cfi_offset w29, -64 +; CHECK1024-NEXT: .cfi_offset b8, -1096 +; CHECK1024-NEXT: .cfi_offset b9, -1104 +; CHECK1024-NEXT: .cfi_offset b10, -1112 +; CHECK1024-NEXT: .cfi_offset b11, -1120 +; CHECK1024-NEXT: .cfi_offset b12, -1128 +; CHECK1024-NEXT: .cfi_offset b13, -1136 +; CHECK1024-NEXT: .cfi_offset b14, -1144 +; CHECK1024-NEXT: .cfi_offset b15, -1152 +; CHECK1024-NEXT: sub sp, sp, #1088 +; CHECK1024-NEXT: .cfi_def_cfa_offset 2240 +; CHECK1024-NEXT: mov w19, w1 +; CHECK1024-NEXT: mov w20, w0 +; CHECK1024-NEXT: str q3, [sp, #1072] // 16-byte Folded Spill +; CHECK1024-NEXT: str q2, [sp, #1056] // 16-byte Folded Spill +; CHECK1024-NEXT: str q1, [sp, #1040] // 16-byte Folded Spill +; CHECK1024-NEXT: str q0, [sp, #1024] // 16-byte Folded Spill +; CHECK1024-NEXT: bl __arm_sme_state +; CHECK1024-NEXT: and x21, x0, #0x1 +; CHECK1024-NEXT: .cfi_offset vg, -48 +; CHECK1024-NEXT: tbz w21, #0, .LBB27_2 +; CHECK1024-NEXT: // %bb.1: +; CHECK1024-NEXT: smstop sm +; CHECK1024-NEXT: .LBB27_2: +; CHECK1024-NEXT: ldr q0, [sp, #1024] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr q1, [sp, #1040] // 16-byte Folded Reload +; CHECK1024-NEXT: bl __lttf2 +; CHECK1024-NEXT: tbz w21, #0, .LBB27_4 +; CHECK1024-NEXT: // %bb.3: +; CHECK1024-NEXT: smstart sm +; CHECK1024-NEXT: .LBB27_4: +; CHECK1024-NEXT: cmp w0, #0 +; CHECK1024-NEXT: .cfi_restore vg +; CHECK1024-NEXT: cset w21, lt +; CHECK1024-NEXT: bl __arm_sme_state +; CHECK1024-NEXT: and x22, x0, #0x1 +; CHECK1024-NEXT: .cfi_offset vg, -48 +; CHECK1024-NEXT: tbz w22, #0, .LBB27_6 +; CHECK1024-NEXT: // %bb.5: +; CHECK1024-NEXT: smstop sm +; CHECK1024-NEXT: .LBB27_6: +; CHECK1024-NEXT: ldr q0, [sp, #1056] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr q1, [sp, #1072] // 16-byte Folded Reload +; CHECK1024-NEXT: bl __getf2 +; CHECK1024-NEXT: tbz w22, #0, .LBB27_8 +; CHECK1024-NEXT: // %bb.7: +; CHECK1024-NEXT: smstart sm +; CHECK1024-NEXT: .LBB27_8: +; CHECK1024-NEXT: cmp w0, #0 +; CHECK1024-NEXT: cset w8, ge +; CHECK1024-NEXT: tst w8, w21 +; CHECK1024-NEXT: csel w0, w20, w19, ne +; CHECK1024-NEXT: .cfi_restore vg +; CHECK1024-NEXT: add sp, sp, #1088 +; CHECK1024-NEXT: .cfi_def_cfa_offset 1152 +; CHECK1024-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr x19, [sp, #1136] // 8-byte Folded Reload +; CHECK1024-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr x20, [sp, #1128] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x21, [sp, #1120] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x22, [sp, #1112] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x30, [sp, #1096] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x29, [sp, #1088] // 8-byte Folded Reload +; CHECK1024-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK1024-NEXT: ldp d15, d14, [sp] // 16-byte Folded Reload +; CHECK1024-NEXT: add sp, sp, #1152 +; CHECK1024-NEXT: .cfi_def_cfa_offset 0 +; CHECK1024-NEXT: .cfi_restore w19 +; CHECK1024-NEXT: .cfi_restore w20 +; CHECK1024-NEXT: .cfi_restore w21 +; CHECK1024-NEXT: .cfi_restore w22 +; CHECK1024-NEXT: .cfi_restore w30 +; CHECK1024-NEXT: .cfi_restore w29 +; CHECK1024-NEXT: .cfi_restore b8 +; CHECK1024-NEXT: .cfi_restore b9 +; CHECK1024-NEXT: .cfi_restore b10 +; CHECK1024-NEXT: .cfi_restore b11 +; CHECK1024-NEXT: .cfi_restore b12 +; CHECK1024-NEXT: .cfi_restore b13 +; CHECK1024-NEXT: .cfi_restore b14 +; CHECK1024-NEXT: .cfi_restore b15 +; CHECK1024-NEXT: ret + %c0 = fcmp olt fp128 %v0, %v1 + %c1 = fcmp oge fp128 %v2, %v3 + %cr = and i1 %c1, %c0 + %sel = select i1 %cr, i32 %a, i32 %b + ret i32 %sel +} + +define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, %P3, i16 %P4) "aarch64_pstate_sm_compatible" { +; CHECK0-LABEL: svecc_call: +; CHECK0: // %bb.0: // %entry +; CHECK0-NEXT: stp x29, x30, [sp, #-48]! // 16-byte Folded Spill +; CHECK0-NEXT: .cfi_def_cfa_offset 48 +; CHECK0-NEXT: cntd x9 +; CHECK0-NEXT: stp x9, x28, [sp, #16] // 16-byte Folded Spill +; CHECK0-NEXT: stp x27, x19, [sp, #32] // 16-byte Folded Spill +; CHECK0-NEXT: .cfi_offset w19, -8 +; CHECK0-NEXT: .cfi_offset w27, -16 +; CHECK0-NEXT: .cfi_offset w28, -24 +; CHECK0-NEXT: .cfi_offset w30, -40 +; CHECK0-NEXT: .cfi_offset w29, -48 +; CHECK0-NEXT: addvl sp, sp, #-18 +; CHECK0-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x30, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 48 + 144 * VG +; CHECK0-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 48 - 8 * VG +; CHECK0-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 48 - 16 * VG +; CHECK0-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 48 - 24 * VG +; CHECK0-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 48 - 32 * VG +; CHECK0-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 48 - 40 * VG +; CHECK0-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 48 - 48 * VG +; CHECK0-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 48 - 56 * VG +; CHECK0-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 48 - 64 * VG +; CHECK0-NEXT: mov x8, x0 +; CHECK0-NEXT: //APP +; CHECK0-NEXT: //NO_APP +; CHECK0-NEXT: bl __arm_sme_state +; CHECK0-NEXT: and x19, x0, #0x1 +; CHECK0-NEXT: .cfi_offset vg, -32 +; CHECK0-NEXT: tbz w19, #0, .LBB28_2 +; CHECK0-NEXT: // %bb.1: // %entry +; CHECK0-NEXT: smstop sm +; CHECK0-NEXT: .LBB28_2: // %entry +; CHECK0-NEXT: mov x0, x8 +; CHECK0-NEXT: mov w1, #45 // =0x2d +; CHECK0-NEXT: mov w2, #37 // =0x25 +; CHECK0-NEXT: bl memset +; CHECK0-NEXT: tbz w19, #0, .LBB28_4 +; CHECK0-NEXT: // %bb.3: // %entry +; CHECK0-NEXT: smstart sm +; CHECK0-NEXT: .LBB28_4: // %entry +; CHECK0-NEXT: mov w0, #22647 // =0x5877 +; CHECK0-NEXT: movk w0, #59491, lsl #16 +; CHECK0-NEXT: .cfi_restore vg +; CHECK0-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: addvl sp, sp, #18 +; CHECK0-NEXT: .cfi_def_cfa wsp, 48 +; CHECK0-NEXT: .cfi_restore z8 +; CHECK0-NEXT: .cfi_restore z9 +; CHECK0-NEXT: .cfi_restore z10 +; CHECK0-NEXT: .cfi_restore z11 +; CHECK0-NEXT: .cfi_restore z12 +; CHECK0-NEXT: .cfi_restore z13 +; CHECK0-NEXT: .cfi_restore z14 +; CHECK0-NEXT: .cfi_restore z15 +; CHECK0-NEXT: ldp x27, x19, [sp, #32] // 16-byte Folded Reload +; CHECK0-NEXT: ldr x28, [sp, #24] // 8-byte Folded Reload +; CHECK0-NEXT: ldp x29, x30, [sp], #48 // 16-byte Folded Reload +; CHECK0-NEXT: .cfi_def_cfa_offset 0 +; CHECK0-NEXT: .cfi_restore w19 +; CHECK0-NEXT: .cfi_restore w27 +; CHECK0-NEXT: .cfi_restore w28 +; CHECK0-NEXT: .cfi_restore w30 +; CHECK0-NEXT: .cfi_restore w29 +; CHECK0-NEXT: ret +; +; CHECK64-LABEL: svecc_call: +; CHECK64: // %bb.0: // %entry +; CHECK64-NEXT: sub sp, sp, #112 +; CHECK64-NEXT: .cfi_def_cfa_offset 112 +; CHECK64-NEXT: cntd x9 +; CHECK64-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK64-NEXT: stp x9, x28, [sp, #80] // 16-byte Folded Spill +; CHECK64-NEXT: stp x27, x19, [sp, #96] // 16-byte Folded Spill +; CHECK64-NEXT: .cfi_offset w19, -8 +; CHECK64-NEXT: .cfi_offset w27, -16 +; CHECK64-NEXT: .cfi_offset w28, -24 +; CHECK64-NEXT: .cfi_offset w30, -40 +; CHECK64-NEXT: .cfi_offset w29, -48 +; CHECK64-NEXT: addvl sp, sp, #-18 +; CHECK64-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xf0, 0x00, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 112 + 144 * VG +; CHECK64-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK64-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK64-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK64-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK64-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; CHECK64-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; CHECK64-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; CHECK64-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; CHECK64-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; CHECK64-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; CHECK64-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; CHECK64-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; CHECK64-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x11, 0x90, 0x7f, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 112 - 8 * VG +; CHECK64-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x11, 0x90, 0x7f, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 112 - 16 * VG +; CHECK64-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x11, 0x90, 0x7f, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 112 - 24 * VG +; CHECK64-NEXT: .cfi_escape 0x10, 0x4b, 0x0b, 0x11, 0x90, 0x7f, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 112 - 32 * VG +; CHECK64-NEXT: .cfi_escape 0x10, 0x4c, 0x0b, 0x11, 0x90, 0x7f, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 112 - 40 * VG +; CHECK64-NEXT: .cfi_escape 0x10, 0x4d, 0x0b, 0x11, 0x90, 0x7f, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 112 - 48 * VG +; CHECK64-NEXT: .cfi_escape 0x10, 0x4e, 0x0b, 0x11, 0x90, 0x7f, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 112 - 56 * VG +; CHECK64-NEXT: .cfi_escape 0x10, 0x4f, 0x0b, 0x11, 0x90, 0x7f, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 112 - 64 * VG +; CHECK64-NEXT: sub sp, sp, #64 +; CHECK64-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xb0, 0x01, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 176 + 144 * VG +; CHECK64-NEXT: mov x8, x0 +; CHECK64-NEXT: //APP +; CHECK64-NEXT: //NO_APP +; CHECK64-NEXT: bl __arm_sme_state +; CHECK64-NEXT: and x19, x0, #0x1 +; CHECK64-NEXT: .cfi_offset vg, -32 +; CHECK64-NEXT: tbz w19, #0, .LBB28_2 +; CHECK64-NEXT: // %bb.1: // %entry +; CHECK64-NEXT: smstop sm +; CHECK64-NEXT: .LBB28_2: // %entry +; CHECK64-NEXT: mov x0, x8 +; CHECK64-NEXT: mov w1, #45 // =0x2d +; CHECK64-NEXT: mov w2, #37 // =0x25 +; CHECK64-NEXT: bl memset +; CHECK64-NEXT: tbz w19, #0, .LBB28_4 +; CHECK64-NEXT: // %bb.3: // %entry +; CHECK64-NEXT: smstart sm +; CHECK64-NEXT: .LBB28_4: // %entry +; CHECK64-NEXT: mov w0, #22647 // =0x5877 +; CHECK64-NEXT: movk w0, #59491, lsl #16 +; CHECK64-NEXT: .cfi_restore vg +; CHECK64-NEXT: add sp, sp, #64 +; CHECK64-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xf0, 0x00, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 112 + 144 * VG +; CHECK64-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK64-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK64-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK64-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK64-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK64-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK64-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK64-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK64-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK64-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK64-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK64-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; CHECK64-NEXT: addvl sp, sp, #18 +; CHECK64-NEXT: .cfi_def_cfa wsp, 112 +; CHECK64-NEXT: .cfi_restore z8 +; CHECK64-NEXT: .cfi_restore z9 +; CHECK64-NEXT: .cfi_restore z10 +; CHECK64-NEXT: .cfi_restore z11 +; CHECK64-NEXT: .cfi_restore z12 +; CHECK64-NEXT: .cfi_restore z13 +; CHECK64-NEXT: .cfi_restore z14 +; CHECK64-NEXT: .cfi_restore z15 +; CHECK64-NEXT: ldp x27, x19, [sp, #96] // 16-byte Folded Reload +; CHECK64-NEXT: ldr x28, [sp, #88] // 8-byte Folded Reload +; CHECK64-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK64-NEXT: add sp, sp, #112 +; CHECK64-NEXT: .cfi_def_cfa_offset 0 +; CHECK64-NEXT: .cfi_restore w19 +; CHECK64-NEXT: .cfi_restore w27 +; CHECK64-NEXT: .cfi_restore w28 +; CHECK64-NEXT: .cfi_restore w30 +; CHECK64-NEXT: .cfi_restore w29 +; CHECK64-NEXT: ret +; +; CHECK1024-LABEL: svecc_call: +; CHECK1024: // %bb.0: // %entry +; CHECK1024-NEXT: sub sp, sp, #1072 +; CHECK1024-NEXT: .cfi_def_cfa_offset 1072 +; CHECK1024-NEXT: cntd x9 +; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill +; CHECK1024-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill +; CHECK1024-NEXT: str x9, [sp, #1040] // 8-byte Folded Spill +; CHECK1024-NEXT: str x28, [sp, #1048] // 8-byte Folded Spill +; CHECK1024-NEXT: str x27, [sp, #1056] // 8-byte Folded Spill +; CHECK1024-NEXT: str x19, [sp, #1064] // 8-byte Folded Spill +; CHECK1024-NEXT: .cfi_offset w19, -8 +; CHECK1024-NEXT: .cfi_offset w27, -16 +; CHECK1024-NEXT: .cfi_offset w28, -24 +; CHECK1024-NEXT: .cfi_offset w30, -40 +; CHECK1024-NEXT: .cfi_offset w29, -48 +; CHECK1024-NEXT: addvl sp, sp, #-18 +; CHECK1024-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xb0, 0x08, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 1072 + 144 * VG +; CHECK1024-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK1024-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK1024-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK1024-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK1024-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; CHECK1024-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; CHECK1024-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; CHECK1024-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; CHECK1024-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; CHECK1024-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; CHECK1024-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; CHECK1024-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; CHECK1024-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x11, 0xd0, 0x77, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 1072 - 8 * VG +; CHECK1024-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x11, 0xd0, 0x77, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 1072 - 16 * VG +; CHECK1024-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x11, 0xd0, 0x77, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 1072 - 24 * VG +; CHECK1024-NEXT: .cfi_escape 0x10, 0x4b, 0x0b, 0x11, 0xd0, 0x77, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 1072 - 32 * VG +; CHECK1024-NEXT: .cfi_escape 0x10, 0x4c, 0x0b, 0x11, 0xd0, 0x77, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 1072 - 40 * VG +; CHECK1024-NEXT: .cfi_escape 0x10, 0x4d, 0x0b, 0x11, 0xd0, 0x77, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 1072 - 48 * VG +; CHECK1024-NEXT: .cfi_escape 0x10, 0x4e, 0x0b, 0x11, 0xd0, 0x77, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 1072 - 56 * VG +; CHECK1024-NEXT: .cfi_escape 0x10, 0x4f, 0x0b, 0x11, 0xd0, 0x77, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 1072 - 64 * VG +; CHECK1024-NEXT: sub sp, sp, #1024 +; CHECK1024-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xb0, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 2096 + 144 * VG +; CHECK1024-NEXT: mov x8, x0 +; CHECK1024-NEXT: //APP +; CHECK1024-NEXT: //NO_APP +; CHECK1024-NEXT: bl __arm_sme_state +; CHECK1024-NEXT: and x19, x0, #0x1 +; CHECK1024-NEXT: .cfi_offset vg, -32 +; CHECK1024-NEXT: tbz w19, #0, .LBB28_2 +; CHECK1024-NEXT: // %bb.1: // %entry +; CHECK1024-NEXT: smstop sm +; CHECK1024-NEXT: .LBB28_2: // %entry +; CHECK1024-NEXT: mov x0, x8 +; CHECK1024-NEXT: mov w1, #45 // =0x2d +; CHECK1024-NEXT: mov w2, #37 // =0x25 +; CHECK1024-NEXT: bl memset +; CHECK1024-NEXT: tbz w19, #0, .LBB28_4 +; CHECK1024-NEXT: // %bb.3: // %entry +; CHECK1024-NEXT: smstart sm +; CHECK1024-NEXT: .LBB28_4: // %entry +; CHECK1024-NEXT: mov w0, #22647 // =0x5877 +; CHECK1024-NEXT: movk w0, #59491, lsl #16 +; CHECK1024-NEXT: .cfi_restore vg +; CHECK1024-NEXT: add sp, sp, #1024 +; CHECK1024-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xb0, 0x08, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 1072 + 144 * VG +; CHECK1024-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK1024-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK1024-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK1024-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK1024-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK1024-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK1024-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK1024-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK1024-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK1024-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK1024-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK1024-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; CHECK1024-NEXT: addvl sp, sp, #18 +; CHECK1024-NEXT: .cfi_def_cfa wsp, 1072 +; CHECK1024-NEXT: .cfi_restore z8 +; CHECK1024-NEXT: .cfi_restore z9 +; CHECK1024-NEXT: .cfi_restore z10 +; CHECK1024-NEXT: .cfi_restore z11 +; CHECK1024-NEXT: .cfi_restore z12 +; CHECK1024-NEXT: .cfi_restore z13 +; CHECK1024-NEXT: .cfi_restore z14 +; CHECK1024-NEXT: .cfi_restore z15 +; CHECK1024-NEXT: ldr x19, [sp, #1064] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x27, [sp, #1056] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x28, [sp, #1048] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload +; CHECK1024-NEXT: add sp, sp, #1072 +; CHECK1024-NEXT: .cfi_def_cfa_offset 0 +; CHECK1024-NEXT: .cfi_restore w19 +; CHECK1024-NEXT: .cfi_restore w27 +; CHECK1024-NEXT: .cfi_restore w28 +; CHECK1024-NEXT: .cfi_restore w30 +; CHECK1024-NEXT: .cfi_restore w29 +; CHECK1024-NEXT: ret +entry: + tail call void asm sideeffect "", "~{x0},~{x28},~{x27},~{x3}"() #2 + %call = call ptr @memset(ptr noundef nonnull %P1, i32 noundef 45, i32 noundef 37) + ret i32 -396142473 +} + +define i32 @svecc_alloca_call(<4 x i16> %P0, ptr %P1, i32 %P2, %P3, i16 %P4) "aarch64_pstate_sm_compatible" { +; CHECK0-LABEL: svecc_alloca_call: +; CHECK0: // %bb.0: // %entry +; CHECK0-NEXT: stp x29, x30, [sp, #-48]! // 16-byte Folded Spill +; CHECK0-NEXT: .cfi_def_cfa_offset 48 +; CHECK0-NEXT: cntd x9 +; CHECK0-NEXT: stp x9, x28, [sp, #16] // 16-byte Folded Spill +; CHECK0-NEXT: stp x27, x19, [sp, #32] // 16-byte Folded Spill +; CHECK0-NEXT: .cfi_offset w19, -8 +; CHECK0-NEXT: .cfi_offset w27, -16 +; CHECK0-NEXT: .cfi_offset w28, -24 +; CHECK0-NEXT: .cfi_offset w30, -40 +; CHECK0-NEXT: .cfi_offset w29, -48 +; CHECK0-NEXT: addvl sp, sp, #-18 +; CHECK0-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x30, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 48 + 144 * VG +; CHECK0-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 48 - 8 * VG +; CHECK0-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 48 - 16 * VG +; CHECK0-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 48 - 24 * VG +; CHECK0-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 48 - 32 * VG +; CHECK0-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 48 - 40 * VG +; CHECK0-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 48 - 48 * VG +; CHECK0-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 48 - 56 * VG +; CHECK0-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 48 - 64 * VG +; CHECK0-NEXT: sub sp, sp, #48 +; CHECK0-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xe0, 0x00, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 96 + 144 * VG +; CHECK0-NEXT: //APP +; CHECK0-NEXT: //NO_APP +; CHECK0-NEXT: bl __arm_sme_state +; CHECK0-NEXT: and x19, x0, #0x1 +; CHECK0-NEXT: .cfi_offset vg, -32 +; CHECK0-NEXT: tbz w19, #0, .LBB29_2 +; CHECK0-NEXT: // %bb.1: // %entry +; CHECK0-NEXT: smstop sm +; CHECK0-NEXT: .LBB29_2: // %entry +; CHECK0-NEXT: mov x0, sp +; CHECK0-NEXT: mov w1, #45 // =0x2d +; CHECK0-NEXT: mov w2, #37 // =0x25 +; CHECK0-NEXT: bl memset +; CHECK0-NEXT: tbz w19, #0, .LBB29_4 +; CHECK0-NEXT: // %bb.3: // %entry +; CHECK0-NEXT: smstart sm +; CHECK0-NEXT: .LBB29_4: // %entry +; CHECK0-NEXT: mov w0, #22647 // =0x5877 +; CHECK0-NEXT: movk w0, #59491, lsl #16 +; CHECK0-NEXT: .cfi_restore vg +; CHECK0-NEXT: add sp, sp, #48 +; CHECK0-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x30, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 48 + 144 * VG +; CHECK0-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: addvl sp, sp, #18 +; CHECK0-NEXT: .cfi_def_cfa wsp, 48 +; CHECK0-NEXT: .cfi_restore z8 +; CHECK0-NEXT: .cfi_restore z9 +; CHECK0-NEXT: .cfi_restore z10 +; CHECK0-NEXT: .cfi_restore z11 +; CHECK0-NEXT: .cfi_restore z12 +; CHECK0-NEXT: .cfi_restore z13 +; CHECK0-NEXT: .cfi_restore z14 +; CHECK0-NEXT: .cfi_restore z15 +; CHECK0-NEXT: ldp x27, x19, [sp, #32] // 16-byte Folded Reload +; CHECK0-NEXT: ldr x28, [sp, #24] // 8-byte Folded Reload +; CHECK0-NEXT: ldp x29, x30, [sp], #48 // 16-byte Folded Reload +; CHECK0-NEXT: .cfi_def_cfa_offset 0 +; CHECK0-NEXT: .cfi_restore w19 +; CHECK0-NEXT: .cfi_restore w27 +; CHECK0-NEXT: .cfi_restore w28 +; CHECK0-NEXT: .cfi_restore w30 +; CHECK0-NEXT: .cfi_restore w29 +; CHECK0-NEXT: ret +; +; CHECK64-LABEL: svecc_alloca_call: +; CHECK64: // %bb.0: // %entry +; CHECK64-NEXT: sub sp, sp, #112 +; CHECK64-NEXT: .cfi_def_cfa_offset 112 +; CHECK64-NEXT: cntd x9 +; CHECK64-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK64-NEXT: stp x9, x28, [sp, #80] // 16-byte Folded Spill +; CHECK64-NEXT: stp x27, x19, [sp, #96] // 16-byte Folded Spill +; CHECK64-NEXT: .cfi_offset w19, -8 +; CHECK64-NEXT: .cfi_offset w27, -16 +; CHECK64-NEXT: .cfi_offset w28, -24 +; CHECK64-NEXT: .cfi_offset w30, -40 +; CHECK64-NEXT: .cfi_offset w29, -48 +; CHECK64-NEXT: addvl sp, sp, #-18 +; CHECK64-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xf0, 0x00, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 112 + 144 * VG +; CHECK64-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK64-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK64-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK64-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK64-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; CHECK64-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; CHECK64-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; CHECK64-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; CHECK64-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; CHECK64-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; CHECK64-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; CHECK64-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; CHECK64-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x11, 0x90, 0x7f, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 112 - 8 * VG +; CHECK64-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x11, 0x90, 0x7f, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 112 - 16 * VG +; CHECK64-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x11, 0x90, 0x7f, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 112 - 24 * VG +; CHECK64-NEXT: .cfi_escape 0x10, 0x4b, 0x0b, 0x11, 0x90, 0x7f, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 112 - 32 * VG +; CHECK64-NEXT: .cfi_escape 0x10, 0x4c, 0x0b, 0x11, 0x90, 0x7f, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 112 - 40 * VG +; CHECK64-NEXT: .cfi_escape 0x10, 0x4d, 0x0b, 0x11, 0x90, 0x7f, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 112 - 48 * VG +; CHECK64-NEXT: .cfi_escape 0x10, 0x4e, 0x0b, 0x11, 0x90, 0x7f, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 112 - 56 * VG +; CHECK64-NEXT: .cfi_escape 0x10, 0x4f, 0x0b, 0x11, 0x90, 0x7f, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 112 - 64 * VG +; CHECK64-NEXT: sub sp, sp, #112 +; CHECK64-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xe0, 0x01, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 224 + 144 * VG +; CHECK64-NEXT: //APP +; CHECK64-NEXT: //NO_APP +; CHECK64-NEXT: bl __arm_sme_state +; CHECK64-NEXT: and x19, x0, #0x1 +; CHECK64-NEXT: .cfi_offset vg, -32 +; CHECK64-NEXT: tbz w19, #0, .LBB29_2 +; CHECK64-NEXT: // %bb.1: // %entry +; CHECK64-NEXT: smstop sm +; CHECK64-NEXT: .LBB29_2: // %entry +; CHECK64-NEXT: mov x0, sp +; CHECK64-NEXT: mov w1, #45 // =0x2d +; CHECK64-NEXT: mov w2, #37 // =0x25 +; CHECK64-NEXT: bl memset +; CHECK64-NEXT: tbz w19, #0, .LBB29_4 +; CHECK64-NEXT: // %bb.3: // %entry +; CHECK64-NEXT: smstart sm +; CHECK64-NEXT: .LBB29_4: // %entry +; CHECK64-NEXT: mov w0, #22647 // =0x5877 +; CHECK64-NEXT: movk w0, #59491, lsl #16 +; CHECK64-NEXT: .cfi_restore vg +; CHECK64-NEXT: add sp, sp, #112 +; CHECK64-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xf0, 0x00, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 112 + 144 * VG +; CHECK64-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK64-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK64-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK64-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK64-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK64-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK64-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK64-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK64-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK64-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK64-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK64-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; CHECK64-NEXT: addvl sp, sp, #18 +; CHECK64-NEXT: .cfi_def_cfa wsp, 112 +; CHECK64-NEXT: .cfi_restore z8 +; CHECK64-NEXT: .cfi_restore z9 +; CHECK64-NEXT: .cfi_restore z10 +; CHECK64-NEXT: .cfi_restore z11 +; CHECK64-NEXT: .cfi_restore z12 +; CHECK64-NEXT: .cfi_restore z13 +; CHECK64-NEXT: .cfi_restore z14 +; CHECK64-NEXT: .cfi_restore z15 +; CHECK64-NEXT: ldp x27, x19, [sp, #96] // 16-byte Folded Reload +; CHECK64-NEXT: ldr x28, [sp, #88] // 8-byte Folded Reload +; CHECK64-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK64-NEXT: add sp, sp, #112 +; CHECK64-NEXT: .cfi_def_cfa_offset 0 +; CHECK64-NEXT: .cfi_restore w19 +; CHECK64-NEXT: .cfi_restore w27 +; CHECK64-NEXT: .cfi_restore w28 +; CHECK64-NEXT: .cfi_restore w30 +; CHECK64-NEXT: .cfi_restore w29 +; CHECK64-NEXT: ret +; +; CHECK1024-LABEL: svecc_alloca_call: +; CHECK1024: // %bb.0: // %entry +; CHECK1024-NEXT: sub sp, sp, #1072 +; CHECK1024-NEXT: .cfi_def_cfa_offset 1072 +; CHECK1024-NEXT: cntd x9 +; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill +; CHECK1024-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill +; CHECK1024-NEXT: str x9, [sp, #1040] // 8-byte Folded Spill +; CHECK1024-NEXT: str x28, [sp, #1048] // 8-byte Folded Spill +; CHECK1024-NEXT: str x27, [sp, #1056] // 8-byte Folded Spill +; CHECK1024-NEXT: str x19, [sp, #1064] // 8-byte Folded Spill +; CHECK1024-NEXT: .cfi_offset w19, -8 +; CHECK1024-NEXT: .cfi_offset w27, -16 +; CHECK1024-NEXT: .cfi_offset w28, -24 +; CHECK1024-NEXT: .cfi_offset w30, -40 +; CHECK1024-NEXT: .cfi_offset w29, -48 +; CHECK1024-NEXT: addvl sp, sp, #-18 +; CHECK1024-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xb0, 0x08, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 1072 + 144 * VG +; CHECK1024-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK1024-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK1024-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK1024-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK1024-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; CHECK1024-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; CHECK1024-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; CHECK1024-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; CHECK1024-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; CHECK1024-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; CHECK1024-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; CHECK1024-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; CHECK1024-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x11, 0xd0, 0x77, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 1072 - 8 * VG +; CHECK1024-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x11, 0xd0, 0x77, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 1072 - 16 * VG +; CHECK1024-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x11, 0xd0, 0x77, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 1072 - 24 * VG +; CHECK1024-NEXT: .cfi_escape 0x10, 0x4b, 0x0b, 0x11, 0xd0, 0x77, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 1072 - 32 * VG +; CHECK1024-NEXT: .cfi_escape 0x10, 0x4c, 0x0b, 0x11, 0xd0, 0x77, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 1072 - 40 * VG +; CHECK1024-NEXT: .cfi_escape 0x10, 0x4d, 0x0b, 0x11, 0xd0, 0x77, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 1072 - 48 * VG +; CHECK1024-NEXT: .cfi_escape 0x10, 0x4e, 0x0b, 0x11, 0xd0, 0x77, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 1072 - 56 * VG +; CHECK1024-NEXT: .cfi_escape 0x10, 0x4f, 0x0b, 0x11, 0xd0, 0x77, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 1072 - 64 * VG +; CHECK1024-NEXT: sub sp, sp, #1072 +; CHECK1024-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xe0, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 2144 + 144 * VG +; CHECK1024-NEXT: //APP +; CHECK1024-NEXT: //NO_APP +; CHECK1024-NEXT: bl __arm_sme_state +; CHECK1024-NEXT: and x19, x0, #0x1 +; CHECK1024-NEXT: .cfi_offset vg, -32 +; CHECK1024-NEXT: tbz w19, #0, .LBB29_2 +; CHECK1024-NEXT: // %bb.1: // %entry +; CHECK1024-NEXT: smstop sm +; CHECK1024-NEXT: .LBB29_2: // %entry +; CHECK1024-NEXT: mov x0, sp +; CHECK1024-NEXT: mov w1, #45 // =0x2d +; CHECK1024-NEXT: mov w2, #37 // =0x25 +; CHECK1024-NEXT: bl memset +; CHECK1024-NEXT: tbz w19, #0, .LBB29_4 +; CHECK1024-NEXT: // %bb.3: // %entry +; CHECK1024-NEXT: smstart sm +; CHECK1024-NEXT: .LBB29_4: // %entry +; CHECK1024-NEXT: mov w0, #22647 // =0x5877 +; CHECK1024-NEXT: movk w0, #59491, lsl #16 +; CHECK1024-NEXT: .cfi_restore vg +; CHECK1024-NEXT: add sp, sp, #1072 +; CHECK1024-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xb0, 0x08, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 1072 + 144 * VG +; CHECK1024-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK1024-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK1024-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK1024-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK1024-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK1024-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK1024-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK1024-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK1024-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK1024-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK1024-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK1024-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; CHECK1024-NEXT: addvl sp, sp, #18 +; CHECK1024-NEXT: .cfi_def_cfa wsp, 1072 +; CHECK1024-NEXT: .cfi_restore z8 +; CHECK1024-NEXT: .cfi_restore z9 +; CHECK1024-NEXT: .cfi_restore z10 +; CHECK1024-NEXT: .cfi_restore z11 +; CHECK1024-NEXT: .cfi_restore z12 +; CHECK1024-NEXT: .cfi_restore z13 +; CHECK1024-NEXT: .cfi_restore z14 +; CHECK1024-NEXT: .cfi_restore z15 +; CHECK1024-NEXT: ldr x19, [sp, #1064] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x27, [sp, #1056] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x28, [sp, #1048] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload +; CHECK1024-NEXT: add sp, sp, #1072 +; CHECK1024-NEXT: .cfi_def_cfa_offset 0 +; CHECK1024-NEXT: .cfi_restore w19 +; CHECK1024-NEXT: .cfi_restore w27 +; CHECK1024-NEXT: .cfi_restore w28 +; CHECK1024-NEXT: .cfi_restore w30 +; CHECK1024-NEXT: .cfi_restore w29 +; CHECK1024-NEXT: ret +entry: + tail call void asm sideeffect "", "~{x0},~{x28},~{x27},~{x3}"() #2 + %0 = alloca [37 x i8], align 16 + %call = call ptr @memset(ptr noundef nonnull %0, i32 noundef 45, i32 noundef 37) + ret i32 -396142473 +} +declare ptr @memset(ptr, i32, i32) + +define void @call_with_doubles() "aarch64_pstate_sm_compatible" { +; CHECK0-LABEL: call_with_doubles: +; CHECK0: // %bb.0: // %entry +; CHECK0-NEXT: str d8, [sp, #-16]! // 8-byte Folded Spill +; CHECK0-NEXT: str x30, [sp, #8] // 8-byte Folded Spill +; CHECK0-NEXT: .cfi_def_cfa_offset 16 +; CHECK0-NEXT: .cfi_offset w30, -8 +; CHECK0-NEXT: .cfi_offset b8, -16 +; CHECK0-NEXT: mov x8, #9221120237041090560 // =0x7ff8000000000000 +; CHECK0-NEXT: fmov d8, x8 +; CHECK0-NEXT: fmov d0, d8 +; CHECK0-NEXT: bl calld +; CHECK0-NEXT: ldr x30, [sp, #8] // 8-byte Folded Reload +; CHECK0-NEXT: fmov d0, d8 +; CHECK0-NEXT: ldr d8, [sp], #16 // 8-byte Folded Reload +; CHECK0-NEXT: b calld +; +; CHECK64-LABEL: call_with_doubles: +; CHECK64: // %bb.0: // %entry +; CHECK64-NEXT: sub sp, sp, #144 +; CHECK64-NEXT: str d8, [sp, #64] // 8-byte Folded Spill +; CHECK64-NEXT: str x30, [sp, #136] // 8-byte Folded Spill +; CHECK64-NEXT: .cfi_def_cfa_offset 144 +; CHECK64-NEXT: .cfi_offset w30, -8 +; CHECK64-NEXT: .cfi_offset b8, -80 +; CHECK64-NEXT: mov x8, #9221120237041090560 // =0x7ff8000000000000 +; CHECK64-NEXT: fmov d8, x8 +; CHECK64-NEXT: fmov d0, d8 +; CHECK64-NEXT: bl calld +; CHECK64-NEXT: fmov d0, d8 +; CHECK64-NEXT: ldr x30, [sp, #136] // 8-byte Folded Reload +; CHECK64-NEXT: ldr d8, [sp, #64] // 8-byte Folded Reload +; CHECK64-NEXT: add sp, sp, #144 +; CHECK64-NEXT: b calld +; +; CHECK1024-LABEL: call_with_doubles: +; CHECK1024: // %bb.0: // %entry +; CHECK1024-NEXT: sub sp, sp, #1056 +; CHECK1024-NEXT: str d8, [sp] // 8-byte Folded Spill +; CHECK1024-NEXT: str x29, [sp, #1032] // 8-byte Folded Spill +; CHECK1024-NEXT: str x30, [sp, #1040] // 8-byte Folded Spill +; CHECK1024-NEXT: sub sp, sp, #1024 +; CHECK1024-NEXT: .cfi_def_cfa_offset 2080 +; CHECK1024-NEXT: .cfi_offset w30, -16 +; CHECK1024-NEXT: .cfi_offset w29, -24 +; CHECK1024-NEXT: .cfi_offset b8, -1056 +; CHECK1024-NEXT: mov x8, #9221120237041090560 // =0x7ff8000000000000 +; CHECK1024-NEXT: fmov d8, x8 +; CHECK1024-NEXT: fmov d0, d8 +; CHECK1024-NEXT: bl calld +; CHECK1024-NEXT: fmov d0, d8 +; CHECK1024-NEXT: add sp, sp, #1024 +; CHECK1024-NEXT: ldr x30, [sp, #1040] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x29, [sp, #1032] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr d8, [sp] // 8-byte Folded Reload +; CHECK1024-NEXT: add sp, sp, #1056 +; CHECK1024-NEXT: b calld +entry: + %call = tail call i32 @calld(double 0x7FF8000000000000) + %call.1 = tail call i32 @calld(double 0x7FF8000000000000) + ret void +} +declare i32 @calld(double) "aarch64_pstate_sm_compatible" + +; Check that stack objects are ordererd fpr > hazard > gpr +define void @ordering_test(double %d, half %h, <4 x i32> %v) "aarch64_pstate_sm_compatible" { +; CHECK0-LABEL: ordering_test: +; CHECK0: // %bb.0: // %entry +; CHECK0-NEXT: sub sp, sp, #48 +; CHECK0-NEXT: .cfi_def_cfa_offset 48 +; CHECK0-NEXT: str wzr, [sp, #32] +; CHECK0-NEXT: str d0, [sp, #24] +; CHECK0-NEXT: str wzr, [sp, #44] +; CHECK0-NEXT: str h1, [sp, #22] +; CHECK0-NEXT: str wzr, [sp, #16] +; CHECK0-NEXT: str q2, [sp], #48 +; CHECK0-NEXT: ret +; +; CHECK64-LABEL: ordering_test: +; CHECK64: // %bb.0: // %entry +; CHECK64-NEXT: sub sp, sp, #128 +; CHECK64-NEXT: .cfi_def_cfa_offset 128 +; CHECK64-NEXT: stp wzr, wzr, [sp, #12] +; CHECK64-NEXT: str d0, [sp, #120] +; CHECK64-NEXT: str wzr, [sp, #28] +; CHECK64-NEXT: str h1, [sp, #118] +; CHECK64-NEXT: str q2, [sp, #96] +; CHECK64-NEXT: add sp, sp, #128 +; CHECK64-NEXT: ret +; +; CHECK1024-LABEL: ordering_test: +; CHECK1024: // %bb.0: // %entry +; CHECK1024-NEXT: sub sp, sp, #1040 +; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill +; CHECK1024-NEXT: sub sp, sp, #1088 +; CHECK1024-NEXT: .cfi_def_cfa_offset 2128 +; CHECK1024-NEXT: .cfi_offset w29, -16 +; CHECK1024-NEXT: stp wzr, wzr, [sp, #12] +; CHECK1024-NEXT: str d0, [sp, #1080] +; CHECK1024-NEXT: str wzr, [sp, #28] +; CHECK1024-NEXT: str h1, [sp, #1078] +; CHECK1024-NEXT: str q2, [sp, #1056] +; CHECK1024-NEXT: add sp, sp, #1088 +; CHECK1024-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload +; CHECK1024-NEXT: add sp, sp, #1040 +; CHECK1024-NEXT: ret +entry: + %i32 = alloca i32 + %i64 = alloca i64 + %f64 = alloca double + %f16 = alloca half + %i32b = alloca i32 + %v4i32 = alloca <4 x i32> + store i32 0, ptr %i64 + store double %d, ptr %f64 + store i32 0, ptr %i32 + store half %h, ptr %f16 + store i32 0, ptr %i32b + store <4 x i32> %v, ptr %v4i32 + ret void +} + + +define void @ordering_test_array(i64 %o, i64 %p, float %f, i32 %x) "aarch64_pstate_sm_compatible" { +; CHECK0-LABEL: ordering_test_array: +; CHECK0: // %bb.0: // %entry +; CHECK0-NEXT: sub sp, sp, #272 +; CHECK0-NEXT: str x29, [sp, #256] // 8-byte Folded Spill +; CHECK0-NEXT: .cfi_def_cfa_offset 272 +; CHECK0-NEXT: .cfi_offset w29, -16 +; CHECK0-NEXT: add x8, sp, #128 +; CHECK0-NEXT: str w2, [x8, x0, lsl #2] +; CHECK0-NEXT: mov x8, sp +; CHECK0-NEXT: str s0, [x8, x1, lsl #2] +; CHECK0-NEXT: add sp, sp, #272 +; CHECK0-NEXT: ret +; +; CHECK64-LABEL: ordering_test_array: +; CHECK64: // %bb.0: // %entry +; CHECK64-NEXT: sub sp, sp, #400 +; CHECK64-NEXT: str x29, [sp, #384] // 8-byte Folded Spill +; CHECK64-NEXT: .cfi_def_cfa_offset 400 +; CHECK64-NEXT: .cfi_offset w29, -16 +; CHECK64-NEXT: mov x8, sp +; CHECK64-NEXT: str w2, [x8, x0, lsl #2] +; CHECK64-NEXT: add x8, sp, #192 +; CHECK64-NEXT: str s0, [x8, x1, lsl #2] +; CHECK64-NEXT: add sp, sp, #400 +; CHECK64-NEXT: ret +; +; CHECK1024-LABEL: ordering_test_array: +; CHECK1024: // %bb.0: // %entry +; CHECK1024-NEXT: sub sp, sp, #1040 +; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill +; CHECK1024-NEXT: sub sp, sp, #1280 +; CHECK1024-NEXT: .cfi_def_cfa_offset 2320 +; CHECK1024-NEXT: .cfi_offset w29, -16 +; CHECK1024-NEXT: mov x8, sp +; CHECK1024-NEXT: str w2, [x8, x0, lsl #2] +; CHECK1024-NEXT: add x8, sp, #1152 +; CHECK1024-NEXT: str s0, [x8, x1, lsl #2] +; CHECK1024-NEXT: add sp, sp, #1280 +; CHECK1024-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload +; CHECK1024-NEXT: add sp, sp, #1040 +; CHECK1024-NEXT: ret +entry: + %i32 = alloca [32 x i32] + %f32 = alloca [32 x float] + %g = getelementptr i32, ptr %i32, i64 %o + store i32 %x, ptr %g + %h = getelementptr float, ptr %f32, i64 %p + store float %f, ptr %h + ret void +} + +; The VA register currently ends up in VLA space. Lets hope that doesn't come up very often. +define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "target-features"="+sme" { +; CHECK0-LABEL: vastate: +; CHECK0: // %bb.0: // %entry +; CHECK0-NEXT: stp d15, d14, [sp, #-112]! // 16-byte Folded Spill +; CHECK0-NEXT: .cfi_def_cfa_offset 112 +; CHECK0-NEXT: cntd x9 +; CHECK0-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK0-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK0-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK0-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK0-NEXT: str x9, [sp, #80] // 8-byte Folded Spill +; CHECK0-NEXT: stp x20, x19, [sp, #96] // 16-byte Folded Spill +; CHECK0-NEXT: add x29, sp, #64 +; CHECK0-NEXT: .cfi_def_cfa w29, 48 +; CHECK0-NEXT: .cfi_offset w19, -8 +; CHECK0-NEXT: .cfi_offset w20, -16 +; CHECK0-NEXT: .cfi_offset w30, -40 +; CHECK0-NEXT: .cfi_offset w29, -48 +; CHECK0-NEXT: .cfi_offset b8, -56 +; CHECK0-NEXT: .cfi_offset b9, -64 +; CHECK0-NEXT: .cfi_offset b10, -72 +; CHECK0-NEXT: .cfi_offset b11, -80 +; CHECK0-NEXT: .cfi_offset b12, -88 +; CHECK0-NEXT: .cfi_offset b13, -96 +; CHECK0-NEXT: .cfi_offset b14, -104 +; CHECK0-NEXT: .cfi_offset b15, -112 +; CHECK0-NEXT: sub sp, sp, #16 +; CHECK0-NEXT: rdsvl x8, #1 +; CHECK0-NEXT: mov x9, sp +; CHECK0-NEXT: mov w20, w0 +; CHECK0-NEXT: msub x9, x8, x8, x9 +; CHECK0-NEXT: mov sp, x9 +; CHECK0-NEXT: stur x9, [x29, #-80] +; CHECK0-NEXT: sub x9, x29, #80 +; CHECK0-NEXT: sturh wzr, [x29, #-70] +; CHECK0-NEXT: stur wzr, [x29, #-68] +; CHECK0-NEXT: sturh w8, [x29, #-72] +; CHECK0-NEXT: msr TPIDR2_EL0, x9 +; CHECK0-NEXT: .cfi_offset vg, -32 +; CHECK0-NEXT: smstop sm +; CHECK0-NEXT: bl other +; CHECK0-NEXT: smstart sm +; CHECK0-NEXT: .cfi_restore vg +; CHECK0-NEXT: smstart za +; CHECK0-NEXT: mrs x8, TPIDR2_EL0 +; CHECK0-NEXT: sub x0, x29, #80 +; CHECK0-NEXT: cbnz x8, .LBB33_2 +; CHECK0-NEXT: // %bb.1: // %entry +; CHECK0-NEXT: bl __arm_tpidr2_restore +; CHECK0-NEXT: .LBB33_2: // %entry +; CHECK0-NEXT: mov w0, w20 +; CHECK0-NEXT: msr TPIDR2_EL0, xzr +; CHECK0-NEXT: sub sp, x29, #64 +; CHECK0-NEXT: .cfi_def_cfa wsp, 112 +; CHECK0-NEXT: ldp x20, x19, [sp, #96] // 16-byte Folded Reload +; CHECK0-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK0-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK0-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK0-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK0-NEXT: ldp d15, d14, [sp], #112 // 16-byte Folded Reload +; CHECK0-NEXT: .cfi_def_cfa_offset 0 +; CHECK0-NEXT: .cfi_restore w19 +; CHECK0-NEXT: .cfi_restore w20 +; CHECK0-NEXT: .cfi_restore w30 +; CHECK0-NEXT: .cfi_restore w29 +; CHECK0-NEXT: .cfi_restore b8 +; CHECK0-NEXT: .cfi_restore b9 +; CHECK0-NEXT: .cfi_restore b10 +; CHECK0-NEXT: .cfi_restore b11 +; CHECK0-NEXT: .cfi_restore b12 +; CHECK0-NEXT: .cfi_restore b13 +; CHECK0-NEXT: .cfi_restore b14 +; CHECK0-NEXT: .cfi_restore b15 +; CHECK0-NEXT: ret +; +; CHECK64-LABEL: vastate: +; CHECK64: // %bb.0: // %entry +; CHECK64-NEXT: stp d15, d14, [sp, #-176]! // 16-byte Folded Spill +; CHECK64-NEXT: .cfi_def_cfa_offset 176 +; CHECK64-NEXT: cntd x9 +; CHECK64-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK64-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK64-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK64-NEXT: stp x29, x30, [sp, #128] // 16-byte Folded Spill +; CHECK64-NEXT: stp x9, x20, [sp, #144] // 16-byte Folded Spill +; CHECK64-NEXT: str x19, [sp, #160] // 8-byte Folded Spill +; CHECK64-NEXT: mov x29, sp +; CHECK64-NEXT: .cfi_def_cfa w29, 176 +; CHECK64-NEXT: .cfi_offset w19, -16 +; CHECK64-NEXT: .cfi_offset w20, -24 +; CHECK64-NEXT: .cfi_offset w30, -40 +; CHECK64-NEXT: .cfi_offset w29, -48 +; CHECK64-NEXT: .cfi_offset b8, -120 +; CHECK64-NEXT: .cfi_offset b9, -128 +; CHECK64-NEXT: .cfi_offset b10, -136 +; CHECK64-NEXT: .cfi_offset b11, -144 +; CHECK64-NEXT: .cfi_offset b12, -152 +; CHECK64-NEXT: .cfi_offset b13, -160 +; CHECK64-NEXT: .cfi_offset b14, -168 +; CHECK64-NEXT: .cfi_offset b15, -176 +; CHECK64-NEXT: sub sp, sp, #80 +; CHECK64-NEXT: rdsvl x8, #1 +; CHECK64-NEXT: mov x9, sp +; CHECK64-NEXT: mov w20, w0 +; CHECK64-NEXT: msub x9, x8, x8, x9 +; CHECK64-NEXT: mov sp, x9 +; CHECK64-NEXT: stur x9, [x29, #-80] +; CHECK64-NEXT: sub x9, x29, #80 +; CHECK64-NEXT: sturh wzr, [x29, #-70] +; CHECK64-NEXT: stur wzr, [x29, #-68] +; CHECK64-NEXT: sturh w8, [x29, #-72] +; CHECK64-NEXT: msr TPIDR2_EL0, x9 +; CHECK64-NEXT: .cfi_offset vg, -32 +; CHECK64-NEXT: smstop sm +; CHECK64-NEXT: bl other +; CHECK64-NEXT: smstart sm +; CHECK64-NEXT: .cfi_restore vg +; CHECK64-NEXT: smstart za +; CHECK64-NEXT: mrs x8, TPIDR2_EL0 +; CHECK64-NEXT: sub x0, x29, #80 +; CHECK64-NEXT: cbnz x8, .LBB33_2 +; CHECK64-NEXT: // %bb.1: // %entry +; CHECK64-NEXT: bl __arm_tpidr2_restore +; CHECK64-NEXT: .LBB33_2: // %entry +; CHECK64-NEXT: mov w0, w20 +; CHECK64-NEXT: msr TPIDR2_EL0, xzr +; CHECK64-NEXT: mov sp, x29 +; CHECK64-NEXT: .cfi_def_cfa wsp, 176 +; CHECK64-NEXT: ldp x20, x19, [sp, #152] // 16-byte Folded Reload +; CHECK64-NEXT: ldr d14, [sp, #8] // 8-byte Folded Reload +; CHECK64-NEXT: ldp x29, x30, [sp, #128] // 16-byte Folded Reload +; CHECK64-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK64-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK64-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK64-NEXT: ldr d15, [sp], #176 // 8-byte Folded Reload +; CHECK64-NEXT: .cfi_def_cfa_offset 0 +; CHECK64-NEXT: .cfi_restore w19 +; CHECK64-NEXT: .cfi_restore w20 +; CHECK64-NEXT: .cfi_restore w30 +; CHECK64-NEXT: .cfi_restore w29 +; CHECK64-NEXT: .cfi_restore b8 +; CHECK64-NEXT: .cfi_restore b9 +; CHECK64-NEXT: .cfi_restore b10 +; CHECK64-NEXT: .cfi_restore b11 +; CHECK64-NEXT: .cfi_restore b12 +; CHECK64-NEXT: .cfi_restore b13 +; CHECK64-NEXT: .cfi_restore b14 +; CHECK64-NEXT: .cfi_restore b15 +; CHECK64-NEXT: ret +; +; CHECK1024-LABEL: vastate: +; CHECK1024: // %bb.0: // %entry +; CHECK1024-NEXT: sub sp, sp, #1136 +; CHECK1024-NEXT: .cfi_def_cfa_offset 1136 +; CHECK1024-NEXT: cntd x9 +; CHECK1024-NEXT: stp d15, d14, [sp] // 16-byte Folded Spill +; CHECK1024-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK1024-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK1024-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK1024-NEXT: str x29, [sp, #1088] // 8-byte Folded Spill +; CHECK1024-NEXT: str x30, [sp, #1096] // 8-byte Folded Spill +; CHECK1024-NEXT: str x9, [sp, #1104] // 8-byte Folded Spill +; CHECK1024-NEXT: str x28, [sp, #1112] // 8-byte Folded Spill +; CHECK1024-NEXT: str x20, [sp, #1120] // 8-byte Folded Spill +; CHECK1024-NEXT: str x19, [sp, #1128] // 8-byte Folded Spill +; CHECK1024-NEXT: mov x29, sp +; CHECK1024-NEXT: .cfi_def_cfa w29, 1136 +; CHECK1024-NEXT: .cfi_offset w19, -8 +; CHECK1024-NEXT: .cfi_offset w20, -16 +; CHECK1024-NEXT: .cfi_offset w28, -24 +; CHECK1024-NEXT: .cfi_offset w30, -40 +; CHECK1024-NEXT: .cfi_offset w29, -48 +; CHECK1024-NEXT: .cfi_offset b8, -1080 +; CHECK1024-NEXT: .cfi_offset b9, -1088 +; CHECK1024-NEXT: .cfi_offset b10, -1096 +; CHECK1024-NEXT: .cfi_offset b11, -1104 +; CHECK1024-NEXT: .cfi_offset b12, -1112 +; CHECK1024-NEXT: .cfi_offset b13, -1120 +; CHECK1024-NEXT: .cfi_offset b14, -1128 +; CHECK1024-NEXT: .cfi_offset b15, -1136 +; CHECK1024-NEXT: sub sp, sp, #1040 +; CHECK1024-NEXT: rdsvl x8, #1 +; CHECK1024-NEXT: mov x9, sp +; CHECK1024-NEXT: mov w20, w0 +; CHECK1024-NEXT: msub x9, x8, x8, x9 +; CHECK1024-NEXT: mov sp, x9 +; CHECK1024-NEXT: sub x10, x29, #784 +; CHECK1024-NEXT: stur x9, [x10, #-256] +; CHECK1024-NEXT: sub x9, x29, #774 +; CHECK1024-NEXT: sub x10, x29, #772 +; CHECK1024-NEXT: sturh wzr, [x9, #-256] +; CHECK1024-NEXT: sub x9, x29, #1040 +; CHECK1024-NEXT: stur wzr, [x10, #-256] +; CHECK1024-NEXT: sub x10, x29, #776 +; CHECK1024-NEXT: sturh w8, [x10, #-256] +; CHECK1024-NEXT: msr TPIDR2_EL0, x9 +; CHECK1024-NEXT: .cfi_offset vg, -32 +; CHECK1024-NEXT: smstop sm +; CHECK1024-NEXT: bl other +; CHECK1024-NEXT: smstart sm +; CHECK1024-NEXT: .cfi_restore vg +; CHECK1024-NEXT: smstart za +; CHECK1024-NEXT: mrs x8, TPIDR2_EL0 +; CHECK1024-NEXT: sub x0, x29, #1040 +; CHECK1024-NEXT: cbnz x8, .LBB33_2 +; CHECK1024-NEXT: // %bb.1: // %entry +; CHECK1024-NEXT: bl __arm_tpidr2_restore +; CHECK1024-NEXT: .LBB33_2: // %entry +; CHECK1024-NEXT: mov w0, w20 +; CHECK1024-NEXT: msr TPIDR2_EL0, xzr +; CHECK1024-NEXT: mov sp, x29 +; CHECK1024-NEXT: .cfi_def_cfa wsp, 1136 +; CHECK1024-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr x19, [sp, #1128] // 8-byte Folded Reload +; CHECK1024-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr x20, [sp, #1120] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x28, [sp, #1112] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x30, [sp, #1096] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x29, [sp, #1088] // 8-byte Folded Reload +; CHECK1024-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK1024-NEXT: ldp d15, d14, [sp] // 16-byte Folded Reload +; CHECK1024-NEXT: add sp, sp, #1136 +; CHECK1024-NEXT: .cfi_def_cfa_offset 0 +; CHECK1024-NEXT: .cfi_restore w19 +; CHECK1024-NEXT: .cfi_restore w20 +; CHECK1024-NEXT: .cfi_restore w28 +; CHECK1024-NEXT: .cfi_restore w30 +; CHECK1024-NEXT: .cfi_restore w29 +; CHECK1024-NEXT: .cfi_restore b8 +; CHECK1024-NEXT: .cfi_restore b9 +; CHECK1024-NEXT: .cfi_restore b10 +; CHECK1024-NEXT: .cfi_restore b11 +; CHECK1024-NEXT: .cfi_restore b12 +; CHECK1024-NEXT: .cfi_restore b13 +; CHECK1024-NEXT: .cfi_restore b14 +; CHECK1024-NEXT: .cfi_restore b15 +; CHECK1024-NEXT: ret +entry: + tail call void @other() + ret i32 %x +} +declare void @other() From 8afb395aeef5335554c623f92f48cbdc9ffe927d Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Thu, 18 Jul 2024 09:28:32 +0200 Subject: [PATCH 380/777] [sanitizer] Fix running sanitizer_bad_report_path_test on Linux as root (#97732) Running tests as root is not the greatest idea, however, there is one valid use case - running them in a container in order to verify LLVM on different distros. There is no reason to configure unprivileged users in a container, so one works as root. sanitizer_bad_report_path_test assumes that creating a file in a non-writable directory would fail, which is not the always case. For example, when we are on Linux and CAP_DAC_OVERRIDE, which root has, is in effect. Therefore, one solution is to drop it. However, that would be Linux-specific. Instead, use argv[0] as if it were a directory. mkdir() on top of a file should be prohibited by all supported Posix operating systems. Combine this with a partial revert of commit f4214e1469ad ("[sanitizer] Skip test on Android where chmod is not working"), since we shouldn't need to exclude Android anymore. --- .../Posix/sanitizer_bad_report_path_test.cpp | 27 ------------------- .../Posix/sanitizer_set_report_path_test.cpp | 10 ++++++- 2 files changed, 9 insertions(+), 28 deletions(-) delete mode 100644 compiler-rt/test/sanitizer_common/TestCases/Posix/sanitizer_bad_report_path_test.cpp diff --git a/compiler-rt/test/sanitizer_common/TestCases/Posix/sanitizer_bad_report_path_test.cpp b/compiler-rt/test/sanitizer_common/TestCases/Posix/sanitizer_bad_report_path_test.cpp deleted file mode 100644 index fd4abf448b09d..0000000000000 --- a/compiler-rt/test/sanitizer_common/TestCases/Posix/sanitizer_bad_report_path_test.cpp +++ /dev/null @@ -1,27 +0,0 @@ -// Test __sanitizer_set_report_path and __sanitizer_get_report_path with an -// unwritable directory. -// RUN: rm -rf %t.report_path && mkdir -p %t.report_path -// RUN: chmod u-w %t.report_path || true -// RUN: %clangxx -O2 %s -o %t -// RUN: not %run %t 2>&1 | FileCheck %s --check-prefix=FAIL - -// The chmod is not working on the android bot for some reason. -// UNSUPPORTED: android - -#include -#include -#include -#include - -volatile int *null = 0; - -int main(int argc, char **argv) { - char buff[1000]; - sprintf(buff, "%s.report_path/report", argv[0]); - __sanitizer_set_report_path(buff); - assert(strncmp(buff, __sanitizer_get_report_path(), strlen(buff)) == 0); - printf("Path %s\n", __sanitizer_get_report_path()); -} - -// FAIL: ERROR: Can't open file: {{.*}}Posix/Output/sanitizer_bad_report_path_test.cpp.tmp.report_path/report. -// FAIL-NOT: Path {{.*}}Posix/Output/sanitizer_bad_report_path_test.cpp.tmp.report_path/report. diff --git a/compiler-rt/test/sanitizer_common/TestCases/Posix/sanitizer_set_report_path_test.cpp b/compiler-rt/test/sanitizer_common/TestCases/Posix/sanitizer_set_report_path_test.cpp index 17cee722749d6..286eafc315baf 100644 --- a/compiler-rt/test/sanitizer_common/TestCases/Posix/sanitizer_set_report_path_test.cpp +++ b/compiler-rt/test/sanitizer_common/TestCases/Posix/sanitizer_set_report_path_test.cpp @@ -1,6 +1,6 @@ // Test __sanitizer_set_report_path and __sanitizer_get_report_path: // RUN: %clangxx -O2 %s -o %t -// RUN: %run %t | FileCheck %s +// RUN: not %run %t 2>&1 | FileCheck %s #include #include @@ -15,6 +15,14 @@ int main(int argc, char **argv) { __sanitizer_set_report_path(buff); assert(strncmp(buff, __sanitizer_get_report_path(), strlen(buff)) == 0); printf("Path %s\n", __sanitizer_get_report_path()); + fflush(stdout); + + // Try setting again with an invalid/inaccessible directory. + sprintf(buff, "%s/report", argv[0]); + __sanitizer_set_report_path(buff); + printf("Path %s\n", __sanitizer_get_report_path()); } // CHECK: Path {{.*}}Posix/Output/sanitizer_set_report_path_test.cpp.tmp.report_path/report. +// CHECK: ERROR: Can't create directory: {{.*}}Posix/Output/sanitizer_set_report_path_test.cpp.tmp +// CHECK-NOT: Path {{.*}}Posix/Output/sanitizer_set_report_path_test.cpp.tmp From a19e5aedd9b15ecf0b05bafb7d20e13c952b4531 Mon Sep 17 00:00:00 2001 From: jeanPerier Date: Thu, 18 Jul 2024 09:36:13 +0200 Subject: [PATCH 381/777] [flang] load SECOND result in genSecond (#99342) Until genSecond, all intrinsic `genXXX` returning scalar intrinsic (except NULL) were returning them as value. The code calling genIntrinsicCall is using that assumption when generation the asExprOp because hflir.expr<> of scalar are badly supported in tools (I should likely just forbid them all together), the type is meant for "non trivial" values: arrays, character, and derived type. For instance, the added tests crashed with error: `'arith.subf' op operand #0 must be floating-point-like, but got '!hlfir.expr'` Load the result in genSecond and add an assert after genIntrinsicCall to better enforce this. --- flang/lib/Lower/ConvertCall.cpp | 2 ++ flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 2 +- flang/test/Lower/Intrinsics/second.f90 | 28 +++++++++++++++---- 3 files changed, 26 insertions(+), 6 deletions(-) diff --git a/flang/lib/Lower/ConvertCall.cpp b/flang/lib/Lower/ConvertCall.cpp index 54e29a1d60689..ba65b644e5a93 100644 --- a/flang/lib/Lower/ConvertCall.cpp +++ b/flang/lib/Lower/ConvertCall.cpp @@ -2005,6 +2005,8 @@ genIntrinsicRefCore(Fortran::lower::PreparedActualArguments &loweredActuals, // returns a null pointer variable that should not be transformed into a value // (what matters is the memory address). if (resultEntity.isVariable() && intrinsicName != "null") { + assert(!fir::isa_trivial(fir::unwrapRefType(resultEntity.getType())) && + "expect intrinsic scalar results to not be in memory"); hlfir::AsExprOp asExpr; // Character/Derived MERGE lowering returns one of its argument address // (this is the only intrinsic implemented in that way so far). The diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index ba71fb3b4040c..0e5e30a7024d8 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -6161,7 +6161,7 @@ IntrinsicLibrary::genSecond(std::optional resultType, genCpuTime(subroutineArgs); if (resultType) - return result; + return builder.create(loc, fir::getBase(result)); return {}; } diff --git a/flang/test/Lower/Intrinsics/second.f90 b/flang/test/Lower/Intrinsics/second.f90 index f1e66506aaaca..7c5cc5e09bbe6 100644 --- a/flang/test/Lower/Intrinsics/second.f90 +++ b/flang/test/Lower/Intrinsics/second.f90 @@ -28,10 +28,28 @@ subroutine test_function(time) ! CHECK: %[[VAL_4:.*]] = fir.call @_FortranACpuTime() fastmath : () -> f64 ! CHECK: %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (f64) -> f32 ! CHECK: fir.store %[[VAL_5]] to %[[VAL_1]] : !fir.ref -! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = ".tmp.intrinsic_result"} : (!fir.ref) -> (!fir.ref, !fir.ref) -! CHECK: %[[VAL_7:.*]] = arith.constant false -! CHECK: %[[VAL_8:.*]] = hlfir.as_expr %[[VAL_6]]#0 move %[[VAL_7]] : (!fir.ref, i1) -> !hlfir.expr -! CHECK: hlfir.assign %[[VAL_8]] to %[[VAL_3]]#0 : !hlfir.expr, !fir.ref -! CHECK: hlfir.destroy %[[VAL_8]] : !hlfir.expr +! CHECK: %[[VAL_6:.*]] = fir.load %[[VAL_1]] : !fir.ref +! CHECK: hlfir.assign %[[VAL_6]] to %[[VAL_3]]#0 : f32, !fir.ref +! CHECK: return +! CHECK: } + +subroutine test_function_subexpr(t1, t2) + real :: t1, t2 + t2 = second() - t1 +end subroutine +! CHECK-LABEL: func.func @_QPtest_function_subexpr( +! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref {fir.bindc_name = "t1"}, +! CHECK-SAME: %[[VAL_1:.*]]: !fir.ref {fir.bindc_name = "t2"}) { +! CHECK: %[[VAL_2:.*]] = fir.alloca f32 +! CHECK: %[[VAL_3:.*]] = fir.dummy_scope : !fir.dscope +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_3]] {uniq_name = "_QFtest_function_subexprEt1"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) +! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_3]] {uniq_name = "_QFtest_function_subexprEt2"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) +! CHECK: %[[VAL_6:.*]] = fir.call @_FortranACpuTime() fastmath : () -> f64 +! CHECK: %[[VAL_7:.*]] = fir.convert %[[VAL_6]] : (f64) -> f32 +! CHECK: fir.store %[[VAL_7]] to %[[VAL_2]] : !fir.ref +! CHECK: %[[VAL_8:.*]] = fir.load %[[VAL_2]] : !fir.ref +! CHECK: %[[VAL_9:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref +! CHECK: %[[VAL_10:.*]] = arith.subf %[[VAL_8]], %[[VAL_9]] fastmath : f32 +! CHECK: hlfir.assign %[[VAL_10]] to %[[VAL_5]]#0 : f32, !fir.ref ! CHECK: return ! CHECK: } From e93df78bd46b585c0bdabdbdc95410e4c08b9d38 Mon Sep 17 00:00:00 2001 From: Pavel Labath Date: Thu, 18 Jul 2024 09:44:06 +0200 Subject: [PATCH 382/777] [llvm/DWARF] Recursively resolve DW_AT_signature references (#97423) findRecursively follows DW_AT_specification and DW_AT_abstract_origin references, but not DW_AT_signature. As far as I can tell, there is no fundamental difference between these attributes that would make this behavior desirable, and this just seems like a consequence of the fact that this attribute is newer. This patch aims to change that. The motivation is some code in lldb, which assumes that it can construct a qualified name of a type by just walking the parent chain and looking at the name attribute. This works for "regular" debug info, even when some of the DIEs are just forward declarations, but it breaks in the presence of type units, because of the need to explicitly resolve the signature reference. While LLDB does not use the llvm's DWARFDie class (yet?), this seems like a very important change in the overall API, and any divergence here would complicate eventual reunification, which is why I am making the change in the llvm API first. However, putting lldb aside, I think this change is beneficial in llvm on its own, as it allows us to remove the explicit DW_AT_signature resolution in the DWARFTypePrinter. --- llvm/include/llvm/DebugInfo/DWARF/DWARFDie.h | 2 - llvm/lib/DebugInfo/DWARF/DWARFDie.cpp | 36 +++++-------- llvm/lib/DebugInfo/DWARF/DWARFTypePrinter.cpp | 52 +++++++++---------- .../X86/prettyprint_type_units.s | 19 ++++++- 4 files changed, 54 insertions(+), 55 deletions(-) diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFDie.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFDie.h index 421b84d644db6..497d3bee048ab 100644 --- a/llvm/include/llvm/DebugInfo/DWARF/DWARFDie.h +++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFDie.h @@ -181,8 +181,6 @@ class DWARFDie { DWARFDie getAttributeValueAsReferencedDie(dwarf::Attribute Attr) const; DWARFDie getAttributeValueAsReferencedDie(const DWARFFormValue &V) const; - DWARFDie resolveTypeUnitReference() const; - /// Extract the range base attribute from this DIE as absolute section offset. /// /// This is a utility function that checks for either the DW_AT_rnglists_base diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp index 72e7464b68971..345a91a6f3585 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp @@ -103,10 +103,6 @@ static void dumpLocationExpr(raw_ostream &OS, const DWARFFormValue &FormValue, .print(OS, DumpOpts, U); } -static DWARFDie resolveReferencedType(DWARFDie D, DWARFFormValue F) { - return D.getAttributeValueAsReferencedDie(F).resolveTypeUnitReference(); -} - static void dumpAttribute(raw_ostream &OS, const DWARFDie &Die, const DWARFAttribute &AttrValue, unsigned Indent, DIDumpOptions DumpOpts) { @@ -198,8 +194,8 @@ static void dumpAttribute(raw_ostream &OS, const DWARFDie &Die, DINameKind::LinkageName)) OS << Space << "\"" << Name << '\"'; } else if (Attr == DW_AT_type || Attr == DW_AT_containing_type) { - DWARFDie D = resolveReferencedType(Die, FormValue); - if (D && !D.isNULL()) { + if (DWARFDie D = Die.getAttributeValueAsReferencedDie(FormValue); + D && !D.isNULL()) { OS << Space << "\""; dumpTypeQualifiedName(D, OS); OS << '"'; @@ -291,13 +287,12 @@ DWARFDie::findRecursively(ArrayRef Attrs) const { if (auto Value = Die.find(Attrs)) return Value; - if (auto D = Die.getAttributeValueAsReferencedDie(DW_AT_abstract_origin)) - if (Seen.insert(D).second) - Worklist.push_back(D); - - if (auto D = Die.getAttributeValueAsReferencedDie(DW_AT_specification)) - if (Seen.insert(D).second) - Worklist.push_back(D); + for (dwarf::Attribute Attr : + {DW_AT_abstract_origin, DW_AT_specification, DW_AT_signature}) { + if (auto D = Die.getAttributeValueAsReferencedDie(Attr)) + if (Seen.insert(D).second) + Worklist.push_back(D); + } } return std::nullopt; @@ -319,21 +314,14 @@ DWARFDie::getAttributeValueAsReferencedDie(const DWARFFormValue &V) const { } else if (Offset = V.getAsDebugInfoReference(); Offset) { if (DWARFUnit *SpecUnit = U->getUnitVector().getUnitForOffset(*Offset)) Result = SpecUnit->getDIEForOffset(*Offset); + } else if (std::optional Sig = V.getAsSignatureReference()) { + if (DWARFTypeUnit *TU = U->getContext().getTypeUnitForHash( + U->getVersion(), *Sig, U->isDWOUnit())) + Result = TU->getDIEForOffset(TU->getTypeOffset() + TU->getOffset()); } return Result; } -DWARFDie DWARFDie::resolveTypeUnitReference() const { - if (auto Attr = find(DW_AT_signature)) { - if (std::optional Sig = Attr->getAsReferenceUVal()) { - if (DWARFTypeUnit *TU = U->getContext().getTypeUnitForHash( - U->getVersion(), *Sig, U->isDWOUnit())) - return TU->getDIEForOffset(TU->getTypeOffset() + TU->getOffset()); - } - } - return *this; -} - std::optional DWARFDie::getRangesBaseAttribute() const { return toSectionOffset(find({DW_AT_rnglists_base, DW_AT_GNU_ranges_base})); } diff --git a/llvm/lib/DebugInfo/DWARF/DWARFTypePrinter.cpp b/llvm/lib/DebugInfo/DWARF/DWARFTypePrinter.cpp index a26431e8313f6..fc1aae77a9293 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFTypePrinter.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFTypePrinter.cpp @@ -62,17 +62,10 @@ void DWARFTypePrinter::appendArrayType(const DWARFDie &D) { EndedWithTemplate = false; } -static DWARFDie resolveReferencedType(DWARFDie D, - dwarf::Attribute Attr = DW_AT_type) { - return D.getAttributeValueAsReferencedDie(Attr).resolveTypeUnitReference(); -} -static DWARFDie resolveReferencedType(DWARFDie D, DWARFFormValue F) { - return D.getAttributeValueAsReferencedDie(F).resolveTypeUnitReference(); -} DWARFDie DWARFTypePrinter::skipQualifiers(DWARFDie D) { while (D && (D.getTag() == DW_TAG_const_type || D.getTag() == DW_TAG_volatile_type)) - D = resolveReferencedType(D); + D = D.getAttributeValueAsReferencedDie(DW_AT_type); return D; } @@ -103,7 +96,9 @@ DWARFTypePrinter::appendUnqualifiedNameBefore(DWARFDie D, return DWARFDie(); } DWARFDie InnerDIE; - auto Inner = [&] { return InnerDIE = resolveReferencedType(D); }; + auto Inner = [&] { + return InnerDIE = D.getAttributeValueAsReferencedDie(DW_AT_type); + }; const dwarf::Tag T = D.getTag(); switch (T) { case DW_TAG_pointer_type: { @@ -134,7 +129,8 @@ DWARFTypePrinter::appendUnqualifiedNameBefore(DWARFDie D, OS << '('; else if (Word) OS << ' '; - if (DWARFDie Cont = resolveReferencedType(D, DW_AT_containing_type)) { + if (DWARFDie Cont = + D.getAttributeValueAsReferencedDie(DW_AT_containing_type)) { appendQualifiedName(Cont); EndedWithTemplate = false; OS << "::"; @@ -173,7 +169,8 @@ DWARFTypePrinter::appendUnqualifiedNameBefore(DWARFDie D, case DW_TAG_base_type: */ default: { - const char *NamePtr = dwarf::toString(D.find(DW_AT_name), nullptr); + const char *NamePtr = + dwarf::toString(D.findRecursively(DW_AT_name), nullptr); if (!NamePtr) { appendTypeTagName(D.getTag()); return DWARFDie(); @@ -235,9 +232,9 @@ void DWARFTypePrinter::appendUnqualifiedNameAfter( case DW_TAG_pointer_type: { if (needsParens(Inner)) OS << ')'; - appendUnqualifiedNameAfter(Inner, resolveReferencedType(Inner), - /*SkipFirstParamIfArtificial=*/D.getTag() == - DW_TAG_ptr_to_member_type); + appendUnqualifiedNameAfter( + Inner, Inner.getAttributeValueAsReferencedDie(DW_AT_type), + /*SkipFirstParamIfArtificial=*/D.getTag() == DW_TAG_ptr_to_member_type); break; } case DW_TAG_LLVM_ptrauth_type: { @@ -341,7 +338,7 @@ bool DWARFTypePrinter::appendTemplateParameters(DWARFDie D, appendTemplateParameters(C, FirstParameter); } if (C.getTag() == dwarf::DW_TAG_template_value_parameter) { - DWARFDie T = resolveReferencedType(C); + DWARFDie T = C.getAttributeValueAsReferencedDie(DW_AT_type); Sep(); if (T.getTag() == DW_TAG_enumeration_type) { OS << '('; @@ -461,7 +458,7 @@ bool DWARFTypePrinter::appendTemplateParameters(DWARFDie D, continue; auto TypeAttr = C.find(DW_AT_type); Sep(); - appendQualifiedName(TypeAttr ? resolveReferencedType(C, *TypeAttr) + appendQualifiedName(TypeAttr ? C.getAttributeValueAsReferencedDie(*TypeAttr) : DWARFDie()); } if (IsTemplate && *FirstParameter && FirstParameter == &FirstParameterValue) { @@ -473,15 +470,15 @@ bool DWARFTypePrinter::appendTemplateParameters(DWARFDie D, void DWARFTypePrinter::decomposeConstVolatile(DWARFDie &N, DWARFDie &T, DWARFDie &C, DWARFDie &V) { (N.getTag() == DW_TAG_const_type ? C : V) = N; - T = resolveReferencedType(N); + T = N.getAttributeValueAsReferencedDie(DW_AT_type); if (T) { auto Tag = T.getTag(); if (Tag == DW_TAG_const_type) { C = T; - T = resolveReferencedType(T); + T = T.getAttributeValueAsReferencedDie(DW_AT_type); } else if (Tag == DW_TAG_volatile_type) { V = T; - T = resolveReferencedType(T); + T = T.getAttributeValueAsReferencedDie(DW_AT_type); } } } @@ -491,10 +488,11 @@ void DWARFTypePrinter::appendConstVolatileQualifierAfter(DWARFDie N) { DWARFDie T; decomposeConstVolatile(N, T, C, V); if (T && T.getTag() == DW_TAG_subroutine_type) - appendSubroutineNameAfter(T, resolveReferencedType(T), false, C.isValid(), - V.isValid()); + appendSubroutineNameAfter(T, T.getAttributeValueAsReferencedDie(DW_AT_type), + false, C.isValid(), V.isValid()); else - appendUnqualifiedNameAfter(T, resolveReferencedType(T)); + appendUnqualifiedNameAfter(T, + T.getAttributeValueAsReferencedDie(DW_AT_type)); } void DWARFTypePrinter::appendConstVolatileQualifierBefore(DWARFDie N) { DWARFDie C; @@ -504,7 +502,7 @@ void DWARFTypePrinter::appendConstVolatileQualifierBefore(DWARFDie N) { bool Subroutine = T && T.getTag() == DW_TAG_subroutine_type; DWARFDie A = T; while (A && A.getTag() == DW_TAG_array_type) - A = resolveReferencedType(A); + A = A.getAttributeValueAsReferencedDie(DW_AT_type); bool Leading = (!A || (A.getTag() != DW_TAG_pointer_type && A.getTag() != llvm::dwarf::DW_TAG_ptr_to_member_type)) && @@ -546,7 +544,7 @@ void DWARFTypePrinter::appendSubroutineNameAfter( if (P.getTag() != DW_TAG_formal_parameter && P.getTag() != DW_TAG_unspecified_parameters) return; - DWARFDie T = resolveReferencedType(P); + DWARFDie T = P.getAttributeValueAsReferencedDie(DW_AT_type); if (SkipFirstParamIfArtificial && RealFirst && P.find(DW_AT_artificial)) { FirstParamIfArtificial = T; RealFirst = false; @@ -567,7 +565,7 @@ void DWARFTypePrinter::appendSubroutineNameAfter( if (DWARFDie P = FirstParamIfArtificial) { if (P.getTag() == DW_TAG_pointer_type) { auto CVStep = [&](DWARFDie CV) { - if (DWARFDie U = resolveReferencedType(CV)) { + if (DWARFDie U = CV.getAttributeValueAsReferencedDie(DW_AT_type)) { Const |= U.getTag() == DW_TAG_const_type; Volatile |= U.getTag() == DW_TAG_volatile_type; return U; @@ -653,7 +651,8 @@ void DWARFTypePrinter::appendSubroutineNameAfter( if (D.find(DW_AT_rvalue_reference)) OS << " &&"; - appendUnqualifiedNameAfter(Inner, resolveReferencedType(Inner)); + appendUnqualifiedNameAfter( + Inner, Inner.getAttributeValueAsReferencedDie(DW_AT_type)); } void DWARFTypePrinter::appendScopes(DWARFDie D) { if (D.getTag() == DW_TAG_compile_unit) @@ -666,7 +665,6 @@ void DWARFTypePrinter::appendScopes(DWARFDie D) { return; if (D.getTag() == DW_TAG_lexical_block) return; - D = D.resolveTypeUnitReference(); if (DWARFDie P = D.getParent()) appendScopes(P); appendUnqualifiedName(D); diff --git a/llvm/test/tools/llvm-dwarfdump/X86/prettyprint_type_units.s b/llvm/test/tools/llvm-dwarfdump/X86/prettyprint_type_units.s index aad748a301e6b..5611963a585f6 100644 --- a/llvm/test/tools/llvm-dwarfdump/X86/prettyprint_type_units.s +++ b/llvm/test/tools/llvm-dwarfdump/X86/prettyprint_type_units.s @@ -18,12 +18,15 @@ # doesn't really need templates - two local variables would've sufficed # (anything that references the type units) but I was working on something else # and this seemed minimal enough. +# A gcc-style type signature reference was also inserted. # CHECK: DW_TAG_template_type_parameter # CHECK: DW_AT_type ({{.*}} "t1") # CHECK: DW_TAG_template_type_parameter # CHECK: DW_AT_type ({{.*}} "t2") +# CHECK: DW_TAG_template_type_parameter +# CHECK: DW_AT_type (0xc6694e51369161f2 "t1") .text .file "test.cpp" @@ -270,6 +273,13 @@ _Z2f1IJ2t12t2EEvv: # @_Z2f1IJ2t12t2EEvv .byte 11 # DW_FORM_data1 .byte 0 # EOM(1) .byte 0 # EOM(2) + .byte 12 # Abbreviation Code + .byte 47 # DW_TAG_template_type_parameter + .byte 0 # DW_CHILDREN_no + .byte 73 # DW_AT_type + .byte 32 # DW_FORM_ref_sig8 + .byte 0 # EOM(1) + .byte 0 # EOM(2) .byte 0 # EOM(3) .section .debug_info,"",@progbits .Lcu_begin0: @@ -313,18 +323,23 @@ _Z2f1IJ2t12t2EEvv: # @_Z2f1IJ2t12t2EEvv .byte 6 # Abbrev [6] 0x46:0xd DW_TAG_GNU_template_parameter_pack .byte 5 # DW_AT_name .byte 7 # Abbrev [7] 0x48:0x5 DW_TAG_template_type_parameter - .long 88 # DW_AT_type + .long .Lt1_decl-.Lcu_begin0 # DW_AT_type .byte 7 # Abbrev [7] 0x4d:0x5 DW_TAG_template_type_parameter - .long 97 # DW_AT_type + # Simulate DWARF emitted by GCC where the signature is directly in the type attribute. + .long .Lt2_decl-.Lcu_begin0 # DW_AT_type + .byte 12 # Abbrev [12] DW_TAG_template_type_parameter + .quad -4149699470930386446 # DW_AT_type .byte 0 # End Of Children Mark .byte 0 # End Of Children Mark .byte 8 # Abbrev [8] 0x54:0x4 DW_TAG_base_type .byte 4 # DW_AT_name .byte 5 # DW_AT_encoding .byte 4 # DW_AT_byte_size + .Lt1_decl: .byte 9 # Abbrev [9] 0x58:0x9 DW_TAG_structure_type # DW_AT_declaration .quad -4149699470930386446 # DW_AT_signature + .Lt2_decl: .byte 9 # Abbrev [9] 0x61:0x9 DW_TAG_structure_type # DW_AT_declaration .quad 5649318945901130368 # DW_AT_signature From 09cbb45edd149d30766c87be4628e4df13f3496d Mon Sep 17 00:00:00 2001 From: Pavel Labath Date: Thu, 18 Jul 2024 09:46:29 +0200 Subject: [PATCH 383/777] [BOLT][DWARF][NFC] A better DIEBuilder for the llvm API change in #98905 (#99324) The caller (cloneAttribute) already switches on the reference type. By aligning the cases with the retrieval functions, we can avoid branching twice. --- bolt/include/bolt/Core/DIEBuilder.h | 12 ++------ bolt/lib/Core/DIEBuilder.cpp | 43 +++++++---------------------- 2 files changed, 12 insertions(+), 43 deletions(-) diff --git a/bolt/include/bolt/Core/DIEBuilder.h b/bolt/include/bolt/Core/DIEBuilder.h index c562373c718ba..0b840c142ed81 100644 --- a/bolt/include/bolt/Core/DIEBuilder.h +++ b/bolt/include/bolt/Core/DIEBuilder.h @@ -135,13 +135,6 @@ class DIEBuilder { /// Returns current state of the DIEBuilder State &getState() { return *BuilderState.get(); } - /// Resolve the reference in DIE, if target is not loaded into IR, - /// pre-allocate it. \p RefCU will be updated to the Unit specific by \p - /// RefValue. - DWARFDie resolveDIEReference( - const DWARFFormValue &RefValue, - const DWARFAbbreviationDeclaration::AttributeSpec AttrSpec, - DWARFUnit *&RefCU, DWARFDebugInfoEntry &DwarfDebugInfoEntry); /// Resolve the reference in DIE, if target is not loaded into IR, /// pre-allocate it. \p RefCU will be updated to the Unit specific by \p @@ -165,10 +158,9 @@ class DIEBuilder { const DWARFFormValue &Val); /// Clone an attribute in reference format. - void cloneDieReferenceAttribute( + void cloneDieOffsetReferenceAttribute( DIE &Die, const DWARFUnit &U, const DWARFDie &InputDIE, - const DWARFAbbreviationDeclaration::AttributeSpec AttrSpec, - const DWARFFormValue &Val); + const DWARFAbbreviationDeclaration::AttributeSpec AttrSpec, uint64_t Ref); /// Clone an attribute in block format. void cloneBlockAttribute( diff --git a/bolt/lib/Core/DIEBuilder.cpp b/bolt/lib/Core/DIEBuilder.cpp index 7815a305c0518..b0f550fd77318 100644 --- a/bolt/lib/Core/DIEBuilder.cpp +++ b/bolt/lib/Core/DIEBuilder.cpp @@ -551,25 +551,6 @@ void DIEBuilder::finish() { updateReferences(); } -DWARFDie DIEBuilder::resolveDIEReference( - const DWARFFormValue &RefValue, - const DWARFAbbreviationDeclaration::AttributeSpec AttrSpec, - DWARFUnit *&RefCU, DWARFDebugInfoEntry &DwarfDebugInfoEntry) { - assert(RefValue.isFormClass(DWARFFormValue::FC_Reference)); - uint64_t RefOffset; - if (std::optional Off = RefValue.getAsRelativeReference()) { - RefOffset = RefValue.getUnit()->getOffset() + *Off; - } else if (Off = RefValue.getAsDebugInfoReference(); Off) { - RefOffset = *Off; - } else { - BC.errs() - << "BOLT-WARNING: [internal-dwarf-error]: unsupported reference type: " - << FormEncodingString(RefValue.getForm()) << ".\n"; - return DWARFDie(); - } - return resolveDIEReference(AttrSpec, RefOffset, RefCU, DwarfDebugInfoEntry); -} - DWARFDie DIEBuilder::resolveDIEReference( const DWARFAbbreviationDeclaration::AttributeSpec AttrSpec, const uint64_t RefOffset, DWARFUnit *&RefCU, @@ -613,23 +594,14 @@ DWARFDie DIEBuilder::resolveDIEReference( return DWARFDie(); } -void DIEBuilder::cloneDieReferenceAttribute( +void DIEBuilder::cloneDieOffsetReferenceAttribute( DIE &Die, const DWARFUnit &U, const DWARFDie &InputDIE, - const DWARFAbbreviationDeclaration::AttributeSpec AttrSpec, - const DWARFFormValue &Val) { - uint64_t Ref; - if (std::optional Off = Val.getAsRelativeReference()) - Ref = Val.getUnit()->getOffset() + *Off; - else if (Off = Val.getAsDebugInfoReference(); Off) - Ref = *Off; - else - return; - + const DWARFAbbreviationDeclaration::AttributeSpec AttrSpec, uint64_t Ref) { DIE *NewRefDie = nullptr; DWARFUnit *RefUnit = nullptr; DWARFDebugInfoEntry DDIEntry; - const DWARFDie RefDie = resolveDIEReference(Val, AttrSpec, RefUnit, DDIEntry); + const DWARFDie RefDie = resolveDIEReference(AttrSpec, Ref, RefUnit, DDIEntry); if (!RefDie) return; @@ -834,7 +806,7 @@ void DIEBuilder::cloneAddressAttribute( void DIEBuilder::cloneRefsigAttribute( DIE &Die, DWARFAbbreviationDeclaration::AttributeSpec AttrSpec, const DWARFFormValue &Val) { - const std::optional SigVal = Val.getRawUValue(); + const std::optional SigVal = Val.getAsSignatureReference(); Die.addValue(getState().DIEAlloc, AttrSpec.Attr, dwarf::DW_FORM_ref_sig8, DIEInteger(*SigVal)); } @@ -902,11 +874,16 @@ void DIEBuilder::cloneAttribute( cloneStringAttribute(Die, U, AttrSpec, Val); break; case dwarf::DW_FORM_ref_addr: + cloneDieOffsetReferenceAttribute(Die, U, InputDIE, AttrSpec, + *Val.getAsDebugInfoReference()); + break; case dwarf::DW_FORM_ref1: case dwarf::DW_FORM_ref2: case dwarf::DW_FORM_ref4: case dwarf::DW_FORM_ref8: - cloneDieReferenceAttribute(Die, U, InputDIE, AttrSpec, Val); + cloneDieOffsetReferenceAttribute(Die, U, InputDIE, AttrSpec, + Val.getUnit()->getOffset() + + *Val.getAsRelativeReference()); break; case dwarf::DW_FORM_block: case dwarf::DW_FORM_block1: From 2ef7cbf71c98246d6f3a9c63dea75b76c7b5e928 Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Thu, 18 Jul 2024 11:49:53 +0400 Subject: [PATCH 384/777] [clang] Add deprecation warning for `-Ofast` driver option (#98736) This patch implements consensus on the corresponding RFC documented here: https://discourse.llvm.org/t/rfc-deprecate-ofast/78687/72 Specifically, I added a deprecation warning for `-Ofast`, that suggests to use `-O3` or `-O3` with `-ffast-math`, and a new diagnostic group for aforementioned warning. Deprecation period is going to be lengthy, so I hope this PR can be merged in time for Clang 19. --- clang/docs/CommandGuide/clang.rst | 3 ++- clang/docs/ReleaseNotes.rst | 7 +++++++ clang/include/clang/Basic/DiagnosticDriverKinds.td | 4 ++++ clang/include/clang/Basic/DiagnosticGroups.td | 2 ++ clang/include/clang/Driver/Options.td | 4 +++- clang/lib/Driver/ToolChains/Clang.cpp | 2 ++ clang/test/Driver/Ofast.c | 7 ++++++- 7 files changed, 26 insertions(+), 3 deletions(-) diff --git a/clang/docs/CommandGuide/clang.rst b/clang/docs/CommandGuide/clang.rst index 29154292dc7a5..663aca1f6ddcb 100644 --- a/clang/docs/CommandGuide/clang.rst +++ b/clang/docs/CommandGuide/clang.rst @@ -429,7 +429,8 @@ Code Generation Options :option:`-Ofast` Enables all the optimizations from :option:`-O3` along with other aggressive optimizations that may violate strict compliance with - language standards. + language standards. This is deprecated in favor of :option:`-O3` + in combination with :option:`-ffast-math`. :option:`-Os` Like :option:`-O2` with extra optimizations to reduce code size. diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 1c1b874273a7c..469510d175887 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -473,6 +473,13 @@ New Compiler Flags Deprecated Compiler Flags ------------------------- +- The ``-Ofast`` command-line option has been deprecated. This option both + enables the ``-O3`` optimization-level, as well as enabling non-standard + ``-ffast-math`` behaviors. As such, it is somewhat misleading as an + "optimization level". Users are advised to switch to ``-O3 -ffast-math`` if + the use of non-standard math behavior is intended, and ``-O3`` otherwise. + See `RFC `_ for details. + Modified Compiler Flags ----------------------- - Added a new diagnostic flag ``-Wreturn-mismatch`` which is grouped under diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td index 359c0de7f811c..cfa897f28b4c0 100644 --- a/clang/include/clang/Basic/DiagnosticDriverKinds.td +++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td @@ -442,6 +442,10 @@ def warn_drv_deprecated_arg : Warning< def warn_drv_deprecated_arg_no_relaxed_template_template_args : Warning< "argument '-fno-relaxed-template-template-args' is deprecated">, InGroup; +def warn_drv_deprecated_arg_ofast : Warning< + "argument '-Ofast' is deprecated; use '-O3 -ffast math' for the same behavior," + " or '-O3' to enable only conforming optimizations">, + InGroup; def warn_drv_deprecated_custom : Warning< "argument '%0' is deprecated, %1">, InGroup; def warn_drv_assuming_mfloat_abi_is : Warning< diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td index 2241f8481484e..d7dba76a0fcf8 100644 --- a/clang/include/clang/Basic/DiagnosticGroups.td +++ b/clang/include/clang/Basic/DiagnosticGroups.td @@ -103,6 +103,7 @@ def EnumConversion : DiagGroup<"enum-conversion", EnumFloatConversion, EnumCompareConditional]>; def DeprecatedNoRelaxedTemplateTemplateArgs : DiagGroup<"deprecated-no-relaxed-template-template-args">; +def DeprecatedOFast : DiagGroup<"deprecated-ofast">; def ObjCSignedCharBoolImplicitIntConversion : DiagGroup<"objc-signed-char-bool-implicit-int-conversion">; def Shorten64To32 : DiagGroup<"shorten-64-to-32">; @@ -228,6 +229,7 @@ def Deprecated : DiagGroup<"deprecated", [DeprecatedAnonEnumEnumConversion, DeprecatedPragma, DeprecatedRegister, DeprecatedNoRelaxedTemplateTemplateArgs, + DeprecatedOFast, DeprecatedThisCapture, DeprecatedType, DeprecatedVolatile, diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 25555e4620523..1675e435d210c 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -931,7 +931,9 @@ def O : Joined<["-"], "O">, Group, def O_flag : Flag<["-"], "O">, Visibility<[ClangOption, CC1Option, FC1Option]>, Alias, AliasArgs<["1"]>; def Ofast : Joined<["-"], "Ofast">, Group, - Visibility<[ClangOption, CC1Option, FlangOption]>; + Visibility<[ClangOption, CC1Option, FlangOption]>, + HelpText<"Deprecated; use '-O3 -ffast math' for the same behavior," + " or '-O3' to enable only conforming optimizations">; def P : Flag<["-"], "P">, Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>, Group, diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index a8a7cef09972e..1fd6fba210042 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -5725,6 +5725,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, options::OPT_fno_zero_initialized_in_bss); bool OFastEnabled = isOptimizationLevelFast(Args); + if (OFastEnabled) + D.Diag(diag::warn_drv_deprecated_arg_ofast); // If -Ofast is the optimization level, then -fstrict-aliasing should be // enabled. This alias option is being used to simplify the hasFlag logic. OptSpecifier StrictAliasingAliasOption = diff --git a/clang/test/Driver/Ofast.c b/clang/test/Driver/Ofast.c index 8b7f2217eca2f..4c63caf9865d5 100644 --- a/clang/test/Driver/Ofast.c +++ b/clang/test/Driver/Ofast.c @@ -3,19 +3,21 @@ // RUN: %clang -fno-fast-math -Ofast -### %s 2>&1 | FileCheck -check-prefix=CHECK-OFAST %s // RUN: %clang -fno-strict-aliasing -Ofast -### %s 2>&1 | FileCheck -check-prefix=CHECK-OFAST %s // RUN: %clang -fno-vectorize -Ofast -### %s 2>&1 | FileCheck -check-prefix=CHECK-OFAST %s -// RUN: %clang -Ofast -O2 -### %s 2>&1 | FileCheck -check-prefix=CHECK-OFAST-O2 \ +// RUN: %clang -Ofast -O2 -### -Werror %s 2>&1 | FileCheck -check-prefix=CHECK-OFAST-O2 \ // RUN: %if target={{.*-windows-msvc.*}} %{ --check-prefix=CHECK-OFAST-O2-ALIASING-MSVC %} \ // RUN: %else %{ --check-prefix=CHECK-OFAST-O2-ALIASING %} %s // RUN: %clang -Ofast -fno-fast-math -### %s 2>&1 | FileCheck -check-prefix=CHECK-OFAST-NO-FAST-MATH %s // RUN: %clang -Ofast -fno-strict-aliasing -### %s 2>&1 | FileCheck -check-prefix=CHECK-OFAST-NO-STRICT-ALIASING %s // RUN: %clang -Ofast -fno-vectorize -### %s 2>&1 | FileCheck -check-prefix=CHECK-OFAST-NO-VECTORIZE %s +// CHECK-OFAST: use '-O3 -ffast math' for the same behavior, or '-O3' to enable only conforming optimizations // CHECK-OFAST: -cc1 // CHECK-OFAST-NOT: -relaxed-aliasing // CHECK-OFAST: -ffast-math // CHECK-OFAST: -Ofast // CHECK-OFAST: -vectorize-loops +// Lack of warning about '-Ofast' deprecation is checked via -Werror // CHECK-OFAST-O2: -cc1 // CHECK-OFAST-O2-ALIASING-NOT: -relaxed-aliasing // CHECK-OFAST-O2-ALIASING-MSVC: -relaxed-aliasing @@ -23,18 +25,21 @@ // CHECK-OFAST-O2-NOT: -Ofast // CHECK-OFAST-O2: -vectorize-loops +// CHECK-OFAST-NO-FAST-MATH: use '-O3 -ffast math' for the same behavior, or '-O3' to enable only conforming optimizations // CHECK-OFAST-NO-FAST-MATH: -cc1 // CHECK-OFAST-NO-FAST-MATH-NOT: -relaxed-aliasing // CHECK-OFAST-NO-FAST-MATH-NOT: -ffast-math // CHECK-OFAST-NO-FAST-MATH: -Ofast // CHECK-OFAST-NO-FAST-MATH: -vectorize-loops +// CHECK-OFAST-NO-STRICT-ALIASING: use '-O3 -ffast math' for the same behavior, or '-O3' to enable only conforming optimizations // CHECK-OFAST-NO-STRICT-ALIASING: -cc1 // CHECK-OFAST-NO-STRICT-ALIASING: -relaxed-aliasing // CHECK-OFAST-NO-STRICT-ALIASING: -ffast-math // CHECK-OFAST-NO-STRICT-ALIASING: -Ofast // CHECK-OFAST-NO-STRICT-ALIASING: -vectorize-loops +// CHECK-OFAST-NO-VECTORIZE: use '-O3 -ffast math' for the same behavior, or '-O3' to enable only conforming optimizations // CHECK-OFAST-NO-VECTORIZE: -cc1 // CHECK-OFAST-NO-VECTORIZE-NOT: -relaxed-aliasing // CHECK-OFAST-NO-VECTORIZE: -ffast-math From cf66cec7c4481ff39525232d64a4d5215cca3ac5 Mon Sep 17 00:00:00 2001 From: goldsteinn <35538541+goldsteinn@users.noreply.github.com> Date: Thu, 18 Jul 2024 15:58:14 +0800 Subject: [PATCH 385/777] Recommit "[PatternMatch] Fix issue of stale reference in new `m_{I,F,}Cmp` matchers" (3rd Try) (#99292) The first fix forgot to fixup the commutative matchers... --- llvm/include/llvm/IR/PatternMatch.h | 25 ++++++++++++------------- llvm/unittests/IR/PatternMatch.cpp | 4 ++-- 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h index 8ae47fb556b25..d9e27e087e705 100644 --- a/llvm/include/llvm/IR/PatternMatch.h +++ b/llvm/include/llvm/IR/PatternMatch.h @@ -1550,23 +1550,27 @@ template inline Exact_match m_Exact(const T &SubPattern) { template struct CmpClass_match { - PredicateTy &Predicate; + PredicateTy *Predicate; LHS_t L; RHS_t R; // The evaluation order is always stable, regardless of Commutability. // The LHS is always matched first. CmpClass_match(PredicateTy &Pred, const LHS_t &LHS, const RHS_t &RHS) - : Predicate(Pred), L(LHS), R(RHS) {} + : Predicate(&Pred), L(LHS), R(RHS) {} + CmpClass_match(const LHS_t &LHS, const RHS_t &RHS) + : Predicate(nullptr), L(LHS), R(RHS) {} template bool match(OpTy *V) { if (auto *I = dyn_cast(V)) { if (L.match(I->getOperand(0)) && R.match(I->getOperand(1))) { - Predicate = I->getPredicate(); + if (Predicate) + *Predicate = I->getPredicate(); return true; } else if (Commutable && L.match(I->getOperand(1)) && R.match(I->getOperand(0))) { - Predicate = I->getSwappedPredicate(); + if (Predicate) + *Predicate = I->getSwappedPredicate(); return true; } } @@ -1595,22 +1599,19 @@ m_FCmp(FCmpInst::Predicate &Pred, const LHS &L, const RHS &R) { template inline CmpClass_match m_Cmp(const LHS &L, const RHS &R) { - CmpInst::Predicate Unused; - return CmpClass_match(Unused, L, R); + return CmpClass_match(L, R); } template inline CmpClass_match m_ICmp(const LHS &L, const RHS &R) { - ICmpInst::Predicate Unused; - return CmpClass_match(Unused, L, R); + return CmpClass_match(L, R); } template inline CmpClass_match m_FCmp(const LHS &L, const RHS &R) { - FCmpInst::Predicate Unused; - return CmpClass_match(Unused, L, R); + return CmpClass_match(L, R); } // Same as CmpClass, but instead of saving Pred as out output variable, match a @@ -2681,9 +2682,7 @@ m_c_ICmp(ICmpInst::Predicate &Pred, const LHS &L, const RHS &R) { template inline CmpClass_match m_c_ICmp(const LHS &L, const RHS &R) { - ICmpInst::Predicate Unused; - return CmpClass_match(Unused, - L, R); + return CmpClass_match(L, R); } /// Matches a specific opcode with LHS and RHS in either order. diff --git a/llvm/unittests/IR/PatternMatch.cpp b/llvm/unittests/IR/PatternMatch.cpp index b82711ec244a6..309fcc93996bc 100644 --- a/llvm/unittests/IR/PatternMatch.cpp +++ b/llvm/unittests/IR/PatternMatch.cpp @@ -2235,7 +2235,7 @@ typedef ::testing::Types, MutableConstTestTypes; TYPED_TEST_SUITE(MutableConstTest, MutableConstTestTypes, ); -TYPED_TEST(MutableConstTest, /* FIXME: UAR bug */ DISABLED_ICmp) { +TYPED_TEST(MutableConstTest, ICmp) { auto &IRB = PatternMatchTest::IRB; typedef std::tuple_element_t<0, TypeParam> ValueType; @@ -2319,7 +2319,7 @@ TYPED_TEST(MutableConstTest, /* FIXME: UAR bug */ DISABLED_ICmp) { .match((InstructionType)IRB.CreateICmp(Pred, L, R))); } -TYPED_TEST(MutableConstTest, /* FIXME: UAR bug */ DISABLED_FCmp) { +TYPED_TEST(MutableConstTest, FCmp) { auto &IRB = PatternMatchTest::IRB; typedef std::tuple_element_t<0, TypeParam> ValueType; From 94cd18b7fcf239b85698ad70f145ca5fa5edd516 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Thu, 18 Jul 2024 09:37:19 +0200 Subject: [PATCH 386/777] [CVP] Add test for phi merging of vectors (NFC) --- .../CorrelatedValuePropagation/vectors.ll | 54 ++++++++++++++++++- 1 file changed, 52 insertions(+), 2 deletions(-) diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/vectors.ll b/llvm/test/Transforms/CorrelatedValuePropagation/vectors.ll index 6f13263fe92be..43e680cd25cdb 100644 --- a/llvm/test/Transforms/CorrelatedValuePropagation/vectors.ll +++ b/llvm/test/Transforms/CorrelatedValuePropagation/vectors.ll @@ -241,8 +241,6 @@ define <2 x i16> @and_with_poison(<2 x i8> %a) { ret <2 x i16> %res } - - define <4 x i64> @issue_97674_getConstantOnEdge(i1 %cond) { ; CHECK-LABEL: define <4 x i64> @issue_97674_getConstantOnEdge( ; CHECK-SAME: i1 [[COND:%.*]]) { @@ -277,3 +275,55 @@ entry: %folds = add <4 x i64> zeroinitializer, zeroinitializer ret <4 x i64> %folds } + +define <2 x i16> @phi_merge1(i1 %c, <2 x i8> %a) { +; CHECK-LABEL: define <2 x i16> @phi_merge1( +; CHECK-SAME: i1 [[C:%.*]], <2 x i8> [[A:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ZEXT:%.*]] = zext <2 x i8> [[A]] to <2 x i16> +; CHECK-NEXT: br i1 [[C]], label %[[IF:.*]], label %[[JOIN:.*]] +; CHECK: [[IF]]: +; CHECK-NEXT: br label %[[JOIN]] +; CHECK: [[JOIN]]: +; CHECK-NEXT: [[PHI:%.*]] = phi <2 x i16> [ [[ZEXT]], %[[ENTRY]] ], [ , %[[IF]] ] +; CHECK-NEXT: [[ADD:%.*]] = add <2 x i16> [[PHI]], +; CHECK-NEXT: ret <2 x i16> [[ADD]] +; +entry: + %zext = zext <2 x i8> %a to <2 x i16> + br i1 %c, label %if, label %join + +if: + br label %join + +join: + %phi = phi <2 x i16> [ %zext, %entry ], [ , %if ] + %add = add <2 x i16> %phi, + ret <2 x i16> %add +} + +define <2 x i16> @phi_merge2(i1 %c, <2 x i8> %a) { +; CHECK-LABEL: define <2 x i16> @phi_merge2( +; CHECK-SAME: i1 [[C:%.*]], <2 x i8> [[A:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ZEXT:%.*]] = zext <2 x i8> [[A]] to <2 x i16> +; CHECK-NEXT: br i1 [[C]], label %[[IF:.*]], label %[[JOIN:.*]] +; CHECK: [[IF]]: +; CHECK-NEXT: br label %[[JOIN]] +; CHECK: [[JOIN]]: +; CHECK-NEXT: [[PHI:%.*]] = phi <2 x i16> [ , %[[ENTRY]] ], [ [[ZEXT]], %[[IF]] ] +; CHECK-NEXT: [[ADD:%.*]] = add <2 x i16> [[PHI]], +; CHECK-NEXT: ret <2 x i16> [[ADD]] +; +entry: + %zext = zext <2 x i8> %a to <2 x i16> + br i1 %c, label %if, label %join + +if: + br label %join + +join: + %phi = phi <2 x i16> [ , %entry ], [ %zext, %if ] + %add = add <2 x i16> %phi, + ret <2 x i16> %add +} From 3eba28d1fd3347a1658f68b63285148b0bb25fab Mon Sep 17 00:00:00 2001 From: Haojian Wu Date: Thu, 18 Jul 2024 10:02:35 +0200 Subject: [PATCH 387/777] [clang] Extend lifetime analysis to support assignments for pointer-like objects. (#99032) This is a follow-up patch to #96475 to detect dangling assignments for C++ pointer-like objects (classes annotated with the `[[gsl::Pointer]]`). Fixes #63310. Similar to the behavior for built-in pointer types, if a temporary owner (`[[gsl::Owner]]`) object is assigned to a pointer-like class object, and this temporary object is destroyed at the end of the full assignment expression, the assignee pointer is considered dangling. In such cases, clang will emit a warning: ``` /tmp/t.cpp:7:20: warning: object backing the pointer my_string_view will be destroyed at the end of the full-expression [-Wdangling-assignment-gsl] 7 | my_string_view = CreateString(); | ^~~~~~~~~~~~~~ 1 warning generated. ``` This new warning is `-Wdangling-assignment-gsl`. It is initially disabled, but I intend to enable it by default in clang 20. I have initially tested this patch on our internal codebase, and it has identified many use-after-free bugs, primarily related to `string_view`. --- clang/docs/ReleaseNotes.rst | 3 + clang/include/clang/Basic/DiagnosticGroups.td | 2 + .../clang/Basic/DiagnosticSemaKinds.td | 3 + clang/lib/Sema/CheckExprLifetime.cpp | 79 ++++++++++++------- clang/lib/Sema/SemaOverload.cpp | 9 ++- .../warn-lifetime-analysis-nocfg-disabled.cpp | 4 + .../Sema/warn-lifetime-analysis-nocfg.cpp | 19 +++-- clang/test/SemaCXX/warn-dangling-local.cpp | 4 +- 8 files changed, 85 insertions(+), 38 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 469510d175887..e0e86af257a19 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -727,6 +727,9 @@ Improvements to Clang's diagnostics - Clang now diagnoses integer constant expressions that are folded to a constant value as an extension in more circumstances. Fixes #GH59863 +- Clang now diagnoses dangling assignments for pointer-like objects (annotated with `[[gsl::Pointer]]`) under `-Wdangling-assignment-gsl` (off by default) + Fixes #GH63310. + Improvements to Clang's time-trace ---------------------------------- diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td index d7dba76a0fcf8..19c3f1e043349 100644 --- a/clang/include/clang/Basic/DiagnosticGroups.td +++ b/clang/include/clang/Basic/DiagnosticGroups.td @@ -451,6 +451,7 @@ def LogicalNotParentheses: DiagGroup<"logical-not-parentheses">; def ShiftOpParentheses: DiagGroup<"shift-op-parentheses">; def OverloadedShiftOpParentheses: DiagGroup<"overloaded-shift-op-parentheses">; def DanglingAssignment: DiagGroup<"dangling-assignment">; +def DanglingAssignmentGsl : DiagGroup<"dangling-assignment-gsl">; def DanglingElse: DiagGroup<"dangling-else">; def DanglingField : DiagGroup<"dangling-field">; def DanglingInitializerList : DiagGroup<"dangling-initializer-list">; @@ -459,6 +460,7 @@ def ReturnStackAddress : DiagGroup<"return-stack-address">; // Name of this warning in GCC def : DiagGroup<"return-local-addr", [ReturnStackAddress]>; def Dangling : DiagGroup<"dangling", [DanglingAssignment, + DanglingAssignmentGsl, DanglingField, DanglingInitializerList, DanglingGsl, diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index b8a43b0a9fe8e..d60f32674ca3a 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -10124,6 +10124,9 @@ def warn_dangling_lifetime_pointer : Warning< "object backing the pointer " "will be destroyed at the end of the full-expression">, InGroup; +def warn_dangling_lifetime_pointer_assignment : Warning<"object backing the " + "pointer %0 will be destroyed at the end of the full-expression">, + InGroup, DefaultIgnore; def warn_new_dangling_initializer_list : Warning< "array backing " "%select{initializer list subobject of the allocated object|" diff --git a/clang/lib/Sema/CheckExprLifetime.cpp b/clang/lib/Sema/CheckExprLifetime.cpp index 995e4cbadacfe..d9031256f235f 100644 --- a/clang/lib/Sema/CheckExprLifetime.cpp +++ b/clang/lib/Sema/CheckExprLifetime.cpp @@ -191,7 +191,8 @@ struct IndirectLocalPathEntry { TemporaryCopy, LambdaCaptureInit, GslReferenceInit, - GslPointerInit + GslPointerInit, + GslPointerAssignment, } Kind; Expr *E; union { @@ -337,7 +338,8 @@ static void handleGslAnnotatedTypes(IndirectLocalPath &Path, Expr *Call, for (const IndirectLocalPathEntry &PE : llvm::reverse(Path)) { if (PE.Kind == IndirectLocalPathEntry::GslReferenceInit) continue; - if (PE.Kind == IndirectLocalPathEntry::GslPointerInit) + if (PE.Kind == IndirectLocalPathEntry::GslPointerInit || + PE.Kind == IndirectLocalPathEntry::GslPointerAssignment) return; break; } @@ -937,6 +939,7 @@ static SourceRange nextPathEntryRange(const IndirectLocalPath &Path, unsigned I, case IndirectLocalPathEntry::TemporaryCopy: case IndirectLocalPathEntry::GslReferenceInit: case IndirectLocalPathEntry::GslPointerInit: + case IndirectLocalPathEntry::GslPointerAssignment: // These exist primarily to mark the path as not permitting or // supporting lifetime extension. break; @@ -957,16 +960,20 @@ static SourceRange nextPathEntryRange(const IndirectLocalPath &Path, unsigned I, return E->getSourceRange(); } -static bool pathOnlyInitializesGslPointer(IndirectLocalPath &Path) { +static bool pathOnlyHandlesGslPointer(IndirectLocalPath &Path) { for (const auto &It : llvm::reverse(Path)) { - if (It.Kind == IndirectLocalPathEntry::VarInit) - continue; - if (It.Kind == IndirectLocalPathEntry::AddressOf) - continue; - if (It.Kind == IndirectLocalPathEntry::LifetimeBoundCall) + switch (It.Kind) { + case IndirectLocalPathEntry::VarInit: + case IndirectLocalPathEntry::AddressOf: + case IndirectLocalPathEntry::LifetimeBoundCall: continue; - return It.Kind == IndirectLocalPathEntry::GslPointerInit || - It.Kind == IndirectLocalPathEntry::GslReferenceInit; + case IndirectLocalPathEntry::GslPointerInit: + case IndirectLocalPathEntry::GslReferenceInit: + case IndirectLocalPathEntry::GslPointerAssignment: + return true; + default: + return false; + } } return false; } @@ -975,7 +982,8 @@ static void checkExprLifetimeImpl(Sema &SemaRef, const InitializedEntity *InitEntity, const InitializedEntity *ExtendingEntity, LifetimeKind LK, - const AssignedEntity *AEntity, Expr *Init) { + const AssignedEntity *AEntity, Expr *Init, + bool EnableLifetimeWarnings) { assert((AEntity && LK == LK_Assignment) || (InitEntity && LK != LK_Assignment)); // If this entity doesn't have an interesting lifetime, don't bother looking @@ -992,9 +1000,9 @@ static void checkExprLifetimeImpl(Sema &SemaRef, auto *MTE = dyn_cast(L); - bool IsGslPtrInitWithGslTempOwner = false; + bool IsGslPtrValueFromGslTempOwner = false; bool IsLocalGslOwner = false; - if (pathOnlyInitializesGslPointer(Path)) { + if (pathOnlyHandlesGslPointer(Path)) { if (isa(L)) { // We do not want to follow the references when returning a pointer // originating from a local owner to avoid the following false positive: @@ -1005,13 +1013,13 @@ static void checkExprLifetimeImpl(Sema &SemaRef, if (pathContainsInit(Path) || !IsLocalGslOwner) return false; } else { - IsGslPtrInitWithGslTempOwner = + IsGslPtrValueFromGslTempOwner = MTE && !MTE->getExtendingDecl() && isRecordWithAttr(MTE->getType()); // Skipping a chain of initializing gsl::Pointer annotated objects. // We are looking only for the final source to find out if it was // a local or temporary owner or the address of a local variable/param. - if (!IsGslPtrInitWithGslTempOwner) + if (!IsGslPtrValueFromGslTempOwner) return true; } } @@ -1030,7 +1038,7 @@ static void checkExprLifetimeImpl(Sema &SemaRef, return false; } - if (IsGslPtrInitWithGslTempOwner && DiagLoc.isValid()) { + if (IsGslPtrValueFromGslTempOwner && DiagLoc.isValid()) { SemaRef.Diag(DiagLoc, diag::warn_dangling_lifetime_pointer) << DiagRange; return false; @@ -1073,14 +1081,16 @@ static void checkExprLifetimeImpl(Sema &SemaRef, } case LK_Assignment: { - if (!MTE) + if (!MTE || pathContainsInit(Path)) return false; assert(shouldLifetimeExtendThroughPath(Path) == PathLifetimeKind::NoExtend && "No lifetime extension for assignments"); - if (!pathContainsInit(Path)) - SemaRef.Diag(DiagLoc, diag::warn_dangling_pointer_assignment) - << AEntity->LHS << DiagRange; + SemaRef.Diag(DiagLoc, + IsGslPtrValueFromGslTempOwner + ? diag::warn_dangling_lifetime_pointer_assignment + : diag::warn_dangling_pointer_assignment) + << AEntity->LHS << DiagRange; return false; } case LK_MemInitializer: { @@ -1090,7 +1100,7 @@ static void checkExprLifetimeImpl(Sema &SemaRef, // temporary, the program is ill-formed. if (auto *ExtendingDecl = ExtendingEntity ? ExtendingEntity->getDecl() : nullptr) { - if (IsGslPtrInitWithGslTempOwner) { + if (IsGslPtrValueFromGslTempOwner) { SemaRef.Diag(DiagLoc, diag::warn_dangling_lifetime_pointer_member) << ExtendingDecl << DiagRange; SemaRef.Diag(ExtendingDecl->getLocation(), @@ -1131,7 +1141,7 @@ static void checkExprLifetimeImpl(Sema &SemaRef, // Suppress false positives for code like the one below: // Ctor(unique_ptr up) : member(*up), member2(move(up)) {} - if (IsLocalGslOwner && pathOnlyInitializesGslPointer(Path)) + if (IsLocalGslOwner && pathOnlyHandlesGslPointer(Path)) return false; auto *DRE = dyn_cast(L); @@ -1159,7 +1169,7 @@ static void checkExprLifetimeImpl(Sema &SemaRef, case LK_New: if (isa(L)) { - if (IsGslPtrInitWithGslTempOwner) + if (IsGslPtrValueFromGslTempOwner) SemaRef.Diag(DiagLoc, diag::warn_dangling_lifetime_pointer) << DiagRange; else @@ -1226,6 +1236,7 @@ static void checkExprLifetimeImpl(Sema &SemaRef, case IndirectLocalPathEntry::TemporaryCopy: case IndirectLocalPathEntry::GslPointerInit: case IndirectLocalPathEntry::GslReferenceInit: + case IndirectLocalPathEntry::GslPointerAssignment: // FIXME: Consider adding a note for these. break; @@ -1265,9 +1276,11 @@ static void checkExprLifetimeImpl(Sema &SemaRef, return false; }; - bool EnableLifetimeWarnings = !SemaRef.getDiagnostics().isIgnored( - diag::warn_dangling_lifetime_pointer, SourceLocation()); llvm::SmallVector Path; + if (EnableLifetimeWarnings && LK == LK_Assignment && + isRecordWithAttr(AEntity->LHS->getType())) + Path.push_back({IndirectLocalPathEntry::GslPointerAssignment, Init}); + if (Init->isGLValue()) visitLocalsRetainedByReferenceBinding(Path, Init, RK_ReferenceBinding, TemporaryVisitor, @@ -1284,16 +1297,26 @@ void checkExprLifetime(Sema &SemaRef, const InitializedEntity &Entity, auto LTResult = getEntityLifetime(&Entity); LifetimeKind LK = LTResult.getInt(); const InitializedEntity *ExtendingEntity = LTResult.getPointer(); - checkExprLifetimeImpl(SemaRef, &Entity, ExtendingEntity, LK, nullptr, Init); + bool EnableLifetimeWarnings = !SemaRef.getDiagnostics().isIgnored( + diag::warn_dangling_lifetime_pointer, SourceLocation()); + checkExprLifetimeImpl(SemaRef, &Entity, ExtendingEntity, LK, + /*AEntity*/ nullptr, Init, EnableLifetimeWarnings); } void checkExprLifetime(Sema &SemaRef, const AssignedEntity &Entity, Expr *Init) { - if (!Entity.LHS->getType()->isPointerType()) // builtin pointer type + bool EnableLifetimeWarnings = !SemaRef.getDiagnostics().isIgnored( + diag::warn_dangling_lifetime_pointer, SourceLocation()); + bool RunAnalysis = Entity.LHS->getType()->isPointerType() || + (EnableLifetimeWarnings && + isRecordWithAttr(Entity.LHS->getType())); + + if (!RunAnalysis) return; + checkExprLifetimeImpl(SemaRef, /*InitEntity=*/nullptr, /*ExtendingEntity=*/nullptr, LK_Assignment, &Entity, - Init); + Init, EnableLifetimeWarnings); } } // namespace clang::sema diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp index 472e7ae5d1d3f..a8d250fbabfed 100644 --- a/clang/lib/Sema/SemaOverload.cpp +++ b/clang/lib/Sema/SemaOverload.cpp @@ -10,6 +10,7 @@ // //===----------------------------------------------------------------------===// +#include "CheckExprLifetime.h" #include "clang/AST/ASTContext.h" #include "clang/AST/ASTLambda.h" #include "clang/AST/CXXInheritance.h" @@ -14714,10 +14715,12 @@ ExprResult Sema::CreateOverloadedBinOp(SourceLocation OpLoc, FnDecl)) return ExprError(); - // Check for a self move. - if (Op == OO_Equal) + if (Op == OO_Equal) { + // Check for a self move. DiagnoseSelfMove(Args[0], Args[1], OpLoc); - + // lifetime check. + checkExprLifetime(*this, AssignedEntity{Args[0]}, Args[1]); + } if (ImplicitThis) { QualType ThisType = Context.getPointerType(ImplicitThis->getType()); QualType ThisTypeFromDecl = Context.getPointerType( diff --git a/clang/test/Sema/warn-lifetime-analysis-nocfg-disabled.cpp b/clang/test/Sema/warn-lifetime-analysis-nocfg-disabled.cpp index 60b8f3ddedcd1..d1266027bdd34 100644 --- a/clang/test/Sema/warn-lifetime-analysis-nocfg-disabled.cpp +++ b/clang/test/Sema/warn-lifetime-analysis-nocfg-disabled.cpp @@ -21,3 +21,7 @@ MyIntPointer g() { MyIntOwner o; return o; // No warning, it is disabled. } + +void h(MyIntPointer p) { + p = MyIntOwner(); // No warning, it is disabled. +} diff --git a/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp b/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp index b3ca173c1fdbc..09dfb2b5d96a8 100644 --- a/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp +++ b/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp @@ -120,11 +120,13 @@ MyLongPointerFromConversion global2; void initLocalGslPtrWithTempOwner() { MyIntPointer p = MyIntOwner{}; // expected-warning {{object backing the pointer will be destroyed at the end of the full-expression}} - p = MyIntOwner{}; // TODO ? - global = MyIntOwner{}; // TODO ? + MyIntPointer pp = p = MyIntOwner{}; // expected-warning {{object backing the pointer p will be}} + p = MyIntOwner{}; // expected-warning {{object backing the pointer p }} + pp = p; // no warning + global = MyIntOwner{}; // expected-warning {{object backing the pointer global }} MyLongPointerFromConversion p2 = MyLongOwnerWithConversion{}; // expected-warning {{object backing the pointer will be destroyed at the end of the full-expression}} - p2 = MyLongOwnerWithConversion{}; // TODO ? - global2 = MyLongOwnerWithConversion{}; // TODO ? + p2 = MyLongOwnerWithConversion{}; // expected-warning {{object backing the pointer p2 }} + global2 = MyLongOwnerWithConversion{}; // expected-warning {{object backing the pointer global2 }} } namespace __gnu_cxx { @@ -170,6 +172,7 @@ struct basic_string_view { basic_string_view(const T *); const T *begin() const; }; +using string_view = basic_string_view; template struct iter { iter& operator-=(int); @@ -188,7 +191,7 @@ struct basic_string { operator basic_string_view () const; using const_iterator = iter; }; - +using string = basic_string; template struct unique_ptr { @@ -346,6 +349,12 @@ void handleTernaryOperator(bool cond) { std::basic_string_view v = cond ? def : ""; // expected-warning {{object backing the pointer will be destroyed at the end of the full-expression}} } +std::string operator+(std::string_view s1, std::string_view s2); +void danglingStringviewAssignment(std::string_view a1, std::string_view a2) { + a1 = std::string(); // expected-warning {{object backing}} + a2 = a1 + a1; // expected-warning {{object backing}} +} + std::reference_wrapper danglingPtrFromNonOwnerLocal() { int i = 5; return i; // TODO diff --git a/clang/test/SemaCXX/warn-dangling-local.cpp b/clang/test/SemaCXX/warn-dangling-local.cpp index 2808a4c01f88d..5ad5013b6f025 100644 --- a/clang/test/SemaCXX/warn-dangling-local.cpp +++ b/clang/test/SemaCXX/warn-dangling-local.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -verify -std=c++11 %s +// RUN: %clang_cc1 -verify -std=c++11 -Wdangling-assignment-gsl %s using T = int[]; @@ -34,6 +34,6 @@ struct basic_string { }; } // namespace std void test(const char* a) { - // verify we're emitting the `-Wdangling-assignment` warning. + // verify we're emitting the `-Wdangling-assignment-gsl` warning. a = std::basic_string().c_str(); // expected-warning {{object backing the pointer a will be destroyed at the end of the full-expression}} } From 26af44b3985c762b2cbaf348f8012a30af09151f Mon Sep 17 00:00:00 2001 From: Sudharsan Veeravalli Date: Thu, 18 Jul 2024 13:33:03 +0530 Subject: [PATCH 388/777] [DebugInfo][SCCPSolver] Fix missing debug locations (#98876) Fixes #98875 --- llvm/lib/Transforms/Utils/SCCPSolver.cpp | 4 +- ...ebugloc-signedinst-branch-feasible-succ.ll | 71 +++++++++++++++++++ 2 files changed, 74 insertions(+), 1 deletion(-) create mode 100644 llvm/test/Transforms/SCCP/preserving-debugloc-signedinst-branch-feasible-succ.ll diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp index 7bfff4dfa67ad..2336466a25a17 100644 --- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp +++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp @@ -228,6 +228,7 @@ static bool replaceSignedInst(SCCPSolver &Solver, NewInst->takeName(&Inst); InsertedValues.insert(NewInst); Inst.replaceAllUsesWith(NewInst); + NewInst->setDebugLoc(Inst.getDebugLoc()); Solver.removeLatticeValueFor(&Inst); Inst.eraseFromParent(); return true; @@ -307,7 +308,8 @@ bool SCCPSolver::removeNonFeasibleEdges(BasicBlock *BB, DomTreeUpdater &DTU, Updates.push_back({DominatorTree::Delete, BB, Succ}); } - BranchInst::Create(OnlyFeasibleSuccessor, BB); + Instruction *BI = BranchInst::Create(OnlyFeasibleSuccessor, BB); + BI->setDebugLoc(TI->getDebugLoc()); TI->eraseFromParent(); DTU.applyUpdatesPermissive(Updates); } else if (FeasibleSuccessors.size() > 1) { diff --git a/llvm/test/Transforms/SCCP/preserving-debugloc-signedinst-branch-feasible-succ.ll b/llvm/test/Transforms/SCCP/preserving-debugloc-signedinst-branch-feasible-succ.ll new file mode 100644 index 0000000000000..790a794bfc23e --- /dev/null +++ b/llvm/test/Transforms/SCCP/preserving-debugloc-signedinst-branch-feasible-succ.ll @@ -0,0 +1,71 @@ +; Test that the debug information is propagated correctly to the new instructions +; RUN: opt < %s -passes=ipsccp -S | FileCheck %s + +define double @sdiv_ashr_sitofp_dbg_pres(i7 %y) !dbg !5 { +; CHECK-LABEL: define double @sdiv_ashr_sitofp_dbg_pres( +; CHECK: [[SDIV:%.*]] = udiv i8 42, [[ZEXT1:%.*]], !dbg [[DBG9:![0-9]+]] +; CHECK: [[ASHR:%.*]] = lshr i8 42, [[SDIV]], !dbg [[DBG10:![0-9]+]] +; CHECK: [[SITOFP:%.*]] = uitofp nneg i16 [[ZEXT2:%.*]] to double, !dbg [[DBG12:![0-9]+]] +; + %zext1 = zext i7 %y to i8, !dbg !8 + %sdiv = sdiv i8 42, %zext1, !dbg !9 + %ashr = ashr i8 42, %sdiv, !dbg !10 + %zext2 = zext i8 %ashr to i16, !dbg !11 + %sitofp = sitofp i16 %zext2 to double, !dbg !12 + ret double %sitofp, !dbg !13 +} + +define i32 @test_duplicate_successors_phi(i1 %c, i32 %x) !dbg !14 { +; CHECK-LABEL: define i32 @test_duplicate_successors_phi( +; CHECK: switch: +; CHECK-NEXT: br label %[[SWITCH_DEFAULT:.*]], !dbg [[DBG16:![0-9]+]] +; +entry: + br i1 %c, label %switch, label %end, !dbg !15 + +switch: ; preds = %entry + switch i32 -1, label %switch.default [ + i32 0, label %end + i32 1, label %end + ], !dbg !16 + +switch.default: ; preds = %switch + ret i32 -1, !dbg !17 + +end: ; preds = %switch, %switch, %entry + %phi = phi i32 [ %x, %entry ], [ 1, %switch ], [ 1, %switch ], !dbg !18 + ret i32 %phi, !dbg !19 +} + +!llvm.dbg.cu = !{!0} +!llvm.debugify = !{!2, !3} +!llvm.module.flags = !{!4} + +;. +; CHECK: [[DBG9]] = !DILocation(line: 2 +; CHECK: [[DBG10]] = !DILocation(line: 3 +; CHECK: [[DBG12]] = !DILocation(line: 5 +; CHECK: [[DBG16]] = !DILocation(line: 8 +;. + + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) +!1 = !DIFile(filename: "sccp.ll", directory: "/") +!2 = !{i32 11} +!3 = !{i32 0} +!4 = !{i32 2, !"Debug Info Version", i32 3} +!5 = distinct !DISubprogram(name: "sdiv_ashr_sitofp_dbg_pres", linkageName: "sdiv_ashr_sitofp_dbg_pres", scope: null, file: !1, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!6 = !DISubroutineType(types: !7) +!7 = !{} +!8 = !DILocation(line: 1, column: 1, scope: !5) +!9 = !DILocation(line: 2, column: 1, scope: !5) +!10 = !DILocation(line: 3, column: 1, scope: !5) +!11 = !DILocation(line: 4, column: 1, scope: !5) +!12 = !DILocation(line: 5, column: 1, scope: !5) +!13 = !DILocation(line: 6, column: 1, scope: !5) +!14 = distinct !DISubprogram(name: "test_duplicate_successors_phi", linkageName: "test_duplicate_successors_phi", scope: null, file: !1, line: 7, type: !6, scopeLine: 7, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!15 = !DILocation(line: 7, column: 1, scope: !14) +!16 = !DILocation(line: 8, column: 1, scope: !14) +!17 = !DILocation(line: 9, column: 1, scope: !14) +!18 = !DILocation(line: 10, column: 1, scope: !14) +!19 = !DILocation(line: 11, column: 1, scope: !14) From 57539418bae45e3c972e8f4f0a88577f807e8697 Mon Sep 17 00:00:00 2001 From: Orlando Cazalet-Hyams Date: Thu, 18 Jul 2024 09:08:25 +0100 Subject: [PATCH 389/777] [SROA] Fix debug locations for variables with non-zero offsets (#97750) Fixes issue #61981 by adjusting variable location offsets (in the DIExpression) when splitting allocas. Patch [4/4] to fix structured bindings in SROA. NOTE: There's still a bug in mem2reg which generates incorrect locations in some situations: if the variable fragment has an offset into the new (split) alloca, mem2reg will fail to convert that into a bit shift (the location contains a garbage offset). That's not addressed here. insertNewDbgInst - Now takes the address-expression and FragmentInfo as separate parameters because unlike dbg_declares dbg_assigns want those to go to different places. dbg_assign records put the variable fragment info in the value expression only (whereas dbg_declare has only one expression so puts it there - ideally this information wouldn't live in DIExpression, but that's another issue). MigrateOne - Modified to correctly compute the necessary offsets and fragment adjustments. The previous implementation produced bogus locations for variables with non-zero offsets. The changes replace most of the body of this lambda, so it might be easier to review in a split-diff view and focus on the change as a whole than to compare it to the old implementation. This uses calculateFragmentIntersect and extractLeadingOffset added in previous patches in this series, and createOrReplaceFragment described below. createOrReplaceFragment - Similar to DIExpression::createFragmentExpression except for 3 important distinctions: 1. The new fragment isn't relative to an existing fragment. 2. There are no checks on the the operation types because it is assumed the location this expression is computing is not implicit (i.e., it's always safe to create a fragment because arithmetic operations apply to the address computation, not to an implicit value computation). 3. Existing extract_bits are modified independetly of fragment changes using \p BitExtractOffset. A change to the fragment offset or size may affect a bit extract. But a bit extract offset can change independently of the fragment dimensions. Returns the new expression, or nullptr if one couldn't be created. Ideally this is only used to signal that a bit-extract has become zero-sized (and thus the new debug record has no size and can be dropped), however, it fails for other reasons too - see the FIXME below. FIXME: To keep the scope of this change focused on non-bitfield structured bindings the function bails in situations that DIExpression::createFragmentExpression fails. E.g. when fragment and bit extract sizes differ. These limitations can be removed in the future. --- llvm/lib/Transforms/Scalar/SROA.cpp | 336 ++++++++++++++---- .../sroa/var-sized-fragment.ll | 5 +- .../DebugInfo/Generic/sroa-alloca-offset.ll | 275 ++++++++++++++ 3 files changed, 551 insertions(+), 65 deletions(-) create mode 100644 llvm/test/DebugInfo/Generic/sroa-alloca-offset.ll diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp index 4d8fd5d3b9f5c..c738a2a6f39a4 100644 --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -4967,32 +4967,218 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, return NewAI; } -static void insertNewDbgInst(DIBuilder &DIB, DbgDeclareInst *Orig, - AllocaInst *NewAddr, DIExpression *NewFragmentExpr, - Instruction *BeforeInst) { - DIB.insertDeclare(NewAddr, Orig->getVariable(), NewFragmentExpr, +// There isn't a shared interface to get the "address" parts out of a +// dbg.declare and dbg.assign, so provide some wrappers now for +// both debug intrinsics and records. +const Value *getAddress(const DbgVariableIntrinsic *DVI) { + if (const auto *DAI = dyn_cast(DVI)) + return DAI->getAddress(); + return cast(DVI)->getAddress(); +} + +const Value *getAddress(const DbgVariableRecord *DVR) { + assert(DVR->getType() == DbgVariableRecord::LocationType::Declare || + DVR->getType() == DbgVariableRecord::LocationType::Assign); + return DVR->getAddress(); +} + +bool isKillAddress(const DbgVariableIntrinsic *DVI) { + if (const auto *DAI = dyn_cast(DVI)) + return DAI->isKillAddress(); + return cast(DVI)->isKillLocation(); +} + +bool isKillAddress(const DbgVariableRecord *DVR) { + assert(DVR->getType() == DbgVariableRecord::LocationType::Declare || + DVR->getType() == DbgVariableRecord::LocationType::Assign); + if (DVR->getType() == DbgVariableRecord::LocationType::Assign) + return DVR->isKillAddress(); + return DVR->isKillLocation(); +} + +const DIExpression *getAddressExpression(const DbgVariableIntrinsic *DVI) { + if (const auto *DAI = dyn_cast(DVI)) + return DAI->getAddressExpression(); + return cast(DVI)->getExpression(); +} + +const DIExpression *getAddressExpression(const DbgVariableRecord *DVR) { + assert(DVR->getType() == DbgVariableRecord::LocationType::Declare || + DVR->getType() == DbgVariableRecord::LocationType::Assign); + if (DVR->getType() == DbgVariableRecord::LocationType::Assign) + return DVR->getAddressExpression(); + return DVR->getExpression(); +} + +/// Create or replace an existing fragment in a DIExpression with \p Frag. +/// If the expression already contains a DW_OP_LLVM_extract_bits_[sz]ext +/// operation, add \p BitExtractOffset to the offset part. +/// +/// Returns the new expression, or nullptr if this fails (see details below). +/// +/// This function is similar to DIExpression::createFragmentExpression except +/// for 3 important distinctions: +/// 1. The new fragment isn't relative to an existing fragment. +/// 2. It assumes the computed location is a memory location. This means we +/// don't need to perform checks that creating the fragment preserves the +/// expression semantics. +/// 3. Existing extract_bits are modified independently of fragment changes +/// using \p BitExtractOffset. A change to the fragment offset or size +/// may affect a bit extract. But a bit extract offset can change +/// independently of the fragment dimensions. +/// +/// Returns the new expression, or nullptr if one couldn't be created. +/// Ideally this is only used to signal that a bit-extract has become +/// zero-sized (and thus the new debug record has no size and can be +/// dropped), however, it fails for other reasons too - see the FIXME below. +/// +/// FIXME: To keep the change that introduces this function NFC it bails +/// in some situations unecessarily, e.g. when fragment and bit extract +/// sizes differ. +static DIExpression *createOrReplaceFragment(const DIExpression *Expr, + DIExpression::FragmentInfo Frag, + int64_t BitExtractOffset) { + SmallVector Ops; + bool HasFragment = false; + bool HasBitExtract = false; + + for (auto &Op : Expr->expr_ops()) { + if (Op.getOp() == dwarf::DW_OP_LLVM_fragment) { + HasFragment = true; + continue; + } + if (Op.getOp() == dwarf::DW_OP_LLVM_extract_bits_zext || + Op.getOp() == dwarf::DW_OP_LLVM_extract_bits_sext) { + HasBitExtract = true; + int64_t ExtractOffsetInBits = Op.getArg(0); + int64_t ExtractSizeInBits = Op.getArg(1); + + // DIExpression::createFragmentExpression doesn't know how to handle + // a fragment that is smaller than the extract. Copy the behaviour + // (bail) to avoid non-NFC changes. + // FIXME: Don't do this. + if (Frag.SizeInBits < uint64_t(ExtractSizeInBits)) + return nullptr; + + assert(BitExtractOffset <= 0); + int64_t AdjustedOffset = ExtractOffsetInBits + BitExtractOffset; + + // DIExpression::createFragmentExpression doesn't know what to do + // if the new extract starts "outside" the existing one. Copy the + // behaviour (bail) to avoid non-NFC changes. + // FIXME: Don't do this. + if (AdjustedOffset < 0) + return nullptr; + + Ops.push_back(Op.getOp()); + Ops.push_back(std::max(0, AdjustedOffset)); + Ops.push_back(ExtractSizeInBits); + continue; + } + Op.appendToVector(Ops); + } + + // Unsupported by createFragmentExpression, so don't support it here yet to + // preserve NFC-ness. + if (HasFragment && HasBitExtract) + return nullptr; + + if (!HasBitExtract) { + Ops.push_back(dwarf::DW_OP_LLVM_fragment); + Ops.push_back(Frag.OffsetInBits); + Ops.push_back(Frag.SizeInBits); + } + return DIExpression::get(Expr->getContext(), Ops); +} + +/// Insert a new dbg.declare. +/// \p Orig Original to copy debug loc and variable from. +/// \p NewAddr Location's new base address. +/// \p NewAddrExpr New expression to apply to address. +/// \p BeforeInst Insert position. +/// \p NewFragment New fragment (absolute, non-relative). +/// \p BitExtractAdjustment Offset to apply to any extract_bits op. +static void +insertNewDbgInst(DIBuilder &DIB, DbgDeclareInst *Orig, AllocaInst *NewAddr, + DIExpression *NewAddrExpr, Instruction *BeforeInst, + std::optional NewFragment, + int64_t BitExtractAdjustment) { + if (NewFragment) + NewAddrExpr = createOrReplaceFragment(NewAddrExpr, *NewFragment, + BitExtractAdjustment); + if (!NewAddrExpr) + return; + + DIB.insertDeclare(NewAddr, Orig->getVariable(), NewAddrExpr, Orig->getDebugLoc(), BeforeInst); } -static void insertNewDbgInst(DIBuilder &DIB, DbgAssignIntrinsic *Orig, - AllocaInst *NewAddr, DIExpression *NewFragmentExpr, - Instruction *BeforeInst) { + +/// Insert a new dbg.assign. +/// \p Orig Original to copy debug loc, variable, value and value expression +/// from. +/// \p NewAddr Location's new base address. +/// \p NewAddrExpr New expression to apply to address. +/// \p BeforeInst Insert position. +/// \p NewFragment New fragment (absolute, non-relative). +/// \p BitExtractAdjustment Offset to apply to any extract_bits op. +static void +insertNewDbgInst(DIBuilder &DIB, DbgAssignIntrinsic *Orig, AllocaInst *NewAddr, + DIExpression *NewAddrExpr, Instruction *BeforeInst, + std::optional NewFragment, + int64_t BitExtractAdjustment) { + // DIBuilder::insertDbgAssign will insert the #dbg_assign after NewAddr. (void)BeforeInst; + + // A dbg.assign puts fragment info in the value expression only. The address + // expression has already been built: NewAddrExpr. + DIExpression *NewFragmentExpr = Orig->getExpression(); + if (NewFragment) + NewFragmentExpr = createOrReplaceFragment(NewFragmentExpr, *NewFragment, + BitExtractAdjustment); + if (!NewFragmentExpr) + return; + + // Apply a DIAssignID to the store if it doesn't already have it. if (!NewAddr->hasMetadata(LLVMContext::MD_DIAssignID)) { NewAddr->setMetadata(LLVMContext::MD_DIAssignID, DIAssignID::getDistinct(NewAddr->getContext())); } + Instruction *NewAssign = DIB.insertDbgAssign(NewAddr, Orig->getValue(), Orig->getVariable(), - NewFragmentExpr, NewAddr, - Orig->getAddressExpression(), Orig->getDebugLoc()) + NewFragmentExpr, NewAddr, NewAddrExpr, + Orig->getDebugLoc()) .get(); LLVM_DEBUG(dbgs() << "Created new assign intrinsic: " << *NewAssign << "\n"); (void)NewAssign; } -static void insertNewDbgInst(DIBuilder &DIB, DbgVariableRecord *Orig, - AllocaInst *NewAddr, DIExpression *NewFragmentExpr, - Instruction *BeforeInst) { + +/// Insert a new DbgRecord. +/// \p Orig Original to copy record type, debug loc and variable from, and +/// additionally value and value expression for dbg_assign records. +/// \p NewAddr Location's new base address. +/// \p NewAddrExpr New expression to apply to address. +/// \p BeforeInst Insert position. +/// \p NewFragment New fragment (absolute, non-relative). +/// \p BitExtractAdjustment Offset to apply to any extract_bits op. +static void +insertNewDbgInst(DIBuilder &DIB, DbgVariableRecord *Orig, AllocaInst *NewAddr, + DIExpression *NewAddrExpr, Instruction *BeforeInst, + std::optional NewFragment, + int64_t BitExtractAdjustment) { (void)DIB; + + // A dbg_assign puts fragment info in the value expression only. The address + // expression has already been built: NewAddrExpr. A dbg_declare puts the + // new fragment info into NewAddrExpr (as it only has one expression). + DIExpression *NewFragmentExpr = + Orig->isDbgAssign() ? Orig->getExpression() : NewAddrExpr; + if (NewFragment) + NewFragmentExpr = createOrReplaceFragment(NewFragmentExpr, *NewFragment, + BitExtractAdjustment); + if (!NewFragmentExpr) + return; + if (Orig->isDbgDeclare()) { DbgVariableRecord *DVR = DbgVariableRecord::createDVRDeclare( NewAddr, Orig->getVariable(), NewFragmentExpr, Orig->getDebugLoc()); @@ -5000,13 +5186,16 @@ static void insertNewDbgInst(DIBuilder &DIB, DbgVariableRecord *Orig, BeforeInst->getIterator()); return; } + + // Apply a DIAssignID to the store if it doesn't already have it. if (!NewAddr->hasMetadata(LLVMContext::MD_DIAssignID)) { NewAddr->setMetadata(LLVMContext::MD_DIAssignID, DIAssignID::getDistinct(NewAddr->getContext())); } + DbgVariableRecord *NewAssign = DbgVariableRecord::createLinkedDVRAssign( NewAddr, Orig->getValue(), Orig->getVariable(), NewFragmentExpr, NewAddr, - Orig->getAddressExpression(), Orig->getDebugLoc()); + NewAddrExpr, Orig->getDebugLoc()); LLVM_DEBUG(dbgs() << "Created new DVRAssign: " << *NewAssign << "\n"); (void)NewAssign; } @@ -5019,7 +5208,7 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) { unsigned NumPartitions = 0; bool Changed = false; - const DataLayout &DL = AI.getDataLayout(); + const DataLayout &DL = AI.getModule()->getDataLayout(); // First try to pre-split loads and stores. Changed |= presplitLoadsAndStores(AI, AS); @@ -5113,54 +5302,78 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) { // Migrate debug information from the old alloca to the new alloca(s) // and the individual partitions. auto MigrateOne = [&](auto *DbgVariable) { - auto *Expr = DbgVariable->getExpression(); - DIBuilder DIB(*AI.getModule(), /*AllowUnresolved*/ false); - uint64_t AllocaSize = - DL.getTypeSizeInBits(AI.getAllocatedType()).getFixedValue(); - for (auto Fragment : Fragments) { - // Create a fragment expression describing the new partition or reuse AI's - // expression if there is only one partition. - auto *FragmentExpr = Expr; - if (Fragment.Size < AllocaSize || Expr->isFragment()) { - // If this alloca is already a scalar replacement of a larger aggregate, - // Fragment.Offset describes the offset inside the scalar. - auto ExprFragment = Expr->getFragmentInfo(); - uint64_t Offset = ExprFragment ? ExprFragment->OffsetInBits : 0; - uint64_t Start = Offset + Fragment.Offset; - uint64_t Size = Fragment.Size; - if (ExprFragment) { - uint64_t AbsEnd = - ExprFragment->OffsetInBits + ExprFragment->SizeInBits; - if (Start >= AbsEnd) { - // No need to describe a SROAed padding. - continue; - } - Size = std::min(Size, AbsEnd - Start); - } - // The new, smaller fragment is stenciled out from the old fragment. - if (auto OrigFragment = FragmentExpr->getFragmentInfo()) { - assert(Start >= OrigFragment->OffsetInBits && - "new fragment is outside of original fragment"); - Start -= OrigFragment->OffsetInBits; - } + // Can't overlap with undef memory. + if (isKillAddress(DbgVariable)) + return; - // The alloca may be larger than the variable. - auto VarSize = DbgVariable->getVariable()->getSizeInBits(); - if (VarSize) { - if (Size > *VarSize) - Size = *VarSize; - if (Size == 0 || Start + Size > *VarSize) - continue; - } + const Value *DbgPtr = getAddress(DbgVariable); + DIExpression::FragmentInfo VarFrag = + DbgVariable->getFragmentOrEntireVariable(); + // Get the address expression constant offset if one exists and the ops + // that come after it. + int64_t CurrentExprOffsetInBytes = 0; + SmallVector PostOffsetOps; + if (!getAddressExpression(DbgVariable) + ->extractLeadingOffset(CurrentExprOffsetInBytes, PostOffsetOps)) + return; // Couldn't interpret this DIExpression - drop the var. + + // Offset defined by a DW_OP_LLVM_extract_bits_[sz]ext. + int64_t ExtractOffsetInBits = 0; + for (auto Op : getAddressExpression(DbgVariable)->expr_ops()) { + if (Op.getOp() == dwarf::DW_OP_LLVM_extract_bits_zext || + Op.getOp() == dwarf::DW_OP_LLVM_extract_bits_sext) { + ExtractOffsetInBits = Op.getArg(0); + break; + } + } - // Avoid creating a fragment expression that covers the entire variable. - if (!VarSize || *VarSize != Size) { - if (auto E = - DIExpression::createFragmentExpression(Expr, Start, Size)) - FragmentExpr = *E; - else - continue; - } + DIBuilder DIB(*AI.getModule(), /*AllowUnresolved*/ false); + for (auto Fragment : Fragments) { + int64_t OffsetFromLocationInBits; + std::optional NewDbgFragment; + // Find the variable fragment that the new alloca slice covers. + // Drop debug info for this variable fragment if we can't compute an + // intersect between it and the alloca slice. + if (!DIExpression::calculateFragmentIntersect( + DL, &AI, Fragment.Offset, Fragment.Size, DbgPtr, + CurrentExprOffsetInBytes * 8, ExtractOffsetInBits, VarFrag, + NewDbgFragment, OffsetFromLocationInBits)) + continue; // Do not migrate this fragment to this slice. + + // Zero sized fragment indicates there's no intersect between the variable + // fragment and the alloca slice. Skip this slice for this variable + // fragment. + if (NewDbgFragment && !NewDbgFragment->SizeInBits) + continue; // Do not migrate this fragment to this slice. + + // No fragment indicates DbgVariable's variable or fragment exactly + // overlaps the slice; copy its fragment (or nullopt if there isn't one). + if (!NewDbgFragment) + NewDbgFragment = DbgVariable->getFragment(); + + // Reduce the new expression offset by the bit-extract offset since + // we'll be keeping that. + int64_t OffestFromNewAllocaInBits = + OffsetFromLocationInBits - ExtractOffsetInBits; + // We need to adjust an existing bit extract if the offset expression + // can't eat the slack (i.e., if the new offset would be negative). + int64_t BitExtractOffset = + std::min(0, OffestFromNewAllocaInBits); + // The magnitude of a negative value indicates the number of bits into + // the existing variable fragment that the memory region begins. The new + // variable fragment already excludes those bits - the new DbgPtr offset + // only needs to be applied if it's positive. + OffestFromNewAllocaInBits = + std::max(int64_t(0), OffestFromNewAllocaInBits); + + // Rebuild the expression: + // {Offset(OffestFromNewAllocaInBits), PostOffsetOps, NewDbgFragment} + // Add NewDbgFragment later, because dbg.assigns don't want it in the + // address expression but the value expression instead. + DIExpression *NewExpr = DIExpression::get(AI.getContext(), PostOffsetOps); + if (OffestFromNewAllocaInBits > 0) { + int64_t OffsetInBytes = (OffestFromNewAllocaInBits + 7) / 8; + NewExpr = DIExpression::prepend(NewExpr, /*flags=*/0, OffsetInBytes); } // Remove any existing intrinsics on the new alloca describing @@ -5177,7 +5390,8 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) { for_each(findDbgDeclares(Fragment.Alloca), RemoveOne); for_each(findDVRDeclares(Fragment.Alloca), RemoveOne); - insertNewDbgInst(DIB, DbgVariable, Fragment.Alloca, FragmentExpr, &AI); + insertNewDbgInst(DIB, DbgVariable, Fragment.Alloca, NewExpr, &AI, + NewDbgFragment, BitExtractOffset); } }; diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/var-sized-fragment.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/var-sized-fragment.ll index 55119114bd602..c2bcd4dcfeeee 100644 --- a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/var-sized-fragment.ll +++ b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/var-sized-fragment.ll @@ -19,10 +19,7 @@ ;; return a; ;; } -;; FIXME: Variable 'b' gets an incorrect location (value and expression) - see -;; llvm.org/PR61981. This check just ensures that no fragment info is added to -;; the dbg.value. -; CHECK: #dbg_value(i32 %.sroa.0.0.extract.trunc, ![[B:[0-9]+]], !DIExpression(DW_OP_plus_uconst, 4), +; CHECK: #dbg_value(i32 %.sroa.2.0.extract.trunc, ![[B:[0-9]+]], !DIExpression(), ; CHECK: #dbg_value(i32 %.sroa.0.0.extract.trunc, ![[A:[0-9]+]], !DIExpression(), ; CHECK: ![[A]] = !DILocalVariable(name: "a", ; CHECK: ![[B]] = !DILocalVariable(name: "b", diff --git a/llvm/test/DebugInfo/Generic/sroa-alloca-offset.ll b/llvm/test/DebugInfo/Generic/sroa-alloca-offset.ll new file mode 100644 index 0000000000000..3789084b6b712 --- /dev/null +++ b/llvm/test/DebugInfo/Generic/sroa-alloca-offset.ll @@ -0,0 +1,275 @@ +; RUN: opt %s -passes=sroa -S | FileCheck %s --check-prefixes=COMMON,OLD +; RUN: opt %s -passes=declare-to-assign,sroa -S | FileCheck %s --check-prefixes=COMMON,NEW + +;; C++17 source: +;; struct two { int a, b; } gt; +;; int fun1() { +;; auto [x, y] = gt; +;; return x + y; +;; } +;; +;; struct four { two a, b; } gf; +;; int fun2() { +;; auto [x, y] = gf; +;; return x.a + y.b; +;; } +;; Plus some hand-written IR. +;; +;; Check that SROA understands how to split dbg.declares and dbg.assigns with +;; offsets into their storge (e.g., the second variable in a structured binding +;; is stored at an offset into the shared alloca). +;; +;; Additional notes: +;; We expect the same dbg.value intrinsics to come out of SROA whether assignment +;; tracking is enabled or not. However, the order of the debug intrinsics may +;; differ, and assignment tracking replaces some dbg.declares with dbg.assigns. +;; +;; Structured bindings produce an artificial variable that covers the entire +;; alloca. Although they add clutter to the test, they've been preserved in +;; order to increase coverage. These use the placehold name 'A' in comments and +;; checks. + +%struct.two = type { i32, i32 } +%struct.four = type { %struct.two, %struct.two } + +@gt = dso_local global %struct.two zeroinitializer, align 4, !dbg !0 +@gf = dso_local global %struct.four zeroinitializer, align 4, !dbg !5 + + +; COMMON-LABEL: @_Z4fun1v +; COMMON-NEXT: entry +;; 32 bit variable x (!27): value a_reg. +;; +;; 32 bit variable y (!28): value b_reg. +;; +;; 64 bit variable A (!29) bits [0, 32): value a_reg. +;; 64 bit variable A (!29) bits [32, 64): value b_reg. + +; OLD-NEXT: %[[a_reg:.*]] = load i32, ptr @gt +; OLD-NEXT: #dbg_value(i32 %[[a_reg]], ![[x0:[0-9]+]], !DIExpression(), +; OLD-NEXT: #dbg_value(i32 %[[a_reg]], ![[A0:[0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 0, 32), +; OLD-NEXT: %[[b_reg:.*]] = load i32, ptr getelementptr inbounds (i8, ptr @gt, i64 4) +; OLD-NEXT: #dbg_value(i32 %[[b_reg]], ![[y0:[0-9]+]], !DIExpression(), +; OLD-NEXT: #dbg_value(i32 %[[b_reg]], ![[A0]], !DIExpression(DW_OP_LLVM_fragment, 32, 32), + +; NEW-NEXT: %[[a_reg:.*]] = load i32, ptr @gt +; NEW-NEXT: %[[b_reg:.*]] = load i32, ptr getelementptr inbounds (i8, ptr @gt, i64 4) +; NEW-NEXT: #dbg_value(i32 %[[b_reg]], ![[y0:[0-9]+]], !DIExpression(), +; NEW-NEXT: #dbg_value(i32 %[[a_reg]], ![[A0:[0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 0, 32), +; NEW-NEXT: #dbg_value(i32 %[[b_reg]], ![[A0]], !DIExpression(DW_OP_LLVM_fragment, 32, 32), +; NEW-NEXT: #dbg_value(i32 %[[a_reg]], ![[x0:[0-9]+]], !DIExpression(), +define dso_local noundef i32 @_Z4fun1v() #0 !dbg !23 { +entry: + %0 = alloca %struct.two, align 4 + #dbg_declare(ptr %0, !27, !DIExpression(), !31) + #dbg_declare(ptr %0, !28, !DIExpression(DW_OP_plus_uconst, 4), !31) + #dbg_declare(ptr %0, !29, !DIExpression(), !31) + call void @llvm.memcpy.p0.p0.i64(ptr align 4 %0, ptr align 4 @gt, i64 8, i1 false), !dbg !31 + %a = getelementptr inbounds %struct.two, ptr %0, i32 0, i32 0, !dbg !31 + %1 = load i32, ptr %a, align 4, !dbg !31 + %b = getelementptr inbounds %struct.two, ptr %0, i32 0, i32 1, !dbg !31 + %2 = load i32, ptr %b, align 4, !dbg !31 + %add = add nsw i32 %1, %2, !dbg !31 + ret i32 %add, !dbg !31 +} + +; COMMON-LABEL: _Z4fun2v() +; COMMON-NEXT: entry: +;; 64 bit variable x (!50) bits [0, 32): value aa_reg. +;; 64 bit variable x (!50) bits [32, 64): deref ab_ba_addr +;; +;; 64 bit variable y (!51) bits [0, 32): deref ab_ba_addr + 32 +;; 64 bit variable y (!51) bits [32, 64): value bb_reg. +;; +;; 128 bit variable A (!52) bits [0, 32): value aa_reg +;; 128 bit variable A (!52) bits [32, 64): deref ab_ba_addr +;; 128 bit variable A (!52) bits [96, 128): value bb_reg +;; +;; NOTE: This 8 byte alloca contains x.b (4 bytes) and y.a (4 bytes). +; COMMON-NEXT: %[[ab_ba_addr:.*]] = alloca [8 x i8], align 4 +; OLD-NEXT: #dbg_declare(ptr %[[ab_ba_addr]], ![[A1:[0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 32, 64), +; OLD-NEXT: #dbg_declare(ptr %[[ab_ba_addr]], ![[y1:[0-9]+]], !DIExpression(DW_OP_plus_uconst, 4, DW_OP_LLVM_fragment, 0, 32), +; OLD-NEXT: #dbg_declare(ptr %[[ab_ba_addr]], ![[x1:[0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 32, 32), +; OLD-NEXT: %[[aa_reg:.*]] = load i32, ptr @gf, align 4 +; OLD-NEXT: #dbg_value(i32 %[[aa_reg]], ![[x1]], !DIExpression(DW_OP_LLVM_fragment, 0, 32), +; OLD-NEXT: #dbg_value(i32 %[[aa_reg]], ![[A1]], !DIExpression(DW_OP_LLVM_fragment, 0, 32), +; OLD-NEXT: call void @llvm.memcpy{{.*}}(ptr align 4 %[[ab_ba_addr]], ptr align 4 getelementptr inbounds (i8, ptr @gf, i64 4), i64 8, i1 false) +; OLD-NEXT: %[[bb_reg:.*]] = load i32, ptr getelementptr inbounds (i8, ptr @gf, i64 12), align 4 +; OLD-NEXT: #dbg_value(i32 %[[bb_reg]], ![[y1]], !DIExpression(DW_OP_LLVM_fragment, 32, 32), +; OLD-NEXT: #dbg_value(i32 %[[bb_reg]], ![[A1]], !DIExpression(DW_OP_LLVM_fragment, 96, 32), + +; NEW-NEXT: #dbg_assign(i1 undef, ![[x1:[0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 32, 32), ![[#]], ptr %[[ab_ba_addr]], !DIExpression(), +; NEW-NEXT: #dbg_assign(i1 undef, ![[A1:[0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 32, 64), ![[#]], ptr %[[ab_ba_addr]], !DIExpression(), +; NEW-NEXT: #dbg_declare(ptr %[[ab_ba_addr]], ![[y1:[0-9]+]], !DIExpression(DW_OP_plus_uconst, 4, DW_OP_LLVM_fragment, 0, 32), +; NEW-NEXT: %[[aa_reg:.*]] = load i32, ptr @gf, align 4 +; NEW-NEXT: llvm.memcpy{{.*}}(ptr align 4 %[[ab_ba_addr]], ptr align 4 getelementptr inbounds (i8, ptr @gf, i64 4), i64 8, i1 false){{.*}}, !DIAssignID ![[ID:[0-9]+]] +; NEW-NEXT: %[[bb_reg:.*]] = load i32, ptr getelementptr inbounds (i8, ptr @gf, i64 12), align 4 +; NEW-NEXT: #dbg_value(i32 %[[bb_reg]], ![[y1:[0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 32, 32), +; NEW-NEXT: #dbg_value(i32 %[[aa_reg]], ![[A1]], !DIExpression(DW_OP_LLVM_fragment, 0, 32), +; NEW-NEXT: #dbg_assign(i1 undef, ![[A1]], !DIExpression(DW_OP_LLVM_fragment, 32, 64), ![[ID]], ptr %[[ab_ba_addr]], !DIExpression(), +; NEW-NEXT: #dbg_value(i32 %[[bb_reg]], ![[A1]], !DIExpression(DW_OP_LLVM_fragment, 96, 32), +; NEW-NEXT: #dbg_value(i32 %[[aa_reg]], ![[x1]], !DIExpression(DW_OP_LLVM_fragment, 0, 32), +define dso_local noundef i32 @_Z4fun2v() #0 !dbg !48 { +entry: + %0 = alloca %struct.four, align 4 + #dbg_declare(ptr %0, !50, !DIExpression(), !54) + #dbg_declare(ptr %0, !51, !DIExpression(DW_OP_plus_uconst, 8), !54) + #dbg_declare(ptr %0, !52, !DIExpression(), !54) + call void @llvm.memcpy.p0.p0.i64(ptr align 4 %0, ptr align 4 @gf, i64 16, i1 false), !dbg !54 + %a = getelementptr inbounds %struct.four, ptr %0, i32 0, i32 0, !dbg !54 + %a1 = getelementptr inbounds %struct.two, ptr %a, i32 0, i32 0, !dbg !54 + %1 = load i32, ptr %a1, align 4, !dbg !54 + %b = getelementptr inbounds %struct.four, ptr %0, i32 0, i32 1, !dbg !54 + %b2 = getelementptr inbounds %struct.two, ptr %b, i32 0, i32 1, !dbg !54 + %2 = load i32, ptr %b2, align 4, !dbg !54 + %add = add nsw i32 %1, %2, !dbg !54 + ret i32 %add, !dbg !54 +} + +;; Hand-written part to test what happens when variables are smaller than the +;; new alloca slices (i.e., check offset rewriting works correctly). Note that +;; mem2reg incorrectly preserves the offest in the DIExpression of a variable +;; stuffed into the upper bits of a value (that is a bug), e.g. alloca+offset +;; becomes vreg+offest. It should either convert the offest to a shift, encode +;; the register-bit offest using DW_OP_bit_piece, or use the new +;; DW_OP_LLVM_extract_bits_[sz]ext operation. +; COMMON-LABEL: _Z4fun3v() +; COMMON-NEXT: entry: +;; 16 bit variable e (!61): value ve (upper bits) +;; +;; 16 bit variable f (!62): value vgf (lower bits) +;; 16 bit variable g (!63): value vgf (upper bits) +;; +;; 16 bit variable h (!64): deref dead_64_128 +; COMMON-NEXT: %[[dead_64_128:.*]] = alloca %struct.two +; COMMON-NEXT: #dbg_declare(ptr %[[dead_64_128]], ![[h:[0-9]+]], !DIExpression(), +; COMMON-NEXT: %[[ve:.*]] = load i32, ptr @gf +;; FIXME: mem2reg bug - offset is incorrect - see comment above. +; COMMON-NEXT: #dbg_value(i32 %[[ve]], ![[e:[0-9]+]], !DIExpression(DW_OP_plus_uconst, 2), +; COMMON-NEXT: %[[vfg:.*]] = load i32, ptr getelementptr inbounds (i8, ptr @gf, i64 4) +; COMMON-NEXT: #dbg_value(i32 %[[vfg]], ![[f:[0-9]+]], !DIExpression(), +;; FIXME: mem2reg bug - offset is incorrect - see comment above. +; COMMON-NEXT: #dbg_value(i32 %[[vfg]], ![[g:[0-9]+]], !DIExpression(DW_OP_plus_uconst, 2), +define dso_local noundef i32 @_Z4fun3v() #0 !dbg !55 { +entry: + %0 = alloca %struct.four, align 4 + #dbg_declare(ptr %0, !61, !DIExpression(DW_OP_plus_uconst, 2), !58) + #dbg_declare(ptr %0, !62, !DIExpression(DW_OP_plus_uconst, 4), !58) + #dbg_declare(ptr %0, !63, !DIExpression(DW_OP_plus_uconst, 6), !58) + #dbg_declare(ptr %0, !64, !DIExpression(DW_OP_plus_uconst, 8), !58) + call void @llvm.memcpy.p0.p0.i64(ptr align 4 %0, ptr align 4 @gf, i64 16, i1 false), !dbg !58 + %1 = getelementptr inbounds %struct.four, ptr %0, i32 0, i32 0, !dbg !58 + %2 = getelementptr inbounds %struct.two, ptr %1, i32 0, i32 1, !dbg !58 + %3 = load i32, ptr %2, align 4, !dbg !58 + ret i32 %3, !dbg !58 +} + +;; Check that DW_OP_extract_bits_[sz]ext compose with expression offsets and +;; that new fragments are not created. DW_OP_extract_bits_[sz]ext and fragments +;; don't compose currently (but could). There are checks that expressions with +;; bit extracts and fragments are dropped in SROA the test +;; in llvm/test/DebugInfo/Generic/sroa-extract-bits.ll. FIXME: Don't do that. +;; +;; Checks are inline for this one. +;; +;; %p alloca is 128 bits +;; SROA is going to split it in half, discard the lower bits, then split +;; the upper bits in half and discard the upper bits leaving us with +;; bits [64, 96) of the original alloca. +;; +; COMMON-LABEL: fun4 +define dso_local noundef i32 @fun4(i64 %0) !dbg !65 { +entry: + %p = alloca [2 x i64] + %1 = getelementptr inbounds [2 x i64], ptr %p, i32 0, i32 1 + store i64 %0, ptr %1 + ; COMMON: %p.sroa.0.8.extract.trunc = trunc i64 %0 to i32 + ;; Simple case - the expression offset (8 bytes) matches the offset of the + ;; slice into the alloca, so can be discarded away entirely. + ; COMMON-NEXT: #dbg_value(i32 %p.sroa.0.8.extract.trunc, ![[p:[0-9]+]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 0, 32) + #dbg_declare(ptr %p, !67, !DIExpression(DW_OP_plus_uconst, 8, DW_OP_LLVM_extract_bits_zext, 0, 32), !66) + ;; The expression offset is 6 bytes, with a bit-extract offset of 32 bits from + ;; there for a total offset of 80 bits. SROA is going to split the alloca in + ;; half (at bit 64). The new expression needs a final bit extract offset of + ;; 80-64=16 bits applied to the mem2reg'd value. + ; COMMON-NEXT: #dbg_value(i32 %p.sroa.0.8.extract.trunc, ![[q:[0-9]+]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 16, 8) + #dbg_declare(ptr %p, !68, !DIExpression(DW_OP_plus_uconst, 6, DW_OP_LLVM_extract_bits_zext, 32, 8), !66) + ;; FIXME: Just as in _Z4fun3v, the offset from the new alloca (2 bytes) is + ;; correct but mem2reg needs to change it from an offset to a shift or + ;; adjust the bit-extract (e.g., add the 2 byte offset to the existing 8 bit + ;; offset for a 24 bit total bit-extract offset). + ; COMMON-NEXT: #dbg_value(i32 %p.sroa.0.8.extract.trunc, ![[r:[0-9]+]], !DIExpression(DW_OP_plus_uconst, 2, DW_OP_LLVM_extract_bits_zext, 8, 8) + #dbg_declare(ptr %p, !69, !DIExpression(DW_OP_plus_uconst, 10, DW_OP_LLVM_extract_bits_zext, 8, 8), !66) + %2 = load i32, ptr %1, align 4 + ret i32 %2 +} + +; COMMON-DAG: ![[x0]] = !DILocalVariable(name: "x", +; COMMON-DAG: ![[y0]] = !DILocalVariable(name: "y", +; COMMON-DAG: ![[A0]] = !DILocalVariable(scope: + +; COMMON-DAG: ![[x1]] = !DILocalVariable(name: "x", +; COMMON-DAG: ![[y1]] = !DILocalVariable(name: "y", +; COMMON-DAG: ![[A1]] = !DILocalVariable(scope: + +; COMMON-DAG: ![[e]] = !DILocalVariable(name: "e", +; COMMON-DAG: ![[f]] = !DILocalVariable(name: "f", +; COMMON-DAG: ![[g]] = !DILocalVariable(name: "g", +; COMMON-DAG: ![[h]] = !DILocalVariable(name: "h", + +; COMMON-DAG: ![[p]] = !DILocalVariable(name: "p" +; COMMON-DAG: ![[q]] = !DILocalVariable(name: "q" +; COMMON-DAG: ![[r]] = !DILocalVariable(name: "r" + +declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) + +!llvm.dbg.cu = !{!2} +!llvm.module.flags = !{!16, !17} +!llvm.ident = !{!22} + +!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression()) +!1 = distinct !DIGlobalVariable(name: "gt", scope: !2, file: !3, line: 1, type: !10, isLocal: false, isDefinition: true) +!2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !3, producer: "clang version 17.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, globals: !4, splitDebugInlining: false, nameTableKind: None) +!3 = !DIFile(filename: "test.cpp", directory: "/") +!4 = !{!0, !5} +!5 = !DIGlobalVariableExpression(var: !6, expr: !DIExpression()) +!6 = distinct !DIGlobalVariable(name: "gf", scope: !2, file: !3, line: 7, type: !7, isLocal: false, isDefinition: true) +!7 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "four", file: !3, line: 7, size: 128, flags: DIFlagTypePassByValue, elements: !8, identifier: "_ZTS4four") +!8 = !{!9, !15} +!9 = !DIDerivedType(tag: DW_TAG_member, name: "a", scope: !7, file: !3, line: 7, baseType: !10, size: 64) +!10 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "two", file: !3, line: 1, size: 64, flags: DIFlagTypePassByValue, elements: !11, identifier: "_ZTS3two") +!11 = !{!12, !14} +!12 = !DIDerivedType(tag: DW_TAG_member, name: "a", scope: !10, file: !3, line: 1, baseType: !13, size: 32) +!13 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!14 = !DIDerivedType(tag: DW_TAG_member, name: "b", scope: !10, file: !3, line: 1, baseType: !13, size: 32, offset: 32) +!15 = !DIDerivedType(tag: DW_TAG_member, name: "b", scope: !7, file: !3, line: 7, baseType: !10, size: 64, offset: 64) +!16 = !{i32 7, !"Dwarf Version", i32 5} +!17 = !{i32 2, !"Debug Info Version", i32 3} +!22 = !{!"clang version 17.0.0"} +!23 = distinct !DISubprogram(name: "fun1", linkageName: "_Z4fun1v", scope: !3, file: !3, line: 2, type: !24, scopeLine: 2, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !26) +!24 = !DISubroutineType(types: !25) +!25 = !{!13} +!26 = !{!27, !28, !29} +!27 = !DILocalVariable(name: "x", scope: !23, file: !3, line: 3, type: !13) +!28 = !DILocalVariable(name: "y", scope: !23, file: !3, line: 3, type: !13) +!29 = !DILocalVariable(scope: !23, file: !3, line: 3, type: !10) +!31 = !DILocation(line: 3, column: 9, scope: !23) +!48 = distinct !DISubprogram(name: "fun2", linkageName: "_Z4fun2v", scope: !3, file: !3, line: 8, type: !24, scopeLine: 8, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !49) +!49 = !{!50, !51, !52} +!50 = !DILocalVariable(name: "x", scope: !48, file: !3, line: 9, type: !10) +!51 = !DILocalVariable(name: "y", scope: !48, file: !3, line: 9, type: !10) +!52 = !DILocalVariable(scope: !48, file: !3, line: 9, type: !7) +!54 = !DILocation(line: 9, column: 9, scope: !48) +!55 = distinct !DISubprogram(name: "fun3", linkageName: "_Z4fun3v", scope: !3, file: !3, line: 8, type: !24, scopeLine: 8, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !56) +!56 = !{} +!58 = !DILocation(line: 9, column: 9, scope: !55) +!60 = !DIBasicType(name: "short", size: 16, encoding: DW_ATE_signed) +!61 = !DILocalVariable(name: "e", scope: !55, file: !3, line: 9, type: !60) +!62 = !DILocalVariable(name: "f", scope: !55, file: !3, line: 9, type: !60) +!63 = !DILocalVariable(name: "g", scope: !55, file: !3, line: 9, type: !60) +!64 = !DILocalVariable(name: "h", scope: !55, file: !3, line: 9, type: !60) +!65 = distinct !DISubprogram(name: "fun4", linkageName: "_Z4fun4v", scope: !3, file: !3, line: 8, type: !24, scopeLine: 8, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !56) +!66 = !DILocation(line: 9, column: 9, scope: !65) +!67 = !DILocalVariable(name: "p", scope: !65, file: !3, line: 9, type: !13) +!68 = !DILocalVariable(name: "q", scope: !65, file: !3, line: 9, type: !13) +!69 = !DILocalVariable(name: "r", scope: !65, file: !3, line: 9, type: !13) From cbd255942b52c3576aa0dca444811512fff43714 Mon Sep 17 00:00:00 2001 From: dlav-sc Date: Thu, 18 Jul 2024 11:09:52 +0300 Subject: [PATCH 390/777] [lldb] add RISCV target specific info in API tests (#99039) Add information about RISCV first register in python API testsuite, that is used to check register readability in tests. Fixed tests on RISCV target: TestBreakpointByFileColonLine.BreakpointByLineAndColumnTestCase TestAddressBreakpoints.AddressBreakpointTestCase TestBreakpointAutoContinue.BreakpointAutoContinue TestInterruptBacktrace.TestInterruptingBacktrace TestBadAddressBreakpoints.BadAddressBreakpointTestCase TestScriptedResolver.TestScriptedResolver TestStopHookScripted.TestStopHooks TestBreakpointConditions.BreakpointConditionsTestCase TestLocalVariables.LocalVariablesTestCase TestFindLineEntry.FindLineEntry TestScriptedResolver.TestScriptedResolver TestInlineSourceFiles.InlineSourceFilesTestCase TestModuleAndSection.ModuleAndSectionAPIsTestCase TestFrameVar.TestFrameVar TestInferiorAssert.AssertingInferiorTestCase TestInferiorCrashing.CrashingInferiorTestCase TestInferiorCrashingStep.CrashingInferiorStepTestCase TestRegistersIterator.RegistersIteratorTestCase TestCoroutineHandle.TestCoroutineHandle TestWithLimitDebugInfo.TestWithLimitDebugInfo TestLLDBIterator.LLDBIteratorTestCase TestMemoryWrite.MemoryWriteTestCase TestNestedTemplate.NestedTemplateTestCase TestParrayVrsCharArrayChild.TestParrayVrsCharArrayChild TestRecursiveInferior.CrashingRecursiveInferiorTestCase TestRecursiveInferiorStep.CrashingRecursiveInferiorStepTestCase TestRunLocker.TestRunLocker TestSampleTest.RenameThisSampleTestTestCase TestUniqueTypes3.UniqueTypesTestCase3 TestPrintStackTraces.ThreadsStackTracesTestCase TestUnicodeSymbols.TestUnicodeSymbols TestUnusedInlinedParameters.TestUnusedInlinedParameters TestValueVarUpdate.ValueVarUpdateTestCase TestPtrRef2Typedef.PtrRef2TypedefTestCase TestDataFormatterStdIterator.StdIteratorDataFormatterTestCase TestDataFormatterStdString.StdStringDataFormatterTestCase TestDataFormatterStdVBool.StdVBoolDataFormatterTestCase --- lldb/packages/Python/lldbsuite/test/lldbplatformutil.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lldb/packages/Python/lldbsuite/test/lldbplatformutil.py b/lldb/packages/Python/lldbsuite/test/lldbplatformutil.py index 818fdf0e6b5c5..b7e6f240f59f6 100644 --- a/lldb/packages/Python/lldbsuite/test/lldbplatformutil.py +++ b/lldb/packages/Python/lldbsuite/test/lldbplatformutil.py @@ -34,6 +34,8 @@ def check_first_register_readable(test_case): test_case.expect("register read r0", substrs=["r0 = 0x"]) elif arch in ["powerpc64le"]: test_case.expect("register read r0", substrs=["r0 = 0x"]) + elif re.match("^rv(32|64)", arch): + test_case.expect("register read zero", substrs=["zero = 0x"]) else: # TODO: Add check for other architectures test_case.fail( From 26cb88e3210af24942310a192431c8d7c3544e21 Mon Sep 17 00:00:00 2001 From: Pavel Labath Date: Thu, 18 Jul 2024 10:22:05 +0200 Subject: [PATCH 391/777] Revert "[llvm/DWARF] Recursively resolve DW_AT_signature references" (#99444) Reverts llvm/llvm-project#97423 due to a failure in the cross-project-tests. --- llvm/include/llvm/DebugInfo/DWARF/DWARFDie.h | 2 + llvm/lib/DebugInfo/DWARF/DWARFDie.cpp | 36 ++++++++----- llvm/lib/DebugInfo/DWARF/DWARFTypePrinter.cpp | 52 ++++++++++--------- .../X86/prettyprint_type_units.s | 19 +------ 4 files changed, 55 insertions(+), 54 deletions(-) diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFDie.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFDie.h index 497d3bee048ab..421b84d644db6 100644 --- a/llvm/include/llvm/DebugInfo/DWARF/DWARFDie.h +++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFDie.h @@ -181,6 +181,8 @@ class DWARFDie { DWARFDie getAttributeValueAsReferencedDie(dwarf::Attribute Attr) const; DWARFDie getAttributeValueAsReferencedDie(const DWARFFormValue &V) const; + DWARFDie resolveTypeUnitReference() const; + /// Extract the range base attribute from this DIE as absolute section offset. /// /// This is a utility function that checks for either the DW_AT_rnglists_base diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp index 345a91a6f3585..72e7464b68971 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp @@ -103,6 +103,10 @@ static void dumpLocationExpr(raw_ostream &OS, const DWARFFormValue &FormValue, .print(OS, DumpOpts, U); } +static DWARFDie resolveReferencedType(DWARFDie D, DWARFFormValue F) { + return D.getAttributeValueAsReferencedDie(F).resolveTypeUnitReference(); +} + static void dumpAttribute(raw_ostream &OS, const DWARFDie &Die, const DWARFAttribute &AttrValue, unsigned Indent, DIDumpOptions DumpOpts) { @@ -194,8 +198,8 @@ static void dumpAttribute(raw_ostream &OS, const DWARFDie &Die, DINameKind::LinkageName)) OS << Space << "\"" << Name << '\"'; } else if (Attr == DW_AT_type || Attr == DW_AT_containing_type) { - if (DWARFDie D = Die.getAttributeValueAsReferencedDie(FormValue); - D && !D.isNULL()) { + DWARFDie D = resolveReferencedType(Die, FormValue); + if (D && !D.isNULL()) { OS << Space << "\""; dumpTypeQualifiedName(D, OS); OS << '"'; @@ -287,12 +291,13 @@ DWARFDie::findRecursively(ArrayRef Attrs) const { if (auto Value = Die.find(Attrs)) return Value; - for (dwarf::Attribute Attr : - {DW_AT_abstract_origin, DW_AT_specification, DW_AT_signature}) { - if (auto D = Die.getAttributeValueAsReferencedDie(Attr)) - if (Seen.insert(D).second) - Worklist.push_back(D); - } + if (auto D = Die.getAttributeValueAsReferencedDie(DW_AT_abstract_origin)) + if (Seen.insert(D).second) + Worklist.push_back(D); + + if (auto D = Die.getAttributeValueAsReferencedDie(DW_AT_specification)) + if (Seen.insert(D).second) + Worklist.push_back(D); } return std::nullopt; @@ -314,14 +319,21 @@ DWARFDie::getAttributeValueAsReferencedDie(const DWARFFormValue &V) const { } else if (Offset = V.getAsDebugInfoReference(); Offset) { if (DWARFUnit *SpecUnit = U->getUnitVector().getUnitForOffset(*Offset)) Result = SpecUnit->getDIEForOffset(*Offset); - } else if (std::optional Sig = V.getAsSignatureReference()) { - if (DWARFTypeUnit *TU = U->getContext().getTypeUnitForHash( - U->getVersion(), *Sig, U->isDWOUnit())) - Result = TU->getDIEForOffset(TU->getTypeOffset() + TU->getOffset()); } return Result; } +DWARFDie DWARFDie::resolveTypeUnitReference() const { + if (auto Attr = find(DW_AT_signature)) { + if (std::optional Sig = Attr->getAsReferenceUVal()) { + if (DWARFTypeUnit *TU = U->getContext().getTypeUnitForHash( + U->getVersion(), *Sig, U->isDWOUnit())) + return TU->getDIEForOffset(TU->getTypeOffset() + TU->getOffset()); + } + } + return *this; +} + std::optional DWARFDie::getRangesBaseAttribute() const { return toSectionOffset(find({DW_AT_rnglists_base, DW_AT_GNU_ranges_base})); } diff --git a/llvm/lib/DebugInfo/DWARF/DWARFTypePrinter.cpp b/llvm/lib/DebugInfo/DWARF/DWARFTypePrinter.cpp index fc1aae77a9293..a26431e8313f6 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFTypePrinter.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFTypePrinter.cpp @@ -62,10 +62,17 @@ void DWARFTypePrinter::appendArrayType(const DWARFDie &D) { EndedWithTemplate = false; } +static DWARFDie resolveReferencedType(DWARFDie D, + dwarf::Attribute Attr = DW_AT_type) { + return D.getAttributeValueAsReferencedDie(Attr).resolveTypeUnitReference(); +} +static DWARFDie resolveReferencedType(DWARFDie D, DWARFFormValue F) { + return D.getAttributeValueAsReferencedDie(F).resolveTypeUnitReference(); +} DWARFDie DWARFTypePrinter::skipQualifiers(DWARFDie D) { while (D && (D.getTag() == DW_TAG_const_type || D.getTag() == DW_TAG_volatile_type)) - D = D.getAttributeValueAsReferencedDie(DW_AT_type); + D = resolveReferencedType(D); return D; } @@ -96,9 +103,7 @@ DWARFTypePrinter::appendUnqualifiedNameBefore(DWARFDie D, return DWARFDie(); } DWARFDie InnerDIE; - auto Inner = [&] { - return InnerDIE = D.getAttributeValueAsReferencedDie(DW_AT_type); - }; + auto Inner = [&] { return InnerDIE = resolveReferencedType(D); }; const dwarf::Tag T = D.getTag(); switch (T) { case DW_TAG_pointer_type: { @@ -129,8 +134,7 @@ DWARFTypePrinter::appendUnqualifiedNameBefore(DWARFDie D, OS << '('; else if (Word) OS << ' '; - if (DWARFDie Cont = - D.getAttributeValueAsReferencedDie(DW_AT_containing_type)) { + if (DWARFDie Cont = resolveReferencedType(D, DW_AT_containing_type)) { appendQualifiedName(Cont); EndedWithTemplate = false; OS << "::"; @@ -169,8 +173,7 @@ DWARFTypePrinter::appendUnqualifiedNameBefore(DWARFDie D, case DW_TAG_base_type: */ default: { - const char *NamePtr = - dwarf::toString(D.findRecursively(DW_AT_name), nullptr); + const char *NamePtr = dwarf::toString(D.find(DW_AT_name), nullptr); if (!NamePtr) { appendTypeTagName(D.getTag()); return DWARFDie(); @@ -232,9 +235,9 @@ void DWARFTypePrinter::appendUnqualifiedNameAfter( case DW_TAG_pointer_type: { if (needsParens(Inner)) OS << ')'; - appendUnqualifiedNameAfter( - Inner, Inner.getAttributeValueAsReferencedDie(DW_AT_type), - /*SkipFirstParamIfArtificial=*/D.getTag() == DW_TAG_ptr_to_member_type); + appendUnqualifiedNameAfter(Inner, resolveReferencedType(Inner), + /*SkipFirstParamIfArtificial=*/D.getTag() == + DW_TAG_ptr_to_member_type); break; } case DW_TAG_LLVM_ptrauth_type: { @@ -338,7 +341,7 @@ bool DWARFTypePrinter::appendTemplateParameters(DWARFDie D, appendTemplateParameters(C, FirstParameter); } if (C.getTag() == dwarf::DW_TAG_template_value_parameter) { - DWARFDie T = C.getAttributeValueAsReferencedDie(DW_AT_type); + DWARFDie T = resolveReferencedType(C); Sep(); if (T.getTag() == DW_TAG_enumeration_type) { OS << '('; @@ -458,7 +461,7 @@ bool DWARFTypePrinter::appendTemplateParameters(DWARFDie D, continue; auto TypeAttr = C.find(DW_AT_type); Sep(); - appendQualifiedName(TypeAttr ? C.getAttributeValueAsReferencedDie(*TypeAttr) + appendQualifiedName(TypeAttr ? resolveReferencedType(C, *TypeAttr) : DWARFDie()); } if (IsTemplate && *FirstParameter && FirstParameter == &FirstParameterValue) { @@ -470,15 +473,15 @@ bool DWARFTypePrinter::appendTemplateParameters(DWARFDie D, void DWARFTypePrinter::decomposeConstVolatile(DWARFDie &N, DWARFDie &T, DWARFDie &C, DWARFDie &V) { (N.getTag() == DW_TAG_const_type ? C : V) = N; - T = N.getAttributeValueAsReferencedDie(DW_AT_type); + T = resolveReferencedType(N); if (T) { auto Tag = T.getTag(); if (Tag == DW_TAG_const_type) { C = T; - T = T.getAttributeValueAsReferencedDie(DW_AT_type); + T = resolveReferencedType(T); } else if (Tag == DW_TAG_volatile_type) { V = T; - T = T.getAttributeValueAsReferencedDie(DW_AT_type); + T = resolveReferencedType(T); } } } @@ -488,11 +491,10 @@ void DWARFTypePrinter::appendConstVolatileQualifierAfter(DWARFDie N) { DWARFDie T; decomposeConstVolatile(N, T, C, V); if (T && T.getTag() == DW_TAG_subroutine_type) - appendSubroutineNameAfter(T, T.getAttributeValueAsReferencedDie(DW_AT_type), - false, C.isValid(), V.isValid()); + appendSubroutineNameAfter(T, resolveReferencedType(T), false, C.isValid(), + V.isValid()); else - appendUnqualifiedNameAfter(T, - T.getAttributeValueAsReferencedDie(DW_AT_type)); + appendUnqualifiedNameAfter(T, resolveReferencedType(T)); } void DWARFTypePrinter::appendConstVolatileQualifierBefore(DWARFDie N) { DWARFDie C; @@ -502,7 +504,7 @@ void DWARFTypePrinter::appendConstVolatileQualifierBefore(DWARFDie N) { bool Subroutine = T && T.getTag() == DW_TAG_subroutine_type; DWARFDie A = T; while (A && A.getTag() == DW_TAG_array_type) - A = A.getAttributeValueAsReferencedDie(DW_AT_type); + A = resolveReferencedType(A); bool Leading = (!A || (A.getTag() != DW_TAG_pointer_type && A.getTag() != llvm::dwarf::DW_TAG_ptr_to_member_type)) && @@ -544,7 +546,7 @@ void DWARFTypePrinter::appendSubroutineNameAfter( if (P.getTag() != DW_TAG_formal_parameter && P.getTag() != DW_TAG_unspecified_parameters) return; - DWARFDie T = P.getAttributeValueAsReferencedDie(DW_AT_type); + DWARFDie T = resolveReferencedType(P); if (SkipFirstParamIfArtificial && RealFirst && P.find(DW_AT_artificial)) { FirstParamIfArtificial = T; RealFirst = false; @@ -565,7 +567,7 @@ void DWARFTypePrinter::appendSubroutineNameAfter( if (DWARFDie P = FirstParamIfArtificial) { if (P.getTag() == DW_TAG_pointer_type) { auto CVStep = [&](DWARFDie CV) { - if (DWARFDie U = CV.getAttributeValueAsReferencedDie(DW_AT_type)) { + if (DWARFDie U = resolveReferencedType(CV)) { Const |= U.getTag() == DW_TAG_const_type; Volatile |= U.getTag() == DW_TAG_volatile_type; return U; @@ -651,8 +653,7 @@ void DWARFTypePrinter::appendSubroutineNameAfter( if (D.find(DW_AT_rvalue_reference)) OS << " &&"; - appendUnqualifiedNameAfter( - Inner, Inner.getAttributeValueAsReferencedDie(DW_AT_type)); + appendUnqualifiedNameAfter(Inner, resolveReferencedType(Inner)); } void DWARFTypePrinter::appendScopes(DWARFDie D) { if (D.getTag() == DW_TAG_compile_unit) @@ -665,6 +666,7 @@ void DWARFTypePrinter::appendScopes(DWARFDie D) { return; if (D.getTag() == DW_TAG_lexical_block) return; + D = D.resolveTypeUnitReference(); if (DWARFDie P = D.getParent()) appendScopes(P); appendUnqualifiedName(D); diff --git a/llvm/test/tools/llvm-dwarfdump/X86/prettyprint_type_units.s b/llvm/test/tools/llvm-dwarfdump/X86/prettyprint_type_units.s index 5611963a585f6..aad748a301e6b 100644 --- a/llvm/test/tools/llvm-dwarfdump/X86/prettyprint_type_units.s +++ b/llvm/test/tools/llvm-dwarfdump/X86/prettyprint_type_units.s @@ -18,15 +18,12 @@ # doesn't really need templates - two local variables would've sufficed # (anything that references the type units) but I was working on something else # and this seemed minimal enough. -# A gcc-style type signature reference was also inserted. # CHECK: DW_TAG_template_type_parameter # CHECK: DW_AT_type ({{.*}} "t1") # CHECK: DW_TAG_template_type_parameter # CHECK: DW_AT_type ({{.*}} "t2") -# CHECK: DW_TAG_template_type_parameter -# CHECK: DW_AT_type (0xc6694e51369161f2 "t1") .text .file "test.cpp" @@ -273,13 +270,6 @@ _Z2f1IJ2t12t2EEvv: # @_Z2f1IJ2t12t2EEvv .byte 11 # DW_FORM_data1 .byte 0 # EOM(1) .byte 0 # EOM(2) - .byte 12 # Abbreviation Code - .byte 47 # DW_TAG_template_type_parameter - .byte 0 # DW_CHILDREN_no - .byte 73 # DW_AT_type - .byte 32 # DW_FORM_ref_sig8 - .byte 0 # EOM(1) - .byte 0 # EOM(2) .byte 0 # EOM(3) .section .debug_info,"",@progbits .Lcu_begin0: @@ -323,23 +313,18 @@ _Z2f1IJ2t12t2EEvv: # @_Z2f1IJ2t12t2EEvv .byte 6 # Abbrev [6] 0x46:0xd DW_TAG_GNU_template_parameter_pack .byte 5 # DW_AT_name .byte 7 # Abbrev [7] 0x48:0x5 DW_TAG_template_type_parameter - .long .Lt1_decl-.Lcu_begin0 # DW_AT_type + .long 88 # DW_AT_type .byte 7 # Abbrev [7] 0x4d:0x5 DW_TAG_template_type_parameter - # Simulate DWARF emitted by GCC where the signature is directly in the type attribute. - .long .Lt2_decl-.Lcu_begin0 # DW_AT_type - .byte 12 # Abbrev [12] DW_TAG_template_type_parameter - .quad -4149699470930386446 # DW_AT_type + .long 97 # DW_AT_type .byte 0 # End Of Children Mark .byte 0 # End Of Children Mark .byte 8 # Abbrev [8] 0x54:0x4 DW_TAG_base_type .byte 4 # DW_AT_name .byte 5 # DW_AT_encoding .byte 4 # DW_AT_byte_size - .Lt1_decl: .byte 9 # Abbrev [9] 0x58:0x9 DW_TAG_structure_type # DW_AT_declaration .quad -4149699470930386446 # DW_AT_signature - .Lt2_decl: .byte 9 # Abbrev [9] 0x61:0x9 DW_TAG_structure_type # DW_AT_declaration .quad 5649318945901130368 # DW_AT_signature From 80865c01e1b8d3a6bea308fda7bbc53047dcc2e7 Mon Sep 17 00:00:00 2001 From: Michael Kruse Date: Thu, 18 Jul 2024 10:35:32 +0200 Subject: [PATCH 392/777] [Clang][OpenMP] Add reverse directive (#92916) Add the reverse directive which will be introduced in the upcoming OpenMP 6.0 specification. A preview has been published in [Technical Report 12](https://www.openmp.org/wp-content/uploads/openmp-TR12.pdf). --------- Co-authored-by: Alexey Bataev --- clang/include/clang-c/Index.h | 4 + clang/include/clang/AST/RecursiveASTVisitor.h | 3 + clang/include/clang/AST/StmtOpenMP.h | 69 +- clang/include/clang/Basic/StmtNodes.td | 1 + clang/include/clang/Sema/SemaOpenMP.h | 3 + .../include/clang/Serialization/ASTBitCodes.h | 1 + clang/lib/AST/StmtOpenMP.cpp | 18 + clang/lib/AST/StmtPrinter.cpp | 5 + clang/lib/AST/StmtProfile.cpp | 4 + clang/lib/Basic/OpenMPKinds.cpp | 2 +- clang/lib/CodeGen/CGStmt.cpp | 3 + clang/lib/CodeGen/CGStmtOpenMP.cpp | 8 + clang/lib/CodeGen/CodeGenFunction.h | 1 + clang/lib/Parse/ParseOpenMP.cpp | 1 + clang/lib/Sema/SemaExceptionSpec.cpp | 1 + clang/lib/Sema/SemaOpenMP.cpp | 195 +++ clang/lib/Sema/TreeTransform.h | 11 + clang/lib/Serialization/ASTReaderStmt.cpp | 13 + clang/lib/Serialization/ASTWriterStmt.cpp | 5 + clang/test/OpenMP/reverse_ast_print.cpp | 159 ++ clang/test/OpenMP/reverse_codegen.cpp | 1554 +++++++++++++++++ clang/test/OpenMP/reverse_messages.cpp | 40 + clang/tools/libclang/CIndex.cpp | 7 + clang/tools/libclang/CXCursor.cpp | 3 + llvm/include/llvm/Frontend/OpenMP/OMP.td | 4 + .../test/transform/reverse/foreach.cpp | 162 ++ .../runtime/test/transform/reverse/intfor.c | 25 + .../test/transform/reverse/iterfor.cpp | 164 ++ .../parallel-wsloop-collapse-foreach.cpp | 285 +++ .../parallel-wsloop-collapse-intfor.cpp | 51 + 30 files changed, 2799 insertions(+), 3 deletions(-) create mode 100644 clang/test/OpenMP/reverse_ast_print.cpp create mode 100644 clang/test/OpenMP/reverse_codegen.cpp create mode 100644 clang/test/OpenMP/reverse_messages.cpp create mode 100644 openmp/runtime/test/transform/reverse/foreach.cpp create mode 100644 openmp/runtime/test/transform/reverse/intfor.c create mode 100644 openmp/runtime/test/transform/reverse/iterfor.cpp create mode 100644 openmp/runtime/test/transform/reverse/parallel-wsloop-collapse-foreach.cpp create mode 100644 openmp/runtime/test/transform/reverse/parallel-wsloop-collapse-intfor.cpp diff --git a/clang/include/clang-c/Index.h b/clang/include/clang-c/Index.h index 24ed23a628728..d096183ef2037 100644 --- a/clang/include/clang-c/Index.h +++ b/clang/include/clang-c/Index.h @@ -2146,6 +2146,10 @@ enum CXCursorKind { */ CXCursor_OMPScopeDirective = 306, + /** OpenMP reverse directive. + */ + CXCursor_OMPReverseDirective = 307, + /** OpenACC Compute Construct. */ CXCursor_OpenACCComputeConstruct = 320, diff --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h index 2785afd59bf21..36deec918c4b9 100644 --- a/clang/include/clang/AST/RecursiveASTVisitor.h +++ b/clang/include/clang/AST/RecursiveASTVisitor.h @@ -3032,6 +3032,9 @@ DEF_TRAVERSE_STMT(OMPTileDirective, DEF_TRAVERSE_STMT(OMPUnrollDirective, { TRY_TO(TraverseOMPExecutableDirective(S)); }) +DEF_TRAVERSE_STMT(OMPReverseDirective, + { TRY_TO(TraverseOMPExecutableDirective(S)); }) + DEF_TRAVERSE_STMT(OMPForDirective, { TRY_TO(TraverseOMPExecutableDirective(S)); }) diff --git a/clang/include/clang/AST/StmtOpenMP.h b/clang/include/clang/AST/StmtOpenMP.h index f735fa5643aec..e41a9e52b7674 100644 --- a/clang/include/clang/AST/StmtOpenMP.h +++ b/clang/include/clang/AST/StmtOpenMP.h @@ -1007,8 +1007,9 @@ class OMPLoopTransformationDirective : public OMPLoopBasedDirective { Stmt *getPreInits() const; static bool classof(const Stmt *T) { - return T->getStmtClass() == OMPTileDirectiveClass || - T->getStmtClass() == OMPUnrollDirectiveClass; + Stmt::StmtClass C = T->getStmtClass(); + return C == OMPTileDirectiveClass || C == OMPUnrollDirectiveClass || + C == OMPReverseDirectiveClass; } }; @@ -5711,6 +5712,70 @@ class OMPUnrollDirective final : public OMPLoopTransformationDirective { } }; +/// Represents the '#pragma omp reverse' loop transformation directive. +/// +/// \code +/// #pragma omp reverse +/// for (int i = 0; i < n; ++i) +/// ... +/// \endcode +class OMPReverseDirective final : public OMPLoopTransformationDirective { + friend class ASTStmtReader; + friend class OMPExecutableDirective; + + /// Offsets of child members. + enum { + PreInitsOffset = 0, + TransformedStmtOffset, + }; + + explicit OMPReverseDirective(SourceLocation StartLoc, SourceLocation EndLoc) + : OMPLoopTransformationDirective(OMPReverseDirectiveClass, + llvm::omp::OMPD_reverse, StartLoc, + EndLoc, 1) {} + + void setPreInits(Stmt *PreInits) { + Data->getChildren()[PreInitsOffset] = PreInits; + } + + void setTransformedStmt(Stmt *S) { + Data->getChildren()[TransformedStmtOffset] = S; + } + +public: + /// Create a new AST node representation for '#pragma omp reverse'. + /// + /// \param C Context of the AST. + /// \param StartLoc Location of the introducer (e.g. the 'omp' token). + /// \param EndLoc Location of the directive's end (e.g. the tok::eod). + /// \param AssociatedStmt The outermost associated loop. + /// \param TransformedStmt The loop nest after tiling, or nullptr in + /// dependent contexts. + /// \param PreInits Helper preinits statements for the loop nest. + static OMPReverseDirective * + Create(const ASTContext &C, SourceLocation StartLoc, SourceLocation EndLoc, + Stmt *AssociatedStmt, Stmt *TransformedStmt, Stmt *PreInits); + + /// Build an empty '#pragma omp reverse' AST node for deserialization. + /// + /// \param C Context of the AST. + /// \param NumClauses Number of clauses to allocate. + static OMPReverseDirective *CreateEmpty(const ASTContext &C); + + /// Gets/sets the associated loops after the transformation, i.e. after + /// de-sugaring. + Stmt *getTransformedStmt() const { + return Data->getChildren()[TransformedStmtOffset]; + } + + /// Return preinits statement. + Stmt *getPreInits() const { return Data->getChildren()[PreInitsOffset]; } + + static bool classof(const Stmt *T) { + return T->getStmtClass() == OMPReverseDirectiveClass; + } +}; + /// This represents '#pragma omp scan' directive. /// /// \code diff --git a/clang/include/clang/Basic/StmtNodes.td b/clang/include/clang/Basic/StmtNodes.td index c59a17be7808f..426b8ec4b4496 100644 --- a/clang/include/clang/Basic/StmtNodes.td +++ b/clang/include/clang/Basic/StmtNodes.td @@ -230,6 +230,7 @@ def OMPSimdDirective : StmtNode; def OMPLoopTransformationDirective : StmtNode; def OMPTileDirective : StmtNode; def OMPUnrollDirective : StmtNode; +def OMPReverseDirective : StmtNode; def OMPForDirective : StmtNode; def OMPForSimdDirective : StmtNode; def OMPSectionsDirective : StmtNode; diff --git a/clang/include/clang/Sema/SemaOpenMP.h b/clang/include/clang/Sema/SemaOpenMP.h index 3edf1cc7c12f2..14dd3fec3e2e9 100644 --- a/clang/include/clang/Sema/SemaOpenMP.h +++ b/clang/include/clang/Sema/SemaOpenMP.h @@ -423,6 +423,9 @@ class SemaOpenMP : public SemaBase { StmtResult ActOnOpenMPUnrollDirective(ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, SourceLocation EndLoc); + /// Called on well-formed '#pragma omp reverse'. + StmtResult ActOnOpenMPReverseDirective(Stmt *AStmt, SourceLocation StartLoc, + SourceLocation EndLoc); /// Called on well-formed '\#pragma omp for' after parsing /// of the associated statement. StmtResult diff --git a/clang/include/clang/Serialization/ASTBitCodes.h b/clang/include/clang/Serialization/ASTBitCodes.h index 488994c05dc12..8cd24699505df 100644 --- a/clang/include/clang/Serialization/ASTBitCodes.h +++ b/clang/include/clang/Serialization/ASTBitCodes.h @@ -1895,6 +1895,7 @@ enum StmtCode { STMT_OMP_SIMD_DIRECTIVE, STMT_OMP_TILE_DIRECTIVE, STMT_OMP_UNROLL_DIRECTIVE, + STMT_OMP_REVERSE_DIRECTIVE, STMT_OMP_FOR_DIRECTIVE, STMT_OMP_FOR_SIMD_DIRECTIVE, STMT_OMP_SECTIONS_DIRECTIVE, diff --git a/clang/lib/AST/StmtOpenMP.cpp b/clang/lib/AST/StmtOpenMP.cpp index c8792941a6bb6..8b19b4209757f 100644 --- a/clang/lib/AST/StmtOpenMP.cpp +++ b/clang/lib/AST/StmtOpenMP.cpp @@ -449,6 +449,24 @@ OMPUnrollDirective *OMPUnrollDirective::CreateEmpty(const ASTContext &C, SourceLocation(), SourceLocation()); } +OMPReverseDirective * +OMPReverseDirective::Create(const ASTContext &C, SourceLocation StartLoc, + SourceLocation EndLoc, Stmt *AssociatedStmt, + Stmt *TransformedStmt, Stmt *PreInits) { + OMPReverseDirective *Dir = createDirective( + C, std::nullopt, AssociatedStmt, TransformedStmtOffset + 1, StartLoc, + EndLoc); + Dir->setTransformedStmt(TransformedStmt); + Dir->setPreInits(PreInits); + return Dir; +} + +OMPReverseDirective *OMPReverseDirective::CreateEmpty(const ASTContext &C) { + return createEmptyDirective( + C, /*NumClauses=*/0, /*HasAssociatedStmt=*/true, + TransformedStmtOffset + 1, SourceLocation(), SourceLocation()); +} + OMPForSimdDirective * OMPForSimdDirective::Create(const ASTContext &C, SourceLocation StartLoc, SourceLocation EndLoc, unsigned CollapsedNum, diff --git a/clang/lib/AST/StmtPrinter.cpp b/clang/lib/AST/StmtPrinter.cpp index 5241a5cdbf009..58fb222d99153 100644 --- a/clang/lib/AST/StmtPrinter.cpp +++ b/clang/lib/AST/StmtPrinter.cpp @@ -763,6 +763,11 @@ void StmtPrinter::VisitOMPUnrollDirective(OMPUnrollDirective *Node) { PrintOMPExecutableDirective(Node); } +void StmtPrinter::VisitOMPReverseDirective(OMPReverseDirective *Node) { + Indent() << "#pragma omp reverse"; + PrintOMPExecutableDirective(Node); +} + void StmtPrinter::VisitOMPForDirective(OMPForDirective *Node) { Indent() << "#pragma omp for"; PrintOMPExecutableDirective(Node); diff --git a/clang/lib/AST/StmtProfile.cpp b/clang/lib/AST/StmtProfile.cpp index 1add5caaf9f2e..ba04136d9896e 100644 --- a/clang/lib/AST/StmtProfile.cpp +++ b/clang/lib/AST/StmtProfile.cpp @@ -985,6 +985,10 @@ void StmtProfiler::VisitOMPUnrollDirective(const OMPUnrollDirective *S) { VisitOMPLoopTransformationDirective(S); } +void StmtProfiler::VisitOMPReverseDirective(const OMPReverseDirective *S) { + VisitOMPLoopTransformationDirective(S); +} + void StmtProfiler::VisitOMPForDirective(const OMPForDirective *S) { VisitOMPLoopDirective(S); } diff --git a/clang/lib/Basic/OpenMPKinds.cpp b/clang/lib/Basic/OpenMPKinds.cpp index 766d6a8418a6a..a442b02ad3b55 100644 --- a/clang/lib/Basic/OpenMPKinds.cpp +++ b/clang/lib/Basic/OpenMPKinds.cpp @@ -684,7 +684,7 @@ bool clang::isOpenMPLoopBoundSharingDirective(OpenMPDirectiveKind Kind) { } bool clang::isOpenMPLoopTransformationDirective(OpenMPDirectiveKind DKind) { - return DKind == OMPD_tile || DKind == OMPD_unroll; + return DKind == OMPD_tile || DKind == OMPD_unroll || DKind == OMPD_reverse; } bool clang::isOpenMPCombinedParallelADirective(OpenMPDirectiveKind DKind) { diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp index 2e65e9fd26099..47246494fc782 100644 --- a/clang/lib/CodeGen/CGStmt.cpp +++ b/clang/lib/CodeGen/CGStmt.cpp @@ -222,6 +222,9 @@ void CodeGenFunction::EmitStmt(const Stmt *S, ArrayRef Attrs) { case Stmt::OMPUnrollDirectiveClass: EmitOMPUnrollDirective(cast(*S)); break; + case Stmt::OMPReverseDirectiveClass: + EmitOMPReverseDirective(cast(*S)); + break; case Stmt::OMPForDirectiveClass: EmitOMPForDirective(cast(*S)); break; diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp index 4d05322951d0a..ec8eeabf96210 100644 --- a/clang/lib/CodeGen/CGStmtOpenMP.cpp +++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp @@ -187,6 +187,8 @@ class OMPLoopScope : public CodeGenFunction::RunCleanupsScope { PreInits = Tile->getPreInits(); } else if (const auto *Unroll = dyn_cast(&S)) { PreInits = Unroll->getPreInits(); + } else if (const auto *Reverse = dyn_cast(&S)) { + PreInits = Reverse->getPreInits(); } else { llvm_unreachable("Unknown loop-based directive kind."); } @@ -2762,6 +2764,12 @@ void CodeGenFunction::EmitOMPTileDirective(const OMPTileDirective &S) { EmitStmt(S.getTransformedStmt()); } +void CodeGenFunction::EmitOMPReverseDirective(const OMPReverseDirective &S) { + // Emit the de-sugared statement. + OMPTransformDirectiveScopeRAII ReverseScope(*this, &S); + EmitStmt(S.getTransformedStmt()); +} + void CodeGenFunction::EmitOMPUnrollDirective(const OMPUnrollDirective &S) { bool UseOMPIRBuilder = CGM.getLangOpts().OpenMPIRBuilder; diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h index 1aac2ee9a5c90..9fe4391237819 100644 --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -3817,6 +3817,7 @@ class CodeGenFunction : public CodeGenTypeCache { void EmitOMPSimdDirective(const OMPSimdDirective &S); void EmitOMPTileDirective(const OMPTileDirective &S); void EmitOMPUnrollDirective(const OMPUnrollDirective &S); + void EmitOMPReverseDirective(const OMPReverseDirective &S); void EmitOMPForDirective(const OMPForDirective &S); void EmitOMPForSimdDirective(const OMPForSimdDirective &S); void EmitOMPSectionsDirective(const OMPSectionsDirective &S); diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp index 326cd22ff9005..5b2fd4f4d5397 100644 --- a/clang/lib/Parse/ParseOpenMP.cpp +++ b/clang/lib/Parse/ParseOpenMP.cpp @@ -2885,6 +2885,7 @@ StmtResult Parser::ParseOpenMPDeclarativeOrExecutableDirective( } break; } + case OMPD_reverse: case OMPD_declare_target: { SourceLocation DTLoc = ConsumeAnyToken(); bool HasClauses = Tok.isNot(tok::annot_pragma_openmp_end); diff --git a/clang/lib/Sema/SemaExceptionSpec.cpp b/clang/lib/Sema/SemaExceptionSpec.cpp index 35a85ef8c80a6..4192777d3d4bd 100644 --- a/clang/lib/Sema/SemaExceptionSpec.cpp +++ b/clang/lib/Sema/SemaExceptionSpec.cpp @@ -1466,6 +1466,7 @@ CanThrowResult Sema::canThrow(const Stmt *S) { case Stmt::OMPSimdDirectiveClass: case Stmt::OMPTileDirectiveClass: case Stmt::OMPUnrollDirectiveClass: + case Stmt::OMPReverseDirectiveClass: case Stmt::OMPSingleDirectiveClass: case Stmt::OMPTargetDataDirectiveClass: case Stmt::OMPTargetDirectiveClass: diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp index bc6894018065f..0f2e8bc1513a7 100644 --- a/clang/lib/Sema/SemaOpenMP.cpp +++ b/clang/lib/Sema/SemaOpenMP.cpp @@ -4405,6 +4405,7 @@ void SemaOpenMP::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, case OMPD_section: case OMPD_tile: case OMPD_unroll: + case OMPD_reverse: break; default: processCapturedRegions(SemaRef, DKind, CurScope, @@ -6284,6 +6285,11 @@ StmtResult SemaOpenMP::ActOnOpenMPExecutableDirective( Res = ActOnOpenMPUnrollDirective(ClausesWithImplicit, AStmt, StartLoc, EndLoc); break; + case OMPD_reverse: + assert(ClausesWithImplicit.empty() && + "reverse directive does not support any clauses"); + Res = ActOnOpenMPReverseDirective(AStmt, StartLoc, EndLoc); + break; case OMPD_for: Res = ActOnOpenMPForDirective(ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA); @@ -14040,6 +14046,8 @@ bool SemaOpenMP::checkTransformableLoopNest( DependentPreInits = Dir->getPreInits(); else if (auto *Dir = dyn_cast(Transform)) DependentPreInits = Dir->getPreInits(); + else if (auto *Dir = dyn_cast(Transform)) + DependentPreInits = Dir->getPreInits(); else llvm_unreachable("Unhandled loop transformation"); @@ -14658,6 +14666,193 @@ StmtResult SemaOpenMP::ActOnOpenMPUnrollDirective(ArrayRef Clauses, buildPreInits(Context, PreInits)); } +StmtResult SemaOpenMP::ActOnOpenMPReverseDirective(Stmt *AStmt, + SourceLocation StartLoc, + SourceLocation EndLoc) { + ASTContext &Context = getASTContext(); + Scope *CurScope = SemaRef.getCurScope(); + + // Empty statement should only be possible if there already was an error. + if (!AStmt) + return StmtError(); + + constexpr unsigned NumLoops = 1; + Stmt *Body = nullptr; + SmallVector LoopHelpers( + NumLoops); + SmallVector, NumLoops + 1> OriginalInits; + if (!checkTransformableLoopNest(OMPD_reverse, AStmt, NumLoops, LoopHelpers, + Body, OriginalInits)) + return StmtError(); + + // Delay applying the transformation to when template is completely + // instantiated. + if (SemaRef.CurContext->isDependentContext()) + return OMPReverseDirective::Create(Context, StartLoc, EndLoc, AStmt, + nullptr, nullptr); + + assert(LoopHelpers.size() == NumLoops && + "Expecting a single-dimensional loop iteration space"); + assert(OriginalInits.size() == NumLoops && + "Expecting a single-dimensional loop iteration space"); + OMPLoopBasedDirective::HelperExprs &LoopHelper = LoopHelpers.front(); + + // Find the loop statement. + Stmt *LoopStmt = nullptr; + collectLoopStmts(AStmt, {LoopStmt}); + + // Determine the PreInit declarations. + SmallVector PreInits; + addLoopPreInits(Context, LoopHelper, LoopStmt, OriginalInits[0], PreInits); + + auto *IterationVarRef = cast(LoopHelper.IterationVarRef); + QualType IVTy = IterationVarRef->getType(); + uint64_t IVWidth = Context.getTypeSize(IVTy); + auto *OrigVar = cast(LoopHelper.Counters.front()); + + // Iteration variable SourceLocations. + SourceLocation OrigVarLoc = OrigVar->getExprLoc(); + SourceLocation OrigVarLocBegin = OrigVar->getBeginLoc(); + SourceLocation OrigVarLocEnd = OrigVar->getEndLoc(); + + // Locations pointing to the transformation. + SourceLocation TransformLoc = StartLoc; + SourceLocation TransformLocBegin = StartLoc; + SourceLocation TransformLocEnd = EndLoc; + + // Internal variable names. + std::string OrigVarName = OrigVar->getNameInfo().getAsString(); + SmallString<64> ForwardIVName(".forward.iv."); + ForwardIVName += OrigVarName; + SmallString<64> ReversedIVName(".reversed.iv."); + ReversedIVName += OrigVarName; + + // LoopHelper.Updates will read the logical iteration number from + // LoopHelper.IterationVarRef, compute the value of the user loop counter of + // that logical iteration from it, then assign it to the user loop counter + // variable. We cannot directly use LoopHelper.IterationVarRef as the + // induction variable of the generated loop because it may cause an underflow: + // \code{.c} + // for (unsigned i = 0; i < n; ++i) + // body(i); + // \endcode + // + // Naive reversal: + // \code{.c} + // for (unsigned i = n-1; i >= 0; --i) + // body(i); + // \endcode + // + // Instead, we introduce a new iteration variable representing the logical + // iteration counter of the original loop, convert it to the logical iteration + // number of the reversed loop, then let LoopHelper.Updates compute the user's + // loop iteration variable from it. + // \code{.cpp} + // for (auto .forward.iv = 0; .forward.iv < n; ++.forward.iv) { + // auto .reversed.iv = n - .forward.iv - 1; + // i = (.reversed.iv + 0) * 1; // LoopHelper.Updates + // body(i); // Body + // } + // \endcode + + // Subexpressions with more than one use. One of the constraints of an AST is + // that every node object must appear at most once, hence we define a lambda + // that creates a new AST node at every use. + CaptureVars CopyTransformer(SemaRef); + auto MakeNumIterations = [&CopyTransformer, &LoopHelper]() -> Expr * { + return AssertSuccess( + CopyTransformer.TransformExpr(LoopHelper.NumIterations)); + }; + + // Create the iteration variable for the forward loop (from 0 to n-1). + VarDecl *ForwardIVDecl = + buildVarDecl(SemaRef, {}, IVTy, ForwardIVName, nullptr, OrigVar); + auto MakeForwardRef = [&SemaRef = this->SemaRef, ForwardIVDecl, IVTy, + OrigVarLoc]() { + return buildDeclRefExpr(SemaRef, ForwardIVDecl, IVTy, OrigVarLoc); + }; + + // Iteration variable for the reversed induction variable (from n-1 downto 0): + // Reuse the iteration variable created by checkOpenMPLoop. + auto *ReversedIVDecl = cast(IterationVarRef->getDecl()); + ReversedIVDecl->setDeclName( + &SemaRef.PP.getIdentifierTable().get(ReversedIVName)); + + // For init-statement: + // \code{.cpp} + // auto .forward.iv = 0; + // \endcode + auto *Zero = IntegerLiteral::Create(Context, llvm::APInt::getZero(IVWidth), + ForwardIVDecl->getType(), OrigVarLoc); + SemaRef.AddInitializerToDecl(ForwardIVDecl, Zero, /*DirectInit=*/false); + StmtResult Init = new (Context) + DeclStmt(DeclGroupRef(ForwardIVDecl), OrigVarLocBegin, OrigVarLocEnd); + if (!Init.isUsable()) + return StmtError(); + + // Forward iv cond-expression: + // \code{.cpp} + // .forward.iv < MakeNumIterations() + // \endcode + ExprResult Cond = + SemaRef.BuildBinOp(CurScope, LoopHelper.Cond->getExprLoc(), BO_LT, + MakeForwardRef(), MakeNumIterations()); + if (!Cond.isUsable()) + return StmtError(); + + // Forward incr-statement: + // \code{.c} + // ++.forward.iv + // \endcode + ExprResult Incr = SemaRef.BuildUnaryOp(CurScope, LoopHelper.Inc->getExprLoc(), + UO_PreInc, MakeForwardRef()); + if (!Incr.isUsable()) + return StmtError(); + + // Reverse the forward-iv: + // \code{.cpp} + // auto .reversed.iv = MakeNumIterations() - 1 - .forward.iv + // \endcode + auto *One = IntegerLiteral::Create(Context, llvm::APInt(IVWidth, 1), IVTy, + TransformLoc); + ExprResult Minus = SemaRef.BuildBinOp(CurScope, TransformLoc, BO_Sub, + MakeNumIterations(), One); + if (!Minus.isUsable()) + return StmtError(); + Minus = SemaRef.BuildBinOp(CurScope, TransformLoc, BO_Sub, Minus.get(), + MakeForwardRef()); + if (!Minus.isUsable()) + return StmtError(); + StmtResult InitReversed = new (Context) DeclStmt( + DeclGroupRef(ReversedIVDecl), TransformLocBegin, TransformLocEnd); + if (!InitReversed.isUsable()) + return StmtError(); + SemaRef.AddInitializerToDecl(ReversedIVDecl, Minus.get(), + /*DirectInit=*/false); + + // The new loop body. + SmallVector BodyStmts; + BodyStmts.reserve(LoopHelper.Updates.size() + 2 + + (isa(LoopStmt) ? 1 : 0)); + BodyStmts.push_back(InitReversed.get()); + llvm::append_range(BodyStmts, LoopHelper.Updates); + if (auto *CXXRangeFor = dyn_cast(LoopStmt)) + BodyStmts.push_back(CXXRangeFor->getLoopVarStmt()); + BodyStmts.push_back(Body); + auto *ReversedBody = + CompoundStmt::Create(Context, BodyStmts, FPOptionsOverride(), + Body->getBeginLoc(), Body->getEndLoc()); + + // Finally create the reversed For-statement. + auto *ReversedFor = new (Context) + ForStmt(Context, Init.get(), Cond.get(), nullptr, Incr.get(), + ReversedBody, LoopHelper.Init->getBeginLoc(), + LoopHelper.Init->getBeginLoc(), LoopHelper.Inc->getEndLoc()); + return OMPReverseDirective::Create(Context, StartLoc, EndLoc, AStmt, + ReversedFor, + buildPreInits(Context, PreInits)); +} + OMPClause *SemaOpenMP::ActOnOpenMPSingleExprClause(OpenMPClauseKind Kind, Expr *Expr, SourceLocation StartLoc, diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h index 79bc5e5c55c87..0f266171cb67a 100644 --- a/clang/lib/Sema/TreeTransform.h +++ b/clang/lib/Sema/TreeTransform.h @@ -9239,6 +9239,17 @@ TreeTransform::TransformOMPUnrollDirective(OMPUnrollDirective *D) { return Res; } +template +StmtResult +TreeTransform::TransformOMPReverseDirective(OMPReverseDirective *D) { + DeclarationNameInfo DirName; + getDerived().getSema().OpenMP().StartOpenMPDSABlock( + D->getDirectiveKind(), DirName, nullptr, D->getBeginLoc()); + StmtResult Res = getDerived().TransformOMPExecutableDirective(D); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); + return Res; +} + template StmtResult TreeTransform::TransformOMPForDirective(OMPForDirective *D) { diff --git a/clang/lib/Serialization/ASTReaderStmt.cpp b/clang/lib/Serialization/ASTReaderStmt.cpp index 6955b42f14e06..9d9329c9a58cc 100644 --- a/clang/lib/Serialization/ASTReaderStmt.cpp +++ b/clang/lib/Serialization/ASTReaderStmt.cpp @@ -2445,6 +2445,10 @@ void ASTStmtReader::VisitOMPUnrollDirective(OMPUnrollDirective *D) { VisitOMPLoopTransformationDirective(D); } +void ASTStmtReader::VisitOMPReverseDirective(OMPReverseDirective *D) { + VisitOMPLoopTransformationDirective(D); +} + void ASTStmtReader::VisitOMPForDirective(OMPForDirective *D) { VisitOMPLoopDirective(D); D->setHasCancel(Record.readBool()); @@ -3464,6 +3468,15 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) { break; } + case STMT_OMP_REVERSE_DIRECTIVE: { + assert(Record[ASTStmtReader::NumStmtFields] == 1 && + "Reverse directive accepts only a single loop"); + assert(Record[ASTStmtReader::NumStmtFields + 1] == 0 && + "Reverse directive has no clauses"); + S = OMPReverseDirective::CreateEmpty(Context); + break; + } + case STMT_OMP_FOR_DIRECTIVE: { unsigned CollapsedNum = Record[ASTStmtReader::NumStmtFields]; unsigned NumClauses = Record[ASTStmtReader::NumStmtFields + 1]; diff --git a/clang/lib/Serialization/ASTWriterStmt.cpp b/clang/lib/Serialization/ASTWriterStmt.cpp index d36f43fdaf262..ba5824e585972 100644 --- a/clang/lib/Serialization/ASTWriterStmt.cpp +++ b/clang/lib/Serialization/ASTWriterStmt.cpp @@ -2437,6 +2437,11 @@ void ASTStmtWriter::VisitOMPUnrollDirective(OMPUnrollDirective *D) { Code = serialization::STMT_OMP_UNROLL_DIRECTIVE; } +void ASTStmtWriter::VisitOMPReverseDirective(OMPReverseDirective *D) { + VisitOMPLoopTransformationDirective(D); + Code = serialization::STMT_OMP_REVERSE_DIRECTIVE; +} + void ASTStmtWriter::VisitOMPForDirective(OMPForDirective *D) { VisitOMPLoopDirective(D); Record.writeBool(D->hasCancel()); diff --git a/clang/test/OpenMP/reverse_ast_print.cpp b/clang/test/OpenMP/reverse_ast_print.cpp new file mode 100644 index 0000000000000..3ff6d18cfdf8b --- /dev/null +++ b/clang/test/OpenMP/reverse_ast_print.cpp @@ -0,0 +1,159 @@ +// Check no warnings/errors +// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=60 -fsyntax-only -verify %s +// expected-no-diagnostics + +// Check AST and unparsing +// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=60 -ast-dump %s | FileCheck %s --check-prefix=DUMP +// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=60 -ast-print %s | FileCheck %s --check-prefix=PRINT + +// Check same results after serialization round-trip +// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=60 -emit-pch -o %t %s +// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=60 -include-pch %t -ast-dump-all %s | FileCheck %s --check-prefix=DUMP +// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=60 -include-pch %t -ast-print %s | FileCheck %s --check-prefix=PRINT + +#ifndef HEADER +#define HEADER + +// placeholder for loop body code. +extern "C" void body(...); + +// PRINT-LABEL: void foo1( +// DUMP-LABEL: FunctionDecl {{.*}} foo1 +void foo1() { + // PRINT: #pragma omp reverse + // DUMP: OMPReverseDirective + #pragma omp reverse + // PRINT: for (int i = 7; i < 17; i += 3) + // DUMP-NEXT: ForStmt + for (int i = 7; i < 17; i += 3) + // PRINT: body(i); + // DUMP: CallExpr + body(i); +} + + +// PRINT-LABEL: void foo2( +// DUMP-LABEL: FunctionDecl {{.*}} foo2 +void foo2(int start, int end, int step) { + // PRINT: #pragma omp reverse + // DUMP: OMPReverseDirective + #pragma omp reverse + // PRINT: for (int i = start; i < end; i += step) + // DUMP-NEXT: ForStmt + for (int i = start; i < end; i += step) + // PRINT: body(i); + // DUMP: CallExpr + body(i); +} + + +// PRINT-LABEL: void foo3( +// DUMP-LABEL: FunctionDecl {{.*}} foo3 +void foo3() { + // PRINT: #pragma omp for + // DUMP: OMPForDirective + // DUMP-NEXT: CapturedStmt + // DUMP-NEXT: CapturedDecl + #pragma omp for + // PRINT: #pragma omp reverse + // DUMP-NEXT: OMPReverseDirective + #pragma omp reverse + for (int i = 7; i < 17; i += 3) + // PRINT: body(i); + // DUMP: CallExpr + body(i); +} + + +// PRINT-LABEL: void foo4( +// DUMP-LABEL: FunctionDecl {{.*}} foo4 +void foo4() { + // PRINT: #pragma omp for collapse(2) + // DUMP: OMPForDirective + // DUMP-NEXT: OMPCollapseClause + // DUMP-NEXT: ConstantExpr + // DUMP-NEXT: value: Int 2 + // DUMP-NEXT: IntegerLiteral {{.*}} 2 + // DUMP-NEXT: CapturedStmt + // DUMP-NEXT: CapturedDecl + #pragma omp for collapse(2) + // PRINT: #pragma omp reverse + // DUMP: OMPReverseDirective + #pragma omp reverse + // PRINT: for (int i = 7; i < 17; i += 1) + // DUMP-NEXT: ForStmt + for (int i = 7; i < 17; i += 1) + // PRINT: for (int j = 7; j < 17; j += 1) + // DUMP: ForStmt + for (int j = 7; j < 17; j += 1) + // PRINT: body(i, j); + // DUMP: CallExpr + body(i, j); +} + + +// PRINT-LABEL: void foo5( +// DUMP-LABEL: FunctionDecl {{.*}} foo5 +void foo5(int start, int end, int step) { + // PRINT: #pragma omp for collapse(2) + // DUMP: OMPForDirective + // DUMP-NEXT: OMPCollapseClause + // DUMP-NEXT: ConstantExpr + // DUMP-NEXT: value: Int 2 + // DUMP-NEXT: IntegerLiteral {{.*}} 2 + // DUMP-NEXT: CapturedStmt + // DUMP-NEXT: CapturedDecl + #pragma omp for collapse(2) + // PRINT: for (int i = 7; i < 17; i += 1) + // DUMP-NEXT: ForStmt + for (int i = 7; i < 17; i += 1) + // PRINT: #pragma omp reverse + // DUMP: OMPReverseDirective + #pragma omp reverse + // PRINT: for (int j = 7; j < 17; j += 1) + // DUMP-NEXT: ForStmt + for (int j = 7; j < 17; j += 1) + // PRINT: body(i, j); + // DUMP: CallExpr + body(i, j); +} + + +// PRINT-LABEL: void foo6( +// DUMP-LABEL: FunctionTemplateDecl {{.*}} foo6 +template +void foo6(T start, T end) { + // PRINT: #pragma omp reverse + // DUMP: OMPReverseDirective + #pragma omp reverse + // PRINT-NEXT: for (T i = start; i < end; i += Step) + // DUMP-NEXT: ForStmt + for (T i = start; i < end; i += Step) + // PRINT-NEXT: body(i); + // DUMP: CallExpr + body(i); +} + +// Also test instantiating the template. +void tfoo6() { + foo6(0, 42); +} + + +// PRINT-LABEL: void foo7( +// DUMP-LABEL: FunctionDecl {{.*}} foo7 +void foo7() { + double arr[128]; + // PRINT: #pragma omp reverse + // DUMP: OMPReverseDirective + #pragma omp reverse + // PRINT-NEXT: for (auto &&v : arr) + // DUMP-NEXT: CXXForRangeStmt + for (auto &&v : arr) + // PRINT-NEXT: body(v); + // DUMP: CallExpr + body(v); +} + +#endif + diff --git a/clang/test/OpenMP/reverse_codegen.cpp b/clang/test/OpenMP/reverse_codegen.cpp new file mode 100644 index 0000000000000..9adaa6cc7d18d --- /dev/null +++ b/clang/test/OpenMP/reverse_codegen.cpp @@ -0,0 +1,1554 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _ + +// expected-no-diagnostics + +// Check code generation +// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -std=c++20 -fclang-abi-compat=latest -fopenmp -fopenmp-version=60 -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK1 + +// Check same results after serialization round-trip +// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -std=c++20 -fclang-abi-compat=latest -fopenmp -fopenmp-version=60 -emit-pch -o %t %s +// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -std=c++20 -fclang-abi-compat=latest -fopenmp -fopenmp-version=60 -include-pch %t -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK2 + +#ifndef HEADER +#define HEADER + +// placeholder for loop body code. +extern "C" void body(...) {} + + +struct S { + int i; + S() { +#pragma omp reverse + for (i = 7; i < 17; i += 3) + body(i); + } +} s; + + +extern "C" void foo1(int start, int end, int step) { + int i; +#pragma omp reverse + for (i = start; i < end; i += step) + body(i); +} + + +extern "C" void foo2() { +#pragma omp for +#pragma omp reverse + for (int i = 7; i < 17; i += 3) + body(i); +} + + +extern "C" void foo3() { +#pragma omp for collapse(3) + for (int k = 7; k < 17; k += 3) +#pragma omp reverse + for (int i = 7; i < 17; i += 3) + for (int j = 7; j < 17; j += 3) + body(k, i, j); +} + + +extern "C" void foo4() { +#pragma omp parallel for +#pragma omp reverse + for (int i = 7; i < 17; i += 3) + body(i); +} + + +template +void foo5(T start, T end) { +#pragma omp reverse + for (T i = start; i < end; i += Step) + body(i); +} + +extern "C" void tfoo5() { + foo5(0, 42); +} + + +extern "C" void foo6() { + double arr[128]; +#pragma omp reverse + for (int c = 42; auto && v : arr) + body(v, c); +} + + +extern "C" void foo7() { + double A[128]; + +#pragma omp for collapse(3) + for (int k = 7; k < 17; k += 3) +#pragma omp reverse + for (int c = 42; auto && v : A) + for (int j = 7; j < 17; j += 3) + body(k, c, v, j); +} + +#endif /* HEADER */ + +// CHECK1-LABEL: define {{[^@]+}}@body +// CHECK1-SAME: (...) #[[ATTR0:[0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__cxx_global_var_init +// CHECK1-SAME: () #[[ATTR1:[0-9]+]] section ".text.startup" { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: call void @_ZN1SC1Ev(ptr noundef nonnull align 4 dereferenceable(4) @s) +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@_ZN1SC1Ev +// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR0]] comdat align 2 { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 +// CHECK1-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 +// CHECK1-NEXT: call void @_ZN1SC2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@_ZN1SC2Ev +// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR0]] comdat align 2 { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[I2:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[DOTFORWARD_IV_I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTREVERSED_IV_I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 +// CHECK1-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 +// CHECK1-NEXT: [[I:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[THIS1]], i32 0, i32 0 +// CHECK1-NEXT: store i32 7, ptr [[I]], align 4 +// CHECK1-NEXT: [[I3:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[THIS1]], i32 0, i32 0 +// CHECK1-NEXT: store ptr [[I3]], ptr [[I2]], align 8 +// CHECK1-NEXT: store i32 0, ptr [[DOTFORWARD_IV_I]], align 4 +// CHECK1-NEXT: br label [[FOR_COND:%.*]] +// CHECK1: for.cond: +// CHECK1-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTFORWARD_IV_I]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP0]], 4 +// CHECK1-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] +// CHECK1: for.body: +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTFORWARD_IV_I]], align 4 +// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 3, [[TMP1]] +// CHECK1-NEXT: store i32 [[SUB]], ptr [[DOTREVERSED_IV_I]], align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTREVERSED_IV_I]], align 4 +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP2]], 3 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 7, [[MUL]] +// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[I2]], align 8 +// CHECK1-NEXT: store i32 [[ADD]], ptr [[TMP3]], align 4 +// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[I2]], align 8 +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4 +// CHECK1-NEXT: call void (...) @body(i32 noundef [[TMP5]]) +// CHECK1-NEXT: br label [[FOR_INC:%.*]] +// CHECK1: for.inc: +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTFORWARD_IV_I]], align 4 +// CHECK1-NEXT: [[INC:%.*]] = add nsw i32 [[TMP6]], 1 +// CHECK1-NEXT: store i32 [[INC]], ptr [[DOTFORWARD_IV_I]], align 4 +// CHECK1-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]] +// CHECK1: for.end: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@foo1 +// CHECK1-SAME: (i32 noundef [[START:%.*]], i32 noundef [[END:%.*]], i32 noundef [[STEP:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[START_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[END_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[STEP_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTNEW_STEP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTFORWARD_IV_I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTREVERSED_IV_I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32 [[START]], ptr [[START_ADDR]], align 4 +// CHECK1-NEXT: store i32 [[END]], ptr [[END_ADDR]], align 4 +// CHECK1-NEXT: store i32 [[STEP]], ptr [[STEP_ADDR]], align 4 +// CHECK1-NEXT: [[TMP0:%.*]] = load i32, ptr [[START_ADDR]], align 4 +// CHECK1-NEXT: store i32 [[TMP0]], ptr [[I]], align 4 +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[START_ADDR]], align 4 +// CHECK1-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[END_ADDR]], align 4 +// CHECK1-NEXT: store i32 [[TMP2]], ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[STEP_ADDR]], align 4 +// CHECK1-NEXT: store i32 [[TMP3]], ptr [[DOTNEW_STEP]], align 4 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[SUB:%.*]] = sub i32 [[TMP4]], [[TMP5]] +// CHECK1-NEXT: [[SUB3:%.*]] = sub i32 [[SUB]], 1 +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4 +// CHECK1-NEXT: [[ADD:%.*]] = add i32 [[SUB3]], [[TMP6]] +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4 +// CHECK1-NEXT: [[DIV:%.*]] = udiv i32 [[ADD]], [[TMP7]] +// CHECK1-NEXT: [[SUB4:%.*]] = sub i32 [[DIV]], 1 +// CHECK1-NEXT: store i32 [[SUB4]], ptr [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK1-NEXT: store i32 0, ptr [[DOTFORWARD_IV_I]], align 4 +// CHECK1-NEXT: br label [[FOR_COND:%.*]] +// CHECK1: for.cond: +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTFORWARD_IV_I]], align 4 +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK1-NEXT: [[ADD5:%.*]] = add i32 [[TMP9]], 1 +// CHECK1-NEXT: [[CMP:%.*]] = icmp ult i32 [[TMP8]], [[ADD5]] +// CHECK1-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] +// CHECK1: for.body: +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK1-NEXT: [[ADD6:%.*]] = add i32 [[TMP10]], 1 +// CHECK1-NEXT: [[SUB7:%.*]] = sub i32 [[ADD6]], 1 +// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTFORWARD_IV_I]], align 4 +// CHECK1-NEXT: [[SUB8:%.*]] = sub i32 [[SUB7]], [[TMP11]] +// CHECK1-NEXT: store i32 [[SUB8]], ptr [[DOTREVERSED_IV_I]], align 4 +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTREVERSED_IV_I]], align 4 +// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4 +// CHECK1-NEXT: [[MUL:%.*]] = mul i32 [[TMP13]], [[TMP14]] +// CHECK1-NEXT: [[ADD9:%.*]] = add i32 [[TMP12]], [[MUL]] +// CHECK1-NEXT: store i32 [[ADD9]], ptr [[I]], align 4 +// CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[I]], align 4 +// CHECK1-NEXT: call void (...) @body(i32 noundef [[TMP15]]) +// CHECK1-NEXT: br label [[FOR_INC:%.*]] +// CHECK1: for.inc: +// CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTFORWARD_IV_I]], align 4 +// CHECK1-NEXT: [[INC:%.*]] = add i32 [[TMP16]], 1 +// CHECK1-NEXT: store i32 [[INC]], ptr [[DOTFORWARD_IV_I]], align 4 +// CHECK1-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP5:![0-9]+]] +// CHECK1: for.end: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@foo2 +// CHECK1-SAME: () #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTFORWARD_IV_I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTREVERSED_IV_I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2:[0-9]+]]) +// CHECK1-NEXT: store i32 7, ptr [[I]], align 4 +// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 3, ptr [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP0]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP1]], 3 +// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK1: cond.true: +// CHECK1-NEXT: br label [[COND_END:%.*]] +// CHECK1: cond.false: +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END]] +// CHECK1: cond.end: +// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 3, [[COND_TRUE]] ], [ [[TMP2]], [[COND_FALSE]] ] +// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP3]], ptr [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]] +// CHECK1-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK1-NEXT: store i32 [[ADD]], ptr [[DOTFORWARD_IV_I]], align 4 +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTFORWARD_IV_I]], align 4 +// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 3, [[TMP7]] +// CHECK1-NEXT: store i32 [[SUB]], ptr [[DOTREVERSED_IV_I]], align 4 +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTREVERSED_IV_I]], align 4 +// CHECK1-NEXT: [[MUL2:%.*]] = mul nsw i32 [[TMP8]], 3 +// CHECK1-NEXT: [[ADD3:%.*]] = add nsw i32 7, [[MUL2]] +// CHECK1-NEXT: store i32 [[ADD3]], ptr [[I]], align 4 +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[I]], align 4 +// CHECK1-NEXT: call void (...) @body(i32 noundef [[TMP9]]) +// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK1: omp.body.continue: +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK1-NEXT: store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP0]]) +// CHECK1-NEXT: call void @__kmpc_barrier(ptr @[[GLOB3:[0-9]+]], i32 [[TMP0]]) +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@foo3 +// CHECK1-SAME: () #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[_TMP2:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTFORWARD_IV_I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTREVERSED_IV_I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2]]) +// CHECK1-NEXT: store i32 7, ptr [[I]], align 4 +// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 63, ptr [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP0]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP1]], 63 +// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK1: cond.true: +// CHECK1-NEXT: br label [[COND_END:%.*]] +// CHECK1: cond.false: +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END]] +// CHECK1: cond.end: +// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 63, [[COND_TRUE]] ], [ [[TMP2]], [[COND_FALSE]] ] +// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP3]], ptr [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[CMP3:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]] +// CHECK1-NEXT: br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP6]], 16 +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 3 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 7, [[MUL]] +// CHECK1-NEXT: store i32 [[ADD]], ptr [[K]], align 4 +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[DIV4:%.*]] = sdiv i32 [[TMP8]], 16 +// CHECK1-NEXT: [[MUL5:%.*]] = mul nsw i32 [[DIV4]], 16 +// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP7]], [[MUL5]] +// CHECK1-NEXT: [[DIV6:%.*]] = sdiv i32 [[SUB]], 4 +// CHECK1-NEXT: [[MUL7:%.*]] = mul nsw i32 [[DIV6]], 1 +// CHECK1-NEXT: [[ADD8:%.*]] = add nsw i32 0, [[MUL7]] +// CHECK1-NEXT: store i32 [[ADD8]], ptr [[DOTFORWARD_IV_I]], align 4 +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[DIV9:%.*]] = sdiv i32 [[TMP10]], 16 +// CHECK1-NEXT: [[MUL10:%.*]] = mul nsw i32 [[DIV9]], 16 +// CHECK1-NEXT: [[SUB11:%.*]] = sub nsw i32 [[TMP9]], [[MUL10]] +// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[DIV12:%.*]] = sdiv i32 [[TMP12]], 16 +// CHECK1-NEXT: [[MUL13:%.*]] = mul nsw i32 [[DIV12]], 16 +// CHECK1-NEXT: [[SUB14:%.*]] = sub nsw i32 [[TMP11]], [[MUL13]] +// CHECK1-NEXT: [[DIV15:%.*]] = sdiv i32 [[SUB14]], 4 +// CHECK1-NEXT: [[MUL16:%.*]] = mul nsw i32 [[DIV15]], 4 +// CHECK1-NEXT: [[SUB17:%.*]] = sub nsw i32 [[SUB11]], [[MUL16]] +// CHECK1-NEXT: [[MUL18:%.*]] = mul nsw i32 [[SUB17]], 3 +// CHECK1-NEXT: [[ADD19:%.*]] = add nsw i32 7, [[MUL18]] +// CHECK1-NEXT: store i32 [[ADD19]], ptr [[J]], align 4 +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTFORWARD_IV_I]], align 4 +// CHECK1-NEXT: [[SUB20:%.*]] = sub nsw i32 3, [[TMP13]] +// CHECK1-NEXT: store i32 [[SUB20]], ptr [[DOTREVERSED_IV_I]], align 4 +// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTREVERSED_IV_I]], align 4 +// CHECK1-NEXT: [[MUL21:%.*]] = mul nsw i32 [[TMP14]], 3 +// CHECK1-NEXT: [[ADD22:%.*]] = add nsw i32 7, [[MUL21]] +// CHECK1-NEXT: store i32 [[ADD22]], ptr [[I]], align 4 +// CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[K]], align 4 +// CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[I]], align 4 +// CHECK1-NEXT: [[TMP17:%.*]] = load i32, ptr [[J]], align 4 +// CHECK1-NEXT: call void (...) @body(i32 noundef [[TMP15]], i32 noundef [[TMP16]], i32 noundef [[TMP17]]) +// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK1: omp.body.continue: +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[ADD23:%.*]] = add nsw i32 [[TMP18]], 1 +// CHECK1-NEXT: store i32 [[ADD23]], ptr [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP0]]) +// CHECK1-NEXT: call void @__kmpc_barrier(ptr @[[GLOB3]], i32 [[TMP0]]) +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@foo4 +// CHECK1-SAME: () #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2]], i32 0, ptr @foo4.omp_outlined) +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@foo4.omp_outlined +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4:[0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTFORWARD_IV_I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTREVERSED_IV_I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32 7, ptr [[I]], align 4 +// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 3, ptr [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP1]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 3 +// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK1: cond.true: +// CHECK1-NEXT: br label [[COND_END:%.*]] +// CHECK1: cond.false: +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END]] +// CHECK1: cond.end: +// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 3, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ] +// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]] +// CHECK1-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK1-NEXT: store i32 [[ADD]], ptr [[DOTFORWARD_IV_I]], align 4 +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTFORWARD_IV_I]], align 4 +// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 3, [[TMP8]] +// CHECK1-NEXT: store i32 [[SUB]], ptr [[DOTREVERSED_IV_I]], align 4 +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTREVERSED_IV_I]], align 4 +// CHECK1-NEXT: [[MUL2:%.*]] = mul nsw i32 [[TMP9]], 3 +// CHECK1-NEXT: [[ADD3:%.*]] = add nsw i32 7, [[MUL2]] +// CHECK1-NEXT: store i32 [[ADD3]], ptr [[I]], align 4 +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[I]], align 4 +// CHECK1-NEXT: call void (...) @body(i32 noundef [[TMP10]]) +// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK1: omp.body.continue: +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP11]], 1 +// CHECK1-NEXT: store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP1]]) +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@tfoo5 +// CHECK1-SAME: () #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: call void @_Z4foo5IiTnT_Li3EEvS0_S0_(i32 noundef 0, i32 noundef 42) +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@_Z4foo5IiTnT_Li3EEvS0_S0_ +// CHECK1-SAME: (i32 noundef [[START:%.*]], i32 noundef [[END:%.*]]) #[[ATTR0]] comdat { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[START_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[END_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTFORWARD_IV_I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTREVERSED_IV_I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32 [[START]], ptr [[START_ADDR]], align 4 +// CHECK1-NEXT: store i32 [[END]], ptr [[END_ADDR]], align 4 +// CHECK1-NEXT: [[TMP0:%.*]] = load i32, ptr [[START_ADDR]], align 4 +// CHECK1-NEXT: store i32 [[TMP0]], ptr [[I]], align 4 +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[START_ADDR]], align 4 +// CHECK1-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[END_ADDR]], align 4 +// CHECK1-NEXT: store i32 [[TMP2]], ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[SUB:%.*]] = sub i32 [[TMP3]], [[TMP4]] +// CHECK1-NEXT: [[SUB3:%.*]] = sub i32 [[SUB]], 1 +// CHECK1-NEXT: [[ADD:%.*]] = add i32 [[SUB3]], 3 +// CHECK1-NEXT: [[DIV:%.*]] = udiv i32 [[ADD]], 3 +// CHECK1-NEXT: [[SUB4:%.*]] = sub i32 [[DIV]], 1 +// CHECK1-NEXT: store i32 [[SUB4]], ptr [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK1-NEXT: store i32 0, ptr [[DOTFORWARD_IV_I]], align 4 +// CHECK1-NEXT: br label [[FOR_COND:%.*]] +// CHECK1: for.cond: +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTFORWARD_IV_I]], align 4 +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK1-NEXT: [[ADD5:%.*]] = add i32 [[TMP6]], 1 +// CHECK1-NEXT: [[CMP:%.*]] = icmp ult i32 [[TMP5]], [[ADD5]] +// CHECK1-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] +// CHECK1: for.body: +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK1-NEXT: [[ADD6:%.*]] = add i32 [[TMP7]], 1 +// CHECK1-NEXT: [[SUB7:%.*]] = sub i32 [[ADD6]], 1 +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTFORWARD_IV_I]], align 4 +// CHECK1-NEXT: [[SUB8:%.*]] = sub i32 [[SUB7]], [[TMP8]] +// CHECK1-NEXT: store i32 [[SUB8]], ptr [[DOTREVERSED_IV_I]], align 4 +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTREVERSED_IV_I]], align 4 +// CHECK1-NEXT: [[MUL:%.*]] = mul i32 [[TMP10]], 3 +// CHECK1-NEXT: [[ADD9:%.*]] = add i32 [[TMP9]], [[MUL]] +// CHECK1-NEXT: store i32 [[ADD9]], ptr [[I]], align 4 +// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[I]], align 4 +// CHECK1-NEXT: call void (...) @body(i32 noundef [[TMP11]]) +// CHECK1-NEXT: br label [[FOR_INC:%.*]] +// CHECK1: for.inc: +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTFORWARD_IV_I]], align 4 +// CHECK1-NEXT: [[INC:%.*]] = add i32 [[TMP12]], 1 +// CHECK1-NEXT: store i32 [[INC]], ptr [[DOTFORWARD_IV_I]], align 4 +// CHECK1-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP8:![0-9]+]] +// CHECK1: for.end: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@foo6 +// CHECK1-SAME: () #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[ARR:%.*]] = alloca [128 x double], align 16 +// CHECK1-NEXT: [[C:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[__RANGE2:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[__END2:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[__BEGIN2:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_4:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTFORWARD_IV___BEGIN2:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTREVERSED_IV___BEGIN2:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[V:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: store i32 42, ptr [[C]], align 4 +// CHECK1-NEXT: store ptr [[ARR]], ptr [[__RANGE2]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[__RANGE2]], align 8 +// CHECK1-NEXT: [[ARRAYDECAY:%.*]] = getelementptr inbounds [128 x double], ptr [[TMP0]], i64 0, i64 0 +// CHECK1-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY]], i64 128 +// CHECK1-NEXT: store ptr [[ADD_PTR]], ptr [[__END2]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[__RANGE2]], align 8 +// CHECK1-NEXT: [[ARRAYDECAY1:%.*]] = getelementptr inbounds [128 x double], ptr [[TMP1]], i64 0, i64 0 +// CHECK1-NEXT: store ptr [[ARRAYDECAY1]], ptr [[__BEGIN2]], align 8 +// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[__RANGE2]], align 8 +// CHECK1-NEXT: [[ARRAYDECAY2:%.*]] = getelementptr inbounds [128 x double], ptr [[TMP2]], i64 0, i64 0 +// CHECK1-NEXT: store ptr [[ARRAYDECAY2]], ptr [[DOTCAPTURE_EXPR_]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[__END2]], align 8 +// CHECK1-NEXT: store ptr [[TMP3]], ptr [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_]], align 8 +// CHECK1-NEXT: [[SUB_PTR_LHS_CAST:%.*]] = ptrtoint ptr [[TMP4]] to i64 +// CHECK1-NEXT: [[SUB_PTR_RHS_CAST:%.*]] = ptrtoint ptr [[TMP5]] to i64 +// CHECK1-NEXT: [[SUB_PTR_SUB:%.*]] = sub i64 [[SUB_PTR_LHS_CAST]], [[SUB_PTR_RHS_CAST]] +// CHECK1-NEXT: [[SUB_PTR_DIV:%.*]] = sdiv exact i64 [[SUB_PTR_SUB]], 8 +// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i64 [[SUB_PTR_DIV]], 1 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i64 [[SUB]], 1 +// CHECK1-NEXT: [[DIV:%.*]] = sdiv i64 [[ADD]], 1 +// CHECK1-NEXT: [[SUB5:%.*]] = sub nsw i64 [[DIV]], 1 +// CHECK1-NEXT: store i64 [[SUB5]], ptr [[DOTCAPTURE_EXPR_4]], align 8 +// CHECK1-NEXT: store i64 0, ptr [[DOTFORWARD_IV___BEGIN2]], align 8 +// CHECK1-NEXT: br label [[FOR_COND:%.*]] +// CHECK1: for.cond: +// CHECK1-NEXT: [[TMP6:%.*]] = load i64, ptr [[DOTFORWARD_IV___BEGIN2]], align 8 +// CHECK1-NEXT: [[TMP7:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_4]], align 8 +// CHECK1-NEXT: [[ADD6:%.*]] = add nsw i64 [[TMP7]], 1 +// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i64 [[TMP6]], [[ADD6]] +// CHECK1-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] +// CHECK1: for.body: +// CHECK1-NEXT: [[TMP8:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_4]], align 8 +// CHECK1-NEXT: [[ADD7:%.*]] = add nsw i64 [[TMP8]], 1 +// CHECK1-NEXT: [[SUB8:%.*]] = sub nsw i64 [[ADD7]], 1 +// CHECK1-NEXT: [[TMP9:%.*]] = load i64, ptr [[DOTFORWARD_IV___BEGIN2]], align 8 +// CHECK1-NEXT: [[SUB9:%.*]] = sub nsw i64 [[SUB8]], [[TMP9]] +// CHECK1-NEXT: store i64 [[SUB9]], ptr [[DOTREVERSED_IV___BEGIN2]], align 8 +// CHECK1-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_]], align 8 +// CHECK1-NEXT: [[TMP11:%.*]] = load i64, ptr [[DOTREVERSED_IV___BEGIN2]], align 8 +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i64 [[TMP11]], 1 +// CHECK1-NEXT: [[ADD_PTR10:%.*]] = getelementptr inbounds double, ptr [[TMP10]], i64 [[MUL]] +// CHECK1-NEXT: store ptr [[ADD_PTR10]], ptr [[__BEGIN2]], align 8 +// CHECK1-NEXT: [[TMP12:%.*]] = load ptr, ptr [[__BEGIN2]], align 8 +// CHECK1-NEXT: store ptr [[TMP12]], ptr [[V]], align 8 +// CHECK1-NEXT: [[TMP13:%.*]] = load ptr, ptr [[V]], align 8 +// CHECK1-NEXT: [[TMP14:%.*]] = load double, ptr [[TMP13]], align 8 +// CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[C]], align 4 +// CHECK1-NEXT: call void (...) @body(double noundef [[TMP14]], i32 noundef [[TMP15]]) +// CHECK1-NEXT: br label [[FOR_INC:%.*]] +// CHECK1: for.inc: +// CHECK1-NEXT: [[TMP16:%.*]] = load i64, ptr [[DOTFORWARD_IV___BEGIN2]], align 8 +// CHECK1-NEXT: [[INC:%.*]] = add nsw i64 [[TMP16]], 1 +// CHECK1-NEXT: store i64 [[INC]], ptr [[DOTFORWARD_IV___BEGIN2]], align 8 +// CHECK1-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP9:![0-9]+]] +// CHECK1: for.end: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@foo7 +// CHECK1-SAME: () #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[A:%.*]] = alloca [128 x double], align 16 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[_TMP1:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[_TMP2:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[C:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[__RANGE3:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[__END3:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[__BEGIN3:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_5:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_6:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_8:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_10:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTFORWARD_IV___BEGIN3:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[K15:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTFORWARD_IV___BEGIN316:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[J17:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTREVERSED_IV___BEGIN3:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[V:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2]]) +// CHECK1-NEXT: store i32 42, ptr [[C]], align 4 +// CHECK1-NEXT: store ptr [[A]], ptr [[__RANGE3]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[__RANGE3]], align 8 +// CHECK1-NEXT: [[ARRAYDECAY:%.*]] = getelementptr inbounds [128 x double], ptr [[TMP1]], i64 0, i64 0 +// CHECK1-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY]], i64 128 +// CHECK1-NEXT: store ptr [[ADD_PTR]], ptr [[__END3]], align 8 +// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[__RANGE3]], align 8 +// CHECK1-NEXT: [[ARRAYDECAY3:%.*]] = getelementptr inbounds [128 x double], ptr [[TMP2]], i64 0, i64 0 +// CHECK1-NEXT: store ptr [[ARRAYDECAY3]], ptr [[__BEGIN3]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[__RANGE3]], align 8 +// CHECK1-NEXT: [[ARRAYDECAY4:%.*]] = getelementptr inbounds [128 x double], ptr [[TMP3]], i64 0, i64 0 +// CHECK1-NEXT: store ptr [[ARRAYDECAY4]], ptr [[DOTCAPTURE_EXPR_]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[__END3]], align 8 +// CHECK1-NEXT: store ptr [[TMP4]], ptr [[DOTCAPTURE_EXPR_5]], align 8 +// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_5]], align 8 +// CHECK1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_]], align 8 +// CHECK1-NEXT: [[SUB_PTR_LHS_CAST:%.*]] = ptrtoint ptr [[TMP5]] to i64 +// CHECK1-NEXT: [[SUB_PTR_RHS_CAST:%.*]] = ptrtoint ptr [[TMP6]] to i64 +// CHECK1-NEXT: [[SUB_PTR_SUB:%.*]] = sub i64 [[SUB_PTR_LHS_CAST]], [[SUB_PTR_RHS_CAST]] +// CHECK1-NEXT: [[SUB_PTR_DIV:%.*]] = sdiv exact i64 [[SUB_PTR_SUB]], 8 +// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i64 [[SUB_PTR_DIV]], 1 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i64 [[SUB]], 1 +// CHECK1-NEXT: [[DIV:%.*]] = sdiv i64 [[ADD]], 1 +// CHECK1-NEXT: [[SUB7:%.*]] = sub nsw i64 [[DIV]], 1 +// CHECK1-NEXT: store i64 [[SUB7]], ptr [[DOTCAPTURE_EXPR_6]], align 8 +// CHECK1-NEXT: [[TMP7:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_6]], align 8 +// CHECK1-NEXT: [[ADD9:%.*]] = add nsw i64 [[TMP7]], 1 +// CHECK1-NEXT: store i64 [[ADD9]], ptr [[DOTCAPTURE_EXPR_8]], align 8 +// CHECK1-NEXT: [[TMP8:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_8]], align 8 +// CHECK1-NEXT: [[SUB11:%.*]] = sub nsw i64 [[TMP8]], 0 +// CHECK1-NEXT: [[DIV12:%.*]] = sdiv i64 [[SUB11]], 1 +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i64 4, [[DIV12]] +// CHECK1-NEXT: [[MUL13:%.*]] = mul nsw i64 [[MUL]], 4 +// CHECK1-NEXT: [[SUB14:%.*]] = sub nsw i64 [[MUL13]], 1 +// CHECK1-NEXT: store i64 [[SUB14]], ptr [[DOTCAPTURE_EXPR_10]], align 8 +// CHECK1-NEXT: store i32 7, ptr [[K]], align 4 +// CHECK1-NEXT: store i64 0, ptr [[DOTFORWARD_IV___BEGIN3]], align 8 +// CHECK1-NEXT: store i32 7, ptr [[J]], align 4 +// CHECK1-NEXT: [[TMP9:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_8]], align 8 +// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i64 0, [[TMP9]] +// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK1: omp.precond.then: +// CHECK1-NEXT: store i64 0, ptr [[DOTOMP_LB]], align 8 +// CHECK1-NEXT: [[TMP10:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_10]], align 8 +// CHECK1-NEXT: store i64 [[TMP10]], ptr [[DOTOMP_UB]], align 8 +// CHECK1-NEXT: store i64 1, ptr [[DOTOMP_STRIDE]], align 8 +// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_8(ptr @[[GLOB1]], i32 [[TMP0]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i64 1, i64 1) +// CHECK1-NEXT: [[TMP11:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8 +// CHECK1-NEXT: [[TMP12:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_10]], align 8 +// CHECK1-NEXT: [[CMP18:%.*]] = icmp sgt i64 [[TMP11]], [[TMP12]] +// CHECK1-NEXT: br i1 [[CMP18]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK1: cond.true: +// CHECK1-NEXT: [[TMP13:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_10]], align 8 +// CHECK1-NEXT: br label [[COND_END:%.*]] +// CHECK1: cond.false: +// CHECK1-NEXT: [[TMP14:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8 +// CHECK1-NEXT: br label [[COND_END]] +// CHECK1: cond.end: +// CHECK1-NEXT: [[COND:%.*]] = phi i64 [ [[TMP13]], [[COND_TRUE]] ], [ [[TMP14]], [[COND_FALSE]] ] +// CHECK1-NEXT: store i64 [[COND]], ptr [[DOTOMP_UB]], align 8 +// CHECK1-NEXT: [[TMP15:%.*]] = load i64, ptr [[DOTOMP_LB]], align 8 +// CHECK1-NEXT: store i64 [[TMP15]], ptr [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP16:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: [[TMP17:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8 +// CHECK1-NEXT: [[CMP19:%.*]] = icmp sle i64 [[TMP16]], [[TMP17]] +// CHECK1-NEXT: br i1 [[CMP19]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP18:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: [[TMP19:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_8]], align 8 +// CHECK1-NEXT: [[SUB20:%.*]] = sub nsw i64 [[TMP19]], 0 +// CHECK1-NEXT: [[DIV21:%.*]] = sdiv i64 [[SUB20]], 1 +// CHECK1-NEXT: [[MUL22:%.*]] = mul nsw i64 1, [[DIV21]] +// CHECK1-NEXT: [[MUL23:%.*]] = mul nsw i64 [[MUL22]], 4 +// CHECK1-NEXT: [[DIV24:%.*]] = sdiv i64 [[TMP18]], [[MUL23]] +// CHECK1-NEXT: [[MUL25:%.*]] = mul nsw i64 [[DIV24]], 3 +// CHECK1-NEXT: [[ADD26:%.*]] = add nsw i64 7, [[MUL25]] +// CHECK1-NEXT: [[CONV:%.*]] = trunc i64 [[ADD26]] to i32 +// CHECK1-NEXT: store i32 [[CONV]], ptr [[K15]], align 4 +// CHECK1-NEXT: [[TMP20:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: [[TMP21:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: [[TMP22:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_8]], align 8 +// CHECK1-NEXT: [[SUB27:%.*]] = sub nsw i64 [[TMP22]], 0 +// CHECK1-NEXT: [[DIV28:%.*]] = sdiv i64 [[SUB27]], 1 +// CHECK1-NEXT: [[MUL29:%.*]] = mul nsw i64 1, [[DIV28]] +// CHECK1-NEXT: [[MUL30:%.*]] = mul nsw i64 [[MUL29]], 4 +// CHECK1-NEXT: [[DIV31:%.*]] = sdiv i64 [[TMP21]], [[MUL30]] +// CHECK1-NEXT: [[TMP23:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_8]], align 8 +// CHECK1-NEXT: [[SUB32:%.*]] = sub nsw i64 [[TMP23]], 0 +// CHECK1-NEXT: [[DIV33:%.*]] = sdiv i64 [[SUB32]], 1 +// CHECK1-NEXT: [[MUL34:%.*]] = mul nsw i64 1, [[DIV33]] +// CHECK1-NEXT: [[MUL35:%.*]] = mul nsw i64 [[MUL34]], 4 +// CHECK1-NEXT: [[MUL36:%.*]] = mul nsw i64 [[DIV31]], [[MUL35]] +// CHECK1-NEXT: [[SUB37:%.*]] = sub nsw i64 [[TMP20]], [[MUL36]] +// CHECK1-NEXT: [[DIV38:%.*]] = sdiv i64 [[SUB37]], 4 +// CHECK1-NEXT: [[MUL39:%.*]] = mul nsw i64 [[DIV38]], 1 +// CHECK1-NEXT: [[ADD40:%.*]] = add nsw i64 0, [[MUL39]] +// CHECK1-NEXT: store i64 [[ADD40]], ptr [[DOTFORWARD_IV___BEGIN316]], align 8 +// CHECK1-NEXT: [[TMP24:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: [[TMP25:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: [[TMP26:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_8]], align 8 +// CHECK1-NEXT: [[SUB41:%.*]] = sub nsw i64 [[TMP26]], 0 +// CHECK1-NEXT: [[DIV42:%.*]] = sdiv i64 [[SUB41]], 1 +// CHECK1-NEXT: [[MUL43:%.*]] = mul nsw i64 1, [[DIV42]] +// CHECK1-NEXT: [[MUL44:%.*]] = mul nsw i64 [[MUL43]], 4 +// CHECK1-NEXT: [[DIV45:%.*]] = sdiv i64 [[TMP25]], [[MUL44]] +// CHECK1-NEXT: [[TMP27:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_8]], align 8 +// CHECK1-NEXT: [[SUB46:%.*]] = sub nsw i64 [[TMP27]], 0 +// CHECK1-NEXT: [[DIV47:%.*]] = sdiv i64 [[SUB46]], 1 +// CHECK1-NEXT: [[MUL48:%.*]] = mul nsw i64 1, [[DIV47]] +// CHECK1-NEXT: [[MUL49:%.*]] = mul nsw i64 [[MUL48]], 4 +// CHECK1-NEXT: [[MUL50:%.*]] = mul nsw i64 [[DIV45]], [[MUL49]] +// CHECK1-NEXT: [[SUB51:%.*]] = sub nsw i64 [[TMP24]], [[MUL50]] +// CHECK1-NEXT: [[TMP28:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: [[TMP29:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: [[TMP30:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_8]], align 8 +// CHECK1-NEXT: [[SUB52:%.*]] = sub nsw i64 [[TMP30]], 0 +// CHECK1-NEXT: [[DIV53:%.*]] = sdiv i64 [[SUB52]], 1 +// CHECK1-NEXT: [[MUL54:%.*]] = mul nsw i64 1, [[DIV53]] +// CHECK1-NEXT: [[MUL55:%.*]] = mul nsw i64 [[MUL54]], 4 +// CHECK1-NEXT: [[DIV56:%.*]] = sdiv i64 [[TMP29]], [[MUL55]] +// CHECK1-NEXT: [[TMP31:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_8]], align 8 +// CHECK1-NEXT: [[SUB57:%.*]] = sub nsw i64 [[TMP31]], 0 +// CHECK1-NEXT: [[DIV58:%.*]] = sdiv i64 [[SUB57]], 1 +// CHECK1-NEXT: [[MUL59:%.*]] = mul nsw i64 1, [[DIV58]] +// CHECK1-NEXT: [[MUL60:%.*]] = mul nsw i64 [[MUL59]], 4 +// CHECK1-NEXT: [[MUL61:%.*]] = mul nsw i64 [[DIV56]], [[MUL60]] +// CHECK1-NEXT: [[SUB62:%.*]] = sub nsw i64 [[TMP28]], [[MUL61]] +// CHECK1-NEXT: [[DIV63:%.*]] = sdiv i64 [[SUB62]], 4 +// CHECK1-NEXT: [[MUL64:%.*]] = mul nsw i64 [[DIV63]], 4 +// CHECK1-NEXT: [[SUB65:%.*]] = sub nsw i64 [[SUB51]], [[MUL64]] +// CHECK1-NEXT: [[MUL66:%.*]] = mul nsw i64 [[SUB65]], 3 +// CHECK1-NEXT: [[ADD67:%.*]] = add nsw i64 7, [[MUL66]] +// CHECK1-NEXT: [[CONV68:%.*]] = trunc i64 [[ADD67]] to i32 +// CHECK1-NEXT: store i32 [[CONV68]], ptr [[J17]], align 4 +// CHECK1-NEXT: [[TMP32:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_6]], align 8 +// CHECK1-NEXT: [[ADD69:%.*]] = add nsw i64 [[TMP32]], 1 +// CHECK1-NEXT: [[SUB70:%.*]] = sub nsw i64 [[ADD69]], 1 +// CHECK1-NEXT: [[TMP33:%.*]] = load i64, ptr [[DOTFORWARD_IV___BEGIN316]], align 8 +// CHECK1-NEXT: [[SUB71:%.*]] = sub nsw i64 [[SUB70]], [[TMP33]] +// CHECK1-NEXT: store i64 [[SUB71]], ptr [[DOTREVERSED_IV___BEGIN3]], align 8 +// CHECK1-NEXT: [[TMP34:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_]], align 8 +// CHECK1-NEXT: [[TMP35:%.*]] = load i64, ptr [[DOTREVERSED_IV___BEGIN3]], align 8 +// CHECK1-NEXT: [[MUL72:%.*]] = mul nsw i64 [[TMP35]], 1 +// CHECK1-NEXT: [[ADD_PTR73:%.*]] = getelementptr inbounds double, ptr [[TMP34]], i64 [[MUL72]] +// CHECK1-NEXT: store ptr [[ADD_PTR73]], ptr [[__BEGIN3]], align 8 +// CHECK1-NEXT: [[TMP36:%.*]] = load ptr, ptr [[__BEGIN3]], align 8 +// CHECK1-NEXT: store ptr [[TMP36]], ptr [[V]], align 8 +// CHECK1-NEXT: [[TMP37:%.*]] = load i32, ptr [[K15]], align 4 +// CHECK1-NEXT: [[TMP38:%.*]] = load i32, ptr [[C]], align 4 +// CHECK1-NEXT: [[TMP39:%.*]] = load ptr, ptr [[V]], align 8 +// CHECK1-NEXT: [[TMP40:%.*]] = load double, ptr [[TMP39]], align 8 +// CHECK1-NEXT: [[TMP41:%.*]] = load i32, ptr [[J17]], align 4 +// CHECK1-NEXT: call void (...) @body(i32 noundef [[TMP37]], i32 noundef [[TMP38]], double noundef [[TMP40]], i32 noundef [[TMP41]]) +// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK1: omp.body.continue: +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP42:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: [[ADD74:%.*]] = add nsw i64 [[TMP42]], 1 +// CHECK1-NEXT: store i64 [[ADD74]], ptr [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP0]]) +// CHECK1-NEXT: br label [[OMP_PRECOND_END]] +// CHECK1: omp.precond.end: +// CHECK1-NEXT: call void @__kmpc_barrier(ptr @[[GLOB3]], i32 [[TMP0]]) +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@_GLOBAL__sub_I_reverse_codegen.cpp +// CHECK1-SAME: () #[[ATTR1]] section ".text.startup" { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: call void @__cxx_global_var_init() +// CHECK1-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__cxx_global_var_init +// CHECK2-SAME: () #[[ATTR0:[0-9]+]] section ".text.startup" { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: call void @_ZN1SC1Ev(ptr noundef nonnull align 4 dereferenceable(4) @s) +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@_ZN1SC1Ev +// CHECK2-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1:[0-9]+]] comdat align 2 { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 +// CHECK2-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 +// CHECK2-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 +// CHECK2-NEXT: call void @_ZN1SC2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@_ZN1SC2Ev +// CHECK2-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 +// CHECK2-NEXT: [[I2:%.*]] = alloca ptr, align 8 +// CHECK2-NEXT: [[DOTFORWARD_IV_I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTREVERSED_IV_I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 +// CHECK2-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 +// CHECK2-NEXT: [[I:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[THIS1]], i32 0, i32 0 +// CHECK2-NEXT: store i32 7, ptr [[I]], align 4 +// CHECK2-NEXT: [[I3:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[THIS1]], i32 0, i32 0 +// CHECK2-NEXT: store ptr [[I3]], ptr [[I2]], align 8 +// CHECK2-NEXT: store i32 0, ptr [[DOTFORWARD_IV_I]], align 4 +// CHECK2-NEXT: br label [[FOR_COND:%.*]] +// CHECK2: for.cond: +// CHECK2-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTFORWARD_IV_I]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP0]], 4 +// CHECK2-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] +// CHECK2: for.body: +// CHECK2-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTFORWARD_IV_I]], align 4 +// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 3, [[TMP1]] +// CHECK2-NEXT: store i32 [[SUB]], ptr [[DOTREVERSED_IV_I]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTREVERSED_IV_I]], align 4 +// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP2]], 3 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 7, [[MUL]] +// CHECK2-NEXT: [[TMP3:%.*]] = load ptr, ptr [[I2]], align 8 +// CHECK2-NEXT: store i32 [[ADD]], ptr [[TMP3]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = load ptr, ptr [[I2]], align 8 +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4 +// CHECK2-NEXT: call void (...) @body(i32 noundef [[TMP5]]) +// CHECK2-NEXT: br label [[FOR_INC:%.*]] +// CHECK2: for.inc: +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTFORWARD_IV_I]], align 4 +// CHECK2-NEXT: [[INC:%.*]] = add nsw i32 [[TMP6]], 1 +// CHECK2-NEXT: store i32 [[INC]], ptr [[DOTFORWARD_IV_I]], align 4 +// CHECK2-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]] +// CHECK2: for.end: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@body +// CHECK2-SAME: (...) #[[ATTR1]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@foo1 +// CHECK2-SAME: (i32 noundef [[START:%.*]], i32 noundef [[END:%.*]], i32 noundef [[STEP:%.*]]) #[[ATTR1]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[START_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[END_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[STEP_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTNEW_STEP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTFORWARD_IV_I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTREVERSED_IV_I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32 [[START]], ptr [[START_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[END]], ptr [[END_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[STEP]], ptr [[STEP_ADDR]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load i32, ptr [[START_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP0]], ptr [[I]], align 4 +// CHECK2-NEXT: [[TMP1:%.*]] = load i32, ptr [[START_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, ptr [[END_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP2]], ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, ptr [[STEP_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP3]], ptr [[DOTNEW_STEP]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[SUB:%.*]] = sub i32 [[TMP4]], [[TMP5]] +// CHECK2-NEXT: [[SUB3:%.*]] = sub i32 [[SUB]], 1 +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4 +// CHECK2-NEXT: [[ADD:%.*]] = add i32 [[SUB3]], [[TMP6]] +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4 +// CHECK2-NEXT: [[DIV:%.*]] = udiv i32 [[ADD]], [[TMP7]] +// CHECK2-NEXT: [[SUB4:%.*]] = sub i32 [[DIV]], 1 +// CHECK2-NEXT: store i32 [[SUB4]], ptr [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: store i32 0, ptr [[DOTFORWARD_IV_I]], align 4 +// CHECK2-NEXT: br label [[FOR_COND:%.*]] +// CHECK2: for.cond: +// CHECK2-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTFORWARD_IV_I]], align 4 +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: [[ADD5:%.*]] = add i32 [[TMP9]], 1 +// CHECK2-NEXT: [[CMP:%.*]] = icmp ult i32 [[TMP8]], [[ADD5]] +// CHECK2-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] +// CHECK2: for.body: +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: [[ADD6:%.*]] = add i32 [[TMP10]], 1 +// CHECK2-NEXT: [[SUB7:%.*]] = sub i32 [[ADD6]], 1 +// CHECK2-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTFORWARD_IV_I]], align 4 +// CHECK2-NEXT: [[SUB8:%.*]] = sub i32 [[SUB7]], [[TMP11]] +// CHECK2-NEXT: store i32 [[SUB8]], ptr [[DOTREVERSED_IV_I]], align 4 +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTREVERSED_IV_I]], align 4 +// CHECK2-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4 +// CHECK2-NEXT: [[MUL:%.*]] = mul i32 [[TMP13]], [[TMP14]] +// CHECK2-NEXT: [[ADD9:%.*]] = add i32 [[TMP12]], [[MUL]] +// CHECK2-NEXT: store i32 [[ADD9]], ptr [[I]], align 4 +// CHECK2-NEXT: [[TMP15:%.*]] = load i32, ptr [[I]], align 4 +// CHECK2-NEXT: call void (...) @body(i32 noundef [[TMP15]]) +// CHECK2-NEXT: br label [[FOR_INC:%.*]] +// CHECK2: for.inc: +// CHECK2-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTFORWARD_IV_I]], align 4 +// CHECK2-NEXT: [[INC:%.*]] = add i32 [[TMP16]], 1 +// CHECK2-NEXT: store i32 [[INC]], ptr [[DOTFORWARD_IV_I]], align 4 +// CHECK2-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP5:![0-9]+]] +// CHECK2: for.end: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@foo2 +// CHECK2-SAME: () #[[ATTR1]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTFORWARD_IV_I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTREVERSED_IV_I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2:[0-9]+]]) +// CHECK2-NEXT: store i32 7, ptr [[I]], align 4 +// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 3, ptr [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP0]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK2-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP1]], 3 +// CHECK2-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK2: cond.true: +// CHECK2-NEXT: br label [[COND_END:%.*]] +// CHECK2: cond.false: +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END]] +// CHECK2: cond.end: +// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ 3, [[COND_TRUE]] ], [ [[TMP2]], [[COND_FALSE]] ] +// CHECK2-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP3]], ptr [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]] +// CHECK2-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK2-NEXT: store i32 [[ADD]], ptr [[DOTFORWARD_IV_I]], align 4 +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTFORWARD_IV_I]], align 4 +// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 3, [[TMP7]] +// CHECK2-NEXT: store i32 [[SUB]], ptr [[DOTREVERSED_IV_I]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTREVERSED_IV_I]], align 4 +// CHECK2-NEXT: [[MUL2:%.*]] = mul nsw i32 [[TMP8]], 3 +// CHECK2-NEXT: [[ADD3:%.*]] = add nsw i32 7, [[MUL2]] +// CHECK2-NEXT: store i32 [[ADD3]], ptr [[I]], align 4 +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, ptr [[I]], align 4 +// CHECK2-NEXT: call void (...) @body(i32 noundef [[TMP9]]) +// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK2: omp.body.continue: +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK2-NEXT: store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP0]]) +// CHECK2-NEXT: call void @__kmpc_barrier(ptr @[[GLOB3:[0-9]+]], i32 [[TMP0]]) +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@foo3 +// CHECK2-SAME: () #[[ATTR1]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[_TMP2:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTFORWARD_IV_I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTREVERSED_IV_I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2]]) +// CHECK2-NEXT: store i32 7, ptr [[I]], align 4 +// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 63, ptr [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP0]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK2-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP1]], 63 +// CHECK2-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK2: cond.true: +// CHECK2-NEXT: br label [[COND_END:%.*]] +// CHECK2: cond.false: +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END]] +// CHECK2: cond.end: +// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ 63, [[COND_TRUE]] ], [ [[TMP2]], [[COND_FALSE]] ] +// CHECK2-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP3]], ptr [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[CMP3:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]] +// CHECK2-NEXT: br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP6]], 16 +// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 3 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 7, [[MUL]] +// CHECK2-NEXT: store i32 [[ADD]], ptr [[K]], align 4 +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[DIV4:%.*]] = sdiv i32 [[TMP8]], 16 +// CHECK2-NEXT: [[MUL5:%.*]] = mul nsw i32 [[DIV4]], 16 +// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP7]], [[MUL5]] +// CHECK2-NEXT: [[DIV6:%.*]] = sdiv i32 [[SUB]], 4 +// CHECK2-NEXT: [[MUL7:%.*]] = mul nsw i32 [[DIV6]], 1 +// CHECK2-NEXT: [[ADD8:%.*]] = add nsw i32 0, [[MUL7]] +// CHECK2-NEXT: store i32 [[ADD8]], ptr [[DOTFORWARD_IV_I]], align 4 +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[DIV9:%.*]] = sdiv i32 [[TMP10]], 16 +// CHECK2-NEXT: [[MUL10:%.*]] = mul nsw i32 [[DIV9]], 16 +// CHECK2-NEXT: [[SUB11:%.*]] = sub nsw i32 [[TMP9]], [[MUL10]] +// CHECK2-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[DIV12:%.*]] = sdiv i32 [[TMP12]], 16 +// CHECK2-NEXT: [[MUL13:%.*]] = mul nsw i32 [[DIV12]], 16 +// CHECK2-NEXT: [[SUB14:%.*]] = sub nsw i32 [[TMP11]], [[MUL13]] +// CHECK2-NEXT: [[DIV15:%.*]] = sdiv i32 [[SUB14]], 4 +// CHECK2-NEXT: [[MUL16:%.*]] = mul nsw i32 [[DIV15]], 4 +// CHECK2-NEXT: [[SUB17:%.*]] = sub nsw i32 [[SUB11]], [[MUL16]] +// CHECK2-NEXT: [[MUL18:%.*]] = mul nsw i32 [[SUB17]], 3 +// CHECK2-NEXT: [[ADD19:%.*]] = add nsw i32 7, [[MUL18]] +// CHECK2-NEXT: store i32 [[ADD19]], ptr [[J]], align 4 +// CHECK2-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTFORWARD_IV_I]], align 4 +// CHECK2-NEXT: [[SUB20:%.*]] = sub nsw i32 3, [[TMP13]] +// CHECK2-NEXT: store i32 [[SUB20]], ptr [[DOTREVERSED_IV_I]], align 4 +// CHECK2-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTREVERSED_IV_I]], align 4 +// CHECK2-NEXT: [[MUL21:%.*]] = mul nsw i32 [[TMP14]], 3 +// CHECK2-NEXT: [[ADD22:%.*]] = add nsw i32 7, [[MUL21]] +// CHECK2-NEXT: store i32 [[ADD22]], ptr [[I]], align 4 +// CHECK2-NEXT: [[TMP15:%.*]] = load i32, ptr [[K]], align 4 +// CHECK2-NEXT: [[TMP16:%.*]] = load i32, ptr [[I]], align 4 +// CHECK2-NEXT: [[TMP17:%.*]] = load i32, ptr [[J]], align 4 +// CHECK2-NEXT: call void (...) @body(i32 noundef [[TMP15]], i32 noundef [[TMP16]], i32 noundef [[TMP17]]) +// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK2: omp.body.continue: +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[ADD23:%.*]] = add nsw i32 [[TMP18]], 1 +// CHECK2-NEXT: store i32 [[ADD23]], ptr [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP0]]) +// CHECK2-NEXT: call void @__kmpc_barrier(ptr @[[GLOB3]], i32 [[TMP0]]) +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@foo4 +// CHECK2-SAME: () #[[ATTR1]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2]], i32 0, ptr @foo4.omp_outlined) +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@foo4.omp_outlined +// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4:[0-9]+]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTFORWARD_IV_I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTREVERSED_IV_I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32 7, ptr [[I]], align 4 +// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 3, ptr [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP1]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 3 +// CHECK2-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK2: cond.true: +// CHECK2-NEXT: br label [[COND_END:%.*]] +// CHECK2: cond.false: +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END]] +// CHECK2: cond.end: +// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ 3, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ] +// CHECK2-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]] +// CHECK2-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK2-NEXT: store i32 [[ADD]], ptr [[DOTFORWARD_IV_I]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTFORWARD_IV_I]], align 4 +// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 3, [[TMP8]] +// CHECK2-NEXT: store i32 [[SUB]], ptr [[DOTREVERSED_IV_I]], align 4 +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTREVERSED_IV_I]], align 4 +// CHECK2-NEXT: [[MUL2:%.*]] = mul nsw i32 [[TMP9]], 3 +// CHECK2-NEXT: [[ADD3:%.*]] = add nsw i32 7, [[MUL2]] +// CHECK2-NEXT: store i32 [[ADD3]], ptr [[I]], align 4 +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, ptr [[I]], align 4 +// CHECK2-NEXT: call void (...) @body(i32 noundef [[TMP10]]) +// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK2: omp.body.continue: +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP11]], 1 +// CHECK2-NEXT: store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP1]]) +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@foo6 +// CHECK2-SAME: () #[[ATTR1]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[ARR:%.*]] = alloca [128 x double], align 16 +// CHECK2-NEXT: [[C:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[__RANGE2:%.*]] = alloca ptr, align 8 +// CHECK2-NEXT: [[__END2:%.*]] = alloca ptr, align 8 +// CHECK2-NEXT: [[__BEGIN2:%.*]] = alloca ptr, align 8 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca ptr, align 8 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca ptr, align 8 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_4:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTFORWARD_IV___BEGIN2:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTREVERSED_IV___BEGIN2:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[V:%.*]] = alloca ptr, align 8 +// CHECK2-NEXT: store i32 42, ptr [[C]], align 4 +// CHECK2-NEXT: store ptr [[ARR]], ptr [[__RANGE2]], align 8 +// CHECK2-NEXT: [[TMP0:%.*]] = load ptr, ptr [[__RANGE2]], align 8 +// CHECK2-NEXT: [[ARRAYDECAY:%.*]] = getelementptr inbounds [128 x double], ptr [[TMP0]], i64 0, i64 0 +// CHECK2-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY]], i64 128 +// CHECK2-NEXT: store ptr [[ADD_PTR]], ptr [[__END2]], align 8 +// CHECK2-NEXT: [[TMP1:%.*]] = load ptr, ptr [[__RANGE2]], align 8 +// CHECK2-NEXT: [[ARRAYDECAY1:%.*]] = getelementptr inbounds [128 x double], ptr [[TMP1]], i64 0, i64 0 +// CHECK2-NEXT: store ptr [[ARRAYDECAY1]], ptr [[__BEGIN2]], align 8 +// CHECK2-NEXT: [[TMP2:%.*]] = load ptr, ptr [[__RANGE2]], align 8 +// CHECK2-NEXT: [[ARRAYDECAY2:%.*]] = getelementptr inbounds [128 x double], ptr [[TMP2]], i64 0, i64 0 +// CHECK2-NEXT: store ptr [[ARRAYDECAY2]], ptr [[DOTCAPTURE_EXPR_]], align 8 +// CHECK2-NEXT: [[TMP3:%.*]] = load ptr, ptr [[__END2]], align 8 +// CHECK2-NEXT: store ptr [[TMP3]], ptr [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK2-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK2-NEXT: [[TMP5:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_]], align 8 +// CHECK2-NEXT: [[SUB_PTR_LHS_CAST:%.*]] = ptrtoint ptr [[TMP4]] to i64 +// CHECK2-NEXT: [[SUB_PTR_RHS_CAST:%.*]] = ptrtoint ptr [[TMP5]] to i64 +// CHECK2-NEXT: [[SUB_PTR_SUB:%.*]] = sub i64 [[SUB_PTR_LHS_CAST]], [[SUB_PTR_RHS_CAST]] +// CHECK2-NEXT: [[SUB_PTR_DIV:%.*]] = sdiv exact i64 [[SUB_PTR_SUB]], 8 +// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i64 [[SUB_PTR_DIV]], 1 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i64 [[SUB]], 1 +// CHECK2-NEXT: [[DIV:%.*]] = sdiv i64 [[ADD]], 1 +// CHECK2-NEXT: [[SUB5:%.*]] = sub nsw i64 [[DIV]], 1 +// CHECK2-NEXT: store i64 [[SUB5]], ptr [[DOTCAPTURE_EXPR_4]], align 8 +// CHECK2-NEXT: store i64 0, ptr [[DOTFORWARD_IV___BEGIN2]], align 8 +// CHECK2-NEXT: br label [[FOR_COND:%.*]] +// CHECK2: for.cond: +// CHECK2-NEXT: [[TMP6:%.*]] = load i64, ptr [[DOTFORWARD_IV___BEGIN2]], align 8 +// CHECK2-NEXT: [[TMP7:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_4]], align 8 +// CHECK2-NEXT: [[ADD6:%.*]] = add nsw i64 [[TMP7]], 1 +// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i64 [[TMP6]], [[ADD6]] +// CHECK2-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] +// CHECK2: for.body: +// CHECK2-NEXT: [[TMP8:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_4]], align 8 +// CHECK2-NEXT: [[ADD7:%.*]] = add nsw i64 [[TMP8]], 1 +// CHECK2-NEXT: [[SUB8:%.*]] = sub nsw i64 [[ADD7]], 1 +// CHECK2-NEXT: [[TMP9:%.*]] = load i64, ptr [[DOTFORWARD_IV___BEGIN2]], align 8 +// CHECK2-NEXT: [[SUB9:%.*]] = sub nsw i64 [[SUB8]], [[TMP9]] +// CHECK2-NEXT: store i64 [[SUB9]], ptr [[DOTREVERSED_IV___BEGIN2]], align 8 +// CHECK2-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_]], align 8 +// CHECK2-NEXT: [[TMP11:%.*]] = load i64, ptr [[DOTREVERSED_IV___BEGIN2]], align 8 +// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i64 [[TMP11]], 1 +// CHECK2-NEXT: [[ADD_PTR10:%.*]] = getelementptr inbounds double, ptr [[TMP10]], i64 [[MUL]] +// CHECK2-NEXT: store ptr [[ADD_PTR10]], ptr [[__BEGIN2]], align 8 +// CHECK2-NEXT: [[TMP12:%.*]] = load ptr, ptr [[__BEGIN2]], align 8 +// CHECK2-NEXT: store ptr [[TMP12]], ptr [[V]], align 8 +// CHECK2-NEXT: [[TMP13:%.*]] = load ptr, ptr [[V]], align 8 +// CHECK2-NEXT: [[TMP14:%.*]] = load double, ptr [[TMP13]], align 8 +// CHECK2-NEXT: [[TMP15:%.*]] = load i32, ptr [[C]], align 4 +// CHECK2-NEXT: call void (...) @body(double noundef [[TMP14]], i32 noundef [[TMP15]]) +// CHECK2-NEXT: br label [[FOR_INC:%.*]] +// CHECK2: for.inc: +// CHECK2-NEXT: [[TMP16:%.*]] = load i64, ptr [[DOTFORWARD_IV___BEGIN2]], align 8 +// CHECK2-NEXT: [[INC:%.*]] = add nsw i64 [[TMP16]], 1 +// CHECK2-NEXT: store i64 [[INC]], ptr [[DOTFORWARD_IV___BEGIN2]], align 8 +// CHECK2-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP8:![0-9]+]] +// CHECK2: for.end: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@foo7 +// CHECK2-SAME: () #[[ATTR1]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[A:%.*]] = alloca [128 x double], align 16 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[_TMP1:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[_TMP2:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[C:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[__RANGE3:%.*]] = alloca ptr, align 8 +// CHECK2-NEXT: [[__END3:%.*]] = alloca ptr, align 8 +// CHECK2-NEXT: [[__BEGIN3:%.*]] = alloca ptr, align 8 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca ptr, align 8 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_5:%.*]] = alloca ptr, align 8 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_6:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_8:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_10:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTFORWARD_IV___BEGIN3:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[K15:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTFORWARD_IV___BEGIN316:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[J17:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTREVERSED_IV___BEGIN3:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[V:%.*]] = alloca ptr, align 8 +// CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2]]) +// CHECK2-NEXT: store i32 42, ptr [[C]], align 4 +// CHECK2-NEXT: store ptr [[A]], ptr [[__RANGE3]], align 8 +// CHECK2-NEXT: [[TMP1:%.*]] = load ptr, ptr [[__RANGE3]], align 8 +// CHECK2-NEXT: [[ARRAYDECAY:%.*]] = getelementptr inbounds [128 x double], ptr [[TMP1]], i64 0, i64 0 +// CHECK2-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY]], i64 128 +// CHECK2-NEXT: store ptr [[ADD_PTR]], ptr [[__END3]], align 8 +// CHECK2-NEXT: [[TMP2:%.*]] = load ptr, ptr [[__RANGE3]], align 8 +// CHECK2-NEXT: [[ARRAYDECAY3:%.*]] = getelementptr inbounds [128 x double], ptr [[TMP2]], i64 0, i64 0 +// CHECK2-NEXT: store ptr [[ARRAYDECAY3]], ptr [[__BEGIN3]], align 8 +// CHECK2-NEXT: [[TMP3:%.*]] = load ptr, ptr [[__RANGE3]], align 8 +// CHECK2-NEXT: [[ARRAYDECAY4:%.*]] = getelementptr inbounds [128 x double], ptr [[TMP3]], i64 0, i64 0 +// CHECK2-NEXT: store ptr [[ARRAYDECAY4]], ptr [[DOTCAPTURE_EXPR_]], align 8 +// CHECK2-NEXT: [[TMP4:%.*]] = load ptr, ptr [[__END3]], align 8 +// CHECK2-NEXT: store ptr [[TMP4]], ptr [[DOTCAPTURE_EXPR_5]], align 8 +// CHECK2-NEXT: [[TMP5:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_5]], align 8 +// CHECK2-NEXT: [[TMP6:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_]], align 8 +// CHECK2-NEXT: [[SUB_PTR_LHS_CAST:%.*]] = ptrtoint ptr [[TMP5]] to i64 +// CHECK2-NEXT: [[SUB_PTR_RHS_CAST:%.*]] = ptrtoint ptr [[TMP6]] to i64 +// CHECK2-NEXT: [[SUB_PTR_SUB:%.*]] = sub i64 [[SUB_PTR_LHS_CAST]], [[SUB_PTR_RHS_CAST]] +// CHECK2-NEXT: [[SUB_PTR_DIV:%.*]] = sdiv exact i64 [[SUB_PTR_SUB]], 8 +// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i64 [[SUB_PTR_DIV]], 1 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i64 [[SUB]], 1 +// CHECK2-NEXT: [[DIV:%.*]] = sdiv i64 [[ADD]], 1 +// CHECK2-NEXT: [[SUB7:%.*]] = sub nsw i64 [[DIV]], 1 +// CHECK2-NEXT: store i64 [[SUB7]], ptr [[DOTCAPTURE_EXPR_6]], align 8 +// CHECK2-NEXT: [[TMP7:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_6]], align 8 +// CHECK2-NEXT: [[ADD9:%.*]] = add nsw i64 [[TMP7]], 1 +// CHECK2-NEXT: store i64 [[ADD9]], ptr [[DOTCAPTURE_EXPR_8]], align 8 +// CHECK2-NEXT: [[TMP8:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_8]], align 8 +// CHECK2-NEXT: [[SUB11:%.*]] = sub nsw i64 [[TMP8]], 0 +// CHECK2-NEXT: [[DIV12:%.*]] = sdiv i64 [[SUB11]], 1 +// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i64 4, [[DIV12]] +// CHECK2-NEXT: [[MUL13:%.*]] = mul nsw i64 [[MUL]], 4 +// CHECK2-NEXT: [[SUB14:%.*]] = sub nsw i64 [[MUL13]], 1 +// CHECK2-NEXT: store i64 [[SUB14]], ptr [[DOTCAPTURE_EXPR_10]], align 8 +// CHECK2-NEXT: store i32 7, ptr [[K]], align 4 +// CHECK2-NEXT: store i64 0, ptr [[DOTFORWARD_IV___BEGIN3]], align 8 +// CHECK2-NEXT: store i32 7, ptr [[J]], align 4 +// CHECK2-NEXT: [[TMP9:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_8]], align 8 +// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i64 0, [[TMP9]] +// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK2: omp.precond.then: +// CHECK2-NEXT: store i64 0, ptr [[DOTOMP_LB]], align 8 +// CHECK2-NEXT: [[TMP10:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_10]], align 8 +// CHECK2-NEXT: store i64 [[TMP10]], ptr [[DOTOMP_UB]], align 8 +// CHECK2-NEXT: store i64 1, ptr [[DOTOMP_STRIDE]], align 8 +// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_8(ptr @[[GLOB1]], i32 [[TMP0]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i64 1, i64 1) +// CHECK2-NEXT: [[TMP11:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8 +// CHECK2-NEXT: [[TMP12:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_10]], align 8 +// CHECK2-NEXT: [[CMP18:%.*]] = icmp sgt i64 [[TMP11]], [[TMP12]] +// CHECK2-NEXT: br i1 [[CMP18]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK2: cond.true: +// CHECK2-NEXT: [[TMP13:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_10]], align 8 +// CHECK2-NEXT: br label [[COND_END:%.*]] +// CHECK2: cond.false: +// CHECK2-NEXT: [[TMP14:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8 +// CHECK2-NEXT: br label [[COND_END]] +// CHECK2: cond.end: +// CHECK2-NEXT: [[COND:%.*]] = phi i64 [ [[TMP13]], [[COND_TRUE]] ], [ [[TMP14]], [[COND_FALSE]] ] +// CHECK2-NEXT: store i64 [[COND]], ptr [[DOTOMP_UB]], align 8 +// CHECK2-NEXT: [[TMP15:%.*]] = load i64, ptr [[DOTOMP_LB]], align 8 +// CHECK2-NEXT: store i64 [[TMP15]], ptr [[DOTOMP_IV]], align 8 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP16:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK2-NEXT: [[TMP17:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8 +// CHECK2-NEXT: [[CMP19:%.*]] = icmp sle i64 [[TMP16]], [[TMP17]] +// CHECK2-NEXT: br i1 [[CMP19]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP18:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK2-NEXT: [[TMP19:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_8]], align 8 +// CHECK2-NEXT: [[SUB20:%.*]] = sub nsw i64 [[TMP19]], 0 +// CHECK2-NEXT: [[DIV21:%.*]] = sdiv i64 [[SUB20]], 1 +// CHECK2-NEXT: [[MUL22:%.*]] = mul nsw i64 1, [[DIV21]] +// CHECK2-NEXT: [[MUL23:%.*]] = mul nsw i64 [[MUL22]], 4 +// CHECK2-NEXT: [[DIV24:%.*]] = sdiv i64 [[TMP18]], [[MUL23]] +// CHECK2-NEXT: [[MUL25:%.*]] = mul nsw i64 [[DIV24]], 3 +// CHECK2-NEXT: [[ADD26:%.*]] = add nsw i64 7, [[MUL25]] +// CHECK2-NEXT: [[CONV:%.*]] = trunc i64 [[ADD26]] to i32 +// CHECK2-NEXT: store i32 [[CONV]], ptr [[K15]], align 4 +// CHECK2-NEXT: [[TMP20:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK2-NEXT: [[TMP21:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK2-NEXT: [[TMP22:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_8]], align 8 +// CHECK2-NEXT: [[SUB27:%.*]] = sub nsw i64 [[TMP22]], 0 +// CHECK2-NEXT: [[DIV28:%.*]] = sdiv i64 [[SUB27]], 1 +// CHECK2-NEXT: [[MUL29:%.*]] = mul nsw i64 1, [[DIV28]] +// CHECK2-NEXT: [[MUL30:%.*]] = mul nsw i64 [[MUL29]], 4 +// CHECK2-NEXT: [[DIV31:%.*]] = sdiv i64 [[TMP21]], [[MUL30]] +// CHECK2-NEXT: [[TMP23:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_8]], align 8 +// CHECK2-NEXT: [[SUB32:%.*]] = sub nsw i64 [[TMP23]], 0 +// CHECK2-NEXT: [[DIV33:%.*]] = sdiv i64 [[SUB32]], 1 +// CHECK2-NEXT: [[MUL34:%.*]] = mul nsw i64 1, [[DIV33]] +// CHECK2-NEXT: [[MUL35:%.*]] = mul nsw i64 [[MUL34]], 4 +// CHECK2-NEXT: [[MUL36:%.*]] = mul nsw i64 [[DIV31]], [[MUL35]] +// CHECK2-NEXT: [[SUB37:%.*]] = sub nsw i64 [[TMP20]], [[MUL36]] +// CHECK2-NEXT: [[DIV38:%.*]] = sdiv i64 [[SUB37]], 4 +// CHECK2-NEXT: [[MUL39:%.*]] = mul nsw i64 [[DIV38]], 1 +// CHECK2-NEXT: [[ADD40:%.*]] = add nsw i64 0, [[MUL39]] +// CHECK2-NEXT: store i64 [[ADD40]], ptr [[DOTFORWARD_IV___BEGIN316]], align 8 +// CHECK2-NEXT: [[TMP24:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK2-NEXT: [[TMP25:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK2-NEXT: [[TMP26:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_8]], align 8 +// CHECK2-NEXT: [[SUB41:%.*]] = sub nsw i64 [[TMP26]], 0 +// CHECK2-NEXT: [[DIV42:%.*]] = sdiv i64 [[SUB41]], 1 +// CHECK2-NEXT: [[MUL43:%.*]] = mul nsw i64 1, [[DIV42]] +// CHECK2-NEXT: [[MUL44:%.*]] = mul nsw i64 [[MUL43]], 4 +// CHECK2-NEXT: [[DIV45:%.*]] = sdiv i64 [[TMP25]], [[MUL44]] +// CHECK2-NEXT: [[TMP27:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_8]], align 8 +// CHECK2-NEXT: [[SUB46:%.*]] = sub nsw i64 [[TMP27]], 0 +// CHECK2-NEXT: [[DIV47:%.*]] = sdiv i64 [[SUB46]], 1 +// CHECK2-NEXT: [[MUL48:%.*]] = mul nsw i64 1, [[DIV47]] +// CHECK2-NEXT: [[MUL49:%.*]] = mul nsw i64 [[MUL48]], 4 +// CHECK2-NEXT: [[MUL50:%.*]] = mul nsw i64 [[DIV45]], [[MUL49]] +// CHECK2-NEXT: [[SUB51:%.*]] = sub nsw i64 [[TMP24]], [[MUL50]] +// CHECK2-NEXT: [[TMP28:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK2-NEXT: [[TMP29:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK2-NEXT: [[TMP30:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_8]], align 8 +// CHECK2-NEXT: [[SUB52:%.*]] = sub nsw i64 [[TMP30]], 0 +// CHECK2-NEXT: [[DIV53:%.*]] = sdiv i64 [[SUB52]], 1 +// CHECK2-NEXT: [[MUL54:%.*]] = mul nsw i64 1, [[DIV53]] +// CHECK2-NEXT: [[MUL55:%.*]] = mul nsw i64 [[MUL54]], 4 +// CHECK2-NEXT: [[DIV56:%.*]] = sdiv i64 [[TMP29]], [[MUL55]] +// CHECK2-NEXT: [[TMP31:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_8]], align 8 +// CHECK2-NEXT: [[SUB57:%.*]] = sub nsw i64 [[TMP31]], 0 +// CHECK2-NEXT: [[DIV58:%.*]] = sdiv i64 [[SUB57]], 1 +// CHECK2-NEXT: [[MUL59:%.*]] = mul nsw i64 1, [[DIV58]] +// CHECK2-NEXT: [[MUL60:%.*]] = mul nsw i64 [[MUL59]], 4 +// CHECK2-NEXT: [[MUL61:%.*]] = mul nsw i64 [[DIV56]], [[MUL60]] +// CHECK2-NEXT: [[SUB62:%.*]] = sub nsw i64 [[TMP28]], [[MUL61]] +// CHECK2-NEXT: [[DIV63:%.*]] = sdiv i64 [[SUB62]], 4 +// CHECK2-NEXT: [[MUL64:%.*]] = mul nsw i64 [[DIV63]], 4 +// CHECK2-NEXT: [[SUB65:%.*]] = sub nsw i64 [[SUB51]], [[MUL64]] +// CHECK2-NEXT: [[MUL66:%.*]] = mul nsw i64 [[SUB65]], 3 +// CHECK2-NEXT: [[ADD67:%.*]] = add nsw i64 7, [[MUL66]] +// CHECK2-NEXT: [[CONV68:%.*]] = trunc i64 [[ADD67]] to i32 +// CHECK2-NEXT: store i32 [[CONV68]], ptr [[J17]], align 4 +// CHECK2-NEXT: [[TMP32:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_6]], align 8 +// CHECK2-NEXT: [[ADD69:%.*]] = add nsw i64 [[TMP32]], 1 +// CHECK2-NEXT: [[SUB70:%.*]] = sub nsw i64 [[ADD69]], 1 +// CHECK2-NEXT: [[TMP33:%.*]] = load i64, ptr [[DOTFORWARD_IV___BEGIN316]], align 8 +// CHECK2-NEXT: [[SUB71:%.*]] = sub nsw i64 [[SUB70]], [[TMP33]] +// CHECK2-NEXT: store i64 [[SUB71]], ptr [[DOTREVERSED_IV___BEGIN3]], align 8 +// CHECK2-NEXT: [[TMP34:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_]], align 8 +// CHECK2-NEXT: [[TMP35:%.*]] = load i64, ptr [[DOTREVERSED_IV___BEGIN3]], align 8 +// CHECK2-NEXT: [[MUL72:%.*]] = mul nsw i64 [[TMP35]], 1 +// CHECK2-NEXT: [[ADD_PTR73:%.*]] = getelementptr inbounds double, ptr [[TMP34]], i64 [[MUL72]] +// CHECK2-NEXT: store ptr [[ADD_PTR73]], ptr [[__BEGIN3]], align 8 +// CHECK2-NEXT: [[TMP36:%.*]] = load ptr, ptr [[__BEGIN3]], align 8 +// CHECK2-NEXT: store ptr [[TMP36]], ptr [[V]], align 8 +// CHECK2-NEXT: [[TMP37:%.*]] = load i32, ptr [[K15]], align 4 +// CHECK2-NEXT: [[TMP38:%.*]] = load i32, ptr [[C]], align 4 +// CHECK2-NEXT: [[TMP39:%.*]] = load ptr, ptr [[V]], align 8 +// CHECK2-NEXT: [[TMP40:%.*]] = load double, ptr [[TMP39]], align 8 +// CHECK2-NEXT: [[TMP41:%.*]] = load i32, ptr [[J17]], align 4 +// CHECK2-NEXT: call void (...) @body(i32 noundef [[TMP37]], i32 noundef [[TMP38]], double noundef [[TMP40]], i32 noundef [[TMP41]]) +// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK2: omp.body.continue: +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP42:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK2-NEXT: [[ADD74:%.*]] = add nsw i64 [[TMP42]], 1 +// CHECK2-NEXT: store i64 [[ADD74]], ptr [[DOTOMP_IV]], align 8 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP0]]) +// CHECK2-NEXT: br label [[OMP_PRECOND_END]] +// CHECK2: omp.precond.end: +// CHECK2-NEXT: call void @__kmpc_barrier(ptr @[[GLOB3]], i32 [[TMP0]]) +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@tfoo5 +// CHECK2-SAME: () #[[ATTR1]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: call void @_Z4foo5IiTnT_Li3EEvS0_S0_(i32 noundef 0, i32 noundef 42) +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@_Z4foo5IiTnT_Li3EEvS0_S0_ +// CHECK2-SAME: (i32 noundef [[START:%.*]], i32 noundef [[END:%.*]]) #[[ATTR1]] comdat { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[START_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[END_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTFORWARD_IV_I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTREVERSED_IV_I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32 [[START]], ptr [[START_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[END]], ptr [[END_ADDR]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load i32, ptr [[START_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP0]], ptr [[I]], align 4 +// CHECK2-NEXT: [[TMP1:%.*]] = load i32, ptr [[START_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, ptr [[END_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP2]], ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[SUB:%.*]] = sub i32 [[TMP3]], [[TMP4]] +// CHECK2-NEXT: [[SUB3:%.*]] = sub i32 [[SUB]], 1 +// CHECK2-NEXT: [[ADD:%.*]] = add i32 [[SUB3]], 3 +// CHECK2-NEXT: [[DIV:%.*]] = udiv i32 [[ADD]], 3 +// CHECK2-NEXT: [[SUB4:%.*]] = sub i32 [[DIV]], 1 +// CHECK2-NEXT: store i32 [[SUB4]], ptr [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: store i32 0, ptr [[DOTFORWARD_IV_I]], align 4 +// CHECK2-NEXT: br label [[FOR_COND:%.*]] +// CHECK2: for.cond: +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTFORWARD_IV_I]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: [[ADD5:%.*]] = add i32 [[TMP6]], 1 +// CHECK2-NEXT: [[CMP:%.*]] = icmp ult i32 [[TMP5]], [[ADD5]] +// CHECK2-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] +// CHECK2: for.body: +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: [[ADD6:%.*]] = add i32 [[TMP7]], 1 +// CHECK2-NEXT: [[SUB7:%.*]] = sub i32 [[ADD6]], 1 +// CHECK2-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTFORWARD_IV_I]], align 4 +// CHECK2-NEXT: [[SUB8:%.*]] = sub i32 [[SUB7]], [[TMP8]] +// CHECK2-NEXT: store i32 [[SUB8]], ptr [[DOTREVERSED_IV_I]], align 4 +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTREVERSED_IV_I]], align 4 +// CHECK2-NEXT: [[MUL:%.*]] = mul i32 [[TMP10]], 3 +// CHECK2-NEXT: [[ADD9:%.*]] = add i32 [[TMP9]], [[MUL]] +// CHECK2-NEXT: store i32 [[ADD9]], ptr [[I]], align 4 +// CHECK2-NEXT: [[TMP11:%.*]] = load i32, ptr [[I]], align 4 +// CHECK2-NEXT: call void (...) @body(i32 noundef [[TMP11]]) +// CHECK2-NEXT: br label [[FOR_INC:%.*]] +// CHECK2: for.inc: +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTFORWARD_IV_I]], align 4 +// CHECK2-NEXT: [[INC:%.*]] = add i32 [[TMP12]], 1 +// CHECK2-NEXT: store i32 [[INC]], ptr [[DOTFORWARD_IV_I]], align 4 +// CHECK2-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP9:![0-9]+]] +// CHECK2: for.end: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@_GLOBAL__sub_I_reverse_codegen.cpp +// CHECK2-SAME: () #[[ATTR0]] section ".text.startup" { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: call void @__cxx_global_var_init() +// CHECK2-NEXT: ret void + diff --git a/clang/test/OpenMP/reverse_messages.cpp b/clang/test/OpenMP/reverse_messages.cpp new file mode 100644 index 0000000000000..9636a70bf2753 --- /dev/null +++ b/clang/test/OpenMP/reverse_messages.cpp @@ -0,0 +1,40 @@ +// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -std=c++20 -fopenmp -fopenmp-version=60 -fsyntax-only -Wuninitialized -verify %s + +void func() { + + // expected-error@+2 {{statement after '#pragma omp reverse' must be a for loop}} + #pragma omp reverse + ; + + // expected-error@+2 {{statement after '#pragma omp reverse' must be a for loop}} + #pragma omp reverse + int b = 0; + + // expected-error@+2 {{statement after '#pragma omp reverse' must be a for loop}} + #pragma omp reverse + #pragma omp for + for (int i = 0; i < 7; ++i) + ; + + { + // expected-error@+2 {{expected statement}} + #pragma omp reverse + } + + // expected-error@+2 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', '>=', or '!=') of loop variable 'i'}} + #pragma omp reverse + for (int i = 0; i/3<7; ++i) + ; + + // expected-error@+1 {{unexpected OpenMP clause 'sizes' in directive '#pragma omp reverse'}} + #pragma omp reverse sizes(5) + for (int i = 0; i < 7; ++i) + ; + + // expected-warning@+1 {{extra tokens at the end of '#pragma omp reverse' are ignored}} + #pragma omp reverse foo + for (int i = 0; i < 7; ++i) + ; + +} + diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp index fe0be203cb462..52d469c31715d 100644 --- a/clang/tools/libclang/CIndex.cpp +++ b/clang/tools/libclang/CIndex.cpp @@ -2182,6 +2182,7 @@ class EnqueueVisitor : public ConstStmtVisitor, VisitOMPLoopTransformationDirective(const OMPLoopTransformationDirective *D); void VisitOMPTileDirective(const OMPTileDirective *D); void VisitOMPUnrollDirective(const OMPUnrollDirective *D); + void VisitOMPReverseDirective(const OMPReverseDirective *D); void VisitOMPForDirective(const OMPForDirective *D); void VisitOMPForSimdDirective(const OMPForSimdDirective *D); void VisitOMPSectionsDirective(const OMPSectionsDirective *D); @@ -3228,6 +3229,10 @@ void EnqueueVisitor::VisitOMPUnrollDirective(const OMPUnrollDirective *D) { VisitOMPLoopTransformationDirective(D); } +void EnqueueVisitor::VisitOMPReverseDirective(const OMPReverseDirective *D) { + VisitOMPLoopTransformationDirective(D); +} + void EnqueueVisitor::VisitOMPForDirective(const OMPForDirective *D) { VisitOMPLoopDirective(D); } @@ -6097,6 +6102,8 @@ CXString clang_getCursorKindSpelling(enum CXCursorKind Kind) { return cxstring::createRef("OMPTileDirective"); case CXCursor_OMPUnrollDirective: return cxstring::createRef("OMPUnrollDirective"); + case CXCursor_OMPReverseDirective: + return cxstring::createRef("OMPReverseDirective"); case CXCursor_OMPForDirective: return cxstring::createRef("OMPForDirective"); case CXCursor_OMPForSimdDirective: diff --git a/clang/tools/libclang/CXCursor.cpp b/clang/tools/libclang/CXCursor.cpp index bc4b162880790..d59151f756dc3 100644 --- a/clang/tools/libclang/CXCursor.cpp +++ b/clang/tools/libclang/CXCursor.cpp @@ -673,6 +673,9 @@ CXCursor cxcursor::MakeCXCursor(const Stmt *S, const Decl *Parent, case Stmt::OMPUnrollDirectiveClass: K = CXCursor_OMPUnrollDirective; break; + case Stmt::OMPReverseDirectiveClass: + K = CXCursor_OMPReverseDirective; + break; case Stmt::OMPForDirectiveClass: K = CXCursor_OMPForDirective; break; diff --git a/llvm/include/llvm/Frontend/OpenMP/OMP.td b/llvm/include/llvm/Frontend/OpenMP/OMP.td index 005c678302b27..4bd398128b673 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMP.td +++ b/llvm/include/llvm/Frontend/OpenMP/OMP.td @@ -837,6 +837,10 @@ def OMP_Requires : Directive<"requires"> { let association = AS_None; let category = CA_Informational; } +def OMP_Reverse : Directive<"reverse"> { + let association = AS_Loop; + let category = CA_Executable; +} def OMP_Scan : Directive<"scan"> { let allowedClauses = [ VersionedClause, diff --git a/openmp/runtime/test/transform/reverse/foreach.cpp b/openmp/runtime/test/transform/reverse/foreach.cpp new file mode 100644 index 0000000000000..0784e3c0057c9 --- /dev/null +++ b/openmp/runtime/test/transform/reverse/foreach.cpp @@ -0,0 +1,162 @@ +// RUN: %libomp-cxx20-compile-and-run | FileCheck %s --match-full-lines + +#ifndef HEADER +#define HEADER + +#include +#include +#include +#include + +struct Reporter { + const char *name; + + Reporter(const char *name) : name(name) { print("ctor"); } + + Reporter() : name("") { print("ctor"); } + + Reporter(const Reporter &that) : name(that.name) { print("copy ctor"); } + + Reporter(Reporter &&that) : name(that.name) { print("move ctor"); } + + ~Reporter() { print("dtor"); } + + const Reporter &operator=(const Reporter &that) { + print("copy assign"); + this->name = that.name; + return *this; + } + + const Reporter &operator=(Reporter &&that) { + print("move assign"); + this->name = that.name; + return *this; + } + + struct Iterator { + const Reporter *owner; + int pos; + + Iterator(const Reporter *owner, int pos) : owner(owner), pos(pos) {} + + Iterator(const Iterator &that) : owner(that.owner), pos(that.pos) { + owner->print("iterator copy ctor"); + } + + Iterator(Iterator &&that) : owner(that.owner), pos(that.pos) { + owner->print("iterator move ctor"); + } + + ~Iterator() { owner->print("iterator dtor"); } + + const Iterator &operator=(const Iterator &that) { + owner->print("iterator copy assign"); + this->owner = that.owner; + this->pos = that.pos; + return *this; + } + + const Iterator &operator=(Iterator &&that) { + owner->print("iterator move assign"); + this->owner = that.owner; + this->pos = that.pos; + return *this; + } + + bool operator==(const Iterator &that) const { + owner->print("iterator %d == %d", 2 - this->pos, 2 - that.pos); + return this->pos == that.pos; + } + + Iterator &operator++() { + owner->print("iterator prefix ++"); + pos -= 1; + return *this; + } + + Iterator operator++(int) { + owner->print("iterator postfix ++"); + auto result = *this; + pos -= 1; + return result; + } + + int operator*() const { + int result = 2 - pos; + owner->print("iterator deref: %i", result); + return result; + } + + size_t operator-(const Iterator &that) const { + int result = (2 - this->pos) - (2 - that.pos); + owner->print("iterator distance: %d", result); + return result; + } + + Iterator operator+(int steps) const { + owner->print("iterator advance: %i += %i", 2 - this->pos, steps); + return Iterator(owner, pos - steps); + } + + void print(const char *msg) const { owner->print(msg); } + }; + + Iterator begin() const { + print("begin()"); + return Iterator(this, 2); + } + + Iterator end() const { + print("end()"); + return Iterator(this, -1); + } + + void print(const char *msg, ...) const { + va_list args; + va_start(args, msg); + printf("[%s] ", name); + vprintf(msg, args); + printf("\n"); + va_end(args); + } +}; + +int main() { + printf("do\n"); +#pragma omp reverse + for (Reporter c{"init-stmt"}; auto &&v : Reporter("range")) + printf("v=%d\n", v); + printf("done\n"); + return EXIT_SUCCESS; +} + +#endif /* HEADER */ + +// CHECK: do +// CHECK-NEXT: [init-stmt] ctor +// CHECK-NEXT: [range] ctor +// CHECK-NEXT: [range] end() +// CHECK-NEXT: [range] begin() +// CHECK-NEXT: [range] begin() +// CHECK-NEXT: [range] iterator distance: 3 +// CHECK-NEXT: [range] iterator advance: 0 += 2 +// CHECK-NEXT: [range] iterator move assign +// CHECK-NEXT: [range] iterator deref: 2 +// CHECK-NEXT: v=2 +// CHECK-NEXT: [range] iterator dtor +// CHECK-NEXT: [range] iterator advance: 0 += 1 +// CHECK-NEXT: [range] iterator move assign +// CHECK-NEXT: [range] iterator deref: 1 +// CHECK-NEXT: v=1 +// CHECK-NEXT: [range] iterator dtor +// CHECK-NEXT: [range] iterator advance: 0 += 0 +// CHECK-NEXT: [range] iterator move assign +// CHECK-NEXT: [range] iterator deref: 0 +// CHECK-NEXT: v=0 +// CHECK-NEXT: [range] iterator dtor +// CHECK-NEXT: [range] iterator dtor +// CHECK-NEXT: [range] iterator dtor +// CHECK-NEXT: [range] iterator dtor +// CHECK-NEXT: [range] dtor +// CHECK-NEXT: [init-stmt] dtor +// CHECK-NEXT: done diff --git a/openmp/runtime/test/transform/reverse/intfor.c b/openmp/runtime/test/transform/reverse/intfor.c new file mode 100644 index 0000000000000..a526a8d493b3d --- /dev/null +++ b/openmp/runtime/test/transform/reverse/intfor.c @@ -0,0 +1,25 @@ +// RUN: %libomp-compile-and-run | FileCheck %s --match-full-lines + +#ifndef HEADER +#define HEADER + +#include +#include + +int main() { + printf("do\n"); +#pragma omp reverse + for (int i = 7; i < 19; i += 3) + printf("i=%d\n", i); + printf("done\n"); + return EXIT_SUCCESS; +} + +#endif /* HEADER */ + +// CHECK: do +// CHECK-NEXT: i=16 +// CHECK-NEXT: i=13 +// CHECK-NEXT: i=10 +// CHECK-NEXT: i=7 +// CHECK-NEXT: done diff --git a/openmp/runtime/test/transform/reverse/iterfor.cpp b/openmp/runtime/test/transform/reverse/iterfor.cpp new file mode 100644 index 0000000000000..ba1086dbd76a5 --- /dev/null +++ b/openmp/runtime/test/transform/reverse/iterfor.cpp @@ -0,0 +1,164 @@ +// RUN: %libomp-cxx20-compile-and-run | FileCheck %s --match-full-lines + +#ifndef HEADER +#define HEADER + +#include +#include +#include +#include + +struct Reporter { + const char *name; + + Reporter(const char *name) : name(name) { print("ctor"); } + + Reporter() : name("") { print("ctor"); } + + Reporter(const Reporter &that) : name(that.name) { print("copy ctor"); } + + Reporter(Reporter &&that) : name(that.name) { print("move ctor"); } + + ~Reporter() { print("dtor"); } + + const Reporter &operator=(const Reporter &that) { + print("copy assign"); + this->name = that.name; + return *this; + } + + const Reporter &operator=(Reporter &&that) { + print("move assign"); + this->name = that.name; + return *this; + } + + struct Iterator { + const Reporter *owner; + int pos; + + Iterator(const Reporter *owner, int pos) : owner(owner), pos(pos) {} + + Iterator(const Iterator &that) : owner(that.owner), pos(that.pos) { + owner->print("iterator copy ctor"); + } + + Iterator(Iterator &&that) : owner(that.owner), pos(that.pos) { + owner->print("iterator move ctor"); + } + + ~Iterator() { owner->print("iterator dtor"); } + + const Iterator &operator=(const Iterator &that) { + owner->print("iterator copy assign"); + this->owner = that.owner; + this->pos = that.pos; + return *this; + } + + const Iterator &operator=(Iterator &&that) { + owner->print("iterator move assign"); + this->owner = that.owner; + this->pos = that.pos; + return *this; + } + + bool operator==(const Iterator &that) const { + owner->print("iterator %d == %d", 2 - this->pos, 2 - that.pos); + return this->pos == that.pos; + } + + bool operator!=(const Iterator &that) const { + owner->print("iterator %d != %d", 2 - this->pos, 2 - that.pos); + return this->pos != that.pos; + } + + Iterator &operator++() { + owner->print("iterator prefix ++"); + pos -= 1; + return *this; + } + + Iterator operator++(int) { + owner->print("iterator postfix ++"); + auto result = *this; + pos -= 1; + return result; + } + + int operator*() const { + int result = 2 - pos; + owner->print("iterator deref: %i", result); + return result; + } + + size_t operator-(const Iterator &that) const { + int result = (2 - this->pos) - (2 - that.pos); + owner->print("iterator distance: %d", result); + return result; + } + + Iterator operator+(int steps) const { + owner->print("iterator advance: %i += %i", 2 - this->pos, steps); + return Iterator(owner, pos - steps); + } + }; + + Iterator begin() const { + print("begin()"); + return Iterator(this, 2); + } + + Iterator end() const { + print("end()"); + return Iterator(this, -1); + } + + void print(const char *msg, ...) const { + va_list args; + va_start(args, msg); + printf("[%s] ", name); + vprintf(msg, args); + printf("\n"); + va_end(args); + } +}; + +int main() { + printf("do\n"); + Reporter range("range"); +#pragma omp reverse + for (auto it = range.begin(); it != range.end(); ++it) + printf("v=%d\n", *it); + printf("done\n"); + return EXIT_SUCCESS; +} + +#endif /* HEADER */ + +// CHECK: do +// CHECK-NEXT: [range] ctor +// CHECK-NEXT: [range] begin() +// CHECK-NEXT: [range] begin() +// CHECK-NEXT: [range] end() +// CHECK-NEXT: [range] iterator distance: 3 +// CHECK-NEXT: [range] iterator advance: 0 += 2 +// CHECK-NEXT: [range] iterator move assign +// CHECK-NEXT: [range] iterator deref: 2 +// CHECK-NEXT: v=2 +// CHECK-NEXT: [range] iterator dtor +// CHECK-NEXT: [range] iterator advance: 0 += 1 +// CHECK-NEXT: [range] iterator move assign +// CHECK-NEXT: [range] iterator deref: 1 +// CHECK-NEXT: v=1 +// CHECK-NEXT: [range] iterator dtor +// CHECK-NEXT: [range] iterator advance: 0 += 0 +// CHECK-NEXT: [range] iterator move assign +// CHECK-NEXT: [range] iterator deref: 0 +// CHECK-NEXT: v=0 +// CHECK-NEXT: [range] iterator dtor +// CHECK-NEXT: [range] iterator dtor +// CHECK-NEXT: [range] iterator dtor +// CHECK-NEXT: done +// CHECK-NEXT: [range] iterator dtor +// CHECK-NEXT: [range] dtor diff --git a/openmp/runtime/test/transform/reverse/parallel-wsloop-collapse-foreach.cpp b/openmp/runtime/test/transform/reverse/parallel-wsloop-collapse-foreach.cpp new file mode 100644 index 0000000000000..240ef59bd6b4b --- /dev/null +++ b/openmp/runtime/test/transform/reverse/parallel-wsloop-collapse-foreach.cpp @@ -0,0 +1,285 @@ +// RUN: %libomp-cxx20-compile-and-run | FileCheck %s --match-full-lines + +#ifndef HEADER +#define HEADER + +#include +#include +#include +#include + +struct Reporter { + const char *name; + + Reporter(const char *name) : name(name) { print("ctor"); } + + Reporter() : name("") { print("ctor"); } + + Reporter(const Reporter &that) : name(that.name) { print("copy ctor"); } + + Reporter(Reporter &&that) : name(that.name) { print("move ctor"); } + + ~Reporter() { print("dtor"); } + + const Reporter &operator=(const Reporter &that) { + print("copy assign"); + this->name = that.name; + return *this; + } + + const Reporter &operator=(Reporter &&that) { + print("move assign"); + this->name = that.name; + return *this; + } + + struct Iterator { + const Reporter *owner; + int pos; + + Iterator(const Reporter *owner, int pos) : owner(owner), pos(pos) {} + + Iterator(const Iterator &that) : owner(that.owner), pos(that.pos) { + owner->print("iterator copy ctor"); + } + + Iterator(Iterator &&that) : owner(that.owner), pos(that.pos) { + owner->print("iterator move ctor"); + } + + ~Iterator() { owner->print("iterator dtor"); } + + const Iterator &operator=(const Iterator &that) { + owner->print("iterator copy assign"); + this->owner = that.owner; + this->pos = that.pos; + return *this; + } + + const Iterator &operator=(Iterator &&that) { + owner->print("iterator move assign"); + this->owner = that.owner; + this->pos = that.pos; + return *this; + } + + bool operator==(const Iterator &that) const { + owner->print("iterator %d == %d", 2 - this->pos, 2 - that.pos); + return this->pos == that.pos; + } + + Iterator &operator++() { + owner->print("iterator prefix ++"); + pos -= 1; + return *this; + } + + Iterator operator++(int) { + owner->print("iterator postfix ++"); + auto result = *this; + pos -= 1; + return result; + } + + int operator*() const { + int result = 2 - pos; + owner->print("iterator deref: %i", result); + return result; + } + + size_t operator-(const Iterator &that) const { + int result = (2 - this->pos) - (2 - that.pos); + owner->print("iterator distance: %d", result); + return result; + } + + Iterator operator+(int steps) const { + owner->print("iterator advance: %i += %i", 2 - this->pos, steps); + return Iterator(owner, pos - steps); + } + + void print(const char *msg) const { owner->print(msg); } + }; + + Iterator begin() const { + print("begin()"); + return Iterator(this, 2); + } + + Iterator end() const { + print("end()"); + return Iterator(this, -1); + } + + void print(const char *msg, ...) const { + va_list args; + va_start(args, msg); + printf("[%s] ", name); + vprintf(msg, args); + printf("\n"); + va_end(args); + } +}; + +int main() { + printf("do\n"); +#pragma omp parallel for collapse(3) num_threads(1) + for (int i = 0; i < 3; ++i) +#pragma omp reverse + for (Reporter c{"init-stmt"}; auto &&v : Reporter("range")) + for (int k = 0; k < 3; ++k) + printf("i=%d j=%d k=%d\n", i, v, k); + printf("done\n"); + return EXIT_SUCCESS; +} + +#endif /* HEADER */ + +// CHECK: do +// CHECK-NEXT: [init-stmt] ctor +// CHECK-NEXT: [range] ctor +// CHECK-NEXT: [range] end() +// CHECK-NEXT: [range] begin() +// CHECK-NEXT: [range] begin() +// CHECK-NEXT: [range] iterator distance: 3 +// CHECK-NEXT: [range] iterator advance: 0 += 2 +// CHECK-NEXT: [range] iterator move assign +// CHECK-NEXT: [range] iterator deref: 2 +// CHECK-NEXT: i=0 j=2 k=0 +// CHECK-NEXT: [range] iterator dtor +// CHECK-NEXT: [range] iterator advance: 0 += 2 +// CHECK-NEXT: [range] iterator move assign +// CHECK-NEXT: [range] iterator deref: 2 +// CHECK-NEXT: i=0 j=2 k=1 +// CHECK-NEXT: [range] iterator dtor +// CHECK-NEXT: [range] iterator advance: 0 += 2 +// CHECK-NEXT: [range] iterator move assign +// CHECK-NEXT: [range] iterator deref: 2 +// CHECK-NEXT: i=0 j=2 k=2 +// CHECK-NEXT: [range] iterator dtor +// CHECK-NEXT: [range] iterator advance: 0 += 1 +// CHECK-NEXT: [range] iterator move assign +// CHECK-NEXT: [range] iterator deref: 1 +// CHECK-NEXT: i=0 j=1 k=0 +// CHECK-NEXT: [range] iterator dtor +// CHECK-NEXT: [range] iterator advance: 0 += 1 +// CHECK-NEXT: [range] iterator move assign +// CHECK-NEXT: [range] iterator deref: 1 +// CHECK-NEXT: i=0 j=1 k=1 +// CHECK-NEXT: [range] iterator dtor +// CHECK-NEXT: [range] iterator advance: 0 += 1 +// CHECK-NEXT: [range] iterator move assign +// CHECK-NEXT: [range] iterator deref: 1 +// CHECK-NEXT: i=0 j=1 k=2 +// CHECK-NEXT: [range] iterator dtor +// CHECK-NEXT: [range] iterator advance: 0 += 0 +// CHECK-NEXT: [range] iterator move assign +// CHECK-NEXT: [range] iterator deref: 0 +// CHECK-NEXT: i=0 j=0 k=0 +// CHECK-NEXT: [range] iterator dtor +// CHECK-NEXT: [range] iterator advance: 0 += 0 +// CHECK-NEXT: [range] iterator move assign +// CHECK-NEXT: [range] iterator deref: 0 +// CHECK-NEXT: i=0 j=0 k=1 +// CHECK-NEXT: [range] iterator dtor +// CHECK-NEXT: [range] iterator advance: 0 += 0 +// CHECK-NEXT: [range] iterator move assign +// CHECK-NEXT: [range] iterator deref: 0 +// CHECK-NEXT: i=0 j=0 k=2 +// CHECK-NEXT: [range] iterator dtor +// CHECK-NEXT: [range] iterator advance: 0 += 2 +// CHECK-NEXT: [range] iterator move assign +// CHECK-NEXT: [range] iterator deref: 2 +// CHECK-NEXT: i=1 j=2 k=0 +// CHECK-NEXT: [range] iterator dtor +// CHECK-NEXT: [range] iterator advance: 0 += 2 +// CHECK-NEXT: [range] iterator move assign +// CHECK-NEXT: [range] iterator deref: 2 +// CHECK-NEXT: i=1 j=2 k=1 +// CHECK-NEXT: [range] iterator dtor +// CHECK-NEXT: [range] iterator advance: 0 += 2 +// CHECK-NEXT: [range] iterator move assign +// CHECK-NEXT: [range] iterator deref: 2 +// CHECK-NEXT: i=1 j=2 k=2 +// CHECK-NEXT: [range] iterator dtor +// CHECK-NEXT: [range] iterator advance: 0 += 1 +// CHECK-NEXT: [range] iterator move assign +// CHECK-NEXT: [range] iterator deref: 1 +// CHECK-NEXT: i=1 j=1 k=0 +// CHECK-NEXT: [range] iterator dtor +// CHECK-NEXT: [range] iterator advance: 0 += 1 +// CHECK-NEXT: [range] iterator move assign +// CHECK-NEXT: [range] iterator deref: 1 +// CHECK-NEXT: i=1 j=1 k=1 +// CHECK-NEXT: [range] iterator dtor +// CHECK-NEXT: [range] iterator advance: 0 += 1 +// CHECK-NEXT: [range] iterator move assign +// CHECK-NEXT: [range] iterator deref: 1 +// CHECK-NEXT: i=1 j=1 k=2 +// CHECK-NEXT: [range] iterator dtor +// CHECK-NEXT: [range] iterator advance: 0 += 0 +// CHECK-NEXT: [range] iterator move assign +// CHECK-NEXT: [range] iterator deref: 0 +// CHECK-NEXT: i=1 j=0 k=0 +// CHECK-NEXT: [range] iterator dtor +// CHECK-NEXT: [range] iterator advance: 0 += 0 +// CHECK-NEXT: [range] iterator move assign +// CHECK-NEXT: [range] iterator deref: 0 +// CHECK-NEXT: i=1 j=0 k=1 +// CHECK-NEXT: [range] iterator dtor +// CHECK-NEXT: [range] iterator advance: 0 += 0 +// CHECK-NEXT: [range] iterator move assign +// CHECK-NEXT: [range] iterator deref: 0 +// CHECK-NEXT: i=1 j=0 k=2 +// CHECK-NEXT: [range] iterator dtor +// CHECK-NEXT: [range] iterator advance: 0 += 2 +// CHECK-NEXT: [range] iterator move assign +// CHECK-NEXT: [range] iterator deref: 2 +// CHECK-NEXT: i=2 j=2 k=0 +// CHECK-NEXT: [range] iterator dtor +// CHECK-NEXT: [range] iterator advance: 0 += 2 +// CHECK-NEXT: [range] iterator move assign +// CHECK-NEXT: [range] iterator deref: 2 +// CHECK-NEXT: i=2 j=2 k=1 +// CHECK-NEXT: [range] iterator dtor +// CHECK-NEXT: [range] iterator advance: 0 += 2 +// CHECK-NEXT: [range] iterator move assign +// CHECK-NEXT: [range] iterator deref: 2 +// CHECK-NEXT: i=2 j=2 k=2 +// CHECK-NEXT: [range] iterator dtor +// CHECK-NEXT: [range] iterator advance: 0 += 1 +// CHECK-NEXT: [range] iterator move assign +// CHECK-NEXT: [range] iterator deref: 1 +// CHECK-NEXT: i=2 j=1 k=0 +// CHECK-NEXT: [range] iterator dtor +// CHECK-NEXT: [range] iterator advance: 0 += 1 +// CHECK-NEXT: [range] iterator move assign +// CHECK-NEXT: [range] iterator deref: 1 +// CHECK-NEXT: i=2 j=1 k=1 +// CHECK-NEXT: [range] iterator dtor +// CHECK-NEXT: [range] iterator advance: 0 += 1 +// CHECK-NEXT: [range] iterator move assign +// CHECK-NEXT: [range] iterator deref: 1 +// CHECK-NEXT: i=2 j=1 k=2 +// CHECK-NEXT: [range] iterator dtor +// CHECK-NEXT: [range] iterator advance: 0 += 0 +// CHECK-NEXT: [range] iterator move assign +// CHECK-NEXT: [range] iterator deref: 0 +// CHECK-NEXT: i=2 j=0 k=0 +// CHECK-NEXT: [range] iterator dtor +// CHECK-NEXT: [range] iterator advance: 0 += 0 +// CHECK-NEXT: [range] iterator move assign +// CHECK-NEXT: [range] iterator deref: 0 +// CHECK-NEXT: i=2 j=0 k=1 +// CHECK-NEXT: [range] iterator dtor +// CHECK-NEXT: [range] iterator advance: 0 += 0 +// CHECK-NEXT: [range] iterator move assign +// CHECK-NEXT: [range] iterator deref: 0 +// CHECK-NEXT: i=2 j=0 k=2 +// CHECK-NEXT: [range] iterator dtor +// CHECK-NEXT: [range] iterator dtor +// CHECK-NEXT: [range] iterator dtor +// CHECK-NEXT: [range] iterator dtor +// CHECK-NEXT: [range] dtor +// CHECK-NEXT: [init-stmt] dtor +// CHECK-NEXT: done diff --git a/openmp/runtime/test/transform/reverse/parallel-wsloop-collapse-intfor.cpp b/openmp/runtime/test/transform/reverse/parallel-wsloop-collapse-intfor.cpp new file mode 100644 index 0000000000000..ae545b863d86c --- /dev/null +++ b/openmp/runtime/test/transform/reverse/parallel-wsloop-collapse-intfor.cpp @@ -0,0 +1,51 @@ +// RUN: %libomp-cxx-compile-and-run | FileCheck %s --match-full-lines + +#ifndef HEADER +#define HEADER + +#include +#include + +int main() { + printf("do\n"); +#pragma omp parallel for collapse(3) num_threads(1) + for (int i = 0; i < 3; ++i) +#pragma omp reverse + for (int j = 0; j < 3; ++j) + for (int k = 0; k < 3; ++k) + printf("i=%d j=%d k=%d\n", i, j, k); + printf("done\n"); + return EXIT_SUCCESS; +} + +#endif /* HEADER */ + +// CHECK: do +// CHECK-NEXT: i=0 j=2 k=0 +// CHECK-NEXT: i=0 j=2 k=1 +// CHECK-NEXT: i=0 j=2 k=2 +// CHECK-NEXT: i=0 j=1 k=0 +// CHECK-NEXT: i=0 j=1 k=1 +// CHECK-NEXT: i=0 j=1 k=2 +// CHECK-NEXT: i=0 j=0 k=0 +// CHECK-NEXT: i=0 j=0 k=1 +// CHECK-NEXT: i=0 j=0 k=2 +// CHECK-NEXT: i=1 j=2 k=0 +// CHECK-NEXT: i=1 j=2 k=1 +// CHECK-NEXT: i=1 j=2 k=2 +// CHECK-NEXT: i=1 j=1 k=0 +// CHECK-NEXT: i=1 j=1 k=1 +// CHECK-NEXT: i=1 j=1 k=2 +// CHECK-NEXT: i=1 j=0 k=0 +// CHECK-NEXT: i=1 j=0 k=1 +// CHECK-NEXT: i=1 j=0 k=2 +// CHECK-NEXT: i=2 j=2 k=0 +// CHECK-NEXT: i=2 j=2 k=1 +// CHECK-NEXT: i=2 j=2 k=2 +// CHECK-NEXT: i=2 j=1 k=0 +// CHECK-NEXT: i=2 j=1 k=1 +// CHECK-NEXT: i=2 j=1 k=2 +// CHECK-NEXT: i=2 j=0 k=0 +// CHECK-NEXT: i=2 j=0 k=1 +// CHECK-NEXT: i=2 j=0 k=2 +// CHECK-NEXT: done From 1dfbd07255f50ab3920d397dda5f8f9c05020f76 Mon Sep 17 00:00:00 2001 From: Noah Goldstein Date: Wed, 17 Jul 2024 16:54:45 +0800 Subject: [PATCH 393/777] [ValueTracking] Add tests for `llvm.vector.reverse` with `DemandedElts`; NFC --- .../test/Analysis/ValueTracking/known-bits.ll | 30 ++++++++++++++++ .../Analysis/ValueTracking/known-fpclass.ll | 35 +++++++++++++++++++ .../Analysis/ValueTracking/known-non-zero.ll | 30 ++++++++++++++++ 3 files changed, 95 insertions(+) diff --git a/llvm/test/Analysis/ValueTracking/known-bits.ll b/llvm/test/Analysis/ValueTracking/known-bits.ll index 035ccf8d42d13..0b1f45ee21ea9 100644 --- a/llvm/test/Analysis/ValueTracking/known-bits.ll +++ b/llvm/test/Analysis/ValueTracking/known-bits.ll @@ -23,3 +23,33 @@ define <4 x i1> @vec_reverse_known_bits_fail(<4 x i8> %xx) { %r = icmp slt <4 x i8> %rev, zeroinitializer ret <4 x i1> %r } + +define i1 @vec_reverse_known_bits_demanded(<4 x i8> %xx) { +; CHECK-LABEL: @vec_reverse_known_bits_demanded( +; CHECK-NEXT: [[X:%.*]] = or <4 x i8> [[XX:%.*]], +; CHECK-NEXT: [[REV:%.*]] = call <4 x i8> @llvm.vector.reverse.v4i8(<4 x i8> [[X]]) +; CHECK-NEXT: [[ELE:%.*]] = extractelement <4 x i8> [[REV]], i64 1 +; CHECK-NEXT: [[R:%.*]] = icmp slt i8 [[ELE]], 0 +; CHECK-NEXT: ret i1 [[R]] +; + %x = or <4 x i8> %xx, + %rev = call <4 x i8> @llvm.vector.reverse(<4 x i8> %x) + %ele = extractelement <4 x i8> %rev, i64 1 + %r = icmp slt i8 %ele, 0 + ret i1 %r +} + +define i1 @vec_reverse_known_bits_demanded_fail(<4 x i8> %xx) { +; CHECK-LABEL: @vec_reverse_known_bits_demanded_fail( +; CHECK-NEXT: [[X:%.*]] = or <4 x i8> [[XX:%.*]], +; CHECK-NEXT: [[REV:%.*]] = call <4 x i8> @llvm.vector.reverse.v4i8(<4 x i8> [[X]]) +; CHECK-NEXT: [[ELE:%.*]] = extractelement <4 x i8> [[REV]], i64 2 +; CHECK-NEXT: [[R:%.*]] = icmp slt i8 [[ELE]], 0 +; CHECK-NEXT: ret i1 [[R]] +; + %x = or <4 x i8> %xx, + %rev = call <4 x i8> @llvm.vector.reverse(<4 x i8> %x) + %ele = extractelement <4 x i8> %rev, i64 2 + %r = icmp slt i8 %ele, 0 + ret i1 %r +} diff --git a/llvm/test/Analysis/ValueTracking/known-fpclass.ll b/llvm/test/Analysis/ValueTracking/known-fpclass.ll index 59f3eed715b52..225120584edef 100644 --- a/llvm/test/Analysis/ValueTracking/known-fpclass.ll +++ b/llvm/test/Analysis/ValueTracking/known-fpclass.ll @@ -24,3 +24,38 @@ define <4 x i1> @vector_reverse_fpclass2(<4 x double> nofpclass(nzero) %x) { ret <4 x i1> %cmp } +define i1 @vector_reverse_fpclass_demanded(<4 x double> %vec, double nofpclass(nzero nan) %x) { +; CHECK-LABEL: @vector_reverse_fpclass_demanded( +; CHECK-NEXT: [[X_ABS:%.*]] = call double @llvm.fabs.f64(double [[X:%.*]]) +; CHECK-NEXT: [[VEC_X:%.*]] = insertelement <4 x double> [[VEC:%.*]], double [[X_ABS]], i64 1 +; CHECK-NEXT: [[REV:%.*]] = call <4 x double> @llvm.vector.reverse.v4f64(<4 x double> [[VEC_X]]) +; CHECK-NEXT: [[ELE:%.*]] = extractelement <4 x double> [[REV]], i64 2 +; CHECK-NEXT: [[CMP:%.*]] = fcmp oge double [[ELE]], 0.000000e+00 +; CHECK-NEXT: ret i1 [[CMP]] +; + + %x.abs = call double @llvm.fabs.f64(double %x) + %vec.x = insertelement <4 x double> %vec, double %x.abs, i64 1 + %rev = call <4 x double> @llvm.vector.reverse(<4 x double> %vec.x) + %ele = extractelement <4 x double> %rev, i64 2 + %cmp = fcmp oge double %ele, 0.0 + ret i1 %cmp +} + +define i1 @vector_reverse_fpclass_demanded_fail(<4 x double> %vec, double nofpclass(nzero nan) %x) { +; CHECK-LABEL: @vector_reverse_fpclass_demanded_fail( +; CHECK-NEXT: [[X_ABS:%.*]] = call double @llvm.fabs.f64(double [[X:%.*]]) +; CHECK-NEXT: [[VEC_X:%.*]] = insertelement <4 x double> [[VEC:%.*]], double [[X_ABS]], i64 1 +; CHECK-NEXT: [[REV:%.*]] = call <4 x double> @llvm.vector.reverse.v4f64(<4 x double> [[VEC_X]]) +; CHECK-NEXT: [[ELE:%.*]] = extractelement <4 x double> [[REV]], i64 1 +; CHECK-NEXT: [[CMP:%.*]] = fcmp oge double [[ELE]], 0.000000e+00 +; CHECK-NEXT: ret i1 [[CMP]] +; + + %x.abs = call double @llvm.fabs.f64(double %x) + %vec.x = insertelement <4 x double> %vec, double %x.abs, i64 1 + %rev = call <4 x double> @llvm.vector.reverse(<4 x double> %vec.x) + %ele = extractelement <4 x double> %rev, i64 1 + %cmp = fcmp oge double %ele, 0.0 + ret i1 %cmp +} diff --git a/llvm/test/Analysis/ValueTracking/known-non-zero.ll b/llvm/test/Analysis/ValueTracking/known-non-zero.ll index 5704586d92300..98f368a7cd6c8 100644 --- a/llvm/test/Analysis/ValueTracking/known-non-zero.ll +++ b/llvm/test/Analysis/ValueTracking/known-non-zero.ll @@ -1520,4 +1520,34 @@ define <4 x i1> @vec_reverse_non_zero_fail(<4 x i8> %xx) { ret <4 x i1> %r } +define i1 @vec_reverse_non_zero_demanded(<4 x i8> %xx) { +; CHECK-LABEL: @vec_reverse_non_zero_demanded( +; CHECK-NEXT: [[X:%.*]] = add nuw <4 x i8> [[XX:%.*]], +; CHECK-NEXT: [[REV:%.*]] = call <4 x i8> @llvm.vector.reverse.v4i8(<4 x i8> [[X]]) +; CHECK-NEXT: [[ELE:%.*]] = extractelement <4 x i8> [[REV]], i64 3 +; CHECK-NEXT: [[R:%.*]] = icmp eq i8 [[ELE]], 0 +; CHECK-NEXT: ret i1 [[R]] +; + %x = add nuw <4 x i8> %xx, + %rev = call <4 x i8> @llvm.vector.reverse(<4 x i8> %x) + %ele = extractelement <4 x i8> %rev, i64 3 + %r = icmp eq i8 %ele, 0 + ret i1 %r +} + +define i1 @vec_reverse_non_zero_demanded_fail(<4 x i8> %xx) { +; CHECK-LABEL: @vec_reverse_non_zero_demanded_fail( +; CHECK-NEXT: [[X:%.*]] = add nuw <4 x i8> [[XX:%.*]], +; CHECK-NEXT: [[REV:%.*]] = call <4 x i8> @llvm.vector.reverse.v4i8(<4 x i8> [[X]]) +; CHECK-NEXT: [[ELE:%.*]] = extractelement <4 x i8> [[REV]], i64 2 +; CHECK-NEXT: [[R:%.*]] = icmp eq i8 [[ELE]], 0 +; CHECK-NEXT: ret i1 [[R]] +; + %x = add nuw <4 x i8> %xx, + %rev = call <4 x i8> @llvm.vector.reverse(<4 x i8> %x) + %ele = extractelement <4 x i8> %rev, i64 2 + %r = icmp eq i8 %ele, 0 + ret i1 %r +} + declare i32 @llvm.experimental.get.vector.length.i32(i32, i32, i1) From 6ef970b65f93b0f7fbcd8ffc44fb9b9af58cc097 Mon Sep 17 00:00:00 2001 From: Noah Goldstein Date: Tue, 16 Jul 2024 20:27:55 +0800 Subject: [PATCH 394/777] [ValueTracking] Consistently propagate `DemandedElts` is `computeKnownBits` --- llvm/lib/Analysis/ValueTracking.cpp | 88 ++++++++++--------- .../test/Analysis/ValueTracking/known-bits.ll | 6 +- 2 files changed, 48 insertions(+), 46 deletions(-) diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index f8ec868398323..8bc0e7f23b81c 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -1091,15 +1091,15 @@ static void computeKnownBitsFromOperator(const Operator *I, break; } case Instruction::UDiv: { - computeKnownBits(I->getOperand(0), Known, Depth + 1, Q); - computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q); + computeKnownBits(I->getOperand(0), DemandedElts, Known, Depth + 1, Q); + computeKnownBits(I->getOperand(1), DemandedElts, Known2, Depth + 1, Q); Known = KnownBits::udiv(Known, Known2, Q.IIQ.isExact(cast(I))); break; } case Instruction::SDiv: { - computeKnownBits(I->getOperand(0), Known, Depth + 1, Q); - computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q); + computeKnownBits(I->getOperand(0), DemandedElts, Known, Depth + 1, Q); + computeKnownBits(I->getOperand(1), DemandedElts, Known2, Depth + 1, Q); Known = KnownBits::sdiv(Known, Known2, Q.IIQ.isExact(cast(I))); break; @@ -1107,7 +1107,7 @@ static void computeKnownBitsFromOperator(const Operator *I, case Instruction::Select: { auto ComputeForArm = [&](Value *Arm, bool Invert) { KnownBits Res(Known.getBitWidth()); - computeKnownBits(Arm, Res, Depth + 1, Q); + computeKnownBits(Arm, DemandedElts, Res, Depth + 1, Q); adjustKnownBitsForSelectArm(Res, I->getOperand(0), Arm, Invert, Depth, Q); return Res; }; @@ -1142,7 +1142,7 @@ static void computeKnownBitsFromOperator(const Operator *I, assert(SrcBitWidth && "SrcBitWidth can't be zero"); Known = Known.anyextOrTrunc(SrcBitWidth); - computeKnownBits(I->getOperand(0), Known, Depth + 1, Q); + computeKnownBits(I->getOperand(0), DemandedElts, Known, Depth + 1, Q); if (auto *Inst = dyn_cast(I); Inst && Inst->hasNonNeg() && !Known.isNegative()) Known.makeNonNegative(); @@ -1164,7 +1164,8 @@ static void computeKnownBitsFromOperator(const Operator *I, if (match(I, m_ElementWiseBitCast(m_Value(V))) && V->getType()->isFPOrFPVectorTy()) { Type *FPType = V->getType()->getScalarType(); - KnownFPClass Result = computeKnownFPClass(V, fcAllFlags, Depth + 1, Q); + KnownFPClass Result = + computeKnownFPClass(V, DemandedElts, fcAllFlags, Depth + 1, Q); FPClassTest FPClasses = Result.KnownFPClasses; // TODO: Treat it as zero/poison if the use of I is unreachable. @@ -1245,7 +1246,7 @@ static void computeKnownBitsFromOperator(const Operator *I, unsigned SrcBitWidth = I->getOperand(0)->getType()->getScalarSizeInBits(); Known = Known.trunc(SrcBitWidth); - computeKnownBits(I->getOperand(0), Known, Depth + 1, Q); + computeKnownBits(I->getOperand(0), DemandedElts, Known, Depth + 1, Q); // If the sign bit of the input is known set or clear, then we know the // top bits of the result. Known = Known.sext(BitWidth); @@ -1305,14 +1306,14 @@ static void computeKnownBitsFromOperator(const Operator *I, break; } case Instruction::SRem: - computeKnownBits(I->getOperand(0), Known, Depth + 1, Q); - computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q); + computeKnownBits(I->getOperand(0), DemandedElts, Known, Depth + 1, Q); + computeKnownBits(I->getOperand(1), DemandedElts, Known2, Depth + 1, Q); Known = KnownBits::srem(Known, Known2); break; case Instruction::URem: - computeKnownBits(I->getOperand(0), Known, Depth + 1, Q); - computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q); + computeKnownBits(I->getOperand(0), DemandedElts, Known, Depth + 1, Q); + computeKnownBits(I->getOperand(1), DemandedElts, Known2, Depth + 1, Q); Known = KnownBits::urem(Known, Known2); break; case Instruction::Alloca: @@ -1465,17 +1466,17 @@ static void computeKnownBitsFromOperator(const Operator *I, unsigned OpNum = P->getOperand(0) == R ? 0 : 1; Instruction *RInst = P->getIncomingBlock(OpNum)->getTerminator(); - Instruction *LInst = P->getIncomingBlock(1-OpNum)->getTerminator(); + Instruction *LInst = P->getIncomingBlock(1 - OpNum)->getTerminator(); // Ok, we have a PHI of the form L op= R. Check for low // zero bits. RecQ.CxtI = RInst; - computeKnownBits(R, Known2, Depth + 1, RecQ); + computeKnownBits(R, DemandedElts, Known2, Depth + 1, RecQ); // We need to take the minimum number of known bits KnownBits Known3(BitWidth); RecQ.CxtI = LInst; - computeKnownBits(L, Known3, Depth + 1, RecQ); + computeKnownBits(L, DemandedElts, Known3, Depth + 1, RecQ); Known.Zero.setLowBits(std::min(Known2.countMinTrailingZeros(), Known3.countMinTrailingZeros())); @@ -1548,7 +1549,8 @@ static void computeKnownBitsFromOperator(const Operator *I, // want to waste time spinning around in loops. // TODO: See if we can base recursion limiter on number of incoming phi // edges so we don't overly clamp analysis. - computeKnownBits(IncValue, Known2, MaxAnalysisRecursionDepth - 1, RecQ); + computeKnownBits(IncValue, DemandedElts, Known2, + MaxAnalysisRecursionDepth - 1, RecQ); // See if we can further use a conditional branch into the phi // to help us determine the range of the value. @@ -1619,9 +1621,10 @@ static void computeKnownBitsFromOperator(const Operator *I, } if (const IntrinsicInst *II = dyn_cast(I)) { switch (II->getIntrinsicID()) { - default: break; + default: + break; case Intrinsic::abs: { - computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q); + computeKnownBits(I->getOperand(0), DemandedElts, Known2, Depth + 1, Q); bool IntMinIsPoison = match(II->getArgOperand(1), m_One()); Known = Known2.abs(IntMinIsPoison); break; @@ -1637,7 +1640,7 @@ static void computeKnownBitsFromOperator(const Operator *I, Known.One |= Known2.One.byteSwap(); break; case Intrinsic::ctlz: { - computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q); + computeKnownBits(I->getOperand(0), DemandedElts, Known2, Depth + 1, Q); // If we have a known 1, its position is our upper bound. unsigned PossibleLZ = Known2.countMaxLeadingZeros(); // If this call is poison for 0 input, the result will be less than 2^n. @@ -1648,7 +1651,7 @@ static void computeKnownBitsFromOperator(const Operator *I, break; } case Intrinsic::cttz: { - computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q); + computeKnownBits(I->getOperand(0), DemandedElts, Known2, Depth + 1, Q); // If we have a known 1, its position is our upper bound. unsigned PossibleTZ = Known2.countMaxTrailingZeros(); // If this call is poison for 0 input, the result will be less than 2^n. @@ -1659,7 +1662,7 @@ static void computeKnownBitsFromOperator(const Operator *I, break; } case Intrinsic::ctpop: { - computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q); + computeKnownBits(I->getOperand(0), DemandedElts, Known2, Depth + 1, Q); // We can bound the space the count needs. Also, bits known to be zero // can't contribute to the population. unsigned BitsPossiblySet = Known2.countMaxPopulation(); @@ -1681,8 +1684,8 @@ static void computeKnownBitsFromOperator(const Operator *I, ShiftAmt = BitWidth - ShiftAmt; KnownBits Known3(BitWidth); - computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q); - computeKnownBits(I->getOperand(1), Known3, Depth + 1, Q); + computeKnownBits(I->getOperand(0), DemandedElts, Known2, Depth + 1, Q); + computeKnownBits(I->getOperand(1), DemandedElts, Known3, Depth + 1, Q); Known.Zero = Known2.Zero.shl(ShiftAmt) | Known3.Zero.lshr(BitWidth - ShiftAmt); @@ -1691,27 +1694,30 @@ static void computeKnownBitsFromOperator(const Operator *I, break; } case Intrinsic::uadd_sat: - computeKnownBits(I->getOperand(0), Known, Depth + 1, Q); - computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q); + computeKnownBits(I->getOperand(0), DemandedElts, Known, Depth + 1, Q); + computeKnownBits(I->getOperand(1), DemandedElts, Known2, Depth + 1, Q); Known = KnownBits::uadd_sat(Known, Known2); break; case Intrinsic::usub_sat: - computeKnownBits(I->getOperand(0), Known, Depth + 1, Q); - computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q); + computeKnownBits(I->getOperand(0), DemandedElts, Known, Depth + 1, Q); + computeKnownBits(I->getOperand(1), DemandedElts, Known2, Depth + 1, Q); Known = KnownBits::usub_sat(Known, Known2); break; case Intrinsic::sadd_sat: - computeKnownBits(I->getOperand(0), Known, Depth + 1, Q); - computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q); + computeKnownBits(I->getOperand(0), DemandedElts, Known, Depth + 1, Q); + computeKnownBits(I->getOperand(1), DemandedElts, Known2, Depth + 1, Q); Known = KnownBits::sadd_sat(Known, Known2); break; case Intrinsic::ssub_sat: - computeKnownBits(I->getOperand(0), Known, Depth + 1, Q); - computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q); + computeKnownBits(I->getOperand(0), DemandedElts, Known, Depth + 1, Q); + computeKnownBits(I->getOperand(1), DemandedElts, Known2, Depth + 1, Q); Known = KnownBits::ssub_sat(Known, Known2); break; // Vec reverse preserves bits from input vec. case Intrinsic::vector_reverse: + computeKnownBits(I->getOperand(0), DemandedElts.reverseBits(), Known, + Depth + 1, Q); + break; // for min/max/and/or reduce, any bit common to each element in the // input vec is set in the output. case Intrinsic::vector_reduce_and: @@ -1738,31 +1744,31 @@ static void computeKnownBitsFromOperator(const Operator *I, break; } case Intrinsic::umin: - computeKnownBits(I->getOperand(0), Known, Depth + 1, Q); - computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q); + computeKnownBits(I->getOperand(0), DemandedElts, Known, Depth + 1, Q); + computeKnownBits(I->getOperand(1), DemandedElts, Known2, Depth + 1, Q); Known = KnownBits::umin(Known, Known2); break; case Intrinsic::umax: - computeKnownBits(I->getOperand(0), Known, Depth + 1, Q); - computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q); + computeKnownBits(I->getOperand(0), DemandedElts, Known, Depth + 1, Q); + computeKnownBits(I->getOperand(1), DemandedElts, Known2, Depth + 1, Q); Known = KnownBits::umax(Known, Known2); break; case Intrinsic::smin: - computeKnownBits(I->getOperand(0), Known, Depth + 1, Q); - computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q); + computeKnownBits(I->getOperand(0), DemandedElts, Known, Depth + 1, Q); + computeKnownBits(I->getOperand(1), DemandedElts, Known2, Depth + 1, Q); Known = KnownBits::smin(Known, Known2); break; case Intrinsic::smax: - computeKnownBits(I->getOperand(0), Known, Depth + 1, Q); - computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q); + computeKnownBits(I->getOperand(0), DemandedElts, Known, Depth + 1, Q); + computeKnownBits(I->getOperand(1), DemandedElts, Known2, Depth + 1, Q); Known = KnownBits::smax(Known, Known2); break; case Intrinsic::ptrmask: { - computeKnownBits(I->getOperand(0), Known, Depth + 1, Q); + computeKnownBits(I->getOperand(0), DemandedElts, Known, Depth + 1, Q); const Value *Mask = I->getOperand(1); Known2 = KnownBits(Mask->getType()->getScalarSizeInBits()); - computeKnownBits(Mask, Known2, Depth + 1, Q); + computeKnownBits(Mask, DemandedElts, Known2, Depth + 1, Q); // TODO: 1-extend would be more precise. Known &= Known2.anyextOrTrunc(BitWidth); break; diff --git a/llvm/test/Analysis/ValueTracking/known-bits.ll b/llvm/test/Analysis/ValueTracking/known-bits.ll index 0b1f45ee21ea9..9d0b153d8ccfc 100644 --- a/llvm/test/Analysis/ValueTracking/known-bits.ll +++ b/llvm/test/Analysis/ValueTracking/known-bits.ll @@ -26,11 +26,7 @@ define <4 x i1> @vec_reverse_known_bits_fail(<4 x i8> %xx) { define i1 @vec_reverse_known_bits_demanded(<4 x i8> %xx) { ; CHECK-LABEL: @vec_reverse_known_bits_demanded( -; CHECK-NEXT: [[X:%.*]] = or <4 x i8> [[XX:%.*]], -; CHECK-NEXT: [[REV:%.*]] = call <4 x i8> @llvm.vector.reverse.v4i8(<4 x i8> [[X]]) -; CHECK-NEXT: [[ELE:%.*]] = extractelement <4 x i8> [[REV]], i64 1 -; CHECK-NEXT: [[R:%.*]] = icmp slt i8 [[ELE]], 0 -; CHECK-NEXT: ret i1 [[R]] +; CHECK-NEXT: ret i1 true ; %x = or <4 x i8> %xx, %rev = call <4 x i8> @llvm.vector.reverse(<4 x i8> %x) From 72ff0499bba72eecc1b3d19833027b6c04337041 Mon Sep 17 00:00:00 2001 From: Noah Goldstein Date: Tue, 16 Jul 2024 20:38:18 +0800 Subject: [PATCH 395/777] [ValueTracking] Consistently propagate `DemandedElts` is `isKnownNonZero` --- llvm/lib/Analysis/ValueTracking.cpp | 90 ++++++++++++------- .../Analysis/ValueTracking/known-non-zero.ll | 6 +- 2 files changed, 57 insertions(+), 39 deletions(-) diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index 8bc0e7f23b81c..b715ab6eabf70 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -303,15 +303,21 @@ bool llvm::isKnownNegative(const Value *V, const SimplifyQuery &SQ, return computeKnownBits(V, Depth, SQ).isNegative(); } -static bool isKnownNonEqual(const Value *V1, const Value *V2, unsigned Depth, +static bool isKnownNonEqual(const Value *V1, const Value *V2, + const APInt &DemandedElts, unsigned Depth, const SimplifyQuery &Q); bool llvm::isKnownNonEqual(const Value *V1, const Value *V2, const DataLayout &DL, AssumptionCache *AC, const Instruction *CxtI, const DominatorTree *DT, bool UseInstrInfo) { + assert(V1->getType() == V2->getType() && + "Testing equality of non-equal types!"); + auto *FVTy = dyn_cast(V1->getType()); + APInt DemandedElts = + FVTy ? APInt::getAllOnes(FVTy->getNumElements()) : APInt(1, 1); return ::isKnownNonEqual( - V1, V2, 0, + V1, V2, DemandedElts, 0, SimplifyQuery(DL, DT, AC, safeCxtI(V2, V1, CxtI), UseInstrInfo)); } @@ -2654,7 +2660,7 @@ static bool isNonZeroSub(const APInt &DemandedElts, unsigned Depth, if (C->isNullValue() && isKnownNonZero(Y, DemandedElts, Q, Depth)) return true; - return ::isKnownNonEqual(X, Y, Depth, Q); + return ::isKnownNonEqual(X, Y, DemandedElts, Depth, Q); } static bool isNonZeroMul(const APInt &DemandedElts, unsigned Depth, @@ -2778,8 +2784,11 @@ static bool isKnownNonZeroFromOperator(const Operator *I, // This all implies the 2 i16 elements are non-zero. Type *FromTy = I->getOperand(0)->getType(); if ((FromTy->isIntOrIntVectorTy() || FromTy->isPtrOrPtrVectorTy()) && - (BitWidth % getBitWidth(FromTy->getScalarType(), Q.DL)) == 0) + (BitWidth % getBitWidth(FromTy->getScalarType(), Q.DL)) == 0) { + if (match(I, m_ElementWiseBitCast(m_Value()))) + return isKnownNonZero(I->getOperand(0), DemandedElts, Q, Depth); return isKnownNonZero(I->getOperand(0), Q, Depth); + } } break; case Instruction::IntToPtr: // Note that we have to take special care to avoid looking through @@ -2788,7 +2797,7 @@ static bool isKnownNonZeroFromOperator(const Operator *I, if (!isa(I->getType()) && Q.DL.getTypeSizeInBits(I->getOperand(0)->getType()).getFixedValue() <= Q.DL.getTypeSizeInBits(I->getType()).getFixedValue()) - return isKnownNonZero(I->getOperand(0), Q, Depth); + return isKnownNonZero(I->getOperand(0), DemandedElts, Q, Depth); break; case Instruction::PtrToInt: // Similar to int2ptr above, we can look through ptr2int here if the cast @@ -2796,13 +2805,13 @@ static bool isKnownNonZeroFromOperator(const Operator *I, if (!isa(I->getType()) && Q.DL.getTypeSizeInBits(I->getOperand(0)->getType()).getFixedValue() <= Q.DL.getTypeSizeInBits(I->getType()).getFixedValue()) - return isKnownNonZero(I->getOperand(0), Q, Depth); + return isKnownNonZero(I->getOperand(0), DemandedElts, Q, Depth); break; case Instruction::Trunc: // nuw/nsw trunc preserves zero/non-zero status of input. if (auto *TI = dyn_cast(I)) if (TI->hasNoSignedWrap() || TI->hasNoUnsignedWrap()) - return isKnownNonZero(TI->getOperand(0), Q, Depth); + return isKnownNonZero(TI->getOperand(0), DemandedElts, Q, Depth); break; case Instruction::Sub: @@ -2823,13 +2832,13 @@ static bool isKnownNonZeroFromOperator(const Operator *I, case Instruction::SExt: case Instruction::ZExt: // ext X != 0 if X != 0. - return isKnownNonZero(I->getOperand(0), Q, Depth); + return isKnownNonZero(I->getOperand(0), DemandedElts, Q, Depth); case Instruction::Shl: { // shl nsw/nuw can't remove any non-zero bits. const OverflowingBinaryOperator *BO = cast(I); if (Q.IIQ.hasNoUnsignedWrap(BO) || Q.IIQ.hasNoSignedWrap(BO)) - return isKnownNonZero(I->getOperand(0), Q, Depth); + return isKnownNonZero(I->getOperand(0), DemandedElts, Q, Depth); // shl X, Y != 0 if X is odd. Note that the value of the shift is undefined // if the lowest bit is shifted off the end. @@ -2845,7 +2854,7 @@ static bool isKnownNonZeroFromOperator(const Operator *I, // shr exact can only shift out zero bits. const PossiblyExactOperator *BO = cast(I); if (BO->isExact()) - return isKnownNonZero(I->getOperand(0), Q, Depth); + return isKnownNonZero(I->getOperand(0), DemandedElts, Q, Depth); // shr X, Y != 0 if X is negative. Note that the value of the shift is not // defined if the sign bit is shifted off the end. @@ -3100,6 +3109,8 @@ static bool isKnownNonZeroFromOperator(const Operator *I, /*NSW=*/true, /* NUW=*/false); // Vec reverse preserves zero/non-zero status from input vec. case Intrinsic::vector_reverse: + return isKnownNonZero(II->getArgOperand(0), DemandedElts.reverseBits(), + Q, Depth); // umin/smin/smax/smin/or of all non-zero elements is always non-zero. case Intrinsic::vector_reduce_or: case Intrinsic::vector_reduce_umax: @@ -3424,7 +3435,8 @@ getInvertibleOperands(const Operator *Op1, /// Only handle a small subset of binops where (binop V2, X) with non-zero X /// implies V2 != V1. static bool isModifyingBinopOfNonZero(const Value *V1, const Value *V2, - unsigned Depth, const SimplifyQuery &Q) { + const APInt &DemandedElts, unsigned Depth, + const SimplifyQuery &Q) { const BinaryOperator *BO = dyn_cast(V1); if (!BO) return false; @@ -3444,39 +3456,43 @@ static bool isModifyingBinopOfNonZero(const Value *V1, const Value *V2, Op = BO->getOperand(0); else return false; - return isKnownNonZero(Op, Q, Depth + 1); + return isKnownNonZero(Op, DemandedElts, Q, Depth + 1); } return false; } /// Return true if V2 == V1 * C, where V1 is known non-zero, C is not 0/1 and /// the multiplication is nuw or nsw. -static bool isNonEqualMul(const Value *V1, const Value *V2, unsigned Depth, +static bool isNonEqualMul(const Value *V1, const Value *V2, + const APInt &DemandedElts, unsigned Depth, const SimplifyQuery &Q) { if (auto *OBO = dyn_cast(V2)) { const APInt *C; return match(OBO, m_Mul(m_Specific(V1), m_APInt(C))) && (OBO->hasNoUnsignedWrap() || OBO->hasNoSignedWrap()) && - !C->isZero() && !C->isOne() && isKnownNonZero(V1, Q, Depth + 1); + !C->isZero() && !C->isOne() && + isKnownNonZero(V1, DemandedElts, Q, Depth + 1); } return false; } /// Return true if V2 == V1 << C, where V1 is known non-zero, C is not 0 and /// the shift is nuw or nsw. -static bool isNonEqualShl(const Value *V1, const Value *V2, unsigned Depth, +static bool isNonEqualShl(const Value *V1, const Value *V2, + const APInt &DemandedElts, unsigned Depth, const SimplifyQuery &Q) { if (auto *OBO = dyn_cast(V2)) { const APInt *C; return match(OBO, m_Shl(m_Specific(V1), m_APInt(C))) && (OBO->hasNoUnsignedWrap() || OBO->hasNoSignedWrap()) && - !C->isZero() && isKnownNonZero(V1, Q, Depth + 1); + !C->isZero() && isKnownNonZero(V1, DemandedElts, Q, Depth + 1); } return false; } static bool isNonEqualPHIs(const PHINode *PN1, const PHINode *PN2, - unsigned Depth, const SimplifyQuery &Q) { + const APInt &DemandedElts, unsigned Depth, + const SimplifyQuery &Q) { // Check two PHIs are in same block. if (PN1->getParent() != PN2->getParent()) return false; @@ -3498,14 +3514,15 @@ static bool isNonEqualPHIs(const PHINode *PN1, const PHINode *PN2, SimplifyQuery RecQ = Q; RecQ.CxtI = IncomBB->getTerminator(); - if (!isKnownNonEqual(IV1, IV2, Depth + 1, RecQ)) + if (!isKnownNonEqual(IV1, IV2, DemandedElts, Depth + 1, RecQ)) return false; UsedFullRecursion = true; } return true; } -static bool isNonEqualSelect(const Value *V1, const Value *V2, unsigned Depth, +static bool isNonEqualSelect(const Value *V1, const Value *V2, + const APInt &DemandedElts, unsigned Depth, const SimplifyQuery &Q) { const SelectInst *SI1 = dyn_cast(V1); if (!SI1) @@ -3516,12 +3533,12 @@ static bool isNonEqualSelect(const Value *V1, const Value *V2, unsigned Depth, const Value *Cond2 = SI2->getCondition(); if (Cond1 == Cond2) return isKnownNonEqual(SI1->getTrueValue(), SI2->getTrueValue(), - Depth + 1, Q) && + DemandedElts, Depth + 1, Q) && isKnownNonEqual(SI1->getFalseValue(), SI2->getFalseValue(), - Depth + 1, Q); + DemandedElts, Depth + 1, Q); } - return isKnownNonEqual(SI1->getTrueValue(), V2, Depth + 1, Q) && - isKnownNonEqual(SI1->getFalseValue(), V2, Depth + 1, Q); + return isKnownNonEqual(SI1->getTrueValue(), V2, DemandedElts, Depth + 1, Q) && + isKnownNonEqual(SI1->getFalseValue(), V2, DemandedElts, Depth + 1, Q); } // Check to see if A is both a GEP and is the incoming value for a PHI in the @@ -3577,7 +3594,8 @@ static bool isNonEqualPointersWithRecursiveGEP(const Value *A, const Value *B, } /// Return true if it is known that V1 != V2. -static bool isKnownNonEqual(const Value *V1, const Value *V2, unsigned Depth, +static bool isKnownNonEqual(const Value *V1, const Value *V2, + const APInt &DemandedElts, unsigned Depth, const SimplifyQuery &Q) { if (V1 == V2) return false; @@ -3595,40 +3613,44 @@ static bool isKnownNonEqual(const Value *V1, const Value *V2, unsigned Depth, auto *O2 = dyn_cast(V2); if (O1 && O2 && O1->getOpcode() == O2->getOpcode()) { if (auto Values = getInvertibleOperands(O1, O2)) - return isKnownNonEqual(Values->first, Values->second, Depth + 1, Q); + return isKnownNonEqual(Values->first, Values->second, DemandedElts, + Depth + 1, Q); if (const PHINode *PN1 = dyn_cast(V1)) { const PHINode *PN2 = cast(V2); // FIXME: This is missing a generalization to handle the case where one is // a PHI and another one isn't. - if (isNonEqualPHIs(PN1, PN2, Depth, Q)) + if (isNonEqualPHIs(PN1, PN2, DemandedElts, Depth, Q)) return true; }; } - if (isModifyingBinopOfNonZero(V1, V2, Depth, Q) || - isModifyingBinopOfNonZero(V2, V1, Depth, Q)) + if (isModifyingBinopOfNonZero(V1, V2, DemandedElts, Depth, Q) || + isModifyingBinopOfNonZero(V2, V1, DemandedElts, Depth, Q)) return true; - if (isNonEqualMul(V1, V2, Depth, Q) || isNonEqualMul(V2, V1, Depth, Q)) + if (isNonEqualMul(V1, V2, DemandedElts, Depth, Q) || + isNonEqualMul(V2, V1, DemandedElts, Depth, Q)) return true; - if (isNonEqualShl(V1, V2, Depth, Q) || isNonEqualShl(V2, V1, Depth, Q)) + if (isNonEqualShl(V1, V2, DemandedElts, Depth, Q) || + isNonEqualShl(V2, V1, DemandedElts, Depth, Q)) return true; if (V1->getType()->isIntOrIntVectorTy()) { // Are any known bits in V1 contradictory to known bits in V2? If V1 // has a known zero where V2 has a known one, they must not be equal. - KnownBits Known1 = computeKnownBits(V1, Depth, Q); + KnownBits Known1 = computeKnownBits(V1, DemandedElts, Depth, Q); if (!Known1.isUnknown()) { - KnownBits Known2 = computeKnownBits(V2, Depth, Q); + KnownBits Known2 = computeKnownBits(V2, DemandedElts, Depth, Q); if (Known1.Zero.intersects(Known2.One) || Known2.Zero.intersects(Known1.One)) return true; } } - if (isNonEqualSelect(V1, V2, Depth, Q) || isNonEqualSelect(V2, V1, Depth, Q)) + if (isNonEqualSelect(V1, V2, DemandedElts, Depth, Q) || + isNonEqualSelect(V2, V1, DemandedElts, Depth, Q)) return true; if (isNonEqualPointersWithRecursiveGEP(V1, V2, Q) || @@ -3640,7 +3662,7 @@ static bool isKnownNonEqual(const Value *V1, const Value *V2, unsigned Depth, // Check PtrToInt type matches the pointer size. if (match(V1, m_PtrToIntSameSize(Q.DL, m_Value(A))) && match(V2, m_PtrToIntSameSize(Q.DL, m_Value(B)))) - return isKnownNonEqual(A, B, Depth + 1, Q); + return isKnownNonEqual(A, B, DemandedElts, Depth + 1, Q); return false; } diff --git a/llvm/test/Analysis/ValueTracking/known-non-zero.ll b/llvm/test/Analysis/ValueTracking/known-non-zero.ll index 98f368a7cd6c8..db2c4f3a1ed65 100644 --- a/llvm/test/Analysis/ValueTracking/known-non-zero.ll +++ b/llvm/test/Analysis/ValueTracking/known-non-zero.ll @@ -1522,11 +1522,7 @@ define <4 x i1> @vec_reverse_non_zero_fail(<4 x i8> %xx) { define i1 @vec_reverse_non_zero_demanded(<4 x i8> %xx) { ; CHECK-LABEL: @vec_reverse_non_zero_demanded( -; CHECK-NEXT: [[X:%.*]] = add nuw <4 x i8> [[XX:%.*]], -; CHECK-NEXT: [[REV:%.*]] = call <4 x i8> @llvm.vector.reverse.v4i8(<4 x i8> [[X]]) -; CHECK-NEXT: [[ELE:%.*]] = extractelement <4 x i8> [[REV]], i64 3 -; CHECK-NEXT: [[R:%.*]] = icmp eq i8 [[ELE]], 0 -; CHECK-NEXT: ret i1 [[R]] +; CHECK-NEXT: ret i1 false ; %x = add nuw <4 x i8> %xx, %rev = call <4 x i8> @llvm.vector.reverse(<4 x i8> %x) From e8eeda8e4dc581ae744bc64a2683b5533fec8922 Mon Sep 17 00:00:00 2001 From: Noah Goldstein Date: Tue, 16 Jul 2024 20:40:39 +0800 Subject: [PATCH 396/777] [ValueTracking] Consistently propagate `DemandedElts` is `ComputeNumSignBits` --- llvm/lib/Analysis/ValueTracking.cpp | 67 +++++++++++++++++------------ 1 file changed, 40 insertions(+), 27 deletions(-) diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index b715ab6eabf70..f54de030d3344 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -3801,7 +3801,8 @@ static unsigned ComputeNumSignBitsImpl(const Value *V, default: break; case Instruction::SExt: Tmp = TyBits - U->getOperand(0)->getType()->getScalarSizeInBits(); - return ComputeNumSignBits(U->getOperand(0), Depth + 1, Q) + Tmp; + return ComputeNumSignBits(U->getOperand(0), DemandedElts, Depth + 1, Q) + + Tmp; case Instruction::SDiv: { const APInt *Denominator; @@ -3813,7 +3814,8 @@ static unsigned ComputeNumSignBitsImpl(const Value *V, break; // Calculate the incoming numerator bits. - unsigned NumBits = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q); + unsigned NumBits = + ComputeNumSignBits(U->getOperand(0), DemandedElts, Depth + 1, Q); // Add floor(log(C)) bits to the numerator bits. return std::min(TyBits, NumBits + Denominator->logBase2()); @@ -3822,7 +3824,7 @@ static unsigned ComputeNumSignBitsImpl(const Value *V, } case Instruction::SRem: { - Tmp = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q); + Tmp = ComputeNumSignBits(U->getOperand(0), DemandedElts, Depth + 1, Q); const APInt *Denominator; // srem X, C -> we know that the result is within [-C+1,C) when C is a @@ -3853,7 +3855,7 @@ static unsigned ComputeNumSignBitsImpl(const Value *V, } case Instruction::AShr: { - Tmp = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q); + Tmp = ComputeNumSignBits(U->getOperand(0), DemandedElts, Depth + 1, Q); // ashr X, C -> adds C sign bits. Vectors too. const APInt *ShAmt; if (match(U->getOperand(1), m_APInt(ShAmt))) { @@ -3869,7 +3871,7 @@ static unsigned ComputeNumSignBitsImpl(const Value *V, const APInt *ShAmt; if (match(U->getOperand(1), m_APInt(ShAmt))) { // shl destroys sign bits. - Tmp = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q); + Tmp = ComputeNumSignBits(U->getOperand(0), DemandedElts, Depth + 1, Q); if (ShAmt->uge(TyBits) || // Bad shift. ShAmt->uge(Tmp)) break; // Shifted all sign bits out. Tmp2 = ShAmt->getZExtValue(); @@ -3881,9 +3883,9 @@ static unsigned ComputeNumSignBitsImpl(const Value *V, case Instruction::Or: case Instruction::Xor: // NOT is handled here. // Logical binary ops preserve the number of sign bits at the worst. - Tmp = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q); + Tmp = ComputeNumSignBits(U->getOperand(0), DemandedElts, Depth + 1, Q); if (Tmp != 1) { - Tmp2 = ComputeNumSignBits(U->getOperand(1), Depth + 1, Q); + Tmp2 = ComputeNumSignBits(U->getOperand(1), DemandedElts, Depth + 1, Q); FirstAnswer = std::min(Tmp, Tmp2); // We computed what we know about the sign bits as our first // answer. Now proceed to the generic code that uses @@ -3899,9 +3901,10 @@ static unsigned ComputeNumSignBitsImpl(const Value *V, if (isSignedMinMaxClamp(U, X, CLow, CHigh)) return std::min(CLow->getNumSignBits(), CHigh->getNumSignBits()); - Tmp = ComputeNumSignBits(U->getOperand(1), Depth + 1, Q); - if (Tmp == 1) break; - Tmp2 = ComputeNumSignBits(U->getOperand(2), Depth + 1, Q); + Tmp = ComputeNumSignBits(U->getOperand(1), DemandedElts, Depth + 1, Q); + if (Tmp == 1) + break; + Tmp2 = ComputeNumSignBits(U->getOperand(2), DemandedElts, Depth + 1, Q); return std::min(Tmp, Tmp2); } @@ -3915,7 +3918,7 @@ static unsigned ComputeNumSignBitsImpl(const Value *V, if (const auto *CRHS = dyn_cast(U->getOperand(1))) if (CRHS->isAllOnesValue()) { KnownBits Known(TyBits); - computeKnownBits(U->getOperand(0), Known, Depth + 1, Q); + computeKnownBits(U->getOperand(0), DemandedElts, Known, Depth + 1, Q); // If the input is known to be 0 or 1, the output is 0/-1, which is // all sign bits set. @@ -3928,19 +3931,21 @@ static unsigned ComputeNumSignBitsImpl(const Value *V, return Tmp; } - Tmp2 = ComputeNumSignBits(U->getOperand(1), Depth + 1, Q); - if (Tmp2 == 1) break; + Tmp2 = ComputeNumSignBits(U->getOperand(1), DemandedElts, Depth + 1, Q); + if (Tmp2 == 1) + break; return std::min(Tmp, Tmp2) - 1; case Instruction::Sub: - Tmp2 = ComputeNumSignBits(U->getOperand(1), Depth + 1, Q); - if (Tmp2 == 1) break; + Tmp2 = ComputeNumSignBits(U->getOperand(1), DemandedElts, Depth + 1, Q); + if (Tmp2 == 1) + break; // Handle NEG. if (const auto *CLHS = dyn_cast(U->getOperand(0))) if (CLHS->isNullValue()) { KnownBits Known(TyBits); - computeKnownBits(U->getOperand(1), Known, Depth + 1, Q); + computeKnownBits(U->getOperand(1), DemandedElts, Known, Depth + 1, Q); // If the input is known to be 0 or 1, the output is 0/-1, which is // all sign bits set. if ((Known.Zero | 1).isAllOnes()) @@ -3957,17 +3962,22 @@ static unsigned ComputeNumSignBitsImpl(const Value *V, // Sub can have at most one carry bit. Thus we know that the output // is, at worst, one more bit than the inputs. - Tmp = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q); - if (Tmp == 1) break; + Tmp = ComputeNumSignBits(U->getOperand(0), DemandedElts, Depth + 1, Q); + if (Tmp == 1) + break; return std::min(Tmp, Tmp2) - 1; case Instruction::Mul: { // The output of the Mul can be at most twice the valid bits in the // inputs. - unsigned SignBitsOp0 = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q); - if (SignBitsOp0 == 1) break; - unsigned SignBitsOp1 = ComputeNumSignBits(U->getOperand(1), Depth + 1, Q); - if (SignBitsOp1 == 1) break; + unsigned SignBitsOp0 = + ComputeNumSignBits(U->getOperand(0), DemandedElts, Depth + 1, Q); + if (SignBitsOp0 == 1) + break; + unsigned SignBitsOp1 = + ComputeNumSignBits(U->getOperand(1), DemandedElts, Depth + 1, Q); + if (SignBitsOp1 == 1) + break; unsigned OutValidBits = (TyBits - SignBitsOp0 + 1) + (TyBits - SignBitsOp1 + 1); return OutValidBits > TyBits ? 1 : TyBits - OutValidBits + 1; @@ -3988,8 +3998,8 @@ static unsigned ComputeNumSignBitsImpl(const Value *V, for (unsigned i = 0, e = NumIncomingValues; i != e; ++i) { if (Tmp == 1) return Tmp; RecQ.CxtI = PN->getIncomingBlock(i)->getTerminator(); - Tmp = std::min( - Tmp, ComputeNumSignBits(PN->getIncomingValue(i), Depth + 1, RecQ)); + Tmp = std::min(Tmp, ComputeNumSignBits(PN->getIncomingValue(i), + DemandedElts, Depth + 1, RecQ)); } return Tmp; } @@ -4050,10 +4060,13 @@ static unsigned ComputeNumSignBitsImpl(const Value *V, case Instruction::Call: { if (const auto *II = dyn_cast(U)) { switch (II->getIntrinsicID()) { - default: break; + default: + break; case Intrinsic::abs: - Tmp = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q); - if (Tmp == 1) break; + Tmp = + ComputeNumSignBits(U->getOperand(0), DemandedElts, Depth + 1, Q); + if (Tmp == 1) + break; // Absolute value reduces number of sign bits by at most 1. return Tmp - 1; From 0589762e4e42a21796ca74eeb356cdfc50eaa232 Mon Sep 17 00:00:00 2001 From: Noah Goldstein Date: Tue, 16 Jul 2024 21:30:36 +0800 Subject: [PATCH 397/777] [ValueTracking] Consistently propagate `DemandedElts` is `computeKnownFPClass` Closes #99080 --- llvm/include/llvm/Analysis/ValueTracking.h | 22 ++++++++++++++----- llvm/lib/Analysis/ValueTracking.cpp | 5 +++-- .../Analysis/ValueTracking/known-fpclass.ll | 7 +----- 3 files changed, 21 insertions(+), 13 deletions(-) diff --git a/llvm/include/llvm/Analysis/ValueTracking.h b/llvm/include/llvm/Analysis/ValueTracking.h index 354ad5bc95317..2c2f965a3cd6f 100644 --- a/llvm/include/llvm/Analysis/ValueTracking.h +++ b/llvm/include/llvm/Analysis/ValueTracking.h @@ -526,16 +526,17 @@ inline KnownFPClass computeKnownFPClass( } /// Wrapper to account for known fast math flags at the use instruction. -inline KnownFPClass computeKnownFPClass(const Value *V, FastMathFlags FMF, - FPClassTest InterestedClasses, - unsigned Depth, - const SimplifyQuery &SQ) { +inline KnownFPClass +computeKnownFPClass(const Value *V, const APInt &DemandedElts, + FastMathFlags FMF, FPClassTest InterestedClasses, + unsigned Depth, const SimplifyQuery &SQ) { if (FMF.noNaNs()) InterestedClasses &= ~fcNan; if (FMF.noInfs()) InterestedClasses &= ~fcInf; - KnownFPClass Result = computeKnownFPClass(V, InterestedClasses, Depth, SQ); + KnownFPClass Result = + computeKnownFPClass(V, DemandedElts, InterestedClasses, Depth, SQ); if (FMF.noNaNs()) Result.KnownFPClasses &= ~fcNan; @@ -544,6 +545,17 @@ inline KnownFPClass computeKnownFPClass(const Value *V, FastMathFlags FMF, return Result; } +inline KnownFPClass computeKnownFPClass(const Value *V, FastMathFlags FMF, + FPClassTest InterestedClasses, + unsigned Depth, + const SimplifyQuery &SQ) { + auto *FVTy = dyn_cast(V->getType()); + APInt DemandedElts = + FVTy ? APInt::getAllOnes(FVTy->getNumElements()) : APInt(1, 1); + return computeKnownFPClass(V, DemandedElts, FMF, InterestedClasses, Depth, + SQ); +} + /// Return true if we can prove that the specified FP value is never equal to /// -0.0. Users should use caution when considering PreserveSign /// denormal-fp-math. diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index f54de030d3344..6e039ad2deadb 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -5274,8 +5274,9 @@ void computeKnownFPClass(const Value *V, const APInt &DemandedElts, } // reverse preserves all characteristics of the input vec's element. case Intrinsic::vector_reverse: - Known = computeKnownFPClass(II->getArgOperand(0), II->getFastMathFlags(), - InterestedClasses, Depth + 1, Q); + Known = computeKnownFPClass( + II->getArgOperand(0), DemandedElts.reverseBits(), + II->getFastMathFlags(), InterestedClasses, Depth + 1, Q); break; case Intrinsic::trunc: case Intrinsic::floor: diff --git a/llvm/test/Analysis/ValueTracking/known-fpclass.ll b/llvm/test/Analysis/ValueTracking/known-fpclass.ll index 225120584edef..2b8e6298d746a 100644 --- a/llvm/test/Analysis/ValueTracking/known-fpclass.ll +++ b/llvm/test/Analysis/ValueTracking/known-fpclass.ll @@ -26,12 +26,7 @@ define <4 x i1> @vector_reverse_fpclass2(<4 x double> nofpclass(nzero) %x) { define i1 @vector_reverse_fpclass_demanded(<4 x double> %vec, double nofpclass(nzero nan) %x) { ; CHECK-LABEL: @vector_reverse_fpclass_demanded( -; CHECK-NEXT: [[X_ABS:%.*]] = call double @llvm.fabs.f64(double [[X:%.*]]) -; CHECK-NEXT: [[VEC_X:%.*]] = insertelement <4 x double> [[VEC:%.*]], double [[X_ABS]], i64 1 -; CHECK-NEXT: [[REV:%.*]] = call <4 x double> @llvm.vector.reverse.v4f64(<4 x double> [[VEC_X]]) -; CHECK-NEXT: [[ELE:%.*]] = extractelement <4 x double> [[REV]], i64 2 -; CHECK-NEXT: [[CMP:%.*]] = fcmp oge double [[ELE]], 0.000000e+00 -; CHECK-NEXT: ret i1 [[CMP]] +; CHECK-NEXT: ret i1 true ; %x.abs = call double @llvm.fabs.f64(double %x) From c1263b326439dd623264d35ac5d006800092bac6 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Thu, 18 Jul 2024 10:44:56 +0200 Subject: [PATCH 398/777] [SCCP] Add tests for vectors ins phis (NFC) --- llvm/test/Transforms/SCCP/phis.ll | 129 +++++++++++++++++++++++++----- 1 file changed, 111 insertions(+), 18 deletions(-) diff --git a/llvm/test/Transforms/SCCP/phis.ll b/llvm/test/Transforms/SCCP/phis.ll index 21d97c41388aa..83daae0a7c0c8 100644 --- a/llvm/test/Transforms/SCCP/phis.ll +++ b/llvm/test/Transforms/SCCP/phis.ll @@ -1,9 +1,14 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: opt < %s -passes=sccp -S | FileCheck %s define i1 @float.1(i1 %cmp) { -; CHECK-LABEL: define i1 @float.1(i1 %cmp) { - -; CHECK-LABEL: end: +; CHECK-LABEL: define i1 @float.1( +; CHECK-SAME: i1 [[CMP:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br i1 [[CMP]], label %[[IF_TRUE:.*]], label %[[END:.*]] +; CHECK: [[IF_TRUE]]: +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: ; CHECK-NEXT: ret i1 true ; entry: @@ -19,12 +24,16 @@ end: } define i1 @float.2(i1 %cmp) { -; CHECK-LABEL: define i1 @float.2(i1 %cmp) { - -; CHECK-LABEL: end: -; CHECK-NEXT: %p = phi float [ 1.000000e+00, %entry ], [ 2.000000e+00, %if.true ] -; CHECK-NEXT: %c = fcmp ueq float %p, 1.000000e+00 -; CHECK-NEXT: ret i1 %c +; CHECK-LABEL: define i1 @float.2( +; CHECK-SAME: i1 [[CMP:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br i1 [[CMP]], label %[[IF_TRUE:.*]], label %[[END:.*]] +; CHECK: [[IF_TRUE]]: +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: +; CHECK-NEXT: [[P:%.*]] = phi float [ 1.000000e+00, %[[ENTRY]] ], [ 2.000000e+00, %[[IF_TRUE]] ] +; CHECK-NEXT: [[C:%.*]] = fcmp ueq float [[P]], 1.000000e+00 +; CHECK-NEXT: ret i1 [[C]] ; entry: br i1 %cmp, label %if.true, label %end @@ -39,13 +48,18 @@ end: } define i1 @float.3(float %f, i1 %cmp) { -; CHECK-LABEL: define i1 @float.3(float %f, i1 %cmp) - -; CHECK-LABEL: end: -; CHECK-NEXT: %p = phi float [ 1.000000e+00, %entry ], [ %f, %if.true ] -; CHECK-NEXT: %c = fcmp ueq float %p, 1.000000e+00 -; CHECK-NEXT: ret i1 %c +; CHECK-LABEL: define i1 @float.3( +; CHECK-SAME: float [[F:%.*]], i1 [[CMP:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br i1 [[CMP]], label %[[IF_TRUE:.*]], label %[[END:.*]] +; CHECK: [[IF_TRUE]]: +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: +; CHECK-NEXT: [[P:%.*]] = phi float [ 1.000000e+00, %[[ENTRY]] ], [ [[F]], %[[IF_TRUE]] ] +; CHECK-NEXT: [[C:%.*]] = fcmp ueq float [[P]], 1.000000e+00 +; CHECK-NEXT: ret i1 [[C]] ; + entry: br i1 %cmp, label %if.true, label %end @@ -60,11 +74,16 @@ end: define i1 @float.4_unreachable(float %f, i1 %cmp) { -; CHECK-LABEL: define i1 @float.4_unreachable(float %f, i1 %cmp) - -; CHECK-LABEL: end: +; CHECK-LABEL: define i1 @float.4_unreachable( +; CHECK-SAME: float [[F:%.*]], i1 [[CMP:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br i1 [[CMP]], label %[[IF_TRUE:.*]], label %[[END:.*]] +; CHECK: [[IF_TRUE]]: +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: ; CHECK-NEXT: ret i1 false ; + entry: br i1 %cmp, label %if.true, label %end @@ -79,3 +98,77 @@ end: %c = fcmp une float %p, 1.0 ret i1 %c } + +define <2 x i16> @phi_vector_merge1(i1 %c, <2 x i8> %a) { +; CHECK-LABEL: define <2 x i16> @phi_vector_merge1( +; CHECK-SAME: i1 [[C:%.*]], <2 x i8> [[A:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ZEXT:%.*]] = zext <2 x i8> [[A]] to <2 x i16> +; CHECK-NEXT: br i1 [[C]], label %[[IF:.*]], label %[[JOIN:.*]] +; CHECK: [[IF]]: +; CHECK-NEXT: br label %[[JOIN]] +; CHECK: [[JOIN]]: +; CHECK-NEXT: [[PHI:%.*]] = phi <2 x i16> [ [[ZEXT]], %[[ENTRY]] ], [ , %[[IF]] ] +; CHECK-NEXT: [[ADD:%.*]] = add <2 x i16> [[PHI]], +; CHECK-NEXT: ret <2 x i16> [[ADD]] +; +entry: + %zext = zext <2 x i8> %a to <2 x i16> + br i1 %c, label %if, label %join + +if: + br label %join + +join: + %phi = phi <2 x i16> [ %zext, %entry ], [ , %if ] + %add = add <2 x i16> %phi, + ret <2 x i16> %add +} + +define <2 x i16> @phi_vector_merge2(i1 %c, <2 x i8> %a) { +; CHECK-LABEL: define <2 x i16> @phi_vector_merge2( +; CHECK-SAME: i1 [[C:%.*]], <2 x i8> [[A:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ZEXT:%.*]] = zext <2 x i8> [[A]] to <2 x i16> +; CHECK-NEXT: br i1 [[C]], label %[[IF:.*]], label %[[JOIN:.*]] +; CHECK: [[IF]]: +; CHECK-NEXT: br label %[[JOIN]] +; CHECK: [[JOIN]]: +; CHECK-NEXT: [[PHI:%.*]] = phi <2 x i16> [ , %[[ENTRY]] ], [ [[ZEXT]], %[[IF]] ] +; CHECK-NEXT: [[ADD:%.*]] = add <2 x i16> [[PHI]], +; CHECK-NEXT: ret <2 x i16> [[ADD]] +; +entry: + %zext = zext <2 x i8> %a to <2 x i16> + br i1 %c, label %if, label %join + +if: + br label %join + +join: + %phi = phi <2 x i16> [ , %entry ], [ %zext, %if ] + %add = add <2 x i16> %phi, + ret <2 x i16> %add +} + +define <2 x float> @phi_vector_merge_float(i1 %c) { +; CHECK-LABEL: define <2 x float> @phi_vector_merge_float( +; CHECK-SAME: i1 [[C:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br i1 [[C]], label %[[IF:.*]], label %[[JOIN:.*]] +; CHECK: [[IF]]: +; CHECK-NEXT: br label %[[JOIN]] +; CHECK: [[JOIN]]: +; CHECK-NEXT: [[PHI:%.*]] = phi <2 x float> [ , %[[ENTRY]] ], [ , %[[IF]] ] +; CHECK-NEXT: ret <2 x float> [[PHI]] +; +entry: + br i1 %c, label %if, label %join + +if: + br label %join + +join: + %phi = phi <2 x float> [ , %entry ], [ , %if ] + ret <2 x float> %phi +} From 474d35f238d46010d12485734e62de91cb469404 Mon Sep 17 00:00:00 2001 From: Rainer Orth Date: Thu, 18 Jul 2024 10:46:26 +0200 Subject: [PATCH 399/777] [safestack] Various Solaris fixes (#99290) Even with the `-u __safestack_init` link order fixed on Solaris, there are still several safestack test issues left: - While 540fd42c755f20f7b79c6c79493ec36d8cb9b3d3 enabled safestack on Solaris in the driver unconditionally, it ignored that Solaris also exists on SPARC and forgot to enable SPARC support for the runtime lib. This patch fixes that. - The tests fail to link with undefined references to `__sanitizer_internal_memset` etc in `safestack.cpp.o` and `interception_linux.cpp.o`. These are from indirectly including `sanitizer_redefine_builtins.h`. Instead of using the implementations from `sanitizer_common` as was done in [[safestack] Various Solaris fixes](https://github.com/llvm/llvm-project/pull/98469), this patch disables the interception as discussed in [Revert "[safestack] Various Solaris fixes"](https://github.com/llvm/llvm-project/pull/98541). A similar issue affects 32-bit Linux/sparc where compiling `safestack.cpp` with `-ftrivial-auto-var-init=pattern` causes the compiler to generate calls to `memset` to initialize a `pthread_attr_t` which is larger than can be handled inline. This is avoided by defining `SANITIZER_COMMON_NO_REDEFINE_BUILTINS` in `safestack.cpp` and also adding definitions of the interceptors that just forward to `libc` for the benefit of `interception_linux.cpp`. - The `pthread*.c` tests `FAIL` with ``` safestack CHECK failed: /vol/llvm/src/llvm-project/local/compiler-rt/lib/safestack/safestack.cpp:227 size ``` The problem is that `pthread_attr_init` initializes the `stacksize` attribute to 0, signifying the default. Unless explicitly overridded, it stays that way. I think this is allowed by XPG7. Since safestack cannot deal with this, I set `size` to the defaults documented in `pthread_create(3C)`. Unfortunately, there's no macro for those values outside of private `libc` headers. - The Solaris `syscall` interface isn't stable. This is not just a theoretical concern, but the syscalls have changed incompatibly several times in the past. Therefore this patch switches the implementations of `TgKill` (where `SYS_lwp_kill` doesn't exist on Solaris 11.4 anyway), `Mmap`, `Munmap`, and `Mprotect` to the same `_REAL*` solution already used in `sanitizer_solaris.cpp`. With those changes, safestack compiles and all tests `PASS`, so the tests are re-enabled for good. Tested on `amd64-pc-solaris2.11`, `sparcv9-sun-solaris2.11`, `x86_64-pc-linux-gnu`, and `sparc64-unknown-linux-gnu`. --- .../cmake/Modules/AllSupportedArchDefs.cmake | 2 +- compiler-rt/lib/safestack/safestack.cpp | 36 +++++++++++++++++++ .../lib/safestack/safestack_platform.h | 35 ++++++++++++++---- compiler-rt/test/safestack/lit.cfg.py | 2 +- 4 files changed, 66 insertions(+), 9 deletions(-) diff --git a/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake b/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake index c8bec41db36e9..02ff92f693810 100644 --- a/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake +++ b/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake @@ -77,7 +77,7 @@ set(ALL_UBSAN_SUPPORTED_ARCH ${X86} ${X86_64} ${ARM32} ${ARM64} ${RISCV64} ${MIPS32} ${MIPS64} ${PPC64} ${S390X} ${SPARC} ${SPARCV9} ${HEXAGON} ${LOONGARCH64}) set(ALL_SAFESTACK_SUPPORTED_ARCH ${X86} ${X86_64} ${ARM64} ${MIPS32} ${MIPS64} - ${HEXAGON} ${LOONGARCH64}) + ${HEXAGON} ${LOONGARCH64} ${SPARC} ${SPARCV9}) set(ALL_CFI_SUPPORTED_ARCH ${X86} ${X86_64} ${ARM32} ${ARM64} ${MIPS64} ${HEXAGON} ${LOONGARCH64}) set(ALL_SCUDO_STANDALONE_SUPPORTED_ARCH ${X86} ${X86_64} ${ARM32} ${ARM64} diff --git a/compiler-rt/lib/safestack/safestack.cpp b/compiler-rt/lib/safestack/safestack.cpp index 0751f3988b9c1..f01a642987646 100644 --- a/compiler-rt/lib/safestack/safestack.cpp +++ b/compiler-rt/lib/safestack/safestack.cpp @@ -13,14 +13,39 @@ // //===----------------------------------------------------------------------===// +#define SANITIZER_COMMON_NO_REDEFINE_BUILTINS + #include "safestack_platform.h" #include "safestack_util.h" +#include "sanitizer_common/sanitizer_internal_defs.h" #include +#include #include #include "interception/interception.h" +// interception.h drags in sanitizer_redefine_builtins.h, which in turn +// creates references to __sanitizer_internal_memcpy etc. The interceptors +// aren't needed here, so just forward to libc. +extern "C" { +SANITIZER_INTERFACE_ATTRIBUTE void *__sanitizer_internal_memcpy(void *dest, + const void *src, + size_t n) { + return memcpy(dest, src, n); +} + +SANITIZER_INTERFACE_ATTRIBUTE void *__sanitizer_internal_memmove( + void *dest, const void *src, size_t n) { + return memmove(dest, src, n); +} + +SANITIZER_INTERFACE_ATTRIBUTE void *__sanitizer_internal_memset(void *s, int c, + size_t n) { + return memset(s, c, n); +} +} // extern "C" + using namespace safestack; // TODO: To make accessing the unsafe stack pointer faster, we plan to @@ -224,6 +249,17 @@ INTERCEPTOR(int, pthread_create, pthread_t *thread, pthread_attr_destroy(&tmpattr); } +#if SANITIZER_SOLARIS + // Solaris pthread_attr_init initializes stacksize to 0 (the default), so + // hardcode the actual values as documented in pthread_create(3C). + if (size == 0) +# if defined(_LP64) + size = 2 * 1024 * 1024; +# else + size = 1024 * 1024; +# endif +#endif + SFS_CHECK(size); size = RoundUpTo(size, kStackAlign); diff --git a/compiler-rt/lib/safestack/safestack_platform.h b/compiler-rt/lib/safestack/safestack_platform.h index d4b2e2ef7391c..77eeb9cda6e15 100644 --- a/compiler-rt/lib/safestack/safestack_platform.h +++ b/compiler-rt/lib/safestack/safestack_platform.h @@ -17,6 +17,7 @@ #include "sanitizer_common/sanitizer_platform.h" #include +#include #include #include #include @@ -68,6 +69,24 @@ static void *GetRealLibcAddress(const char *symbol) { SFS_CHECK(real_##func); #endif +#if SANITIZER_SOLARIS +# define _REAL(func) _##func +# define DEFINE__REAL(ret_type, func, ...) \ + extern "C" ret_type _REAL(func)(__VA_ARGS__) + +# if !defined(_LP64) && _FILE_OFFSET_BITS == 64 +# define _REAL64(func) _##func##64 +# else +# define _REAL64(func) _REAL(func) +# endif +# define DEFINE__REAL64(ret_type, func, ...) \ + extern "C" ret_type _REAL64(func)(__VA_ARGS__) + +DEFINE__REAL64(void *, mmap, void *a, size_t b, int c, int d, int e, off_t f); +DEFINE__REAL(int, munmap, void *a, size_t b); +DEFINE__REAL(int, mprotect, void *a, size_t b, int c); +#endif + using ThreadId = uint64_t; inline ThreadId GetTid() { @@ -91,11 +110,10 @@ inline int TgKill(pid_t pid, ThreadId tid, int sig) { (void)pid; return _REAL(_lwp_kill, tid, sig); #elif SANITIZER_SOLARIS -# ifdef SYS_lwp_kill - return syscall(SYS_lwp_kill, tid, sig); -# else - return -1; -# endif + (void)pid; + errno = thr_kill(tid, sig); + // TgKill is expected to return -1 on error, not an errno. + return errno != 0 ? -1 : 0; #elif SANITIZER_FREEBSD return syscall(SYS_thr_kill2, pid, tid, sig); #else @@ -110,8 +128,7 @@ inline void *Mmap(void *addr, size_t length, int prot, int flags, int fd, #elif SANITIZER_FREEBSD && (defined(__aarch64__) || defined(__x86_64__)) return (void *)__syscall(SYS_mmap, addr, length, prot, flags, fd, offset); #elif SANITIZER_SOLARIS - return (void *)(uintptr_t)syscall(SYS_mmap, addr, length, prot, flags, fd, - offset); + return _REAL64(mmap)(addr, length, prot, flags, fd, offset); #else return (void *)syscall(SYS_mmap, addr, length, prot, flags, fd, offset); #endif @@ -121,6 +138,8 @@ inline int Munmap(void *addr, size_t length) { #if SANITIZER_NETBSD DEFINE__REAL(int, munmap, void *a, size_t b); return _REAL(munmap, addr, length); +#elif SANITIZER_SOLARIS + return _REAL(munmap)(addr, length); #else return syscall(SYS_munmap, addr, length); #endif @@ -130,6 +149,8 @@ inline int Mprotect(void *addr, size_t length, int prot) { #if SANITIZER_NETBSD DEFINE__REAL(int, mprotect, void *a, size_t b, int c); return _REAL(mprotect, addr, length, prot); +#elif SANITIZER_SOLARIS + return _REAL(mprotect)(addr, length, prot); #else return syscall(SYS_mprotect, addr, length, prot); #endif diff --git a/compiler-rt/test/safestack/lit.cfg.py b/compiler-rt/test/safestack/lit.cfg.py index aadb8bf0d5c77..17dfae46a412b 100644 --- a/compiler-rt/test/safestack/lit.cfg.py +++ b/compiler-rt/test/safestack/lit.cfg.py @@ -33,5 +33,5 @@ ) ) -if config.host_os not in ["Linux", "FreeBSD", "NetBSD"]: +if config.host_os not in ["Linux", "FreeBSD", "NetBSD", "SunOS"]: config.unsupported = True From b1864a8d6ab8bfd346922e36d80e684a4eaf3248 Mon Sep 17 00:00:00 2001 From: Daniel Bertalan Date: Thu, 18 Jul 2024 10:49:19 +0200 Subject: [PATCH 400/777] [lld-macho] Ignore duplicate `-rpath` entries (#99289) Starting with Xcode 16 (dyld-1122), Apple's binary utilities, e.g. `dyld_info` (but not dyld itself), will refuse to load binaries built against the macOS 15 SDK or newer that contain the same `LC_RPATH` entry multiple times: https://github.com/apple-oss-distributions/dyld/blob/rel/dyld-1122/mach_o/Policy.cpp#L246-L249 `ld-prime` deduplicates entries (regardless of the deployment target), we now do the same. We also match `ld-prime`'s and `ld64`'s behavior by warning on duplicate `-rpath` arguments. This can be disabled by the LLD-specific `--no-warn-duplicate-rpath` flag. --- lld/MachO/Config.h | 1 + lld/MachO/Driver.cpp | 17 ++++++++++++++++- lld/MachO/Options.td | 6 ++++++ lld/test/MachO/link-search-at-rpath.s | 1 + lld/test/MachO/rpath.s | 15 +++++++++++++++ 5 files changed, 39 insertions(+), 1 deletion(-) diff --git a/lld/MachO/Config.h b/lld/MachO/Config.h index 4d3f3d05c2338..5c354e0fe8821 100644 --- a/lld/MachO/Config.h +++ b/lld/MachO/Config.h @@ -199,6 +199,7 @@ struct Configuration { std::vector systemLibraryRoots; std::vector librarySearchPaths; std::vector frameworkSearchPaths; + bool warnDuplicateRpath = true; llvm::SmallVector runtimePaths; std::vector astPaths; std::vector explicitUndefineds; diff --git a/lld/MachO/Driver.cpp b/lld/MachO/Driver.cpp index a370d5734124a..ffb3feae25ca4 100644 --- a/lld/MachO/Driver.cpp +++ b/lld/MachO/Driver.cpp @@ -1402,6 +1402,19 @@ static void eraseInitializerSymbols() { sym->used = false; } +static SmallVector getRuntimePaths(opt::InputArgList &args) { + SmallVector vals; + DenseSet seen; + for (const Arg *arg : args.filtered(OPT_rpath)) { + StringRef val = arg->getValue(); + if (seen.insert(val).second) + vals.push_back(val); + else if (config->warnDuplicateRpath) + warn("duplicate -rpath '" + val + "' ignored [--warn-duplicate-rpath]"); + } + return vals; +} + namespace lld { namespace macho { bool link(ArrayRef argsArr, llvm::raw_ostream &stdoutOS, @@ -1642,7 +1655,9 @@ bool link(ArrayRef argsArr, llvm::raw_ostream &stdoutOS, error("--thinlto-prefix-replace=old_dir;new_dir;obj_dir must be used with " "--thinlto-index-only="); } - config->runtimePaths = args::getStrings(args, OPT_rpath); + config->warnDuplicateRpath = + args.hasFlag(OPT_warn_duplicate_rpath, OPT_no_warn_duplicate_rpath, true); + config->runtimePaths = getRuntimePaths(args); config->allLoad = args.hasFlag(OPT_all_load, OPT_noall_load, false); config->archMultiple = args.hasArg(OPT_arch_multiple); config->applicationExtension = args.hasFlag( diff --git a/lld/MachO/Options.td b/lld/MachO/Options.td index aecced9279da4..dc2212399222f 100644 --- a/lld/MachO/Options.td +++ b/lld/MachO/Options.td @@ -111,6 +111,12 @@ def no_warn_dylib_install_name: Flag<["--"], "no-warn-dylib-install-name">, def warn_dylib_install_name: Flag<["--"], "warn-dylib-install-name">, HelpText<"Warn on -install_name if -dylib is not passed">, Group; +def warn_duplicate_rpath: Flag<["--"], "warn-duplicate-rpath">, + HelpText<"Warn if the same -rpath is specified multiple times (default)">, + Group; +def no_warn_duplicate_rpath: Flag<["--"], "no-warn-duplicate-rpath">, + HelpText<"Do not warn if the same -rpath is specified multiple times">, + Group; def call_graph_profile_sort: Flag<["--"], "call-graph-profile-sort">, HelpText<"Reorder sections with call graph profile (default)">, Group; diff --git a/lld/test/MachO/link-search-at-rpath.s b/lld/test/MachO/link-search-at-rpath.s index dbc0d4b3cbf63..71d27fc5033bb 100644 --- a/lld/test/MachO/link-search-at-rpath.s +++ b/lld/test/MachO/link-search-at-rpath.s @@ -14,6 +14,7 @@ # RUN: -rpath @loader_path/../foo \ # RUN: -rpath @loader_path/../subdir \ # RUN: -rpath @loader_path/../foo \ +# RUN: --no-warn-duplicate-rpath \ # RUN: %t/bar.o -o %t/subdir2/libbar.dylib # RUN: %lld -lSystem %t/main.o %t/subdir2/libbar.dylib -o %t/test diff --git a/lld/test/MachO/rpath.s b/lld/test/MachO/rpath.s index 5b404a36b26b0..09ae108b34a21 100644 --- a/lld/test/MachO/rpath.s +++ b/lld/test/MachO/rpath.s @@ -12,6 +12,21 @@ # CHECK-NEXT: cmdsize 32 # CHECK-NEXT: path /another/rpath +## Check that -rpath entries are deduplicated. +# RUN: not %lld %t.o -o /dev/null -rpath /some/rpath -rpath /other/rpath -rpath /some/rpath 2>&1 | \ +# RUN: FileCheck --check-prefix=FATAL %s +# FATAL: error: duplicate -rpath '/some/rpath' ignored [--warn-duplicate-rpath] + +# RUN: %lld -o %t-dup %t.o -rpath /some/rpath -rpath /other/rpath -rpath /some/rpath --no-warn-duplicate-rpath +# RUN: llvm-objdump --macho --all-headers %t-dup | FileCheck %s --check-prefix=DEDUP +# DEDUP: LC_RPATH +# DEDUP-NEXT: cmdsize 24 +# DEDUP-NEXT: path /some/rpath +# DEDUP: LC_RPATH +# DEDUP-NEXT: cmdsize 32 +# DEDUP-NEXT: path /other/rpath +# DEDUP-NOT: LC_RPATH + .text .global _main _main: From 1ce89899ad33a0d2976859d8d278dba4342cbb6b Mon Sep 17 00:00:00 2001 From: Dmitry Polukhin <34227995+dmpolukhin@users.noreply.github.com> Date: Thu, 18 Jul 2024 09:55:36 +0100 Subject: [PATCH 401/777] [clang-tidy] Fix false in unnecessary-value-param inside templates (#98488) Summary: If callExpr is type dependent, there is no way to analyze individual arguments until template specialization. Before this diff only calls with dependent callees were skipped so unnecessary-value-param was processing arguments that had non-dependent type that gave false positives because the call was not fully resolved till specialization. So now instead of checking type dependent callee, the whole expression will be checked for type dependent. Test Plan: check-clang-tools --- clang-tools-extra/docs/ReleaseNotes.rst | 3 +- .../performance/unnecessary-value-param.cpp | 34 +++++++++++++++++ clang/lib/Analysis/ExprMutationAnalyzer.cpp | 37 +++++++++---------- 3 files changed, 54 insertions(+), 20 deletions(-) diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst index 697b514ae1572..a23483e6df6d2 100644 --- a/clang-tools-extra/docs/ReleaseNotes.rst +++ b/clang-tools-extra/docs/ReleaseNotes.rst @@ -450,7 +450,8 @@ Changes in existing checks ` check detecting more cases for template functions including lambdas with ``auto``. E.g., ``std::sort(a.begin(), a.end(), [](auto x, auto y) { return a > b; });`` - will be detected for expensive to copy types. + will be detected for expensive to copy types. Fixed false positives for + dependent call expressions. - Improved :doc:`readability-avoid-return-with-void-value ` check by adding diff --git a/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-value-param.cpp b/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-value-param.cpp index 0dffaefa213a4..7c7ae43698929 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-value-param.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-value-param.cpp @@ -2,6 +2,31 @@ // CHECK-FIXES: #include +namespace std { +template +struct remove_reference; + +template +struct remove_reference { + typedef _Tp type; +}; + +template +struct remove_reference<_Tp &> { + typedef _Tp type; +}; + +template +struct remove_reference<_Tp &&> { + typedef _Tp type; +}; + +template +constexpr typename std::remove_reference<_Tp>::type &&move(_Tp &&__t) { + return static_cast::type &&>(__t); +} +} // namespace std + struct ExpensiveToCopyType { const ExpensiveToCopyType & constReference() const { return *this; @@ -357,3 +382,12 @@ void fun() { ExpensiveToCopyType E; NegativeUsingConstructor S(E); } + +struct B { + static void bar(ExpensiveMovableType a, ExpensiveMovableType b); +}; + +template +void NegativeCallWithDependentAndNondependentArgs(ExpensiveMovableType a, T b) { + B::bar(std::move(a), b); +} diff --git a/clang/lib/Analysis/ExprMutationAnalyzer.cpp b/clang/lib/Analysis/ExprMutationAnalyzer.cpp index 3b3782fa1db9a..6d726ae44104e 100644 --- a/clang/lib/Analysis/ExprMutationAnalyzer.cpp +++ b/clang/lib/Analysis/ExprMutationAnalyzer.cpp @@ -404,25 +404,24 @@ ExprMutationAnalyzer::Analyzer::findDirectMutation(const Expr *Exp) { memberExpr(hasObjectExpression(canResolveToExpr(Exp)))), nonConstReferenceType()); const auto NotInstantiated = unless(hasDeclaration(isInstantiated())); - const auto TypeDependentCallee = - callee(expr(anyOf(unresolvedLookupExpr(), unresolvedMemberExpr(), - cxxDependentScopeMemberExpr(), - hasType(templateTypeParmType()), isTypeDependent()))); - - const auto AsNonConstRefArg = anyOf( - callExpr(NonConstRefParam, NotInstantiated), - cxxConstructExpr(NonConstRefParam, NotInstantiated), - callExpr(TypeDependentCallee, hasAnyArgument(canResolveToExpr(Exp))), - cxxUnresolvedConstructExpr(hasAnyArgument(canResolveToExpr(Exp))), - // Previous False Positive in the following Code: - // `template void f() { int i = 42; new Type(i); }` - // Where the constructor of `Type` takes its argument as reference. - // The AST does not resolve in a `cxxConstructExpr` because it is - // type-dependent. - parenListExpr(hasDescendant(expr(canResolveToExpr(Exp)))), - // If the initializer is for a reference type, there is no cast for - // the variable. Values are cast to RValue first. - initListExpr(hasAnyInit(expr(canResolveToExpr(Exp))))); + + const auto AsNonConstRefArg = + anyOf(callExpr(NonConstRefParam, NotInstantiated), + cxxConstructExpr(NonConstRefParam, NotInstantiated), + // If the call is type-dependent, we can't properly process any + // argument because required type conversions and implicit casts + // will be inserted only after specialization. + callExpr(isTypeDependent(), hasAnyArgument(canResolveToExpr(Exp))), + cxxUnresolvedConstructExpr(hasAnyArgument(canResolveToExpr(Exp))), + // Previous False Positive in the following Code: + // `template void f() { int i = 42; new Type(i); }` + // Where the constructor of `Type` takes its argument as reference. + // The AST does not resolve in a `cxxConstructExpr` because it is + // type-dependent. + parenListExpr(hasDescendant(expr(canResolveToExpr(Exp)))), + // If the initializer is for a reference type, there is no cast for + // the variable. Values are cast to RValue first. + initListExpr(hasAnyInit(expr(canResolveToExpr(Exp))))); // Captured by a lambda by reference. // If we're initializing a capture with 'Exp' directly then we're initializing From dfddc0c4843baaf9605aeb1c4f82eac185e90265 Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Thu, 18 Jul 2024 10:59:58 +0200 Subject: [PATCH 402/777] [libc++] Include the rest of the detail headers by version in the umbrella headers (#96032) This is a follow-up to #83740. --- libcxx/include/__type_traits/make_signed.h | 1 + libcxx/include/atomic | 5 +- libcxx/include/expected | 18 +++- libcxx/include/filesystem | 36 +++---- libcxx/include/format | 81 ++++++++++------ libcxx/include/forward_list | 1 + libcxx/include/functional | 47 ++++++--- libcxx/include/iterator | 56 ++++++----- libcxx/include/mdspan | 27 ++++-- libcxx/include/memory | 28 ++++-- libcxx/include/memory_resource | 26 +++-- libcxx/include/module.modulemap | 16 +++- libcxx/include/numeric | 29 ++++-- libcxx/include/ostream | 9 +- libcxx/include/random | 3 - libcxx/include/ranges | 95 +++++++++++-------- libcxx/include/stop_token | 9 +- libcxx/include/string_view | 1 + libcxx/include/type_traits | 63 ++++++------ libcxx/include/utility | 38 +++++--- .../sequences/deque/abi.compile.pass.cpp | 1 + .../sequences/list/abi.compile.pass.cpp | 1 + .../bounded_iter/arithmetic.pass.cpp | 2 +- .../bounded_iter/comparison.pass.cpp | 2 +- .../bounded_iter/dereference.pass.cpp | 2 +- .../bounded_iter/pointer_traits.pass.cpp | 3 +- .../bounded_iter/types.compile.pass.cpp | 1 + .../libcxx/memory/allocation_guard.pass.cpp | 2 +- .../compressed_pair/compressed_pair.pass.cpp | 2 +- .../numerics/clamp_to_integral.pass.cpp | 2 +- .../as-lvalue.lifetimebound.verify.cpp | 2 +- .../range.adaptor.helpers/as-lvalue.pass.cpp | 1 + .../test/libcxx/transitive_includes/cxx03.csv | 1 + .../test/libcxx/transitive_includes/cxx11.csv | 1 + .../test/libcxx/transitive_includes/cxx14.csv | 1 + .../test/libcxx/transitive_includes/cxx17.csv | 1 + .../test/libcxx/transitive_includes/cxx20.csv | 2 + .../test/libcxx/transitive_includes/cxx23.csv | 1 - .../test/libcxx/transitive_includes/cxx26.csv | 1 - .../type_traits/is_callable.compile.pass.cpp | 2 +- .../is_constant_evaluated.pass.cpp | 2 +- ..._implicitly_default_constructible.pass.cpp | 3 +- .../is_specialization.compile.pass.cpp | 5 +- .../type_traits/is_specialization.verify.cpp | 3 +- .../type_traits/lazy_metafunctions.pass.cpp | 3 + .../exception_guard.no_exceptions.pass.cpp | 1 + .../libcxx/utilities/exception_guard.pass.cpp | 1 + .../func.bind.partial/compose.pass.cpp | 2 +- .../libcxx/utilities/meta/meta_base.pass.cpp | 5 +- .../make.heap/ranges_make_heap.pass.cpp | 1 + .../pop.heap/ranges_pop_heap.pass.cpp | 1 + .../push.heap/ranges_push_heap.pass.cpp | 1 + .../range.owning.view/begin_end.pass.cpp | 1 + .../iterator/plus_eq.pass.cpp | 3 +- libcxx/test/support/test_iterators.h | 1 + 55 files changed, 417 insertions(+), 236 deletions(-) diff --git a/libcxx/include/__type_traits/make_signed.h b/libcxx/include/__type_traits/make_signed.h index c1fc009d9ba2e..d09d6ed4a1e7c 100644 --- a/libcxx/include/__type_traits/make_signed.h +++ b/libcxx/include/__type_traits/make_signed.h @@ -10,6 +10,7 @@ #define _LIBCPP___TYPE_TRAITS_MAKE_SIGNED_H #include <__config> +#include <__type_traits/copy_cv.h> #include <__type_traits/is_enum.h> #include <__type_traits/is_integral.h> #include <__type_traits/nat.h> diff --git a/libcxx/include/atomic b/libcxx/include/atomic index 80a0f9ee373e9..0d13619d6ce45 100644 --- a/libcxx/include/atomic +++ b/libcxx/include/atomic @@ -599,7 +599,6 @@ template #include <__atomic/atomic_flag.h> #include <__atomic/atomic_init.h> #include <__atomic/atomic_lock_free.h> -#include <__atomic/atomic_ref.h> #include <__atomic/atomic_sync.h> #include <__atomic/check_memory_order.h> #include <__atomic/contention_t.h> @@ -610,6 +609,10 @@ template #include <__atomic/memory_order.h> #include +#if _LIBCPP_STD_VER >= 20 +# include <__atomic/atomic_ref.h> +#endif + #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif diff --git a/libcxx/include/expected b/libcxx/include/expected index f455ab7d5d61c..6a2f12f2bf3b5 100644 --- a/libcxx/include/expected +++ b/libcxx/include/expected @@ -39,14 +39,24 @@ namespace std { */ #include <__config> -#include <__expected/bad_expected_access.h> -#include <__expected/expected.h> -#include <__expected/unexpect.h> -#include <__expected/unexpected.h> + +#if _LIBCPP_STD_VER >= 23 +# include <__expected/bad_expected_access.h> +# include <__expected/expected.h> +# include <__expected/unexpect.h> +# include <__expected/unexpected.h> +#endif + #include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif +#if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 +# include +# include +# include +#endif + #endif // _LIBCPP_EXPECTED diff --git a/libcxx/include/filesystem b/libcxx/include/filesystem index eff7dff4a4551..6ea04df0a089b 100644 --- a/libcxx/include/filesystem +++ b/libcxx/include/filesystem @@ -534,22 +534,26 @@ inline constexpr bool std::ranges::enable_view -#include <__filesystem/copy_options.h> -#include <__filesystem/directory_entry.h> -#include <__filesystem/directory_iterator.h> -#include <__filesystem/directory_options.h> -#include <__filesystem/file_status.h> -#include <__filesystem/file_time_type.h> -#include <__filesystem/file_type.h> -#include <__filesystem/filesystem_error.h> -#include <__filesystem/operations.h> -#include <__filesystem/path.h> -#include <__filesystem/path_iterator.h> -#include <__filesystem/perm_options.h> -#include <__filesystem/perms.h> -#include <__filesystem/recursive_directory_iterator.h> -#include <__filesystem/space_info.h> -#include <__filesystem/u8path.h> + +#if _LIBCPP_STD_VER >= 17 +# include <__filesystem/copy_options.h> +# include <__filesystem/directory_entry.h> +# include <__filesystem/directory_iterator.h> +# include <__filesystem/directory_options.h> +# include <__filesystem/file_status.h> +# include <__filesystem/file_time_type.h> +# include <__filesystem/file_type.h> +# include <__filesystem/filesystem_error.h> +# include <__filesystem/operations.h> +# include <__filesystem/path.h> +# include <__filesystem/path_iterator.h> +# include <__filesystem/perm_options.h> +# include <__filesystem/perms.h> +# include <__filesystem/recursive_directory_iterator.h> +# include <__filesystem/space_info.h> +# include <__filesystem/u8path.h> +#endif + #include // standard-mandated includes diff --git a/libcxx/include/format b/libcxx/include/format index 07c2ba083199e..c3f2b45f0f730 100644 --- a/libcxx/include/format +++ b/libcxx/include/format @@ -189,40 +189,65 @@ namespace std { */ #include <__config> -#include <__format/buffer.h> -#include <__format/concepts.h> -#include <__format/container_adaptor.h> -#include <__format/enable_insertable.h> -#include <__format/escaped_output_table.h> -#include <__format/extended_grapheme_cluster_table.h> -#include <__format/format_arg.h> -#include <__format/format_arg_store.h> -#include <__format/format_args.h> -#include <__format/format_context.h> -#include <__format/format_error.h> -#include <__format/format_functions.h> -#include <__format/format_parse_context.h> -#include <__format/format_string.h> -#include <__format/format_to_n_result.h> -#include <__format/formatter.h> -#include <__format/formatter_bool.h> -#include <__format/formatter_char.h> -#include <__format/formatter_floating_point.h> -#include <__format/formatter_integer.h> -#include <__format/formatter_pointer.h> -#include <__format/formatter_string.h> -#include <__format/formatter_tuple.h> -#include <__format/parser_std_format_spec.h> -#include <__format/range_default_formatter.h> -#include <__format/range_formatter.h> -#include <__format/unicode.h> -#include <__fwd/format.h> + +#if _LIBCPP_STD_VER >= 20 +# include <__format/buffer.h> +# include <__format/concepts.h> +# include <__format/container_adaptor.h> +# include <__format/enable_insertable.h> +# include <__format/escaped_output_table.h> +# include <__format/extended_grapheme_cluster_table.h> +# include <__format/format_arg.h> +# include <__format/format_arg_store.h> +# include <__format/format_args.h> +# include <__format/format_context.h> +# include <__format/format_error.h> +# include <__format/format_functions.h> +# include <__format/format_parse_context.h> +# include <__format/format_string.h> +# include <__format/format_to_n_result.h> +# include <__format/formatter.h> +# include <__format/formatter_bool.h> +# include <__format/formatter_char.h> +# include <__format/formatter_floating_point.h> +# include <__format/formatter_integer.h> +# include <__format/formatter_pointer.h> +# include <__format/formatter_string.h> +# include <__format/formatter_tuple.h> +# include <__format/parser_std_format_spec.h> +# include <__format/range_default_formatter.h> +# include <__format/range_formatter.h> +# include <__format/unicode.h> +# include <__fwd/format.h> +#endif + #include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif +#if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +#endif + #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 # include # include diff --git a/libcxx/include/forward_list b/libcxx/include/forward_list index 1ae19d23f88cc..b14d2cb6c7803 100644 --- a/libcxx/include/forward_list +++ b/libcxx/include/forward_list @@ -224,6 +224,7 @@ template #include <__type_traits/is_nothrow_constructible.h> #include <__type_traits/is_pointer.h> #include <__type_traits/is_same.h> +#include <__type_traits/is_swappable.h> #include <__type_traits/type_identity.h> #include <__utility/forward.h> #include <__utility/move.h> diff --git a/libcxx/include/functional b/libcxx/include/functional index 27cf21e1a4c8b..3d39f654ddb08 100644 --- a/libcxx/include/functional +++ b/libcxx/include/functional @@ -527,41 +527,60 @@ POLICY: For non-variadic implementations, the number of arguments is limited */ -#include <__algorithm/search.h> -#include <__compare/compare_three_way.h> #include <__config> + #include <__functional/binary_function.h> #include <__functional/binary_negate.h> #include <__functional/bind.h> -#include <__functional/bind_back.h> -#include <__functional/bind_front.h> #include <__functional/binder1st.h> #include <__functional/binder2nd.h> -#include <__functional/boyer_moore_searcher.h> -#include <__functional/compose.h> -#include <__functional/default_searcher.h> -#include <__functional/function.h> #include <__functional/hash.h> -#include <__functional/identity.h> -#include <__functional/invoke.h> #include <__functional/mem_fn.h> // TODO: deprecate #include <__functional/mem_fun_ref.h> -#include <__functional/not_fn.h> #include <__functional/operations.h> #include <__functional/pointer_to_binary_function.h> #include <__functional/pointer_to_unary_function.h> -#include <__functional/ranges_operations.h> #include <__functional/reference_wrapper.h> #include <__functional/unary_function.h> #include <__functional/unary_negate.h> -#include <__type_traits/unwrap_ref.h> -#include <__utility/forward.h> + +#ifndef _LIBCPP_CXX03_LANG +# include <__functional/function.h> +#endif + +#if _LIBCPP_STD_VER >= 17 +# include <__functional/boyer_moore_searcher.h> +# include <__functional/default_searcher.h> +# include <__functional/invoke.h> +# include <__functional/not_fn.h> +#endif + +#if _LIBCPP_STD_VER >= 20 +# include <__functional/bind_back.h> +# include <__functional/bind_front.h> +# include <__functional/identity.h> +# include <__functional/ranges_operations.h> +# include <__type_traits/unwrap_ref.h> +#endif + #include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif +#if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && defined(_LIBCPP_CXX03_LANG) +# include +# include +#endif + +#if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 14 +# include +# include +# include +# include +#endif + #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 # include # include diff --git a/libcxx/include/iterator b/libcxx/include/iterator index 1b9e7eaf0c1e8..fca75f0a19ed1 100644 --- a/libcxx/include/iterator +++ b/libcxx/include/iterator @@ -683,43 +683,49 @@ template constexpr const E* data(initializer_list il) noexcept; #include <__iterator/access.h> #include <__iterator/advance.h> #include <__iterator/back_insert_iterator.h> -#include <__iterator/bounded_iter.h> -#include <__iterator/common_iterator.h> -#include <__iterator/concepts.h> -#include <__iterator/counted_iterator.h> -#include <__iterator/data.h> -#include <__iterator/default_sentinel.h> #include <__iterator/distance.h> -#include <__iterator/empty.h> -#include <__iterator/erase_if_container.h> #include <__iterator/front_insert_iterator.h> -#include <__iterator/incrementable_traits.h> -#include <__iterator/indirectly_comparable.h> #include <__iterator/insert_iterator.h> #include <__iterator/istream_iterator.h> #include <__iterator/istreambuf_iterator.h> -#include <__iterator/iter_move.h> -#include <__iterator/iter_swap.h> #include <__iterator/iterator.h> #include <__iterator/iterator_traits.h> -#include <__iterator/mergeable.h> #include <__iterator/move_iterator.h> -#include <__iterator/move_sentinel.h> #include <__iterator/next.h> #include <__iterator/ostream_iterator.h> #include <__iterator/ostreambuf_iterator.h> -#include <__iterator/permutable.h> #include <__iterator/prev.h> -#include <__iterator/projected.h> -#include <__iterator/readable_traits.h> -#include <__iterator/reverse_access.h> #include <__iterator/reverse_iterator.h> -#include <__iterator/size.h> -#include <__iterator/sortable.h> -#include <__iterator/unreachable_sentinel.h> #include <__iterator/wrap_iter.h> -#include <__memory/addressof.h> -#include <__memory/pointer_traits.h> + +#if _LIBCPP_STD_VER >= 14 +# include <__iterator/reverse_access.h> +#endif + +#if _LIBCPP_STD_VER >= 17 +# include <__iterator/data.h> +# include <__iterator/empty.h> +# include <__iterator/size.h> +#endif + +#if _LIBCPP_STD_VER >= 20 +# include <__iterator/common_iterator.h> +# include <__iterator/concepts.h> +# include <__iterator/counted_iterator.h> +# include <__iterator/default_sentinel.h> +# include <__iterator/incrementable_traits.h> +# include <__iterator/indirectly_comparable.h> +# include <__iterator/iter_move.h> +# include <__iterator/iter_swap.h> +# include <__iterator/mergeable.h> +# include <__iterator/move_sentinel.h> +# include <__iterator/permutable.h> +# include <__iterator/projected.h> +# include <__iterator/readable_traits.h> +# include <__iterator/sortable.h> +# include <__iterator/unreachable_sentinel.h> +#endif + #include // standard-mandated includes @@ -732,6 +738,10 @@ template constexpr const E* data(initializer_list il) noexcept; # pragma GCC system_header #endif +#if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 17 +# include +#endif + #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 # include # include diff --git a/libcxx/include/mdspan b/libcxx/include/mdspan index aa7ba278b1aa0..29190e4a9953e 100644 --- a/libcxx/include/mdspan +++ b/libcxx/include/mdspan @@ -409,17 +409,30 @@ namespace std { #define _LIBCPP_MDSPAN #include <__config> -#include <__fwd/mdspan.h> -#include <__mdspan/default_accessor.h> -#include <__mdspan/extents.h> -#include <__mdspan/layout_left.h> -#include <__mdspan/layout_right.h> -#include <__mdspan/layout_stride.h> -#include <__mdspan/mdspan.h> + +#if _LIBCPP_STD_VER >= 23 +# include <__fwd/mdspan.h> +# include <__mdspan/default_accessor.h> +# include <__mdspan/extents.h> +# include <__mdspan/layout_left.h> +# include <__mdspan/layout_right.h> +# include <__mdspan/layout_stride.h> +# include <__mdspan/mdspan.h> +#endif + #include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif +#if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 +# include +# include +# include +# include +# include +# include +#endif + #endif // _LIBCPP_MDSPAN diff --git a/libcxx/include/memory b/libcxx/include/memory index a8c0264eb9eb7..d52ee7b4c8eee 100644 --- a/libcxx/include/memory +++ b/libcxx/include/memory @@ -920,30 +920,38 @@ template #include <__config> #include <__memory/addressof.h> #include <__memory/align.h> -#include <__memory/allocate_at_least.h> -#include <__memory/allocation_guard.h> #include <__memory/allocator.h> #include <__memory/allocator_arg_t.h> #include <__memory/allocator_traits.h> -#include <__memory/assume_aligned.h> #include <__memory/auto_ptr.h> -#include <__memory/compressed_pair.h> -#include <__memory/concepts.h> -#include <__memory/construct_at.h> #include <__memory/pointer_traits.h> -#include <__memory/ranges_construct_at.h> -#include <__memory/ranges_uninitialized_algorithms.h> #include <__memory/raw_storage_iterator.h> #include <__memory/shared_ptr.h> #include <__memory/temporary_buffer.h> #include <__memory/uninitialized_algorithms.h> #include <__memory/unique_ptr.h> #include <__memory/uses_allocator.h> -#include <__memory/uses_allocator_construction.h> -#include // standard-mandated includes +#if _LIBCPP_STD_VER >= 17 +# include <__memory/construct_at.h> +#endif + +#if _LIBCPP_STD_VER >= 20 +# include <__memory/assume_aligned.h> +# include <__memory/concepts.h> +# include <__memory/ranges_construct_at.h> +# include <__memory/ranges_uninitialized_algorithms.h> +# include <__memory/uses_allocator_construction.h> +#endif + +#if _LIBCPP_STD_VER >= 23 +# include <__memory/allocate_at_least.h> +#endif + +#include + // [memory.syn] #include diff --git a/libcxx/include/memory_resource b/libcxx/include/memory_resource index e9c87777e8f75..67411054820a1 100644 --- a/libcxx/include/memory_resource +++ b/libcxx/include/memory_resource @@ -50,18 +50,32 @@ namespace std::pmr { */ #include <__config> -#include <__memory_resource/memory_resource.h> -#include <__memory_resource/monotonic_buffer_resource.h> -#include <__memory_resource/polymorphic_allocator.h> -#include <__memory_resource/pool_options.h> -#include <__memory_resource/synchronized_pool_resource.h> -#include <__memory_resource/unsynchronized_pool_resource.h> + +#if _LIBCPP_STD_VER >= 17 +# include <__memory_resource/memory_resource.h> +# include <__memory_resource/monotonic_buffer_resource.h> +# include <__memory_resource/polymorphic_allocator.h> +# include <__memory_resource/pool_options.h> +# include <__memory_resource/synchronized_pool_resource.h> +# include <__memory_resource/unsynchronized_pool_resource.h> +#endif + #include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif +#if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 14 +# include +# include +# include +# include +# include +# include +# include +#endif + #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 # include #endif diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap index f4aaa14c1c2ee..7608aef3f3a43 100644 --- a/libcxx/include/module.modulemap +++ b/libcxx/include/module.modulemap @@ -1081,7 +1081,10 @@ module std_private_charconv_tables [system] { header "__charcon module std_private_charconv_to_chars [system] { header "__charconv/to_chars.h" } module std_private_charconv_to_chars_base_10 [system] { header "__charconv/to_chars_base_10.h" } module std_private_charconv_to_chars_floating_point [system] { header "__charconv/to_chars_floating_point.h" } -module std_private_charconv_to_chars_integral [system] { header "__charconv/to_chars_integral.h" } +module std_private_charconv_to_chars_integral [system] { + header "__charconv/to_chars_integral.h" + export std_private_charconv_traits +} module std_private_charconv_to_chars_result [system] { header "__charconv/to_chars_result.h" export * @@ -1130,6 +1133,7 @@ module std_private_chrono_steady_clock [system] { } module std_private_chrono_time_zone [system] { header "__chrono/time_zone.h" + export std_private_memory_unique_ptr } module std_private_chrono_time_zone_link [system] { header "__chrono/time_zone_link.h" @@ -1924,7 +1928,10 @@ module std_private_type_traits_is_array [system module std_private_type_traits_is_assignable [system] { header "__type_traits/is_assignable.h" } module std_private_type_traits_is_base_of [system] { header "__type_traits/is_base_of.h" } module std_private_type_traits_is_bounded_array [system] { header "__type_traits/is_bounded_array.h" } -module std_private_type_traits_is_callable [system] { header "__type_traits/is_callable.h" } +module std_private_type_traits_is_callable [system] { + header "__type_traits/is_callable.h" + export std_private_type_traits_integral_constant +} module std_private_type_traits_is_char_like_type [system] { header "__type_traits/is_char_like_type.h" } module std_private_type_traits_is_class [system] { header "__type_traits/is_class.h" } module std_private_type_traits_is_compound [system] { header "__type_traits/is_compound.h" } @@ -1959,7 +1966,10 @@ module std_private_type_traits_is_final [system module std_private_type_traits_is_floating_point [system] { header "__type_traits/is_floating_point.h" } module std_private_type_traits_is_function [system] { header "__type_traits/is_function.h" } module std_private_type_traits_is_fundamental [system] { header "__type_traits/is_fundamental.h" } -module std_private_type_traits_is_implicitly_default_constructible [system] { header "__type_traits/is_implicitly_default_constructible.h" } +module std_private_type_traits_is_implicitly_default_constructible [system] { + header "__type_traits/is_implicitly_default_constructible.h" + export std_private_type_traits_integral_constant +} module std_private_type_traits_is_integral [system] { header "__type_traits/is_integral.h" } module std_private_type_traits_is_literal_type [system] { header "__type_traits/is_literal_type.h" } module std_private_type_traits_is_member_pointer [system] { header "__type_traits/is_member_pointer.h" } diff --git a/libcxx/include/numeric b/libcxx/include/numeric index 9fb5e9fb1da70..6b92ce3a07123 100644 --- a/libcxx/include/numeric +++ b/libcxx/include/numeric @@ -157,31 +157,40 @@ constexpr T saturate_cast(U x) noexcept; // freestanding, Sin */ #include <__config> -#include #include <__numeric/accumulate.h> #include <__numeric/adjacent_difference.h> -#include <__numeric/exclusive_scan.h> -#include <__numeric/gcd_lcm.h> -#include <__numeric/inclusive_scan.h> #include <__numeric/inner_product.h> #include <__numeric/iota.h> -#include <__numeric/midpoint.h> #include <__numeric/partial_sum.h> -#include <__numeric/reduce.h> -#include <__numeric/saturation_arithmetic.h> -#include <__numeric/transform_exclusive_scan.h> -#include <__numeric/transform_inclusive_scan.h> -#include <__numeric/transform_reduce.h> #if _LIBCPP_STD_VER >= 17 +# include <__numeric/exclusive_scan.h> +# include <__numeric/gcd_lcm.h> +# include <__numeric/inclusive_scan.h> # include <__numeric/pstl.h> +# include <__numeric/reduce.h> +# include <__numeric/transform_exclusive_scan.h> +# include <__numeric/transform_inclusive_scan.h> +# include <__numeric/transform_reduce.h> +#endif + +#if _LIBCPP_STD_VER >= 20 +# include <__numeric/midpoint.h> +# include <__numeric/saturation_arithmetic.h> #endif +#include + #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif +#if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 14 +# include +# include +#endif + #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 # include # include diff --git a/libcxx/include/ostream b/libcxx/include/ostream index f75110e7d73f7..359d3c0e19c4c 100644 --- a/libcxx/include/ostream +++ b/libcxx/include/ostream @@ -173,8 +173,13 @@ void vprint_nonunicode(ostream& os, string_view fmt, format_args args); */ #include <__config> + #include <__ostream/basic_ostream.h> -#include <__ostream/print.h> + +#if _LIBCPP_STD_VER >= 23 +# include <__ostream/print.h> +#endif + #include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) @@ -186,8 +191,10 @@ void vprint_nonunicode(ostream& os, string_view fmt, format_args args); # include # include # include +# include # include # include +# include # include # include #endif diff --git a/libcxx/include/random b/libcxx/include/random index 9edd6c4608ec2..6cc3760c20e16 100644 --- a/libcxx/include/random +++ b/libcxx/include/random @@ -1682,7 +1682,6 @@ class piecewise_linear_distribution #include <__random/binomial_distribution.h> #include <__random/cauchy_distribution.h> #include <__random/chi_squared_distribution.h> -#include <__random/clamp_to_integral.h> #include <__random/default_random_engine.h> #include <__random/discard_block_engine.h> #include <__random/discrete_distribution.h> @@ -1694,10 +1693,8 @@ class piecewise_linear_distribution #include <__random/geometric_distribution.h> #include <__random/independent_bits_engine.h> #include <__random/is_seed_sequence.h> -#include <__random/is_valid.h> #include <__random/knuth_b.h> #include <__random/linear_congruential_engine.h> -#include <__random/log2.h> #include <__random/lognormal_distribution.h> #include <__random/mersenne_twister_engine.h> #include <__random/negative_binomial_distribution.h> diff --git a/libcxx/include/ranges b/libcxx/include/ranges index 07a525ed8641f..fa35874265de6 100644 --- a/libcxx/include/ranges +++ b/libcxx/include/ranges @@ -381,49 +381,56 @@ namespace std { */ #include <__config> -#include <__ranges/access.h> -#include <__ranges/all.h> -#include <__ranges/as_rvalue_view.h> -#include <__ranges/chunk_by_view.h> -#include <__ranges/common_view.h> -#include <__ranges/concepts.h> -#include <__ranges/counted.h> -#include <__ranges/dangling.h> -#include <__ranges/data.h> -#include <__ranges/drop_view.h> -#include <__ranges/drop_while_view.h> -#include <__ranges/elements_view.h> -#include <__ranges/empty.h> -#include <__ranges/empty_view.h> -#include <__ranges/enable_borrowed_range.h> -#include <__ranges/enable_view.h> -#include <__ranges/filter_view.h> -#include <__ranges/from_range.h> -#include <__ranges/iota_view.h> -#include <__ranges/join_view.h> -#include <__ranges/lazy_split_view.h> -#include <__ranges/rbegin.h> -#include <__ranges/ref_view.h> -#include <__ranges/rend.h> -#include <__ranges/repeat_view.h> -#include <__ranges/reverse_view.h> -#include <__ranges/single_view.h> -#include <__ranges/size.h> -#include <__ranges/split_view.h> -#include <__ranges/subrange.h> -#include <__ranges/take_view.h> -#include <__ranges/take_while_view.h> -#include <__ranges/to.h> -#include <__ranges/transform_view.h> -#include <__ranges/view_interface.h> -#include <__ranges/views.h> -#include <__ranges/zip_view.h> -#include -#if !defined(_LIBCPP_HAS_NO_LOCALIZATION) -# include <__ranges/istream_view.h> +#if _LIBCPP_STD_VER >= 20 +# include <__ranges/access.h> +# include <__ranges/all.h> +# include <__ranges/common_view.h> +# include <__ranges/concepts.h> +# include <__ranges/counted.h> +# include <__ranges/dangling.h> +# include <__ranges/data.h> +# include <__ranges/drop_view.h> +# include <__ranges/drop_while_view.h> +# include <__ranges/elements_view.h> +# include <__ranges/empty.h> +# include <__ranges/empty_view.h> +# include <__ranges/enable_borrowed_range.h> +# include <__ranges/enable_view.h> +# include <__ranges/filter_view.h> +# include <__ranges/iota_view.h> +# include <__ranges/join_view.h> +# include <__ranges/lazy_split_view.h> +# include <__ranges/rbegin.h> +# include <__ranges/ref_view.h> +# include <__ranges/rend.h> +# include <__ranges/reverse_view.h> +# include <__ranges/single_view.h> +# include <__ranges/size.h> +# include <__ranges/split_view.h> +# include <__ranges/subrange.h> +# include <__ranges/take_view.h> +# include <__ranges/take_while_view.h> +# include <__ranges/transform_view.h> +# include <__ranges/view_interface.h> +# include <__ranges/views.h> + +# if !defined(_LIBCPP_HAS_NO_LOCALIZATION) +# include <__ranges/istream_view.h> +# endif +#endif + +#if _LIBCPP_STD_VER >= 23 +# include <__ranges/as_rvalue_view.h> +# include <__ranges/chunk_by_view.h> +# include <__ranges/from_range.h> +# include <__ranges/repeat_view.h> +# include <__ranges/to.h> +# include <__ranges/zip_view.h> #endif +#include + // standard-mandated includes // [ranges.syn] @@ -439,6 +446,14 @@ namespace std { # pragma GCC system_header #endif +#if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 17 +# include +# include +# include +# include +# include +#endif + #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 # include # include diff --git a/libcxx/include/stop_token b/libcxx/include/stop_token index c9c54dfb5a755..d4e651d9541f4 100644 --- a/libcxx/include/stop_token +++ b/libcxx/include/stop_token @@ -35,9 +35,12 @@ namespace std { #if !defined(_LIBCPP_HAS_NO_THREADS) -# include <__stop_token/stop_callback.h> -# include <__stop_token/stop_source.h> -# include <__stop_token/stop_token.h> +# if _LIBCPP_STD_VER >= 20 +# include <__stop_token/stop_callback.h> +# include <__stop_token/stop_source.h> +# include <__stop_token/stop_token.h> +# endif + # include # if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) diff --git a/libcxx/include/string_view b/libcxx/include/string_view index b2a4db4e7519a..72dbf0bfa8e54 100644 --- a/libcxx/include/string_view +++ b/libcxx/include/string_view @@ -210,6 +210,7 @@ namespace std { #include <__config> #include <__functional/hash.h> #include <__functional/unary_function.h> +#include <__fwd/ostream.h> #include <__fwd/string_view.h> #include <__iterator/bounded_iter.h> #include <__iterator/concepts.h> diff --git a/libcxx/include/type_traits b/libcxx/include/type_traits index a77ddadafb681..ffa137338b6a2 100644 --- a/libcxx/include/type_traits +++ b/libcxx/include/type_traits @@ -428,97 +428,96 @@ namespace std #include <__type_traits/aligned_storage.h> #include <__type_traits/aligned_union.h> #include <__type_traits/alignment_of.h> -#include <__type_traits/can_extract_key.h> -#include <__type_traits/common_reference.h> #include <__type_traits/common_type.h> #include <__type_traits/conditional.h> -#include <__type_traits/conjunction.h> #include <__type_traits/decay.h> -#include <__type_traits/dependent_type.h> -#include <__type_traits/disjunction.h> #include <__type_traits/enable_if.h> #include <__type_traits/extent.h> -#include <__type_traits/has_unique_object_representation.h> #include <__type_traits/has_virtual_destructor.h> #include <__type_traits/integral_constant.h> -#include <__type_traits/invoke.h> #include <__type_traits/is_abstract.h> -#include <__type_traits/is_aggregate.h> #include <__type_traits/is_arithmetic.h> #include <__type_traits/is_array.h> #include <__type_traits/is_assignable.h> #include <__type_traits/is_base_of.h> -#include <__type_traits/is_bounded_array.h> -#include <__type_traits/is_callable.h> -#include <__type_traits/is_char_like_type.h> #include <__type_traits/is_class.h> #include <__type_traits/is_compound.h> #include <__type_traits/is_const.h> -#include <__type_traits/is_constant_evaluated.h> #include <__type_traits/is_constructible.h> #include <__type_traits/is_convertible.h> #include <__type_traits/is_destructible.h> #include <__type_traits/is_empty.h> #include <__type_traits/is_enum.h> -#include <__type_traits/is_final.h> #include <__type_traits/is_floating_point.h> #include <__type_traits/is_function.h> #include <__type_traits/is_fundamental.h> -#include <__type_traits/is_implicitly_default_constructible.h> #include <__type_traits/is_integral.h> #include <__type_traits/is_literal_type.h> #include <__type_traits/is_member_pointer.h> #include <__type_traits/is_nothrow_assignable.h> #include <__type_traits/is_nothrow_constructible.h> -#include <__type_traits/is_nothrow_convertible.h> #include <__type_traits/is_nothrow_destructible.h> -#include <__type_traits/is_null_pointer.h> #include <__type_traits/is_object.h> #include <__type_traits/is_pod.h> #include <__type_traits/is_pointer.h> #include <__type_traits/is_polymorphic.h> #include <__type_traits/is_reference.h> -#include <__type_traits/is_reference_wrapper.h> -#include <__type_traits/is_referenceable.h> #include <__type_traits/is_same.h> #include <__type_traits/is_scalar.h> -#include <__type_traits/is_scoped_enum.h> #include <__type_traits/is_signed.h> -#include <__type_traits/is_specialization.h> #include <__type_traits/is_standard_layout.h> -#include <__type_traits/is_swappable.h> #include <__type_traits/is_trivial.h> #include <__type_traits/is_trivially_assignable.h> #include <__type_traits/is_trivially_constructible.h> #include <__type_traits/is_trivially_copyable.h> #include <__type_traits/is_trivially_destructible.h> -#include <__type_traits/is_unbounded_array.h> #include <__type_traits/is_union.h> #include <__type_traits/is_unsigned.h> #include <__type_traits/is_void.h> #include <__type_traits/is_volatile.h> -#include <__type_traits/make_const_lvalue_ref.h> #include <__type_traits/make_signed.h> #include <__type_traits/make_unsigned.h> -#include <__type_traits/maybe_const.h> -#include <__type_traits/negation.h> #include <__type_traits/rank.h> #include <__type_traits/remove_all_extents.h> #include <__type_traits/remove_const.h> -#include <__type_traits/remove_const_ref.h> #include <__type_traits/remove_cv.h> #include <__type_traits/remove_extent.h> #include <__type_traits/remove_pointer.h> #include <__type_traits/remove_reference.h> #include <__type_traits/remove_volatile.h> #include <__type_traits/result_of.h> -#include <__type_traits/type_identity.h> #include <__type_traits/underlying_type.h> -#include <__type_traits/unwrap_ref.h> -#include <__type_traits/void_t.h> -#include <__utility/declval.h> -#include -#include + +#if _LIBCPP_STD_VER >= 14 +# include <__type_traits/is_final.h> +# include <__type_traits/is_null_pointer.h> +#endif + +#if _LIBCPP_STD_VER >= 17 +# include <__type_traits/conjunction.h> +# include <__type_traits/disjunction.h> +# include <__type_traits/has_unique_object_representation.h> +# include <__type_traits/invoke.h> +# include <__type_traits/is_aggregate.h> +# include <__type_traits/is_swappable.h> +# include <__type_traits/negation.h> +# include <__type_traits/void_t.h> +#endif + +#if _LIBCPP_STD_VER >= 20 +# include <__type_traits/common_reference.h> +# include <__type_traits/is_bounded_array.h> +# include <__type_traits/is_constant_evaluated.h> +# include <__type_traits/is_nothrow_convertible.h> +# include <__type_traits/is_unbounded_array.h> +# include <__type_traits/type_identity.h> +# include <__type_traits/unwrap_ref.h> +#endif + +#if _LIBCPP_STD_VER >= 23 +# include <__type_traits/is_scoped_enum.h> +#endif + #include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) diff --git a/libcxx/include/utility b/libcxx/include/utility index f2f0052df2755..f97907fbf72e9 100644 --- a/libcxx/include/utility +++ b/libcxx/include/utility @@ -247,25 +247,35 @@ template */ #include <__config> -#include <__utility/as_const.h> -#include <__utility/as_lvalue.h> -#include <__utility/auto_cast.h> -#include <__utility/cmp.h> + #include <__utility/declval.h> -#include <__utility/exception_guard.h> -#include <__utility/exchange.h> #include <__utility/forward.h> -#include <__utility/forward_like.h> -#include <__utility/in_place.h> -#include <__utility/integer_sequence.h> #include <__utility/move.h> #include <__utility/pair.h> #include <__utility/piecewise_construct.h> -#include <__utility/priority_tag.h> #include <__utility/rel_ops.h> #include <__utility/swap.h> -#include <__utility/to_underlying.h> -#include <__utility/unreachable.h> + +#if _LIBCPP_STD_VER >= 14 +# include <__utility/exchange.h> +# include <__utility/integer_sequence.h> +#endif + +#if _LIBCPP_STD_VER >= 17 +# include <__utility/as_const.h> +# include <__utility/in_place.h> +#endif + +#if _LIBCPP_STD_VER >= 20 +# include <__utility/cmp.h> +#endif + +#if _LIBCPP_STD_VER >= 23 +# include <__utility/forward_like.h> +# include <__utility/to_underlying.h> +# include <__utility/unreachable.h> +#endif + #include // standard-mandated includes @@ -286,6 +296,10 @@ template # pragma GCC system_header #endif +#if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 +# include +#endif + #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 # include # include diff --git a/libcxx/test/libcxx/containers/sequences/deque/abi.compile.pass.cpp b/libcxx/test/libcxx/containers/sequences/deque/abi.compile.pass.cpp index 9910befb0349e..7d2dd218f967b 100644 --- a/libcxx/test/libcxx/containers/sequences/deque/abi.compile.pass.cpp +++ b/libcxx/test/libcxx/containers/sequences/deque/abi.compile.pass.cpp @@ -6,6 +6,7 @@ // //===----------------------------------------------------------------------===// +#include #include #include "min_allocator.h" diff --git a/libcxx/test/libcxx/containers/sequences/list/abi.compile.pass.cpp b/libcxx/test/libcxx/containers/sequences/list/abi.compile.pass.cpp index 8ace45d661df6..a16ae1d527921 100644 --- a/libcxx/test/libcxx/containers/sequences/list/abi.compile.pass.cpp +++ b/libcxx/test/libcxx/containers/sequences/list/abi.compile.pass.cpp @@ -6,6 +6,7 @@ // //===----------------------------------------------------------------------===// +#include #include #include "min_allocator.h" diff --git a/libcxx/test/libcxx/iterators/bounded_iter/arithmetic.pass.cpp b/libcxx/test/libcxx/iterators/bounded_iter/arithmetic.pass.cpp index feaef53ae09d5..45d0cc3b95f90 100644 --- a/libcxx/test/libcxx/iterators/bounded_iter/arithmetic.pass.cpp +++ b/libcxx/test/libcxx/iterators/bounded_iter/arithmetic.pass.cpp @@ -11,8 +11,8 @@ // // Arithmetic operators +#include <__iterator/bounded_iter.h> #include -#include #include "test_iterators.h" #include "test_macros.h" diff --git a/libcxx/test/libcxx/iterators/bounded_iter/comparison.pass.cpp b/libcxx/test/libcxx/iterators/bounded_iter/comparison.pass.cpp index f4b0da9511eaf..9c5df5da55b9c 100644 --- a/libcxx/test/libcxx/iterators/bounded_iter/comparison.pass.cpp +++ b/libcxx/test/libcxx/iterators/bounded_iter/comparison.pass.cpp @@ -11,7 +11,7 @@ // // Comparison operators -#include +#include <__iterator/bounded_iter.h> #include "test_iterators.h" #include "test_macros.h" diff --git a/libcxx/test/libcxx/iterators/bounded_iter/dereference.pass.cpp b/libcxx/test/libcxx/iterators/bounded_iter/dereference.pass.cpp index bf723f14e80a9..7e3a59a49ffd4 100644 --- a/libcxx/test/libcxx/iterators/bounded_iter/dereference.pass.cpp +++ b/libcxx/test/libcxx/iterators/bounded_iter/dereference.pass.cpp @@ -16,7 +16,7 @@ // UNSUPPORTED: libcpp-hardening-mode=none // XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing -#include +#include <__iterator/bounded_iter.h> #include "check_assertion.h" #include "test_iterators.h" diff --git a/libcxx/test/libcxx/iterators/bounded_iter/pointer_traits.pass.cpp b/libcxx/test/libcxx/iterators/bounded_iter/pointer_traits.pass.cpp index 6ae0928b7e528..bfd779d644f51 100644 --- a/libcxx/test/libcxx/iterators/bounded_iter/pointer_traits.pass.cpp +++ b/libcxx/test/libcxx/iterators/bounded_iter/pointer_traits.pass.cpp @@ -11,9 +11,10 @@ // // std::pointer_traits specialization +#include <__iterator/bounded_iter.h> #include #include -#include +#include #include #include "test_iterators.h" diff --git a/libcxx/test/libcxx/iterators/bounded_iter/types.compile.pass.cpp b/libcxx/test/libcxx/iterators/bounded_iter/types.compile.pass.cpp index db95513055f81..56ded9ae5ed21 100644 --- a/libcxx/test/libcxx/iterators/bounded_iter/types.compile.pass.cpp +++ b/libcxx/test/libcxx/iterators/bounded_iter/types.compile.pass.cpp @@ -11,6 +11,7 @@ // // Nested types +#include <__iterator/bounded_iter.h> #include #include #include diff --git a/libcxx/test/libcxx/memory/allocation_guard.pass.cpp b/libcxx/test/libcxx/memory/allocation_guard.pass.cpp index 4b2f7abe9159b..493ebf044187c 100644 --- a/libcxx/test/libcxx/memory/allocation_guard.pass.cpp +++ b/libcxx/test/libcxx/memory/allocation_guard.pass.cpp @@ -15,8 +15,8 @@ // template // struct __allocation_guard; +#include <__memory/allocation_guard.h> #include -#include #include #include diff --git a/libcxx/test/libcxx/memory/compressed_pair/compressed_pair.pass.cpp b/libcxx/test/libcxx/memory/compressed_pair/compressed_pair.pass.cpp index 8bc890a208d0c..4258089813e0d 100644 --- a/libcxx/test/libcxx/memory/compressed_pair/compressed_pair.pass.cpp +++ b/libcxx/test/libcxx/memory/compressed_pair/compressed_pair.pass.cpp @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// +#include <__memory/compressed_pair.h> #include -#include #include #include "test_macros.h" diff --git a/libcxx/test/libcxx/numerics/clamp_to_integral.pass.cpp b/libcxx/test/libcxx/numerics/clamp_to_integral.pass.cpp index a826555d48dda..aed78f9cddf84 100644 --- a/libcxx/test/libcxx/numerics/clamp_to_integral.pass.cpp +++ b/libcxx/test/libcxx/numerics/clamp_to_integral.pass.cpp @@ -12,10 +12,10 @@ // closest representable value for the specified integer type, or // numeric_limits::max()/min() if the value isn't representable. +#include <__random/clamp_to_integral.h> #include #include #include -#include // for __clamp_to_integral template void test() { diff --git a/libcxx/test/libcxx/ranges/range.adaptors/range.adaptor.helpers/as-lvalue.lifetimebound.verify.cpp b/libcxx/test/libcxx/ranges/range.adaptors/range.adaptor.helpers/as-lvalue.lifetimebound.verify.cpp index 7046936b1b7a7..b60f172363350 100644 --- a/libcxx/test/libcxx/ranges/range.adaptors/range.adaptor.helpers/as-lvalue.lifetimebound.verify.cpp +++ b/libcxx/test/libcxx/ranges/range.adaptors/range.adaptor.helpers/as-lvalue.lifetimebound.verify.cpp @@ -11,7 +11,7 @@ // template // constexpr T& as-lvalue(T&& t) { // exposition only -#include +#include <__utility/as_lvalue.h> void test() { // Check prvalue diff --git a/libcxx/test/libcxx/ranges/range.adaptors/range.adaptor.helpers/as-lvalue.pass.cpp b/libcxx/test/libcxx/ranges/range.adaptors/range.adaptor.helpers/as-lvalue.pass.cpp index 721279fcd586b..8e47a507f2f8a 100644 --- a/libcxx/test/libcxx/ranges/range.adaptors/range.adaptor.helpers/as-lvalue.pass.cpp +++ b/libcxx/test/libcxx/ranges/range.adaptors/range.adaptor.helpers/as-lvalue.pass.cpp @@ -11,6 +11,7 @@ // template // constexpr T& as-lvalue(T&& t) { // exposition only +#include <__utility/as_lvalue.h> #include #include diff --git a/libcxx/test/libcxx/transitive_includes/cxx03.csv b/libcxx/test/libcxx/transitive_includes/cxx03.csv index 65c805cd86b76..51e659f52000b 100644 --- a/libcxx/test/libcxx/transitive_includes/cxx03.csv +++ b/libcxx/test/libcxx/transitive_includes/cxx03.csv @@ -960,6 +960,7 @@ unordered_set type_traits unordered_set version utility compare utility cstddef +utility cstdint utility cstdlib utility initializer_list utility iosfwd diff --git a/libcxx/test/libcxx/transitive_includes/cxx11.csv b/libcxx/test/libcxx/transitive_includes/cxx11.csv index bf353b2dd4ce4..17e85e982729c 100644 --- a/libcxx/test/libcxx/transitive_includes/cxx11.csv +++ b/libcxx/test/libcxx/transitive_includes/cxx11.csv @@ -967,6 +967,7 @@ unordered_set type_traits unordered_set version utility compare utility cstddef +utility cstdint utility cstdlib utility initializer_list utility iosfwd diff --git a/libcxx/test/libcxx/transitive_includes/cxx14.csv b/libcxx/test/libcxx/transitive_includes/cxx14.csv index fa6e44873fc12..8aed93da9e6cc 100644 --- a/libcxx/test/libcxx/transitive_includes/cxx14.csv +++ b/libcxx/test/libcxx/transitive_includes/cxx14.csv @@ -970,6 +970,7 @@ unordered_set type_traits unordered_set version utility compare utility cstddef +utility cstdint utility cstdlib utility initializer_list utility iosfwd diff --git a/libcxx/test/libcxx/transitive_includes/cxx17.csv b/libcxx/test/libcxx/transitive_includes/cxx17.csv index e03f74f50b914..2c028462144ee 100644 --- a/libcxx/test/libcxx/transitive_includes/cxx17.csv +++ b/libcxx/test/libcxx/transitive_includes/cxx17.csv @@ -971,6 +971,7 @@ unordered_set type_traits unordered_set version utility compare utility cstddef +utility cstdint utility cstdlib utility initializer_list utility iosfwd diff --git a/libcxx/test/libcxx/transitive_includes/cxx20.csv b/libcxx/test/libcxx/transitive_includes/cxx20.csv index 37cd4e58e7fca..982c2013e3417 100644 --- a/libcxx/test/libcxx/transitive_includes/cxx20.csv +++ b/libcxx/test/libcxx/transitive_includes/cxx20.csv @@ -682,6 +682,7 @@ ranges optional ranges span ranges tuple ranges type_traits +ranges variant ranges version ratio climits ratio cstdint @@ -977,6 +978,7 @@ unordered_set type_traits unordered_set version utility compare utility cstddef +utility cstdint utility cstdlib utility initializer_list utility iosfwd diff --git a/libcxx/test/libcxx/transitive_includes/cxx23.csv b/libcxx/test/libcxx/transitive_includes/cxx23.csv index 098752e8699f7..8ffb71d8b566b 100644 --- a/libcxx/test/libcxx/transitive_includes/cxx23.csv +++ b/libcxx/test/libcxx/transitive_includes/cxx23.csv @@ -456,7 +456,6 @@ ranges cwchar ranges initializer_list ranges iterator ranges limits -ranges new ranges optional ranges span ranges tuple diff --git a/libcxx/test/libcxx/transitive_includes/cxx26.csv b/libcxx/test/libcxx/transitive_includes/cxx26.csv index 098752e8699f7..8ffb71d8b566b 100644 --- a/libcxx/test/libcxx/transitive_includes/cxx26.csv +++ b/libcxx/test/libcxx/transitive_includes/cxx26.csv @@ -456,7 +456,6 @@ ranges cwchar ranges initializer_list ranges iterator ranges limits -ranges new ranges optional ranges span ranges tuple diff --git a/libcxx/test/libcxx/type_traits/is_callable.compile.pass.cpp b/libcxx/test/libcxx/type_traits/is_callable.compile.pass.cpp index f7f76bbe9bef0..d7bd701aa706a 100644 --- a/libcxx/test/libcxx/type_traits/is_callable.compile.pass.cpp +++ b/libcxx/test/libcxx/type_traits/is_callable.compile.pass.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include +#include <__type_traits/is_callable.h> struct Functor { void operator()(); diff --git a/libcxx/test/libcxx/type_traits/is_constant_evaluated.pass.cpp b/libcxx/test/libcxx/type_traits/is_constant_evaluated.pass.cpp index 55398f8ad64ba..5a779c0e96e21 100644 --- a/libcxx/test/libcxx/type_traits/is_constant_evaluated.pass.cpp +++ b/libcxx/test/libcxx/type_traits/is_constant_evaluated.pass.cpp @@ -14,7 +14,7 @@ // returns false when there's no constant evaluation support from the compiler. // as well as when called not in a constexpr context -#include +#include <__type_traits/is_constant_evaluated.h> #include #include "test_macros.h" diff --git a/libcxx/test/libcxx/type_traits/is_implicitly_default_constructible.pass.cpp b/libcxx/test/libcxx/type_traits/is_implicitly_default_constructible.pass.cpp index e1951cbc8446f..ff0ab6f68df67 100644 --- a/libcxx/test/libcxx/type_traits/is_implicitly_default_constructible.pass.cpp +++ b/libcxx/test/libcxx/type_traits/is_implicitly_default_constructible.pass.cpp @@ -12,8 +12,7 @@ // __is_implicitly_default_constructible -#include - +#include <__type_traits/is_implicitly_default_constructible.h> struct ExplicitlyDefaultConstructible1 { explicit ExplicitlyDefaultConstructible1() = default; diff --git a/libcxx/test/libcxx/type_traits/is_specialization.compile.pass.cpp b/libcxx/test/libcxx/type_traits/is_specialization.compile.pass.cpp index 134e5f5d186a6..73dfc773aa774 100644 --- a/libcxx/test/libcxx/type_traits/is_specialization.compile.pass.cpp +++ b/libcxx/test/libcxx/type_traits/is_specialization.compile.pass.cpp @@ -14,10 +14,7 @@ // Note instantiation for certain type combinations are ill-formed. These are // tested in is_specialization.verify.cpp. -#include - -#include -#include +#include <__type_traits/is_specialization.h> #include #include #include diff --git a/libcxx/test/libcxx/type_traits/is_specialization.verify.cpp b/libcxx/test/libcxx/type_traits/is_specialization.verify.cpp index 2fd1176417538..a798647d56ee1 100644 --- a/libcxx/test/libcxx/type_traits/is_specialization.verify.cpp +++ b/libcxx/test/libcxx/type_traits/is_specialization.verify.cpp @@ -13,8 +13,7 @@ // // Tests the ill-formed instantiations. -#include - +#include <__type_traits/is_specialization.h> #include #include diff --git a/libcxx/test/libcxx/type_traits/lazy_metafunctions.pass.cpp b/libcxx/test/libcxx/type_traits/lazy_metafunctions.pass.cpp index 51ff161357605..669bcdb58d7ba 100644 --- a/libcxx/test/libcxx/type_traits/lazy_metafunctions.pass.cpp +++ b/libcxx/test/libcxx/type_traits/lazy_metafunctions.pass.cpp @@ -14,6 +14,9 @@ // Test the libc++ lazy meta-programming helpers in +#include <__type_traits/conjunction.h> +#include <__type_traits/disjunction.h> +#include <__type_traits/negation.h> #include #include "test_macros.h" diff --git a/libcxx/test/libcxx/utilities/exception_guard.no_exceptions.pass.cpp b/libcxx/test/libcxx/utilities/exception_guard.no_exceptions.pass.cpp index 7f2bcbca11ca0..60cf4f7814597 100644 --- a/libcxx/test/libcxx/utilities/exception_guard.no_exceptions.pass.cpp +++ b/libcxx/test/libcxx/utilities/exception_guard.no_exceptions.pass.cpp @@ -10,6 +10,7 @@ // ADDITIONAL_COMPILE_FLAGS: -fno-exceptions +#include <__utility/exception_guard.h> #include int main(int, char**) { diff --git a/libcxx/test/libcxx/utilities/exception_guard.pass.cpp b/libcxx/test/libcxx/utilities/exception_guard.pass.cpp index 71e60fc94542d..0728959c7277b 100644 --- a/libcxx/test/libcxx/utilities/exception_guard.pass.cpp +++ b/libcxx/test/libcxx/utilities/exception_guard.pass.cpp @@ -10,6 +10,7 @@ // UNSUPPORTED: no-exceptions +#include <__utility/exception_guard.h> #include #include #include diff --git a/libcxx/test/libcxx/utilities/function.objects/func.bind.partial/compose.pass.cpp b/libcxx/test/libcxx/utilities/function.objects/func.bind.partial/compose.pass.cpp index a39352e539135..7e597081cfaad 100644 --- a/libcxx/test/libcxx/utilities/function.objects/func.bind.partial/compose.pass.cpp +++ b/libcxx/test/libcxx/utilities/function.objects/func.bind.partial/compose.pass.cpp @@ -11,7 +11,7 @@ // template // constexpr unspecified __compose(F1&&, F2&&); -#include +#include <__functional/compose.h> #include #include #include diff --git a/libcxx/test/libcxx/utilities/meta/meta_base.pass.cpp b/libcxx/test/libcxx/utilities/meta/meta_base.pass.cpp index 4696e3f667830..5d0c5586d9b3d 100644 --- a/libcxx/test/libcxx/utilities/meta/meta_base.pass.cpp +++ b/libcxx/test/libcxx/utilities/meta/meta_base.pass.cpp @@ -10,9 +10,12 @@ #include "test_macros.h" TEST_CLANG_DIAGNOSTIC_IGNORED("-Wprivate-header") +#include <__type_traits/conjunction.h> +#include <__type_traits/disjunction.h> #include <__type_traits/is_valid_expansion.h> -#include +#include <__type_traits/negation.h> #include +#include struct Bomb; template diff --git a/libcxx/test/std/algorithms/alg.sorting/alg.heap.operations/make.heap/ranges_make_heap.pass.cpp b/libcxx/test/std/algorithms/alg.sorting/alg.heap.operations/make.heap/ranges_make_heap.pass.cpp index 8d9df94eee5a6..f69351209e4f1 100644 --- a/libcxx/test/std/algorithms/alg.sorting/alg.heap.operations/make.heap/ranges_make_heap.pass.cpp +++ b/libcxx/test/std/algorithms/alg.sorting/alg.heap.operations/make.heap/ranges_make_heap.pass.cpp @@ -25,6 +25,7 @@ #include #include #include +#include #include #include "almost_satisfies_types.h" diff --git a/libcxx/test/std/algorithms/alg.sorting/alg.heap.operations/pop.heap/ranges_pop_heap.pass.cpp b/libcxx/test/std/algorithms/alg.sorting/alg.heap.operations/pop.heap/ranges_pop_heap.pass.cpp index 2f13bdc76f793..9efe2513271ed 100644 --- a/libcxx/test/std/algorithms/alg.sorting/alg.heap.operations/pop.heap/ranges_pop_heap.pass.cpp +++ b/libcxx/test/std/algorithms/alg.sorting/alg.heap.operations/pop.heap/ranges_pop_heap.pass.cpp @@ -25,6 +25,7 @@ #include #include #include +#include #include #include "almost_satisfies_types.h" diff --git a/libcxx/test/std/algorithms/alg.sorting/alg.heap.operations/push.heap/ranges_push_heap.pass.cpp b/libcxx/test/std/algorithms/alg.sorting/alg.heap.operations/push.heap/ranges_push_heap.pass.cpp index 6f8a1a20c965c..571da879ed3f5 100644 --- a/libcxx/test/std/algorithms/alg.sorting/alg.heap.operations/push.heap/ranges_push_heap.pass.cpp +++ b/libcxx/test/std/algorithms/alg.sorting/alg.heap.operations/push.heap/ranges_push_heap.pass.cpp @@ -25,6 +25,7 @@ #include #include #include +#include #include #include "almost_satisfies_types.h" diff --git a/libcxx/test/std/ranges/range.adaptors/range.all/range.owning.view/begin_end.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.all/range.owning.view/begin_end.pass.cpp index 155f88010309c..2dddd84d8796e 100644 --- a/libcxx/test/std/ranges/range.adaptors/range.all/range.owning.view/begin_end.pass.cpp +++ b/libcxx/test/std/ranges/range.adaptors/range.all/range.owning.view/begin_end.pass.cpp @@ -19,6 +19,7 @@ #include #include #include +#include #include "test_iterators.h" #include "test_macros.h" diff --git a/libcxx/test/std/ranges/range.factories/range.repeat.view/iterator/plus_eq.pass.cpp b/libcxx/test/std/ranges/range.factories/range.repeat.view/iterator/plus_eq.pass.cpp index a4696d73839b6..549ebffe5c97e 100644 --- a/libcxx/test/std/ranges/range.factories/range.repeat.view/iterator/plus_eq.pass.cpp +++ b/libcxx/test/std/ranges/range.factories/range.repeat.view/iterator/plus_eq.pass.cpp @@ -10,9 +10,10 @@ // constexpr iterator& operator+=(difference_type n); -#include #include #include +#include +#include constexpr bool test() { std::ranges::repeat_view v(10); diff --git a/libcxx/test/support/test_iterators.h b/libcxx/test/support/test_iterators.h index aa819ecd4733b..bb3ba2182f557 100644 --- a/libcxx/test/support/test_iterators.h +++ b/libcxx/test/support/test_iterators.h @@ -11,6 +11,7 @@ #include #include +#include #include #include #include From c6144cb0de35013e19ddd4d9fbc86367bb1ba223 Mon Sep 17 00:00:00 2001 From: Noah Goldstein Date: Thu, 18 Jul 2024 16:40:00 +0800 Subject: [PATCH 403/777] [ValueTracking] Remove unnecessary `m_ElementWiseBitCast` from `isKnownNonZeroFromOperator`; NFC --- llvm/lib/Analysis/ValueTracking.cpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index 6e039ad2deadb..535a248a5f1a2 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -2784,11 +2784,8 @@ static bool isKnownNonZeroFromOperator(const Operator *I, // This all implies the 2 i16 elements are non-zero. Type *FromTy = I->getOperand(0)->getType(); if ((FromTy->isIntOrIntVectorTy() || FromTy->isPtrOrPtrVectorTy()) && - (BitWidth % getBitWidth(FromTy->getScalarType(), Q.DL)) == 0) { - if (match(I, m_ElementWiseBitCast(m_Value()))) - return isKnownNonZero(I->getOperand(0), DemandedElts, Q, Depth); + (BitWidth % getBitWidth(FromTy->getScalarType(), Q.DL)) == 0) return isKnownNonZero(I->getOperand(0), Q, Depth); - } } break; case Instruction::IntToPtr: // Note that we have to take special care to avoid looking through From d097f430a172a5d798a39b416b1af84f4ec572e1 Mon Sep 17 00:00:00 2001 From: Dmitry Vasilyev Date: Thu, 18 Jul 2024 13:04:49 +0400 Subject: [PATCH 404/777] [lldb] Fixed the error `unable to launch a GDB server` in API tests (#98833) TestPlatformLaunchGDBServer.py runs `ldb-server` w/o parameters `--min-gdbserver-port`, `--max-gdbserver-port` or `--gdbserver-port`. So `gdbserver_portmap` is empty and `gdbserver_portmap.GetNextAvailablePort()` will return 0. Do not call `portmap_for_child.AllowPort(0)` in this case. Otherwise `portmap_for_child.GetNextAvailablePort()` will allocate and never free the port 0 and next call `portmap_for_child.GetNextAvailablePort()` will fail. Added few asserts in `GDBRemoteCommunicationServerPlatform::PortMap` to avoid such issue in the future. This patch fixes a bug added in #88845. The behaviour is very close to #97537 w/o parameters `--min-gdbserver-port`, `--max-gdbserver-port` and `--gdbserver-port`. --- .../GDBRemoteCommunicationServerPlatform.cpp | 2 ++ lldb/tools/lldb-server/lldb-platform.cpp | 10 ++++++---- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerPlatform.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerPlatform.cpp index 5285ec1d3db4e..65f1cc12ba307 100644 --- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerPlatform.cpp +++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerPlatform.cpp @@ -46,11 +46,13 @@ using namespace lldb_private; GDBRemoteCommunicationServerPlatform::PortMap::PortMap(uint16_t min_port, uint16_t max_port) { + assert(min_port); for (; min_port < max_port; ++min_port) m_port_map[min_port] = LLDB_INVALID_PROCESS_ID; } void GDBRemoteCommunicationServerPlatform::PortMap::AllowPort(uint16_t port) { + assert(port); // Do not modify existing mappings m_port_map.insert({port, LLDB_INVALID_PROCESS_ID}); } diff --git a/lldb/tools/lldb-server/lldb-platform.cpp b/lldb/tools/lldb-server/lldb-platform.cpp index cfd0a3797d810..7148a1d2a3094 100644 --- a/lldb/tools/lldb-server/lldb-platform.cpp +++ b/lldb/tools/lldb-server/lldb-platform.cpp @@ -313,9 +313,11 @@ int main_platform(int argc, char *argv[]) { GDBRemoteCommunicationServerPlatform::PortMap portmap_for_child; llvm::Expected available_port = gdbserver_portmap.GetNextAvailablePort(); - if (available_port) - portmap_for_child.AllowPort(*available_port); - else { + if (available_port) { + // GetNextAvailablePort() may return 0 if gdbserver_portmap is empty. + if (*available_port) + portmap_for_child.AllowPort(*available_port); + } else { llvm::consumeError(available_port.takeError()); fprintf(stderr, "no available gdbserver port for connection - dropping...\n"); @@ -352,7 +354,7 @@ int main_platform(int argc, char *argv[]) { if (platform.IsConnected()) { if (inferior_arguments.GetArgumentCount() > 0) { lldb::pid_t pid = LLDB_INVALID_PROCESS_ID; - std::optional port = 0; + std::optional port; std::string socket_name; Status error = platform.LaunchGDBServer(inferior_arguments, "", // hostname From ba8e4920ca57518f429bcf0a68ed3d48195fb1e6 Mon Sep 17 00:00:00 2001 From: Uday Bondhugula Date: Thu, 18 Jul 2024 14:51:09 +0530 Subject: [PATCH 405/777] [MLIR] NFC. Remove anti-patterns given the default null init for Value (#99457) Remove anti-patterns given the default null init for Value. Drop some extra includes while on this file. NFC. Co-authored-by: GitHub runner --- mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp index 605737542e9fc..f09d93f3ba444 100644 --- a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp +++ b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp @@ -12,10 +12,8 @@ #include "mlir/Dialect/Affine/LoopUtils.h" #include "mlir/Analysis/SliceAnalysis.h" -#include "mlir/Dialect/Affine/Analysis/AffineAnalysis.h" #include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h" #include "mlir/Dialect/Affine/Analysis/Utils.h" -#include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Affine/IR/AffineValueMap.h" #include "mlir/Dialect/Affine/Utils.h" #include "mlir/Dialect/Func/IR/FuncOps.h" @@ -2082,8 +2080,8 @@ static LogicalResult generateCopy( auto numElementsSSA = top.create(loc, *numElements); - Value dmaStride = nullptr; - Value numEltPerDmaStride = nullptr; + Value dmaStride; + Value numEltPerDmaStride; if (copyOptions.generateDma) { SmallVector dmaStrideInfos; getMultiLevelStrides(region, fastBufferShape, &dmaStrideInfos); From e4a2d74e0917d481ecda8e8ff0c0af3c683c9441 Mon Sep 17 00:00:00 2001 From: Him188 Date: Thu, 18 Jul 2024 10:22:27 +0100 Subject: [PATCH 406/777] [AArch64][GISel] Always fold G_SHL into addressing mode where possible, unless the subtarget has addr-lsl-slow-14 (#96603) Before this patch, we fold G_SHL into addressing mode lsl only when there is exactly one usage, or all the usages are memory ops, or we are optimizing for size. However, lsl is free on all aarch64 targets except those with FeatureAddrLSLSlow14. This patch uses this fact and always folds G_SHL into lsl for memory ops, with exceptions for FeatureAddrLSLSlow14. This patch also fixes GISel 15% regression in TSVC kernel s482, and brings regression in s291 from 20% to 10%. --- .../GISel/AArch64InstructionSelector.cpp | 76 +++++++-- .../GlobalISel/load-addressing-modes.mir | 156 ++++++++++++------ .../GlobalISel/store-addressing-modes.mir | 62 ++++--- .../CodeGen/AArch64/aarch64-fold-lslfast.ll | 49 +++--- 4 files changed, 231 insertions(+), 112 deletions(-) diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp index 24d65624e09e9..0d3f6d9e353ba 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp @@ -414,8 +414,13 @@ class AArch64InstructionSelector : public InstructionSelector { return selectAddrModeIndexed(Root, Width / 8); } + std::optional + isWorthFoldingIntoAddrMode(MachineInstr &MI, + const MachineRegisterInfo &MRI) const; + bool isWorthFoldingIntoExtendedReg(MachineInstr &MI, - const MachineRegisterInfo &MRI) const; + const MachineRegisterInfo &MRI, + bool IsAddrOperand) const; ComplexRendererFns selectAddrModeShiftedExtendXReg(MachineOperand &Root, unsigned SizeInBytes) const; @@ -6869,19 +6874,70 @@ AArch64InstructionSelector::selectNegArithImmed(MachineOperand &Root) const { return select12BitValueWithLeftShift(Immed); } +/// Checks if we are sure that folding MI into load/store addressing mode is +/// beneficial or not. +/// +/// Returns: +/// - true if folding MI would be beneficial. +/// - false if folding MI would be bad. +/// - std::nullopt if it is not sure whether folding MI is beneficial. +/// +/// \p MI can be the offset operand of G_PTR_ADD, e.g. G_SHL in the example: +/// +/// %13:gpr(s64) = G_CONSTANT i64 1 +/// %8:gpr(s64) = G_SHL %6, %13(s64) +/// %9:gpr(p0) = G_PTR_ADD %0, %8(s64) +/// %12:gpr(s32) = G_LOAD %9(p0) :: (load (s16)) +std::optional AArch64InstructionSelector::isWorthFoldingIntoAddrMode( + MachineInstr &MI, const MachineRegisterInfo &MRI) const { + if (MI.getOpcode() == AArch64::G_SHL) { + // Address operands with shifts are free, except for running on subtargets + // with AddrLSLSlow14. + if (const auto ValAndVeg = getIConstantVRegValWithLookThrough( + MI.getOperand(2).getReg(), MRI)) { + const APInt ShiftVal = ValAndVeg->Value; + + // Don't fold if we know this will be slow. + return !(STI.hasAddrLSLSlow14() && (ShiftVal == 1 || ShiftVal == 4)); + } + } + return std::nullopt; +} + /// Return true if it is worth folding MI into an extended register. That is, /// if it's safe to pull it into the addressing mode of a load or store as a /// shift. +/// \p IsAddrOperand whether the def of MI is used as an address operand +/// (e.g. feeding into an LDR/STR). bool AArch64InstructionSelector::isWorthFoldingIntoExtendedReg( - MachineInstr &MI, const MachineRegisterInfo &MRI) const { + MachineInstr &MI, const MachineRegisterInfo &MRI, + bool IsAddrOperand) const { + // Always fold if there is one use, or if we're optimizing for size. Register DefReg = MI.getOperand(0).getReg(); if (MRI.hasOneNonDBGUse(DefReg) || MI.getParent()->getParent()->getFunction().hasOptSize()) return true; - // FIXME: Consider checking HasAddrLSLSlow14 and HasALULSLFast as - // appropriate. + if (IsAddrOperand) { + // If we are already sure that folding MI is good or bad, return the result. + if (const auto Worth = isWorthFoldingIntoAddrMode(MI, MRI)) + return *Worth; + + // Fold G_PTR_ADD if its offset operand can be folded + if (MI.getOpcode() == AArch64::G_PTR_ADD) { + MachineInstr *OffsetInst = + getDefIgnoringCopies(MI.getOperand(2).getReg(), MRI); + + // Note, we already know G_PTR_ADD is used by at least two instructions. + // If we are also sure about whether folding is beneficial or not, + // return the result. + if (const auto Worth = isWorthFoldingIntoAddrMode(*OffsetInst, MRI)) + return *Worth; + } + } + + // FIXME: Consider checking HasALULSLFast as appropriate. // We have a fastpath, so folding a shift in and potentially computing it // many times may be beneficial. Check if this is only used in memory ops. @@ -6929,7 +6985,7 @@ AArch64InstructionSelector::selectExtendedSHL( int64_t LegalShiftVal = Log2_32(SizeInBytes); if (LegalShiftVal == 0) return std::nullopt; - if (!isWorthFoldingIntoExtendedReg(*OffsetInst, MRI)) + if (!isWorthFoldingIntoExtendedReg(*OffsetInst, MRI, true)) return std::nullopt; // Now, try to find the specific G_CONSTANT. Start by assuming that the @@ -7036,7 +7092,7 @@ AArch64InstructionSelector::selectAddrModeShiftedExtendXReg( // Check if we can find the G_PTR_ADD. MachineInstr *PtrAdd = getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI); - if (!PtrAdd || !isWorthFoldingIntoExtendedReg(*PtrAdd, MRI)) + if (!PtrAdd || !isWorthFoldingIntoExtendedReg(*PtrAdd, MRI, true)) return std::nullopt; // Now, try to match an opcode which will match our specific offset. @@ -7170,7 +7226,7 @@ AArch64InstructionSelector::selectAddrModeWRO(MachineOperand &Root, MachineInstr *PtrAdd = getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI); - if (!PtrAdd || !isWorthFoldingIntoExtendedReg(*PtrAdd, MRI)) + if (!PtrAdd || !isWorthFoldingIntoExtendedReg(*PtrAdd, MRI, true)) return std::nullopt; MachineOperand &LHS = PtrAdd->getOperand(1); @@ -7201,7 +7257,7 @@ AArch64InstructionSelector::selectAddrModeWRO(MachineOperand &Root, // // e.g. // ldr something, [base_reg, ext_reg, sxtw] - if (!isWorthFoldingIntoExtendedReg(*OffsetInst, MRI)) + if (!isWorthFoldingIntoExtendedReg(*OffsetInst, MRI, true)) return std::nullopt; // Check if this is an extend. We'll get an extend type if it is. @@ -7396,7 +7452,7 @@ AArch64InstructionSelector::selectShiftedRegister(MachineOperand &Root, return std::nullopt; if (ShType == AArch64_AM::ROR && !AllowROR) return std::nullopt; - if (!isWorthFoldingIntoExtendedReg(*ShiftInst, MRI)) + if (!isWorthFoldingIntoExtendedReg(*ShiftInst, MRI, false)) return std::nullopt; // Need an immediate on the RHS. @@ -7510,7 +7566,7 @@ AArch64InstructionSelector::selectArithExtendedRegister( if (!RootDef) return std::nullopt; - if (!isWorthFoldingIntoExtendedReg(*RootDef, MRI)) + if (!isWorthFoldingIntoExtendedReg(*RootDef, MRI, false)) return std::nullopt; // Check if we can fold a shift and an extend. diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir b/llvm/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir index 3af2aaf57eed8..dc2e1c5dc28d4 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir @@ -535,13 +535,13 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0 ; CHECK-NEXT: [[UBFMXri:%[0-9]+]]:gpr64common = UBFMXri [[COPY]], 61, 60 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1 - ; CHECK-NEXT: [[ADDXrr:%[0-9]+]]:gpr64common = ADDXrr [[COPY1]], [[UBFMXri]] - ; CHECK-NEXT: [[LDRXui:%[0-9]+]]:gpr64 = LDRXui [[ADDXrr]], 0 :: (load (s64) from %ir.addr) + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64 = COPY [[COPY1]] + ; CHECK-NEXT: [[ADDXrr:%[0-9]+]]:gpr64 = ADDXrr [[COPY2]], [[UBFMXri]] + ; CHECK-NEXT: [[LDRXroX:%[0-9]+]]:gpr64 = LDRXroX [[COPY1]], [[COPY]], 0, 1 :: (load (s64) from %ir.addr) ; CHECK-NEXT: [[ADDXri:%[0-9]+]]:gpr64common = ADDXri [[UBFMXri]], 3, 0 - ; CHECK-NEXT: [[ADDXrr1:%[0-9]+]]:gpr64 = ADDXrr [[LDRXui]], [[ADDXri]] - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64 = COPY [[ADDXrr]] - ; CHECK-NEXT: [[ADDXrr2:%[0-9]+]]:gpr64 = ADDXrr [[COPY2]], [[ADDXrr1]] + ; CHECK-NEXT: [[ADDXrr1:%[0-9]+]]:gpr64 = ADDXrr [[LDRXroX]], [[ADDXri]] + ; CHECK-NEXT: [[ADDXrr2:%[0-9]+]]:gpr64 = ADDXrr [[ADDXrr]], [[ADDXrr1]] ; CHECK-NEXT: $x2 = COPY [[ADDXrr2]] ; CHECK-NEXT: RET_ReallyLR implicit $x2 %0:gpr(s64) = COPY $x0 @@ -571,19 +571,36 @@ body: | liveins: $x0, $x1, $x2 liveins: $w1, $x0 - ; CHECK-LABEL: name: ldrhrox_more_than_one_mem_use_shl - ; CHECK: liveins: $x0, $x1, $x2, $w1, $x0 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr32 = COPY $w1 - ; CHECK-NEXT: [[UBFMWri:%[0-9]+]]:gpr32 = UBFMWri [[COPY1]], 9, 31 - ; CHECK-NEXT: [[ORRWrs:%[0-9]+]]:gpr32 = ORRWrs $wzr, [[UBFMWri]], 0 - ; CHECK-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, [[ORRWrs]], %subreg.sub_32 - ; CHECK-NEXT: [[ANDXri:%[0-9]+]]:gpr64common = ANDXri [[SUBREG_TO_REG]], 4103 - ; CHECK-NEXT: [[LDRHHroX:%[0-9]+]]:gpr32 = LDRHHroX [[COPY]], [[ANDXri]], 0, 1 :: (load (s16)) - ; CHECK-NEXT: [[LDRHHroX1:%[0-9]+]]:gpr32 = LDRHHroX [[COPY]], [[ANDXri]], 0, 1 :: (load (s16)) - ; CHECK-NEXT: [[ADDWrr:%[0-9]+]]:gpr32 = ADDWrr [[LDRHHroX]], [[LDRHHroX1]] - ; CHECK-NEXT: RET_ReallyLR implicit [[ADDWrr]] + ; CHECK-FAST-LABEL: name: ldrhrox_more_than_one_mem_use_shl + ; CHECK-FAST: liveins: $x0, $x1, $x2, $w1, $x0 + ; CHECK-FAST-NEXT: {{ $}} + ; CHECK-FAST-NEXT: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0 + ; CHECK-FAST-NEXT: [[COPY1:%[0-9]+]]:gpr32 = COPY $w1 + ; CHECK-FAST-NEXT: [[UBFMWri:%[0-9]+]]:gpr32 = UBFMWri [[COPY1]], 9, 31 + ; CHECK-FAST-NEXT: [[ORRWrs:%[0-9]+]]:gpr32 = ORRWrs $wzr, [[UBFMWri]], 0 + ; CHECK-FAST-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, [[ORRWrs]], %subreg.sub_32 + ; CHECK-FAST-NEXT: [[ANDXri:%[0-9]+]]:gpr64common = ANDXri [[SUBREG_TO_REG]], 4103 + ; CHECK-FAST-NEXT: [[LDRHHroX:%[0-9]+]]:gpr32 = LDRHHroX [[COPY]], [[ANDXri]], 0, 1 :: (load (s16)) + ; CHECK-FAST-NEXT: [[LDRHHroX1:%[0-9]+]]:gpr32 = LDRHHroX [[COPY]], [[ANDXri]], 0, 1 :: (load (s16)) + ; CHECK-FAST-NEXT: [[ADDWrr:%[0-9]+]]:gpr32 = ADDWrr [[LDRHHroX]], [[LDRHHroX1]] + ; CHECK-FAST-NEXT: RET_ReallyLR implicit [[ADDWrr]] + ; + ; CHECK-SLOW-LABEL: name: ldrhrox_more_than_one_mem_use_shl + ; CHECK-SLOW: liveins: $x0, $x1, $x2, $w1, $x0 + ; CHECK-SLOW-NEXT: {{ $}} + ; CHECK-SLOW-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0 + ; CHECK-SLOW-NEXT: [[COPY1:%[0-9]+]]:gpr32 = COPY $w1 + ; CHECK-SLOW-NEXT: [[UBFMWri:%[0-9]+]]:gpr32 = UBFMWri [[COPY1]], 9, 31 + ; CHECK-SLOW-NEXT: [[ORRWrs:%[0-9]+]]:gpr32 = ORRWrs $wzr, [[UBFMWri]], 0 + ; CHECK-SLOW-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, [[ORRWrs]], %subreg.sub_32 + ; CHECK-SLOW-NEXT: [[COPY2:%[0-9]+]]:gpr32all = COPY [[SUBREG_TO_REG]].sub_32 + ; CHECK-SLOW-NEXT: [[COPY3:%[0-9]+]]:gpr32 = COPY [[COPY2]] + ; CHECK-SLOW-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY [[COPY]] + ; CHECK-SLOW-NEXT: [[ADDXrx:%[0-9]+]]:gpr64sp = ADDXrx [[COPY4]], [[COPY3]], 1 + ; CHECK-SLOW-NEXT: [[LDRHHui:%[0-9]+]]:gpr32 = LDRHHui [[ADDXrx]], 0 :: (load (s16)) + ; CHECK-SLOW-NEXT: [[LDRHHui1:%[0-9]+]]:gpr32 = LDRHHui [[ADDXrx]], 0 :: (load (s16)) + ; CHECK-SLOW-NEXT: [[ADDWrr:%[0-9]+]]:gpr32 = ADDWrr [[LDRHHui]], [[LDRHHui1]] + ; CHECK-SLOW-NEXT: RET_ReallyLR implicit [[ADDWrr]] %0:gpr(p0) = COPY $x0 %1:gpr(s32) = COPY $w1 %15:gpr(s64) = G_CONSTANT i64 9 @@ -612,19 +629,36 @@ body: | liveins: $x0, $x1, $x2 liveins: $w1, $x0 - ; CHECK-LABEL: name: ldrhrox_more_than_one_use_shl - ; CHECK: liveins: $x0, $x1, $x2, $w1, $x0 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr32 = COPY $w1 - ; CHECK-NEXT: [[UBFMWri:%[0-9]+]]:gpr32 = UBFMWri [[COPY1]], 9, 31 - ; CHECK-NEXT: [[ORRWrs:%[0-9]+]]:gpr32 = ORRWrs $wzr, [[UBFMWri]], 0 - ; CHECK-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, [[ORRWrs]], %subreg.sub_32 - ; CHECK-NEXT: [[ANDXri:%[0-9]+]]:gpr64common = ANDXri [[SUBREG_TO_REG]], 4103 - ; CHECK-NEXT: [[LDRHHroX:%[0-9]+]]:gpr32 = LDRHHroX [[COPY]], [[ANDXri]], 0, 1 :: (load (s16)) - ; CHECK-NEXT: [[LDRHHroX1:%[0-9]+]]:gpr32 = LDRHHroX [[COPY]], [[ANDXri]], 0, 1 :: (load (s16)) - ; CHECK-NEXT: [[ADDWrr:%[0-9]+]]:gpr32 = ADDWrr [[LDRHHroX]], [[LDRHHroX1]] - ; CHECK-NEXT: RET_ReallyLR implicit [[ADDWrr]] + ; CHECK-FAST-LABEL: name: ldrhrox_more_than_one_use_shl + ; CHECK-FAST: liveins: $x0, $x1, $x2, $w1, $x0 + ; CHECK-FAST-NEXT: {{ $}} + ; CHECK-FAST-NEXT: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0 + ; CHECK-FAST-NEXT: [[COPY1:%[0-9]+]]:gpr32 = COPY $w1 + ; CHECK-FAST-NEXT: [[UBFMWri:%[0-9]+]]:gpr32 = UBFMWri [[COPY1]], 9, 31 + ; CHECK-FAST-NEXT: [[ORRWrs:%[0-9]+]]:gpr32 = ORRWrs $wzr, [[UBFMWri]], 0 + ; CHECK-FAST-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, [[ORRWrs]], %subreg.sub_32 + ; CHECK-FAST-NEXT: [[ANDXri:%[0-9]+]]:gpr64common = ANDXri [[SUBREG_TO_REG]], 4103 + ; CHECK-FAST-NEXT: [[LDRHHroX:%[0-9]+]]:gpr32 = LDRHHroX [[COPY]], [[ANDXri]], 0, 1 :: (load (s16)) + ; CHECK-FAST-NEXT: [[LDRHHroX1:%[0-9]+]]:gpr32 = LDRHHroX [[COPY]], [[ANDXri]], 0, 1 :: (load (s16)) + ; CHECK-FAST-NEXT: [[ADDWrr:%[0-9]+]]:gpr32 = ADDWrr [[LDRHHroX]], [[LDRHHroX1]] + ; CHECK-FAST-NEXT: RET_ReallyLR implicit [[ADDWrr]] + ; + ; CHECK-SLOW-LABEL: name: ldrhrox_more_than_one_use_shl + ; CHECK-SLOW: liveins: $x0, $x1, $x2, $w1, $x0 + ; CHECK-SLOW-NEXT: {{ $}} + ; CHECK-SLOW-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0 + ; CHECK-SLOW-NEXT: [[COPY1:%[0-9]+]]:gpr32 = COPY $w1 + ; CHECK-SLOW-NEXT: [[UBFMWri:%[0-9]+]]:gpr32 = UBFMWri [[COPY1]], 9, 31 + ; CHECK-SLOW-NEXT: [[ORRWrs:%[0-9]+]]:gpr32 = ORRWrs $wzr, [[UBFMWri]], 0 + ; CHECK-SLOW-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, [[ORRWrs]], %subreg.sub_32 + ; CHECK-SLOW-NEXT: [[COPY2:%[0-9]+]]:gpr32all = COPY [[SUBREG_TO_REG]].sub_32 + ; CHECK-SLOW-NEXT: [[COPY3:%[0-9]+]]:gpr32 = COPY [[COPY2]] + ; CHECK-SLOW-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY [[COPY]] + ; CHECK-SLOW-NEXT: [[ADDXrx:%[0-9]+]]:gpr64sp = ADDXrx [[COPY4]], [[COPY3]], 1 + ; CHECK-SLOW-NEXT: [[LDRHHui:%[0-9]+]]:gpr32 = LDRHHui [[ADDXrx]], 0 :: (load (s16)) + ; CHECK-SLOW-NEXT: [[LDRHHui1:%[0-9]+]]:gpr32 = LDRHHui [[ADDXrx]], 0 :: (load (s16)) + ; CHECK-SLOW-NEXT: [[ADDWrr:%[0-9]+]]:gpr32 = ADDWrr [[LDRHHui]], [[LDRHHui1]] + ; CHECK-SLOW-NEXT: RET_ReallyLR implicit [[ADDWrr]] %0:gpr(p0) = COPY $x0 %1:gpr(s32) = COPY $w1 %15:gpr(s64) = G_CONSTANT i64 9 @@ -656,15 +690,15 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0 ; CHECK-NEXT: [[UBFMXri:%[0-9]+]]:gpr64common = UBFMXri [[COPY]], 62, 61 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1 - ; CHECK-NEXT: [[ADDXrr:%[0-9]+]]:gpr64common = ADDXrr [[COPY1]], [[UBFMXri]] - ; CHECK-NEXT: [[LDRWui:%[0-9]+]]:gpr32 = LDRWui [[ADDXrr]], 0 :: (load (s32) from %ir.addr) - ; CHECK-NEXT: [[ORRWrs:%[0-9]+]]:gpr32 = ORRWrs $wzr, [[LDRWui]], 0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64 = COPY [[COPY1]] + ; CHECK-NEXT: [[ADDXrr:%[0-9]+]]:gpr64 = ADDXrr [[COPY2]], [[UBFMXri]] + ; CHECK-NEXT: [[LDRWroX:%[0-9]+]]:gpr32 = LDRWroX [[COPY1]], [[COPY]], 0, 1 :: (load (s32) from %ir.addr) + ; CHECK-NEXT: [[ORRWrs:%[0-9]+]]:gpr32 = ORRWrs $wzr, [[LDRWroX]], 0 ; CHECK-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, [[ORRWrs]], %subreg.sub_32 ; CHECK-NEXT: [[ADDXri:%[0-9]+]]:gpr64common = ADDXri [[UBFMXri]], 2, 0 ; CHECK-NEXT: [[ADDXrr1:%[0-9]+]]:gpr64 = ADDXrr [[SUBREG_TO_REG]], [[ADDXri]] - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64 = COPY [[ADDXrr]] - ; CHECK-NEXT: [[ADDXrr2:%[0-9]+]]:gpr64 = ADDXrr [[COPY2]], [[ADDXrr1]] + ; CHECK-NEXT: [[ADDXrr2:%[0-9]+]]:gpr64 = ADDXrr [[ADDXrr]], [[ADDXrr1]] ; CHECK-NEXT: $x2 = COPY [[ADDXrr2]] ; CHECK-NEXT: RET_ReallyLR implicit $x2 %0:gpr(s64) = COPY $x0 @@ -692,21 +726,37 @@ machineFunctionInfo: {} body: | bb.0: liveins: $x0, $x1, $x2 - ; CHECK-LABEL: name: ldrqrox_more_than_one_use_shl - ; CHECK: liveins: $x0, $x1, $x2 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0 - ; CHECK-NEXT: [[UBFMXri:%[0-9]+]]:gpr64common = UBFMXri [[COPY]], 60, 59 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1 - ; CHECK-NEXT: [[ADDXrr:%[0-9]+]]:gpr64common = ADDXrr [[COPY1]], [[UBFMXri]] - ; CHECK-NEXT: [[LDRQui:%[0-9]+]]:fpr128 = LDRQui [[ADDXrr]], 0 :: (load (s128) from %ir.addr) - ; CHECK-NEXT: [[ADDXri:%[0-9]+]]:gpr64common = ADDXri [[UBFMXri]], 4, 0 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:fpr64 = COPY [[LDRQui]].dsub - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64 = COPY [[COPY2]] - ; CHECK-NEXT: [[ADDXrr1:%[0-9]+]]:gpr64 = ADDXrr [[COPY3]], [[ADDXri]] - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64 = COPY [[ADDXrr]] - ; CHECK-NEXT: [[ADDXrr2:%[0-9]+]]:gpr64 = ADDXrr [[COPY4]], [[ADDXrr1]] - ; CHECK-NEXT: RET_ReallyLR implicit [[ADDXrr2]] + ; CHECK-FAST-LABEL: name: ldrqrox_more_than_one_use_shl + ; CHECK-FAST: liveins: $x0, $x1, $x2 + ; CHECK-FAST-NEXT: {{ $}} + ; CHECK-FAST-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0 + ; CHECK-FAST-NEXT: [[UBFMXri:%[0-9]+]]:gpr64common = UBFMXri [[COPY]], 60, 59 + ; CHECK-FAST-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1 + ; CHECK-FAST-NEXT: [[COPY2:%[0-9]+]]:gpr64 = COPY [[COPY1]] + ; CHECK-FAST-NEXT: [[ADDXrr:%[0-9]+]]:gpr64 = ADDXrr [[COPY2]], [[UBFMXri]] + ; CHECK-FAST-NEXT: [[LDRQroX:%[0-9]+]]:fpr128 = LDRQroX [[COPY1]], [[COPY]], 0, 1 :: (load (s128) from %ir.addr) + ; CHECK-FAST-NEXT: [[ADDXri:%[0-9]+]]:gpr64common = ADDXri [[UBFMXri]], 4, 0 + ; CHECK-FAST-NEXT: [[COPY3:%[0-9]+]]:fpr64 = COPY [[LDRQroX]].dsub + ; CHECK-FAST-NEXT: [[COPY4:%[0-9]+]]:gpr64 = COPY [[COPY3]] + ; CHECK-FAST-NEXT: [[ADDXrr1:%[0-9]+]]:gpr64 = ADDXrr [[COPY4]], [[ADDXri]] + ; CHECK-FAST-NEXT: [[ADDXrr2:%[0-9]+]]:gpr64 = ADDXrr [[ADDXrr]], [[ADDXrr1]] + ; CHECK-FAST-NEXT: RET_ReallyLR implicit [[ADDXrr2]] + ; + ; CHECK-SLOW-LABEL: name: ldrqrox_more_than_one_use_shl + ; CHECK-SLOW: liveins: $x0, $x1, $x2 + ; CHECK-SLOW-NEXT: {{ $}} + ; CHECK-SLOW-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0 + ; CHECK-SLOW-NEXT: [[UBFMXri:%[0-9]+]]:gpr64common = UBFMXri [[COPY]], 60, 59 + ; CHECK-SLOW-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1 + ; CHECK-SLOW-NEXT: [[ADDXrr:%[0-9]+]]:gpr64common = ADDXrr [[COPY1]], [[UBFMXri]] + ; CHECK-SLOW-NEXT: [[LDRQui:%[0-9]+]]:fpr128 = LDRQui [[ADDXrr]], 0 :: (load (s128) from %ir.addr) + ; CHECK-SLOW-NEXT: [[ADDXri:%[0-9]+]]:gpr64common = ADDXri [[UBFMXri]], 4, 0 + ; CHECK-SLOW-NEXT: [[COPY2:%[0-9]+]]:fpr64 = COPY [[LDRQui]].dsub + ; CHECK-SLOW-NEXT: [[COPY3:%[0-9]+]]:gpr64 = COPY [[COPY2]] + ; CHECK-SLOW-NEXT: [[ADDXrr1:%[0-9]+]]:gpr64 = ADDXrr [[COPY3]], [[ADDXri]] + ; CHECK-SLOW-NEXT: [[COPY4:%[0-9]+]]:gpr64 = COPY [[ADDXrr]] + ; CHECK-SLOW-NEXT: [[ADDXrr2:%[0-9]+]]:gpr64 = ADDXrr [[COPY4]], [[ADDXrr1]] + ; CHECK-SLOW-NEXT: RET_ReallyLR implicit [[ADDXrr2]] %0:gpr(s64) = COPY $x0 %1:gpr(s64) = G_CONSTANT i64 4 %2:gpr(s64) = G_SHL %0, %1(s64) diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/store-addressing-modes.mir b/llvm/test/CodeGen/AArch64/GlobalISel/store-addressing-modes.mir index 62ebe86504bfa..94af12a91ae97 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/store-addressing-modes.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/store-addressing-modes.mir @@ -241,16 +241,28 @@ machineFunctionInfo: {} body: | bb.0: liveins: $x0, $x1, $x2 - ; CHECK-LABEL: name: shl_slow_1_more_than_one_use - ; CHECK: liveins: $x0, $x1, $x2 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64sp = COPY $x1 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64 = COPY $x2 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr32 = COPY [[COPY2]].sub_32 - ; CHECK-NEXT: STRHHroX [[COPY3]], [[COPY1]], [[COPY]], 0, 1 :: (store (s16) into %ir.addr) - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr32 = COPY [[COPY2]].sub_32 - ; CHECK-NEXT: STRHHroX [[COPY4]], [[COPY1]], [[COPY]], 0, 1 :: (store (s16) into %ir.addr) + ; CHECK-FAST-LABEL: name: shl_slow_1_more_than_one_use + ; CHECK-FAST: liveins: $x0, $x1, $x2 + ; CHECK-FAST-NEXT: {{ $}} + ; CHECK-FAST-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0 + ; CHECK-FAST-NEXT: [[COPY1:%[0-9]+]]:gpr64sp = COPY $x1 + ; CHECK-FAST-NEXT: [[COPY2:%[0-9]+]]:gpr64 = COPY $x2 + ; CHECK-FAST-NEXT: [[COPY3:%[0-9]+]]:gpr32 = COPY [[COPY2]].sub_32 + ; CHECK-FAST-NEXT: STRHHroX [[COPY3]], [[COPY1]], [[COPY]], 0, 1 :: (store (s16) into %ir.addr) + ; CHECK-FAST-NEXT: [[COPY4:%[0-9]+]]:gpr32 = COPY [[COPY2]].sub_32 + ; CHECK-FAST-NEXT: STRHHroX [[COPY4]], [[COPY1]], [[COPY]], 0, 1 :: (store (s16) into %ir.addr) + ; + ; CHECK-SLOW-LABEL: name: shl_slow_1_more_than_one_use + ; CHECK-SLOW: liveins: $x0, $x1, $x2 + ; CHECK-SLOW-NEXT: {{ $}} + ; CHECK-SLOW-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0 + ; CHECK-SLOW-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1 + ; CHECK-SLOW-NEXT: %ptr:gpr64common = ADDXrs [[COPY1]], [[COPY]], 1 + ; CHECK-SLOW-NEXT: [[COPY2:%[0-9]+]]:gpr64 = COPY $x2 + ; CHECK-SLOW-NEXT: [[COPY3:%[0-9]+]]:gpr32 = COPY [[COPY2]].sub_32 + ; CHECK-SLOW-NEXT: STRHHui [[COPY3]], %ptr, 0 :: (store (s16) into %ir.addr) + ; CHECK-SLOW-NEXT: [[COPY4:%[0-9]+]]:gpr32 = COPY [[COPY2]].sub_32 + ; CHECK-SLOW-NEXT: STRHHui [[COPY4]], %ptr, 0 :: (store (s16) into %ir.addr) %0:gpr(s64) = COPY $x0 %1:gpr(s64) = G_CONSTANT i64 1 %2:gpr(s64) = G_SHL %0, %1(s64) @@ -296,14 +308,24 @@ machineFunctionInfo: {} body: | bb.0: liveins: $x0, $x1, $x2, $q0 - ; CHECK-LABEL: name: shl_slow_4_more_than_one_use - ; CHECK: liveins: $x0, $x1, $x2, $q0 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64sp = COPY $x1 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:fpr128 = COPY $q0 - ; CHECK-NEXT: STRQroX [[COPY2]], [[COPY1]], [[COPY]], 0, 1 :: (store (s128) into %ir.addr) - ; CHECK-NEXT: STRQroX [[COPY2]], [[COPY1]], [[COPY]], 0, 1 :: (store (s128) into %ir.addr) + ; CHECK-FAST-LABEL: name: shl_slow_4_more_than_one_use + ; CHECK-FAST: liveins: $x0, $x1, $x2, $q0 + ; CHECK-FAST-NEXT: {{ $}} + ; CHECK-FAST-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0 + ; CHECK-FAST-NEXT: [[COPY1:%[0-9]+]]:gpr64sp = COPY $x1 + ; CHECK-FAST-NEXT: [[COPY2:%[0-9]+]]:fpr128 = COPY $q0 + ; CHECK-FAST-NEXT: STRQroX [[COPY2]], [[COPY1]], [[COPY]], 0, 1 :: (store (s128) into %ir.addr) + ; CHECK-FAST-NEXT: STRQroX [[COPY2]], [[COPY1]], [[COPY]], 0, 1 :: (store (s128) into %ir.addr) + ; + ; CHECK-SLOW-LABEL: name: shl_slow_4_more_than_one_use + ; CHECK-SLOW: liveins: $x0, $x1, $x2, $q0 + ; CHECK-SLOW-NEXT: {{ $}} + ; CHECK-SLOW-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0 + ; CHECK-SLOW-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1 + ; CHECK-SLOW-NEXT: %ptr:gpr64common = ADDXrs [[COPY1]], [[COPY]], 4 + ; CHECK-SLOW-NEXT: [[COPY2:%[0-9]+]]:fpr128 = COPY $q0 + ; CHECK-SLOW-NEXT: STRQui [[COPY2]], %ptr, 0 :: (store (s128) into %ir.addr) + ; CHECK-SLOW-NEXT: STRQui [[COPY2]], %ptr, 0 :: (store (s128) into %ir.addr) %0:gpr(s64) = COPY $x0 %1:gpr(s64) = G_CONSTANT i64 4 %2:gpr(s64) = G_SHL %0, %1(s64) @@ -339,7 +361,3 @@ body: | %4:gpr(p0) = COPY $x2 G_STORE %4, %ptr :: (store (p0) into %ir.addr) ... - -# NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -# CHECK-FAST: {{.*}} -# CHECK-SLOW: {{.*}} diff --git a/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll b/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll index 614ac15d959f0..63dcafed2320a 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll @@ -28,17 +28,16 @@ define i16 @halfword(ptr %ctx, i32 %xor72) nounwind { ; ; CHECK0-GISEL-LABEL: halfword: ; CHECK0-GISEL: // %bb.0: -; CHECK0-GISEL-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill +; CHECK0-GISEL-NEXT: str x30, [sp, #-32]! // 8-byte Folded Spill ; CHECK0-GISEL-NEXT: lsr w8, w1, #9 ; CHECK0-GISEL-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill -; CHECK0-GISEL-NEXT: mov x19, x0 -; CHECK0-GISEL-NEXT: and x21, x8, #0xff -; CHECK0-GISEL-NEXT: ldrh w20, [x0, x21, lsl #1] +; CHECK0-GISEL-NEXT: add x20, x0, w8, uxtb #1 +; CHECK0-GISEL-NEXT: ldrh w19, [x20] ; CHECK0-GISEL-NEXT: bl foo -; CHECK0-GISEL-NEXT: mov w0, w20 -; CHECK0-GISEL-NEXT: strh w20, [x19, x21, lsl #1] +; CHECK0-GISEL-NEXT: mov w0, w19 +; CHECK0-GISEL-NEXT: strh w19, [x20] ; CHECK0-GISEL-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; CHECK0-GISEL-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload +; CHECK0-GISEL-NEXT: ldr x30, [sp], #32 // 8-byte Folded Reload ; CHECK0-GISEL-NEXT: ret ; ; CHECK3-SDAG-LABEL: halfword: @@ -248,27 +247,23 @@ define i16 @multi_use_half_word(ptr %ctx, i32 %xor72) { ; ; CHECK0-GISEL-LABEL: multi_use_half_word: ; CHECK0-GISEL: // %bb.0: // %entry -; CHECK0-GISEL-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill -; CHECK0-GISEL-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill -; CHECK0-GISEL-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill -; CHECK0-GISEL-NEXT: .cfi_def_cfa_offset 48 +; CHECK0-GISEL-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill +; CHECK0-GISEL-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK0-GISEL-NEXT: .cfi_def_cfa_offset 32 ; CHECK0-GISEL-NEXT: .cfi_offset w19, -8 ; CHECK0-GISEL-NEXT: .cfi_offset w20, -16 ; CHECK0-GISEL-NEXT: .cfi_offset w21, -24 -; CHECK0-GISEL-NEXT: .cfi_offset w22, -32 -; CHECK0-GISEL-NEXT: .cfi_offset w30, -48 +; CHECK0-GISEL-NEXT: .cfi_offset w30, -32 ; CHECK0-GISEL-NEXT: lsr w8, w1, #9 -; CHECK0-GISEL-NEXT: mov x19, x0 -; CHECK0-GISEL-NEXT: and x21, x8, #0xff -; CHECK0-GISEL-NEXT: ldrh w20, [x0, x21, lsl #1] -; CHECK0-GISEL-NEXT: add w22, w20, #1 +; CHECK0-GISEL-NEXT: add x20, x0, w8, uxtb #1 +; CHECK0-GISEL-NEXT: ldrh w19, [x20] +; CHECK0-GISEL-NEXT: add w21, w19, #1 ; CHECK0-GISEL-NEXT: bl foo -; CHECK0-GISEL-NEXT: strh w20, [x19, x21, lsl #1] -; CHECK0-GISEL-NEXT: mov w0, w20 -; CHECK0-GISEL-NEXT: strh w22, [x19, x21, lsl #1] -; CHECK0-GISEL-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload -; CHECK0-GISEL-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload -; CHECK0-GISEL-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload +; CHECK0-GISEL-NEXT: strh w19, [x20] +; CHECK0-GISEL-NEXT: mov w0, w19 +; CHECK0-GISEL-NEXT: strh w21, [x20] +; CHECK0-GISEL-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK0-GISEL-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; CHECK0-GISEL-NEXT: ret ; ; CHECK3-SDAG-LABEL: multi_use_half_word: @@ -387,14 +382,14 @@ define i128 @gep4(ptr %p, i128 %a, i64 %b) { ; ; CHECK0-GISEL-LABEL: gep4: ; CHECK0-GISEL: // %bb.0: -; CHECK0-GISEL-NEXT: ldr q1, [x0, x4, lsl #4] +; CHECK0-GISEL-NEXT: add x8, x0, x4, lsl #4 ; CHECK0-GISEL-NEXT: mov v0.d[0], x2 -; CHECK0-GISEL-NEXT: mov x8, x0 +; CHECK0-GISEL-NEXT: ldr q1, [x8] ; CHECK0-GISEL-NEXT: mov d2, v1.d[1] -; CHECK0-GISEL-NEXT: fmov x0, d1 ; CHECK0-GISEL-NEXT: mov v0.d[1], x3 +; CHECK0-GISEL-NEXT: fmov x0, d1 ; CHECK0-GISEL-NEXT: fmov x1, d2 -; CHECK0-GISEL-NEXT: str q0, [x8, x4, lsl #4] +; CHECK0-GISEL-NEXT: str q0, [x8] ; CHECK0-GISEL-NEXT: ret ; ; CHECK3-SDAG-LABEL: gep4: From c661422db04d86b83c1bfeed18e31745a3725357 Mon Sep 17 00:00:00 2001 From: Michael Kruse Date: Thu, 18 Jul 2024 11:21:47 +0200 Subject: [PATCH 407/777] [Clang] Handle OMPReverseDirectiveClass in switch --- clang/lib/StaticAnalyzer/Core/ExprEngine.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp index 19c85352a6144..e56f75be8ebc2 100644 --- a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp +++ b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp @@ -1811,6 +1811,7 @@ void ExprEngine::Visit(const Stmt *S, ExplodedNode *Pred, case Stmt::OMPTargetTeamsDistributeParallelForDirectiveClass: case Stmt::OMPTargetTeamsDistributeParallelForSimdDirectiveClass: case Stmt::OMPTargetTeamsDistributeSimdDirectiveClass: + case Stmt::OMPReverseDirectiveClass: case Stmt::OMPTileDirectiveClass: case Stmt::OMPInteropDirectiveClass: case Stmt::OMPDispatchDirectiveClass: From 783e07f3a4f4684613ffb4a442c97f25c83f309b Mon Sep 17 00:00:00 2001 From: Bjorn Pettersson Date: Mon, 15 Jul 2024 00:09:49 +0200 Subject: [PATCH 408/777] [InstCombine] Add tests cases related to demanded use bits for ashr When trying to improve value tracking in https://github.com/llvm/llvm-project/pull/97693 some regressions was found due to a "weirdness" in simplify demanded use bits for ashr. Normally an ashr is replaced by lshr when the shifted in bits aren't demanded. Some years ago (see commit 22178dd33b346020) there was a test case motivating to keep the ashr when any sign bit (besides the shifted in bits) was demanded. The weird part about it is that the better we get at analysing known sign bits, the less likely it is that we canonicalize from ashr to lshr. That makes it hard to tune other combines to work based on the canonicalization, as well as possibly resulting in unexpected regressions when improving value tracking. This patch adds a test case for which it would be better to canonicalize ashr into lshr when possible. Worth mentioning is also that reverting 22178dd33b346020 doesn't seem to cause regressions in any other lit tests (not even the one added in 22178dd33b346020). --- .../Transforms/InstCombine/ashr-demand.ll | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/llvm/test/Transforms/InstCombine/ashr-demand.ll b/llvm/test/Transforms/InstCombine/ashr-demand.ll index 2b5ccf0626dd7..be7496d38f272 100644 --- a/llvm/test/Transforms/InstCombine/ashr-demand.ll +++ b/llvm/test/Transforms/InstCombine/ashr-demand.ll @@ -52,3 +52,39 @@ define <2 x i32> @srem2_ashr_mask_vector_nonconstant(<2 x i32> %a0, <2 x i32> %a %mask = and <2 x i32> %ashr, ret <2 x i32> %mask } + + +; If it does not matter if we do ashr or lshr, then we canonicalize to lshr. + +define i16 @ashr_can_be_lshr(i32 %a) { +; CHECK-LABEL: @ashr_can_be_lshr( +; CHECK-NEXT: [[ASHR:%.*]] = lshr exact i32 [[A:%.*]], 16 +; CHECK-NEXT: [[TRUNC:%.*]] = trunc nuw i32 [[ASHR]] to i16 +; CHECK-NEXT: ret i16 [[TRUNC]] +; + %ashr = ashr exact i32 %a, 16 + %trunc = trunc nsw i32 %ashr to i16 + ret i16 %trunc +} + +; Historically SimplifyDemandedUseBits skipped replacing ashr with lshr here +; due to known sign bits analysis indicating that %ashr had more than 33 sign +; bits. It does however seem weird not to always canonicalize to lshr when +; possible, and in this case rewriting into lshr would trigger further +; optimizations. +define i32 @ashr_can_be_lshr_2(i32 %a) { +; CHECK-LABEL: @ashr_can_be_lshr_2( +; CHECK-NEXT: [[TMP1:%.*]] = or i32 [[A:%.*]], 1056964608 +; CHECK-NEXT: [[OR:%.*]] = zext i32 [[TMP1]] to i64 +; CHECK-NEXT: [[SHL:%.*]] = shl i64 [[OR]], 34 +; CHECK-NEXT: [[ASHR:%.*]] = ashr exact i64 [[SHL]], 32 +; CHECK-NEXT: [[TRUNC:%.*]] = trunc nsw i64 [[ASHR]] to i32 +; CHECK-NEXT: ret i32 [[TRUNC]] +; + %ext = zext i32 %a to i64 + %or = or i64 %ext, 4278190080 + %shl = shl i64 %or, 34 + %ashr = ashr exact i64 %shl, 32 + %trunc = trunc nsw i64 %ashr to i32 + ret i32 %trunc +} From b8c4c58ecf186dd91f40bdff4d1bdad403435789 Mon Sep 17 00:00:00 2001 From: Bjorn Pettersson Date: Mon, 8 Jul 2024 01:35:53 +0200 Subject: [PATCH 409/777] [InstCombine] Turn AShr into LShr more often in SimplifyDemandedUseBits (#99155) The functional change here is to undo "llvm-svn: 311773", aka D36936, aka commit 22178dd33b3460207b8. That patch avoided to convert AShr into LShr in SimplifyDemandedUseBits based on known sign bits analysis. Even if it would be legal to turn the shift into a logical shift (given by the fact that the shifted in bits wasn't demanded), that patch prevented converting the shift into LShr when any of the original sign bits were demanded. One side effect of the reverted functionalty was that the better we were at computing number of sign bits, the less likely it was that we would replace AShr by LShr during SimplifyDemandedUseBits. This was seen in https://github.com/llvm/llvm-project/pull/97693/ when an improvement of ComputeNumSignBits resulted in regressions due to no longer rewriting AShr to LShr. The test case from D36936 still passes after this commit. So it seems like at least the compiler has been taught how to optimize that scenario even if we do the AShr->LShr transform more aggressively. --- .../InstCombineSimplifyDemanded.cpp | 26 ++++++++----------- .../Transforms/InstCombine/ashr-demand.ll | 9 +++---- 2 files changed, 14 insertions(+), 21 deletions(-) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index 081e783c964fd..8a6ec3076ac62 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -806,34 +806,30 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Instruction *I, // Signed shift right. APInt DemandedMaskIn(DemandedMask.shl(ShiftAmt)); - // If any of the high bits are demanded, we should set the sign bit as - // demanded. - if (DemandedMask.countl_zero() <= ShiftAmt) + // If any of the bits being shifted in are demanded, then we should set + // the sign bit as demanded. + bool ShiftedInBitsDemanded = DemandedMask.countl_zero() < ShiftAmt; + if (ShiftedInBitsDemanded) DemandedMaskIn.setSignBit(); - if (SimplifyDemandedBits(I, 0, DemandedMaskIn, Known, Depth + 1, Q)) { // exact flag may not longer hold. I->dropPoisonGeneratingFlags(); return I; } - Known = KnownBits::ashr( - Known, KnownBits::makeConstant(APInt(BitWidth, ShiftAmt)), - ShiftAmt != 0, I->isExact()); - - // If the input sign bit is known to be zero, or if none of the top bits - // are demanded, turn this into an unsigned shift right. - assert(BitWidth > ShiftAmt && "Shift amount not saturated?"); - APInt HighBits(APInt::getHighBitsSet( - BitWidth, std::min(SignBits + ShiftAmt - 1, BitWidth))); - if (Known.Zero[BitWidth-ShiftAmt-1] || - !DemandedMask.intersects(HighBits)) { + // If the input sign bit is known to be zero, or if none of the shifted in + // bits are demanded, turn this into an unsigned shift right. + if (Known.Zero[BitWidth - 1] || !ShiftedInBitsDemanded) { BinaryOperator *LShr = BinaryOperator::CreateLShr(I->getOperand(0), I->getOperand(1)); LShr->setIsExact(cast(I)->isExact()); LShr->takeName(I); return InsertNewInstWith(LShr, I->getIterator()); } + + Known = KnownBits::ashr( + Known, KnownBits::makeConstant(APInt(BitWidth, ShiftAmt)), + ShiftAmt != 0, I->isExact()); } else { llvm::computeKnownBits(I, Known, Depth, Q); } diff --git a/llvm/test/Transforms/InstCombine/ashr-demand.ll b/llvm/test/Transforms/InstCombine/ashr-demand.ll index be7496d38f272..a0e2af93b809b 100644 --- a/llvm/test/Transforms/InstCombine/ashr-demand.ll +++ b/llvm/test/Transforms/InstCombine/ashr-demand.ll @@ -74,12 +74,9 @@ define i16 @ashr_can_be_lshr(i32 %a) { ; optimizations. define i32 @ashr_can_be_lshr_2(i32 %a) { ; CHECK-LABEL: @ashr_can_be_lshr_2( -; CHECK-NEXT: [[TMP1:%.*]] = or i32 [[A:%.*]], 1056964608 -; CHECK-NEXT: [[OR:%.*]] = zext i32 [[TMP1]] to i64 -; CHECK-NEXT: [[SHL:%.*]] = shl i64 [[OR]], 34 -; CHECK-NEXT: [[ASHR:%.*]] = ashr exact i64 [[SHL]], 32 -; CHECK-NEXT: [[TRUNC:%.*]] = trunc nsw i64 [[ASHR]] to i32 -; CHECK-NEXT: ret i32 [[TRUNC]] +; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[A:%.*]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = or i32 [[TMP1]], -67108864 +; CHECK-NEXT: ret i32 [[TMP2]] ; %ext = zext i32 %a to i64 %or = or i64 %ext, 4278190080 From c68c289984d161d220b9434be5dbbfc387981e23 Mon Sep 17 00:00:00 2001 From: Kiran Chandramohan Date: Thu, 18 Jul 2024 10:48:39 +0100 Subject: [PATCH 410/777] [Flang][OpenMP] Add support for proc_bind=primary (#99319) The support was missing only in the parser, all other phases handle the primary option for proc_bind. Fixes one of the issues in parsing for gomp/affinity-1.f90. (https://discourse.llvm.org/t/proposal-rename-flang-new-to-flang/69462/60) --- flang/lib/Parser/openmp-parsers.cpp | 3 ++- flang/test/Parser/OpenMP/proc-bind.f90 | 14 ++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) create mode 100644 flang/test/Parser/OpenMP/proc-bind.f90 diff --git a/flang/lib/Parser/openmp-parsers.cpp b/flang/lib/Parser/openmp-parsers.cpp index 0ea48ce29ca2f..52789d6e5f0f6 100644 --- a/flang/lib/Parser/openmp-parsers.cpp +++ b/flang/lib/Parser/openmp-parsers.cpp @@ -31,10 +31,11 @@ TYPE_PARSER(construct( "SHARED" >> pure(OmpDefaultClause::Type::Shared) || "NONE" >> pure(OmpDefaultClause::Type::None))) -// 2.5 PROC_BIND (MASTER | CLOSE | SPREAD) +// 2.5 PROC_BIND (MASTER | CLOSE | PRIMARY | SPREAD ) TYPE_PARSER(construct( "CLOSE" >> pure(OmpProcBindClause::Type::Close) || "MASTER" >> pure(OmpProcBindClause::Type::Master) || + "PRIMARY" >> pure(OmpProcBindClause::Type::Primary) || "SPREAD" >> pure(OmpProcBindClause::Type::Spread))) // 2.15.5.1 MAP ([ [ALWAYS[,]] map-type : ] variable-name-list) diff --git a/flang/test/Parser/OpenMP/proc-bind.f90 b/flang/test/Parser/OpenMP/proc-bind.f90 new file mode 100644 index 0000000000000..08bcf69e5e765 --- /dev/null +++ b/flang/test/Parser/OpenMP/proc-bind.f90 @@ -0,0 +1,14 @@ +! RUN: %flang_fc1 -fdebug-unparse -fopenmp %s | FileCheck --ignore-case %s +! RUN: %flang_fc1 -fdebug-dump-parse-tree -fopenmp %s | FileCheck --check-prefix="PARSE-TREE" %s + +! CHECK: !$OMP PARALLEL PROC_BIND(PRIMARY) + +! PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPBlockConstruct +! PARSE-TREE: OmpBeginBlockDirective +! PARSE-TREE: OmpBlockDirective -> llvm::omp::Directive = parallel +! PARSE-TREE: OmpClauseList -> OmpClause -> ProcBind -> OmpProcBindClause -> Type = Primary +subroutine sb1 + !$omp parallel proc_bind(primary) + print *, "Hello" + !$omp end parallel +end subroutine From da0c8b275564f814a53a5c19497669ae2d99538d Mon Sep 17 00:00:00 2001 From: Hau Hsu Date: Thu, 18 Jul 2024 18:02:50 +0800 Subject: [PATCH 411/777] [RISCV][sanitizer] Fix sanitizer support for different virtual memory layout (#66743) This PR combines the following reviews from Phabricator: * https://reviews.llvm.org/D139823 * https://reviews.llvm.org/D139827 Other related (and merged) reviews are: * https://reviews.llvm.org/D152895 * https://reviews.llvm.org/D152991 * https://reviews.llvm.org/D152990 --------- Co-authored-by: Kito Cheng --- compiler-rt/lib/asan/asan_mapping.h | 7 +++++-- compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp | 6 +++--- compiler-rt/lib/sanitizer_common/sanitizer_platform.h | 6 +++--- llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp | 2 +- 4 files changed, 12 insertions(+), 9 deletions(-) diff --git a/compiler-rt/lib/asan/asan_mapping.h b/compiler-rt/lib/asan/asan_mapping.h index c5f95c07a2105..91fe60db6329a 100644 --- a/compiler-rt/lib/asan/asan_mapping.h +++ b/compiler-rt/lib/asan/asan_mapping.h @@ -72,7 +72,10 @@ // || `[0x2000000000, 0x23ffffffff]` || LowShadow || // || `[0x0000000000, 0x1fffffffff]` || LowMem || // -// Default Linux/RISCV64 Sv39 mapping: +// Default Linux/RISCV64 Sv39 mapping with SHADOW_OFFSET == 0xd55550000; +// (the exact location of SHADOW_OFFSET may vary depending the dynamic probing +// by FindDynamicShadowStart). +// // || `[0x1555550000, 0x3fffffffff]` || HighMem || // || `[0x0fffffa000, 0x1555555fff]` || HighShadow || // || `[0x0effffa000, 0x0fffff9fff]` || ShadowGap || @@ -186,7 +189,7 @@ # elif SANITIZER_FREEBSD && defined(__aarch64__) # define ASAN_SHADOW_OFFSET_CONST 0x0000800000000000 # elif SANITIZER_RISCV64 -# define ASAN_SHADOW_OFFSET_CONST 0x0000000d55550000 +# define ASAN_SHADOW_OFFSET_DYNAMIC # elif defined(__aarch64__) # define ASAN_SHADOW_OFFSET_CONST 0x0000001000000000 # elif defined(__powerpc64__) diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp index 7935c88204a05..794e3e7b2fb6c 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp @@ -1109,7 +1109,8 @@ uptr GetMaxVirtualAddress() { # if SANITIZER_NETBSD && defined(__x86_64__) return 0x7f7ffffff000ULL; // (0x00007f8000000000 - PAGE_SIZE) # elif SANITIZER_WORDSIZE == 64 -# if defined(__powerpc64__) || defined(__aarch64__) || defined(__loongarch__) +# if defined(__powerpc64__) || defined(__aarch64__) || \ + defined(__loongarch__) || SANITIZER_RISCV64 // On PowerPC64 we have two different address space layouts: 44- and 46-bit. // We somehow need to figure out which one we are using now and choose // one of 0x00000fffffffffffUL and 0x00003fffffffffffUL. @@ -1118,9 +1119,8 @@ uptr GetMaxVirtualAddress() { // This should (does) work for both PowerPC64 Endian modes. // Similarly, aarch64 has multiple address space layouts: 39, 42 and 47-bit. // loongarch64 also has multiple address space layouts: default is 47-bit. + // RISC-V 64 also has multiple address space layouts: 39, 48 and 57-bit. return (1ULL << (MostSignificantSetBitIndex(GET_CURRENT_FRAME()) + 1)) - 1; -# elif SANITIZER_RISCV64 - return (1ULL << 38) - 1; # elif SANITIZER_MIPS64 return (1ULL << 40) - 1; // 0x000000ffffffffffUL; # elif defined(__s390x__) diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform.h b/compiler-rt/lib/sanitizer_common/sanitizer_platform.h index 5965281555059..57966403c92a9 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_platform.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform.h @@ -295,8 +295,8 @@ // For such platforms build this code with -DSANITIZER_CAN_USE_ALLOCATOR64=0 or // change the definition of SANITIZER_CAN_USE_ALLOCATOR64 here. #ifndef SANITIZER_CAN_USE_ALLOCATOR64 -# if (SANITIZER_RISCV64 && !SANITIZER_FUCHSIA) || SANITIZER_IOS || \ - SANITIZER_DRIVERKIT +# if (SANITIZER_RISCV64 && !SANITIZER_FUCHSIA && !SANITIZER_LINUX) || \ + SANITIZER_IOS || SANITIZER_DRIVERKIT # define SANITIZER_CAN_USE_ALLOCATOR64 0 # elif defined(__mips64) || defined(__hexagon__) # define SANITIZER_CAN_USE_ALLOCATOR64 0 @@ -322,7 +322,7 @@ # if SANITIZER_FUCHSIA # define SANITIZER_MMAP_RANGE_SIZE (1ULL << 38) # else -# define SANITIZER_MMAP_RANGE_SIZE FIRST_32_SECOND_64(1ULL << 32, 1ULL << 47) +# define SANITIZER_MMAP_RANGE_SIZE FIRST_32_SECOND_64(1ULL << 32, 1ULL << 56) # endif #elif defined(__aarch64__) # if SANITIZER_APPLE diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp index adf77f20cb1c7..149866a8e4200 100644 --- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -108,7 +108,7 @@ static const uint64_t kMIPS32_ShadowOffset32 = 0x0aaa0000; static const uint64_t kMIPS64_ShadowOffset64 = 1ULL << 37; static const uint64_t kAArch64_ShadowOffset64 = 1ULL << 36; static const uint64_t kLoongArch64_ShadowOffset64 = 1ULL << 46; -static const uint64_t kRISCV64_ShadowOffset64 = 0xd55550000; +static const uint64_t kRISCV64_ShadowOffset64 = kDynamicShadowSentinel; static const uint64_t kFreeBSD_ShadowOffset32 = 1ULL << 30; static const uint64_t kFreeBSD_ShadowOffset64 = 1ULL << 46; static const uint64_t kFreeBSDAArch64_ShadowOffset64 = 1ULL << 47; From 4a19be5d45e4b1e02c2512023151be5d56ef5744 Mon Sep 17 00:00:00 2001 From: Hristo Hristov Date: Thu, 18 Jul 2024 13:26:37 +0300 Subject: [PATCH 412/777] [libc++][strings] P2591R5: Concatenation of strings and string views (#88389) Implemented: https://wg21.link/P2591R5 - https://eel.is/c++draft/string.syn - https://eel.is/c++draft/string.op.plus --------- Co-authored-by: Hristo Hristov --- libcxx/docs/FeatureTestMacroTable.rst | 2 + libcxx/docs/ReleaseNotes/19.rst | 1 + libcxx/docs/Status/Cxx2cPapers.csv | 2 +- libcxx/include/string | 98 ++++++++ libcxx/include/version | 5 +- .../string.version.compile.pass.cpp | 5 +- .../string_view.version.compile.pass.cpp | 5 +- .../version.version.compile.pass.cpp | 5 +- .../string_op+/string.string_view.pass.cpp | 216 ++++++++++++++++++ .../generate_feature_test_macro_components.py | 2 +- 10 files changed, 332 insertions(+), 9 deletions(-) create mode 100644 libcxx/test/std/strings/basic.string/string.nonmembers/string_op+/string.string_view.pass.cpp diff --git a/libcxx/docs/FeatureTestMacroTable.rst b/libcxx/docs/FeatureTestMacroTable.rst index 1e347d043ef69..53cfc3739d2be 100644 --- a/libcxx/docs/FeatureTestMacroTable.rst +++ b/libcxx/docs/FeatureTestMacroTable.rst @@ -474,6 +474,8 @@ Status ---------------------------------------------------------- ----------------- ``__cpp_lib_sstream_from_string_view`` ``202306L`` ---------------------------------------------------------- ----------------- + ``__cpp_lib_string_view`` ``202403L`` + ---------------------------------------------------------- ----------------- ``__cpp_lib_submdspan`` *unimplemented* ---------------------------------------------------------- ----------------- ``__cpp_lib_text_encoding`` *unimplemented* diff --git a/libcxx/docs/ReleaseNotes/19.rst b/libcxx/docs/ReleaseNotes/19.rst index 80b9e18cec901..80f43256f1270 100644 --- a/libcxx/docs/ReleaseNotes/19.rst +++ b/libcxx/docs/ReleaseNotes/19.rst @@ -46,6 +46,7 @@ Implemented Papers - P2872R3 - Remove ``wstring_convert`` From C++26 - P3142R0 - Printing Blank Lines with ``println`` (as DR against C++23) - P2944R3 - Comparisons for ``reference_wrapper`` (comparison operators for ``reference_wrapper`` only) +- P2591R5 - Concatenation of strings and string views - P2968R2 - Make ``std::ignore`` a first-class object - P2302R4 - ``std::ranges::contains`` - P1659R3 - ``std::ranges::starts_with`` and ``std::ranges::ends_with`` diff --git a/libcxx/docs/Status/Cxx2cPapers.csv b/libcxx/docs/Status/Cxx2cPapers.csv index 2c498f336b125..968d82a973a79 100644 --- a/libcxx/docs/Status/Cxx2cPapers.csv +++ b/libcxx/docs/Status/Cxx2cPapers.csv @@ -55,7 +55,7 @@ "`P2845R8 `__","LWG","Formatting of ``std::filesystem::path``","Tokyo March 2024","","","|format|" "`P0493R5 `__","LWG","Atomic minimum/maximum","Tokyo March 2024","","","" "`P2542R8 `__","LWG","``views::concat``","Tokyo March 2024","","","|ranges|" -"`P2591R5 `__","LWG","Concatenation of strings and string views","Tokyo March 2024","","","" +"`P2591R5 `__","LWG","Concatenation of strings and string views","Tokyo March 2024","|Complete|","19.0","" "`P2248R8 `__","LWG","Enabling list-initialization for algorithms","Tokyo March 2024","","","" "`P2810R4 `__","LWG","``is_debugger_present`` ``is_replaceable``","Tokyo March 2024","","","" "`P1068R11 `__","LWG","Vector API for random number generation","Tokyo March 2024","","","" diff --git a/libcxx/include/string b/libcxx/include/string index 9a52ab6aef41e..90394e9edbe83 100644 --- a/libcxx/include/string +++ b/libcxx/include/string @@ -407,6 +407,24 @@ template basic_string operator+(const basic_string& lhs, charT rhs); // constexpr since C++20 +template + constexpr basic_string + operator+(const basic_string& lhs, + type_identity_t> rhs); // Since C++26 +template + constexpr basic_string + operator+(basic_string&& lhs, + type_identity_t> rhs); // Since C++26 +template + constexpr basic_string + operator+(type_identity_t> lhs, + const basic_string& rhs); // Since C++26 +template + constexpr basic_string + operator+(type_identity_t> lhs, + basic_string&& rhs); // Since C++26 + + template bool operator==(const basic_string& lhs, const basic_string& rhs) noexcept; // constexpr since C++20 @@ -687,6 +705,28 @@ template _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator> operator+(const basic_string<_CharT, _Traits, _Allocator>& __x, _CharT __y); +#if _LIBCPP_STD_VER >= 26 + +template +_LIBCPP_HIDE_FROM_ABI constexpr basic_string<_CharT, _Traits, _Allocator> +operator+(const basic_string<_CharT, _Traits, _Allocator>& __lhs, + type_identity_t> __rhs); + +template +_LIBCPP_HIDE_FROM_ABI constexpr basic_string<_CharT, _Traits, _Allocator> +operator+(basic_string<_CharT, _Traits, _Allocator>&& __lhs, type_identity_t> __rhs); + +template +_LIBCPP_HIDE_FROM_ABI constexpr basic_string<_CharT, _Traits, _Allocator> +operator+(type_identity_t> __lhs, + const basic_string<_CharT, _Traits, _Allocator>& __rhs); + +template +_LIBCPP_HIDE_FROM_ABI constexpr basic_string<_CharT, _Traits, _Allocator> +operator+(type_identity_t> __lhs, basic_string<_CharT, _Traits, _Allocator>&& __rhs); + +#endif + extern template _LIBCPP_EXPORTED_FROM_ABI string operator+ , allocator >(char const*, string const&); @@ -2150,6 +2190,10 @@ private: friend _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string operator+ <>(value_type, const basic_string&); friend _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string operator+ <>(const basic_string&, const value_type*); friend _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string operator+ <>(const basic_string&, value_type); +#if _LIBCPP_STD_VER >= 26 + friend constexpr basic_string operator+ <>(const basic_string&, type_identity_t<__self_view>); + friend constexpr basic_string operator+ <>(type_identity_t<__self_view>, const basic_string&); +#endif }; // These declarations must appear before any functions are implicitly used @@ -4007,6 +4051,60 @@ operator+(basic_string<_CharT, _Traits, _Allocator>&& __lhs, _CharT __rhs) { #endif // _LIBCPP_CXX03_LANG +#if _LIBCPP_STD_VER >= 26 + +template +_LIBCPP_HIDE_FROM_ABI constexpr basic_string<_CharT, _Traits, _Allocator> +operator+(const basic_string<_CharT, _Traits, _Allocator>& __lhs, + type_identity_t> __rhs) { + using _String = basic_string<_CharT, _Traits, _Allocator>; + typename _String::size_type __lhs_sz = __lhs.size(); + typename _String::size_type __rhs_sz = __rhs.size(); + _String __r(__uninitialized_size_tag(), + __lhs_sz + __rhs_sz, + _String::__alloc_traits::select_on_container_copy_construction(__lhs.get_allocator())); + auto __ptr = std::__to_address(__r.__get_pointer()); + _Traits::copy(__ptr, __lhs.data(), __lhs_sz); + _Traits::copy(__ptr + __lhs_sz, __rhs.data(), __rhs_sz); + _Traits::assign(__ptr + __lhs_sz + __rhs_sz, 1, _CharT()); + return __r; +} + +template +_LIBCPP_HIDE_FROM_ABI constexpr basic_string<_CharT, _Traits, _Allocator> +operator+(basic_string<_CharT, _Traits, _Allocator>&& __lhs, + type_identity_t> __rhs) { + __lhs.append(__rhs); + return std::move(__lhs); +} + +template +_LIBCPP_HIDE_FROM_ABI constexpr basic_string<_CharT, _Traits, _Allocator> +operator+(type_identity_t> __lhs, + const basic_string<_CharT, _Traits, _Allocator>& __rhs) { + using _String = basic_string<_CharT, _Traits, _Allocator>; + typename _String::size_type __lhs_sz = __lhs.size(); + typename _String::size_type __rhs_sz = __rhs.size(); + _String __r(__uninitialized_size_tag(), + __lhs_sz + __rhs_sz, + _String::__alloc_traits::select_on_container_copy_construction(__rhs.get_allocator())); + auto __ptr = std::__to_address(__r.__get_pointer()); + _Traits::copy(__ptr, __lhs.data(), __lhs_sz); + _Traits::copy(__ptr + __lhs_sz, __rhs.data(), __rhs_sz); + _Traits::assign(__ptr + __lhs_sz + __rhs_sz, 1, _CharT()); + return __r; +} + +template +_LIBCPP_HIDE_FROM_ABI constexpr basic_string<_CharT, _Traits, _Allocator> +operator+(type_identity_t> __lhs, + basic_string<_CharT, _Traits, _Allocator>&& __rhs) { + __rhs.insert(0, __lhs); + return std::move(__rhs); +} + +#endif // _LIBCPP_STD_VER >= 26 + // swap template diff --git a/libcxx/include/version b/libcxx/include/version index c971336bcb85c..7d9fad1cb1eb2 100644 --- a/libcxx/include/version +++ b/libcxx/include/version @@ -231,7 +231,8 @@ __cpp_lib_stdatomic_h 202011L __cpp_lib_string_contains 202011L __cpp_lib_string_resize_and_overwrite 202110L __cpp_lib_string_udls 201304L -__cpp_lib_string_view 201803L +__cpp_lib_string_view 202403L + 201803L // C++20 201606L // C++17 __cpp_lib_submdspan 202306L __cpp_lib_syncbuf 201803L @@ -547,6 +548,8 @@ __cpp_lib_void_t 201411L # define __cpp_lib_span_at 202311L # define __cpp_lib_span_initializer_list 202311L # define __cpp_lib_sstream_from_string_view 202306L +# undef __cpp_lib_string_view +# define __cpp_lib_string_view 202403L // # define __cpp_lib_submdspan 202306L // # define __cpp_lib_text_encoding 202306L # undef __cpp_lib_to_chars diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/string.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/string.version.compile.pass.cpp index af6386a40a458..69a938edd1cb9 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/string.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/string.version.compile.pass.cpp @@ -29,6 +29,7 @@ __cpp_lib_string_udls 201304L [C++14] __cpp_lib_string_view 201606L [C++17] 201803L [C++20] + 202403L [C++26] __cpp_lib_to_string 202306L [C++26] */ @@ -483,8 +484,8 @@ # ifndef __cpp_lib_string_view # error "__cpp_lib_string_view should be defined in c++26" # endif -# if __cpp_lib_string_view != 201803L -# error "__cpp_lib_string_view should have the value 201803L in c++26" +# if __cpp_lib_string_view != 202403L +# error "__cpp_lib_string_view should have the value 202403L in c++26" # endif # if !defined(_LIBCPP_VERSION) diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/string_view.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/string_view.version.compile.pass.cpp index a86ab2adff6a9..f3c70cf977973 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/string_view.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/string_view.version.compile.pass.cpp @@ -23,6 +23,7 @@ __cpp_lib_string_contains 202011L [C++23] __cpp_lib_string_view 201606L [C++17] 201803L [C++20] + 202403L [C++26] */ #include @@ -252,8 +253,8 @@ # ifndef __cpp_lib_string_view # error "__cpp_lib_string_view should be defined in c++26" # endif -# if __cpp_lib_string_view != 201803L -# error "__cpp_lib_string_view should have the value 201803L in c++26" +# if __cpp_lib_string_view != 202403L +# error "__cpp_lib_string_view should have the value 202403L in c++26" # endif #endif // TEST_STD_VER > 23 diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp index a01ee702a5172..e1af3061725df 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp @@ -216,6 +216,7 @@ __cpp_lib_string_udls 201304L [C++14] __cpp_lib_string_view 201606L [C++17] 201803L [C++20] + 202403L [C++26] __cpp_lib_submdspan 202306L [C++26] __cpp_lib_syncbuf 201803L [C++20] __cpp_lib_text_encoding 202306L [C++26] @@ -7894,8 +7895,8 @@ # ifndef __cpp_lib_string_view # error "__cpp_lib_string_view should be defined in c++26" # endif -# if __cpp_lib_string_view != 201803L -# error "__cpp_lib_string_view should have the value 201803L in c++26" +# if __cpp_lib_string_view != 202403L +# error "__cpp_lib_string_view should have the value 202403L in c++26" # endif # if !defined(_LIBCPP_VERSION) diff --git a/libcxx/test/std/strings/basic.string/string.nonmembers/string_op+/string.string_view.pass.cpp b/libcxx/test/std/strings/basic.string/string.nonmembers/string_op+/string.string_view.pass.cpp new file mode 100644 index 0000000000000..3d981e8b3a3cd --- /dev/null +++ b/libcxx/test/std/strings/basic.string/string.nonmembers/string_op+/string.string_view.pass.cpp @@ -0,0 +1,216 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23 + +// + +// [string.op.plus] +// +// template +// constexpr basic_string +// operator+(const basic_string& lhs, +// type_identity_t> rhs); // Since C++26 +// template +// constexpr basic_string +// operator+(basic_string&& lhs, +// type_identity_t> rhs); // Since C++26 +// template +// constexpr basic_string +// operator+(type_identity_t> lhs, +// const basic_string& rhs); // Since C++26 +// template +// constexpr basic_string +// operator+(type_identity_t> lhs, +// basic_string&& rhs); // Since C++26 + +#include +#include +#include +#include + +#include "asan_testing.h" +#include "constexpr_char_traits.h" +#include "make_string.h" +#include "min_allocator.h" +#include "test_allocator.h" +#include "test_macros.h" + +template > +class ConvertibleToStringView { +public: + constexpr explicit ConvertibleToStringView(const CharT* cs) : cs_{cs} {} + + constexpr operator std::basic_string_view() { return std::basic_string_view(cs_); } + constexpr operator std::basic_string_view() const { + return std::basic_string_view(cs_); + } + +private: + const CharT* cs_; +}; + +static_assert(std::constructible_from, const ConvertibleToStringView>); +static_assert(std::convertible_to, std::basic_string_view>); + +static_assert(std::constructible_from, ConvertibleToStringView>); +static_assert(std::convertible_to, std::basic_string_view>); + +#define CS(S) MAKE_CSTRING(CharT, S) + +template