From bb3c2fc397652b97fd496530d41a495c4549b42b Mon Sep 17 00:00:00 2001
From: nasmnc01 <nashe.mncube@arm.com>
Date: Thu, 9 Jan 2025 14:04:39 +0000
Subject: [PATCH 1/6] [ARM][SLP] Fix incorrect cost function for SLP
 Vectorization of ZExt/SExt

PR #117350 made changes to the SLP vectorizer which introduced
a regression on ARM vectorization benchmarks. This was due
to the changes assuming that SExt/ZExt vector instructions have
constant cost. This behaviour is expected for RISCV but not on ARM
where we take into account source and destination type of SExt/ZExt
instructions when calculating vector cost.

Change-Id: I6f995dcde26e5aaf62b779b63e52988fb333f941
---
 .../lib/Target/ARM/ARMTargetTransformInfo.cpp |  1 -
 .../Transforms/SLPVectorizer/ARM/vadd-mve.ll  | 29 +++++++++++++++++++
 2 files changed, 29 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/ARM/vadd-mve.ll

diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 2a2a46a19d7c1..7e9373af0b7cb 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -1794,7 +1794,6 @@ InstructionCost ARMTTIImpl::getExtendedReductionCost(
   case ISD::ADD:
     if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
       std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
-
       // The legal cases are:
       //   VADDV u/s 8/16/32
       //   VADDLV u/s 32
diff --git a/llvm/test/Transforms/SLPVectorizer/ARM/vadd-mve.ll b/llvm/test/Transforms/SLPVectorizer/ARM/vadd-mve.ll
new file mode 100644
index 0000000000000..e0af0ca1e4f8b
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/ARM/vadd-mve.ll
@@ -0,0 +1,29 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=slp-vectorizer --mtriple arm-none-eabi -mattr=+mve -S -o - | FileCheck %s
+
+define i64 @vadd_32_64(ptr readonly %a) {
+; CHECK-LABEL: define i64 @vadd_32_64(
+; CHECK-SAME: ptr readonly [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[A]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = sext <4 x i32> [[TMP0]] to <4 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP1]])
+; CHECK-NEXT:    ret i64 [[TMP2]]
+;
+entry:
+  %0 = load i32, ptr %a, align 4
+  %conv = sext i32 %0 to i64
+  %arrayidx1 = getelementptr inbounds nuw i8, ptr %a, i32 4
+  %1 = load i32, ptr %arrayidx1, align 4
+  %conv2 = sext i32 %1 to i64
+  %add = add nsw i64 %conv2, %conv
+  %arrayidx3 = getelementptr inbounds nuw i8, ptr %a, i32 8
+  %2 = load i32, ptr %arrayidx3, align 4
+  %conv4 = sext i32 %2 to i64
+  %add5 = add nsw i64 %add, %conv4
+  %arrayidx6 = getelementptr inbounds nuw i8, ptr %a, i32 12
+  %3 = load i32, ptr %arrayidx6, align 4
+  %conv7 = sext i32 %3 to i64
+  %add8 = add nsw i64 %add5, %conv7
+  ret i64 %add8
+}

From 1921822b8d034a5f4617867e56a0f67e38fa5544 Mon Sep 17 00:00:00 2001
From: nasmnc01 <nashe.mncube@arm.com>
Date: Thu, 30 Jan 2025 11:51:01 +0000
Subject: [PATCH 2/6] Adjust extended reduction cost

Change-Id: Ie2795e0e5ebb0589146eaf07c752410e307a36e6
---
 llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 7e9373af0b7cb..b1152396fab0e 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -1794,6 +1794,7 @@ InstructionCost ARMTTIImpl::getExtendedReductionCost(
   case ISD::ADD:
     if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
       std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
+
       // The legal cases are:
       //   VADDV u/s 8/16/32
       //   VADDLV u/s 32
@@ -1805,7 +1806,7 @@ InstructionCost ARMTTIImpl::getExtendedReductionCost(
           ((LT.second == MVT::v16i8 && RevVTSize <= 32) ||
            (LT.second == MVT::v8i16 && RevVTSize <= 32) ||
            (LT.second == MVT::v4i32 && RevVTSize <= 64)))
-        return ST->getMVEVectorCostFactor(CostKind) * LT.first;
+        return 3 * ST->getMVEVectorCostFactor(CostKind) * LT.first;
     }
     break;
   default:

From a602844898a8f5d022d73955aab73c2f65953c0d Mon Sep 17 00:00:00 2001
From: nasmnc01 <nashe.mncube@arm.com>
Date: Wed, 5 Feb 2025 15:57:57 +0000
Subject: [PATCH 3/6] [ARM]Reduce scalar cost of SMLAL muls

Changes to the SLPVectorizer caused a regression
in some benchmarks which targetted cores that support
both DSP and MVE instructions. The particular regression
has been reduced to a case where MUL instructions that
are part of a chain of instructions that can be replaced
with DSP SMLAL may also be vectorized. The generated
code ends up being an inefficient of both scalar
and vector ops rather than leaning to one or the other.
By reducing the cost of these MUL instructions in these
patterns we recover lost performance.

Change-Id: I302817cf4fcd18a11d40fba430c44e034a36448b
---
 .../lib/Target/ARM/ARMTargetTransformInfo.cpp | 56 +++++++++++++++++--
 .../CostModel/ARM/muls-in-smlal-patterns.ll   | 37 ++++++++++++
 .../Transforms/SLPVectorizer/ARM/vadd-mve.ll  | 29 ----------
 3 files changed, 88 insertions(+), 34 deletions(-)
 create mode 100644 llvm/test/Analysis/CostModel/ARM/muls-in-smlal-patterns.ll
 delete mode 100644 llvm/test/Transforms/SLPVectorizer/ARM/vadd-mve.ll

diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index b1152396fab0e..8ece8bbf4e4a9 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -1458,16 +1458,62 @@ InstructionCost ARMTTIImpl::getArithmeticInstrCost(
   if (LooksLikeAFreeShift())
     return 0;
 
+  // When targets have both DSP and MVE we find that the
+  // the compiler will attempt to vectorize as well as using
+  // scalar SMLAL operations. This is in cases where we have
+  // the pattern ext(mul(ext(i16), ext(i16))) we find
+  // that generated codegen performs better when only using SMLAL scalar
+  // ops instead of trying to mix vector ops with SMLAL ops. We therefore
+  // check if a mul instruction is used in a SMLAL pattern.
+  auto MulInSMLALPattern = [&](const Instruction *I, unsigned Opcode,
+                               Type *Ty) -> bool {
+    if (!ST->hasDSP() || !ST->hasMVEIntegerOps())
+      return false;
+    if (!I)
+      return false;
+
+    if (Opcode != Instruction::Mul)
+      return false;
+
+    if (Ty->isVectorTy())
+      return false;
+
+    auto IsSExtInst = [](const Value *V) -> bool {
+      return (dyn_cast<SExtInst>(V)) ? true : false;
+    };
+
+    // We check the arguments of the function to see if they're extends
+    auto *BinOp = dyn_cast<BinaryOperator>(I);
+    if (!BinOp)
+      return false;
+    auto *Op0 = BinOp->getOperand(0);
+    auto *Op1 = BinOp->getOperand(1);
+    if (Op0 && Op1 && IsSExtInst(Op0) && IsSExtInst(Op1)) {
+      // In this case we're interested in an ext of an i16
+      if (!Op0->getType()->isIntegerTy(32) || !Op1->getType()->isIntegerTy(32))
+        return false;
+      // We need to check if this result will be further extended to i64
+      for (auto *U : I->users())
+        if (IsSExtInst(dyn_cast<Value>(U)))
+          return true;
+    }
+
+    return false;
+  };
+
+  if (MulInSMLALPattern(CxtI, Opcode, Ty))
+    return 0;
+
   // Default to cheap (throughput/size of 1 instruction) but adjust throughput
   // for "multiple beats" potentially needed by MVE instructions.
   int BaseCost = 1;
   if (ST->hasMVEIntegerOps() && Ty->isVectorTy())
     BaseCost = ST->getMVEVectorCostFactor(CostKind);
 
-  // The rest of this mostly follows what is done in BaseT::getArithmeticInstrCost,
-  // without treating floats as more expensive that scalars or increasing the
-  // costs for custom operations. The results is also multiplied by the
-  // MVEVectorCostFactor where appropriate.
+  // The rest of this mostly follows what is done in
+  // BaseT::getArithmeticInstrCost, without treating floats as more expensive
+  // that scalars or increasing the costs for custom operations. The results is
+  // also multiplied by the MVEVectorCostFactor where appropriate.
   if (TLI->isOperationLegalOrCustomOrPromote(ISDOpcode, LT.second))
     return LT.first * BaseCost;
 
@@ -1806,7 +1852,7 @@ InstructionCost ARMTTIImpl::getExtendedReductionCost(
           ((LT.second == MVT::v16i8 && RevVTSize <= 32) ||
            (LT.second == MVT::v8i16 && RevVTSize <= 32) ||
            (LT.second == MVT::v4i32 && RevVTSize <= 64)))
-        return 3 * ST->getMVEVectorCostFactor(CostKind) * LT.first;
+        return ST->getMVEVectorCostFactor(CostKind) * LT.first;
     }
     break;
   default:
diff --git a/llvm/test/Analysis/CostModel/ARM/muls-in-smlal-patterns.ll b/llvm/test/Analysis/CostModel/ARM/muls-in-smlal-patterns.ll
new file mode 100644
index 0000000000000..eafd378ab0aad
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/ARM/muls-in-smlal-patterns.ll
@@ -0,0 +1,37 @@
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple thumbv8.1-m.main -mattr=+mve,+dsp  < %s | FileCheck %s
+define i64 @test(i16 %a, i16 %b) {
+; CHECK-LABEL: 'test'
+; CHECK:  Cost Model: Found an estimated cost of 0 for instruction: %m = mul i32 %as, %bs
+;
+    %as = sext i16 %a to i32
+    %bs = sext i16 %b to i32
+    %m = mul i32 %as, %bs
+    %ms = sext i32 %m to i64
+    ret i64 %ms
+}
+
+define i64 @withadd(i16 %a, i16 %b, i64 %c) {
+; CHECK-LABEL: 'withadd'
+; CHECK:  Cost Model: Found an estimated cost of 0 for instruction: %m = mul i32 %as, %bs
+;
+    %as = sext i16 %a to i32
+    %bs = sext i16 %b to i32
+    %m = mul i32 %as, %bs
+    %ms = sext i32 %m to i64
+    %r = add i64 %c, %ms
+    ret i64 %r
+}
+
+define i64 @withloads(ptr %pa, ptr %pb, i64 %c) {
+; CHECK-LABEL: 'withloads'
+; CHECK:  Cost Model: Found an estimated cost of 0 for instruction: %m = mul i32 %as, %bs
+;
+    %a = load i16, ptr %pa
+    %b = load i16, ptr %pb
+    %as = sext i16 %a to i32
+    %bs = sext i16 %b to i32
+    %m = mul i32 %as, %bs
+    %ms = sext i32 %m to i64
+    %r = add i64 %c, %ms
+    ret i64 %r
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/ARM/vadd-mve.ll b/llvm/test/Transforms/SLPVectorizer/ARM/vadd-mve.ll
deleted file mode 100644
index e0af0ca1e4f8b..0000000000000
--- a/llvm/test/Transforms/SLPVectorizer/ARM/vadd-mve.ll
+++ /dev/null
@@ -1,29 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -passes=slp-vectorizer --mtriple arm-none-eabi -mattr=+mve -S -o - | FileCheck %s
-
-define i64 @vadd_32_64(ptr readonly %a) {
-; CHECK-LABEL: define i64 @vadd_32_64(
-; CHECK-SAME: ptr readonly [[A:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[A]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = sext <4 x i32> [[TMP0]] to <4 x i64>
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP1]])
-; CHECK-NEXT:    ret i64 [[TMP2]]
-;
-entry:
-  %0 = load i32, ptr %a, align 4
-  %conv = sext i32 %0 to i64
-  %arrayidx1 = getelementptr inbounds nuw i8, ptr %a, i32 4
-  %1 = load i32, ptr %arrayidx1, align 4
-  %conv2 = sext i32 %1 to i64
-  %add = add nsw i64 %conv2, %conv
-  %arrayidx3 = getelementptr inbounds nuw i8, ptr %a, i32 8
-  %2 = load i32, ptr %arrayidx3, align 4
-  %conv4 = sext i32 %2 to i64
-  %add5 = add nsw i64 %add, %conv4
-  %arrayidx6 = getelementptr inbounds nuw i8, ptr %a, i32 12
-  %3 = load i32, ptr %arrayidx6, align 4
-  %conv7 = sext i32 %3 to i64
-  %add8 = add nsw i64 %add5, %conv7
-  ret i64 %add8
-}

From 6a38c9aa491cd7a01cd5fae7326763723a1d7fb3 Mon Sep 17 00:00:00 2001
From: nasmnc01 <nashe.mncube@arm.com>
Date: Fri, 14 Mar 2025 14:24:06 +0000
Subject: [PATCH 4/6] Responding to review comments

Change-Id: Ie8c40972ae07e568d767ace37b9dda0f77272569
---
 .../lib/Target/ARM/ARMTargetTransformInfo.cpp | 48 ++++++++++-------
 .../CostModel/ARM/muls-in-smlal-patterns.ll   | 51 +++++++++++++++++--
 2 files changed, 78 insertions(+), 21 deletions(-)

diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 8ece8bbf4e4a9..e49286d6662c5 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -1460,15 +1460,16 @@ InstructionCost ARMTTIImpl::getArithmeticInstrCost(
 
   // When targets have both DSP and MVE we find that the
   // the compiler will attempt to vectorize as well as using
-  // scalar SMLAL operations. This is in cases where we have
+  // scalar (S/U)MLAL operations. This is in cases where we have
   // the pattern ext(mul(ext(i16), ext(i16))) we find
-  // that generated codegen performs better when only using SMLAL scalar
-  // ops instead of trying to mix vector ops with SMLAL ops. We therefore
+  // that generated codegen performs better when only using (S/U)MLAL scalar
+  // ops instead of trying to mix vector ops with (S/U)MLAL ops. We therefore
   // check if a mul instruction is used in a SMLAL pattern.
-  auto MulInSMLALPattern = [&](const Instruction *I, unsigned Opcode,
-                               Type *Ty) -> bool {
-    if (!ST->hasDSP() || !ST->hasMVEIntegerOps())
+  auto MulInDSPMLALPattern = [&](const Instruction *I, unsigned Opcode,
+                                 Type *Ty) -> bool {
+    if (!ST->hasDSP())
       return false;
+
     if (!I)
       return false;
 
@@ -1478,30 +1479,43 @@ InstructionCost ARMTTIImpl::getArithmeticInstrCost(
     if (Ty->isVectorTy())
       return false;
 
-    auto IsSExtInst = [](const Value *V) -> bool {
-      return (dyn_cast<SExtInst>(V)) ? true : false;
+    auto IsSExtInst = [](const Value *V) -> bool { return isa<SExtInst>(V); };
+    auto IsZExtInst = [](const Value *V) -> bool { return isa<ZExtInst>(V); };
+    auto IsExtInst = [&, IsSExtInst, IsZExtInst](const Value *V) -> bool {
+      return IsSExtInst(V) || IsZExtInst(V);
+    };
+    auto IsExtensionFromHalf = [&, IsSExtInst,
+                                IsZExtInst](const Value *V) -> bool {
+      if (IsSExtInst(V))
+        return dyn_cast<SExtInst>(V)->getOperand(0)->getType()->isIntegerTy(16);
+      if (IsZExtInst(V))
+        return dyn_cast<ZExtInst>(V)->getOperand(0)->getType()->isIntegerTy(16);
+      return false;
     };
 
-    // We check the arguments of the function to see if they're extends
+    // We check the arguments of the instruction to see if they're extends
     auto *BinOp = dyn_cast<BinaryOperator>(I);
     if (!BinOp)
       return false;
-    auto *Op0 = BinOp->getOperand(0);
-    auto *Op1 = BinOp->getOperand(1);
-    if (Op0 && Op1 && IsSExtInst(Op0) && IsSExtInst(Op1)) {
-      // In this case we're interested in an ext of an i16
-      if (!Op0->getType()->isIntegerTy(32) || !Op1->getType()->isIntegerTy(32))
+    Value *Op0 = BinOp->getOperand(0);
+    Value *Op1 = BinOp->getOperand(1);
+    if (IsExtInst(Op0) && IsExtInst(Op1)) {
+      // We're interested in an ext of an i16
+      if (!I->getType()->isIntegerTy(32) || !IsExtensionFromHalf(Op0) ||
+          !IsExtensionFromHalf(Op1))
         return false;
       // We need to check if this result will be further extended to i64
+      // and that all these uses are SExt
       for (auto *U : I->users())
-        if (IsSExtInst(dyn_cast<Value>(U)))
-          return true;
+        if (!IsExtInst(dyn_cast<Value>(U)))
+          return false;
+      return true;
     }
 
     return false;
   };
 
-  if (MulInSMLALPattern(CxtI, Opcode, Ty))
+  if (MulInDSPMLALPattern(CxtI, Opcode, Ty))
     return 0;
 
   // Default to cheap (throughput/size of 1 instruction) but adjust throughput
diff --git a/llvm/test/Analysis/CostModel/ARM/muls-in-smlal-patterns.ll b/llvm/test/Analysis/CostModel/ARM/muls-in-smlal-patterns.ll
index eafd378ab0aad..68a7ea9ef69af 100644
--- a/llvm/test/Analysis/CostModel/ARM/muls-in-smlal-patterns.ll
+++ b/llvm/test/Analysis/CostModel/ARM/muls-in-smlal-patterns.ll
@@ -1,7 +1,20 @@
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple thumbv8.1-m.main -mattr=+mve,+dsp  < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple thumbv8.1-m.main -mattr=+dsp  < %s | FileCheck %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple thumbv8.1-m.main < %s | FileCheck %s --check-prefix=CHECK-NO-DSP
 define i64 @test(i16 %a, i16 %b) {
 ; CHECK-LABEL: 'test'
-; CHECK:  Cost Model: Found an estimated cost of 0 for instruction: %m = mul i32 %as, %bs
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %as = sext i16 %a to i32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bs = sext i16 %b to i32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %m = mul i32 %as, %bs
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ms = sext i32 %m to i64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %ms
+;
+; CHECK-NO-DSP-LABEL: 'test'
+; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %as = sext i16 %a to i32
+; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bs = sext i16 %b to i32
+; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %m = mul i32 %as, %bs
+; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ms = sext i32 %m to i64
+; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %ms
 ;
     %as = sext i16 %a to i32
     %bs = sext i16 %b to i32
@@ -12,7 +25,20 @@ define i64 @test(i16 %a, i16 %b) {
 
 define i64 @withadd(i16 %a, i16 %b, i64 %c) {
 ; CHECK-LABEL: 'withadd'
-; CHECK:  Cost Model: Found an estimated cost of 0 for instruction: %m = mul i32 %as, %bs
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %as = sext i16 %a to i32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bs = sext i16 %b to i32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %m = mul i32 %as, %bs
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ms = sext i32 %m to i64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r = add i64 %c, %ms
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
+;
+; CHECK-NO-DSP-LABEL: 'withadd'
+; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %as = sext i16 %a to i32
+; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bs = sext i16 %b to i32
+; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %m = mul i32 %as, %bs
+; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ms = sext i32 %m to i64
+; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r = add i64 %c, %ms
+; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
 ;
     %as = sext i16 %a to i32
     %bs = sext i16 %b to i32
@@ -24,7 +50,24 @@ define i64 @withadd(i16 %a, i16 %b, i64 %c) {
 
 define i64 @withloads(ptr %pa, ptr %pb, i64 %c) {
 ; CHECK-LABEL: 'withloads'
-; CHECK:  Cost Model: Found an estimated cost of 0 for instruction: %m = mul i32 %as, %bs
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a = load i16, ptr %pa, align 2
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b = load i16, ptr %pb, align 2
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %as = sext i16 %a to i32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %bs = sext i16 %b to i32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %m = mul i32 %as, %bs
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ms = sext i32 %m to i64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r = add i64 %c, %ms
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
+;
+; CHECK-NO-DSP-LABEL: 'withloads'
+; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a = load i16, ptr %pa, align 2
+; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b = load i16, ptr %pb, align 2
+; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %as = sext i16 %a to i32
+; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %bs = sext i16 %b to i32
+; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %m = mul i32 %as, %bs
+; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ms = sext i32 %m to i64
+; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r = add i64 %c, %ms
+; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
 ;
     %a = load i16, ptr %pa
     %b = load i16, ptr %pb

From 55f3c72e10719e62a79fcd2dfbcc12bcc14e090a Mon Sep 17 00:00:00 2001
From: nasmnc01 <nashe.mncube@arm.com>
Date: Tue, 18 Mar 2025 11:24:04 +0000
Subject: [PATCH 5/6] Further review comments

Change-Id: I0c20291a8a1d2cbc895890d02aea963323ee48cb
---
 .../lib/Target/ARM/ARMTargetTransformInfo.cpp | 23 +++---
 .../CostModel/ARM/muls-in-smlal-patterns.ll   | 23 ++++++
 .../CostModel/ARM/muls-in-umull-patterns.ll   | 80 +++++++++++++++++++
 3 files changed, 114 insertions(+), 12 deletions(-)
 create mode 100644 llvm/test/Analysis/CostModel/ARM/muls-in-umull-patterns.ll

diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index e49286d6662c5..b7fcb6bd34241 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -1479,17 +1479,16 @@ InstructionCost ARMTTIImpl::getArithmeticInstrCost(
     if (Ty->isVectorTy())
       return false;
 
-    auto IsSExtInst = [](const Value *V) -> bool { return isa<SExtInst>(V); };
-    auto IsZExtInst = [](const Value *V) -> bool { return isa<ZExtInst>(V); };
-    auto IsExtInst = [&, IsSExtInst, IsZExtInst](const Value *V) -> bool {
-      return IsSExtInst(V) || IsZExtInst(V);
+    auto ValueOpcodesEqual = [](const Value *LHS, const Value *RHS) -> bool {
+      return cast<Instruction>(LHS)->getOpcode() ==
+             cast<Instruction>(RHS)->getOpcode();
     };
-    auto IsExtensionFromHalf = [&, IsSExtInst,
-                                IsZExtInst](const Value *V) -> bool {
-      if (IsSExtInst(V))
-        return dyn_cast<SExtInst>(V)->getOperand(0)->getType()->isIntegerTy(16);
-      if (IsZExtInst(V))
-        return dyn_cast<ZExtInst>(V)->getOperand(0)->getType()->isIntegerTy(16);
+    auto IsExtInst = [](const Value *V) -> bool {
+      return isa<ZExtInst>(V) || isa<SExtInst>(V);
+    };
+    auto IsExtensionFromHalf = [&, IsExtInst](const Value *V) -> bool {
+      if (IsExtInst(V))
+        return cast<Instruction>(V)->getOperand(0)->getType()->isIntegerTy(16);
       return false;
     };
 
@@ -1499,7 +1498,7 @@ InstructionCost ARMTTIImpl::getArithmeticInstrCost(
       return false;
     Value *Op0 = BinOp->getOperand(0);
     Value *Op1 = BinOp->getOperand(1);
-    if (IsExtInst(Op0) && IsExtInst(Op1)) {
+    if (IsExtInst(Op0) && IsExtInst(Op1) && ValueOpcodesEqual(Op0, Op1)) {
       // We're interested in an ext of an i16
       if (!I->getType()->isIntegerTy(32) || !IsExtensionFromHalf(Op0) ||
           !IsExtensionFromHalf(Op1))
@@ -1507,7 +1506,7 @@ InstructionCost ARMTTIImpl::getArithmeticInstrCost(
       // We need to check if this result will be further extended to i64
       // and that all these uses are SExt
       for (auto *U : I->users())
-        if (!IsExtInst(dyn_cast<Value>(U)))
+        if (!IsExtInst(U))
           return false;
       return true;
     }
diff --git a/llvm/test/Analysis/CostModel/ARM/muls-in-smlal-patterns.ll b/llvm/test/Analysis/CostModel/ARM/muls-in-smlal-patterns.ll
index 68a7ea9ef69af..7de2799d5af9c 100644
--- a/llvm/test/Analysis/CostModel/ARM/muls-in-smlal-patterns.ll
+++ b/llvm/test/Analysis/CostModel/ARM/muls-in-smlal-patterns.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple thumbv8.1-m.main -mattr=+dsp  < %s | FileCheck %s
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple thumbv8.1-m.main < %s | FileCheck %s --check-prefix=CHECK-NO-DSP
+
 define i64 @test(i16 %a, i16 %b) {
 ; CHECK-LABEL: 'test'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %as = sext i16 %a to i32
@@ -78,3 +79,25 @@ define i64 @withloads(ptr %pa, ptr %pb, i64 %c) {
     %r = add i64 %c, %ms
     ret i64 %r
 }
+
+define i64 @different_extend_ops(i16 %a, i16 %b) {
+; CHECK-LABEL: 'different_extend_ops'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %as = sext i16 %a to i32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bs = zext i16 %b to i32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %m = mul i32 %as, %bs
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ms = sext i32 %m to i64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %ms
+;
+; CHECK-NO-DSP-LABEL: 'different_extend_ops'
+; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %as = sext i16 %a to i32
+; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bs = zext i16 %b to i32
+; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %m = mul i32 %as, %bs
+; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ms = sext i32 %m to i64
+; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %ms
+;
+    %as = sext i16 %a to i32
+    %bs = zext i16 %b to i32
+    %m = mul i32 %as, %bs
+    %ms = sext i32 %m to i64
+    ret i64 %ms
+}
diff --git a/llvm/test/Analysis/CostModel/ARM/muls-in-umull-patterns.ll b/llvm/test/Analysis/CostModel/ARM/muls-in-umull-patterns.ll
new file mode 100644
index 0000000000000..521816d13000b
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/ARM/muls-in-umull-patterns.ll
@@ -0,0 +1,80 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple thumbv8.1-m.main -mattr=+dsp  < %s | FileCheck %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple thumbv8.1-m.main < %s | FileCheck %s --check-prefix=CHECK-NO-DSP
+define i64 @test(i16 %a, i16 %b) {
+; CHECK-LABEL: 'test'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %as = zext i16 %a to i32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bs = zext i16 %b to i32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %m = mul i32 %as, %bs
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ms = zext i32 %m to i64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %ms
+;
+; CHECK-NO-DSP-LABEL: 'test'
+; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %as = zext i16 %a to i32
+; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bs = zext i16 %b to i32
+; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %m = mul i32 %as, %bs
+; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ms = zext i32 %m to i64
+; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %ms
+;
+    %as = zext i16 %a to i32
+    %bs = zext i16 %b to i32
+    %m = mul i32 %as, %bs
+    %ms = zext i32 %m to i64
+    ret i64 %ms
+}
+
+define i64 @withadd(i16 %a, i16 %b, i64 %c) {
+; CHECK-LABEL: 'withadd'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %as = zext i16 %a to i32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bs = zext i16 %b to i32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %m = mul i32 %as, %bs
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ms = zext i32 %m to i64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r = add i64 %c, %ms
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
+;
+; CHECK-NO-DSP-LABEL: 'withadd'
+; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %as = zext i16 %a to i32
+; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bs = zext i16 %b to i32
+; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %m = mul i32 %as, %bs
+; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ms = zext i32 %m to i64
+; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r = add i64 %c, %ms
+; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
+;
+    %as = zext i16 %a to i32
+    %bs = zext i16 %b to i32
+    %m = mul i32 %as, %bs
+    %ms = zext i32 %m to i64
+    %r = add i64 %c, %ms
+    ret i64 %r
+}
+
+define i64 @withloads(ptr %pa, ptr %pb, i64 %c) {
+; CHECK-LABEL: 'withloads'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a = load i16, ptr %pa, align 2
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b = load i16, ptr %pb, align 2
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %as = zext i16 %a to i32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %bs = zext i16 %b to i32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %m = mul i32 %as, %bs
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ms = zext i32 %m to i64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r = add i64 %c, %ms
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
+;
+; CHECK-NO-DSP-LABEL: 'withloads'
+; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a = load i16, ptr %pa, align 2
+; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b = load i16, ptr %pb, align 2
+; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %as = zext i16 %a to i32
+; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %bs = zext i16 %b to i32
+; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %m = mul i32 %as, %bs
+; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ms = zext i32 %m to i64
+; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r = add i64 %c, %ms
+; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
+;
+    %a = load i16, ptr %pa
+    %b = load i16, ptr %pb
+    %as = zext i16 %a to i32
+    %bs = zext i16 %b to i32
+    %m = mul i32 %as, %bs
+    %ms = zext i32 %m to i64
+    %r = add i64 %c, %ms
+    ret i64 %r
+}

From df802a37e9fc9c4bd86a40979b93a30dfa8d0a6c Mon Sep 17 00:00:00 2001
From: nasmnc01 <nashe.mncube@arm.com>
Date: Tue, 18 Mar 2025 13:56:57 +0000
Subject: [PATCH 6/6] Minor comments

Change-Id: I9ce62da8405a08ece5e36c7e1ea0bb93e6f0b6c4
---
 llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index b7fcb6bd34241..88ca37ea12cf5 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -1462,9 +1462,9 @@ InstructionCost ARMTTIImpl::getArithmeticInstrCost(
   // the compiler will attempt to vectorize as well as using
   // scalar (S/U)MLAL operations. This is in cases where we have
   // the pattern ext(mul(ext(i16), ext(i16))) we find
-  // that generated codegen performs better when only using (S/U)MLAL scalar
+  // that codegen performs better when only using (S/U)MLAL scalar
   // ops instead of trying to mix vector ops with (S/U)MLAL ops. We therefore
-  // check if a mul instruction is used in a SMLAL pattern.
+  // check if a mul instruction is used in a (U/S)MLAL pattern.
   auto MulInDSPMLALPattern = [&](const Instruction *I, unsigned Opcode,
                                  Type *Ty) -> bool {
     if (!ST->hasDSP())
@@ -1487,9 +1487,7 @@ InstructionCost ARMTTIImpl::getArithmeticInstrCost(
       return isa<ZExtInst>(V) || isa<SExtInst>(V);
     };
     auto IsExtensionFromHalf = [&, IsExtInst](const Value *V) -> bool {
-      if (IsExtInst(V))
-        return cast<Instruction>(V)->getOperand(0)->getType()->isIntegerTy(16);
-      return false;
+      return cast<Instruction>(V)->getOperand(0)->getType()->isIntegerTy(16);
     };
 
     // We check the arguments of the instruction to see if they're extends