From 80e303c6e0976d8c2437a806679a54d5919c5917 Mon Sep 17 00:00:00 2001 From: william Date: Wed, 6 Aug 2025 16:17:48 +0800 Subject: [PATCH 01/15] [DAG] Fold trunc(avg(x,y)) for avgceil/floor u/s nodes if they have sufficient leading zero/sign bits-1 --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 45 +++++++++++++++++++ llvm/test/CodeGen/AArch64/trunc-avg-fold.ll | 43 ++++++++++++++++++ 2 files changed, 88 insertions(+) create mode 100644 llvm/test/CodeGen/AArch64/trunc-avg-fold.ll diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index d70e96938ed9a..9ff256f8090ba 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -16294,6 +16294,51 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { // because targets may prefer a wider type during later combines and invert // this transform. switch (N0.getOpcode()) { + case ISD::AVGCEILU: + case ISD::AVGFLOORU: + if (!LegalOperations && N0.hasOneUse() && + TLI.isOperationLegal(N0.getOpcode(), VT)) { + SDValue X = N0.getOperand(0); + SDValue Y = N0.getOperand(1); + + KnownBits KnownX = DAG.computeKnownBits(X); + KnownBits KnownY = DAG.computeKnownBits(Y); + + unsigned SrcBits = X.getScalarValueSizeInBits(); + unsigned DstBits = VT.getScalarSizeInBits(); + unsigned NeededLeadingZeros = SrcBits - DstBits + 1; + + if (KnownX.countMinLeadingZeros() >= NeededLeadingZeros && + KnownY.countMinLeadingZeros() >= NeededLeadingZeros) { + SDValue Tx = DAG.getNode(ISD::TRUNCATE, DL, VT, X); + SDValue Ty = DAG.getNode(ISD::TRUNCATE, DL, VT, Y); + return DAG.getNode(N0.getOpcode(), DL, VT, Tx, Ty); + } + } + break; + + case ISD::AVGCEILS: + case ISD::AVGFLOORS: + if (!LegalOperations && N0.hasOneUse() && + TLI.isOperationLegal(N0.getOpcode(), VT)) { + SDValue X = N0.getOperand(0); + SDValue Y = N0.getOperand(1); + + unsigned SignBitsX = DAG.ComputeNumSignBits(X); + unsigned SignBitsY = DAG.ComputeNumSignBits(Y); + + unsigned SrcBits = X.getScalarValueSizeInBits(); + unsigned DstBits = VT.getScalarSizeInBits(); + unsigned NeededSignBits = SrcBits - DstBits + 1; + + if (SignBitsX >= NeededSignBits && SignBitsY >= NeededSignBits) { + SDValue Tx = DAG.getNode(ISD::TRUNCATE, DL, VT, X); + SDValue Ty = DAG.getNode(ISD::TRUNCATE, DL, VT, Y); + return DAG.getNode(N0.getOpcode(), DL, VT, Tx, Ty); + } + } + break; + case ISD::ADD: case ISD::SUB: case ISD::MUL: diff --git a/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll b/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll new file mode 100644 index 0000000000000..175f54d6f9c05 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll @@ -0,0 +1,43 @@ +; RUN: llc -mtriple=aarch64-- -O2 -mattr=+neon < %s | FileCheck %s + +; CHECK-LABEL: test_avgceil_u +; CHECK: uhadd v0.8b, v0.8b, v1.8b +define <8 x i8> @test_avgceil_u(<8 x i16> %a, <8 x i16> %b) { + %ta = trunc <8 x i16> %a to <8 x i8> + %tb = trunc <8 x i16> %b to <8 x i8> + %res = call <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8> %ta, <8 x i8> %tb) + ret <8 x i8> %res +} + +; CHECK-LABEL: test_avgceil_s +; CHECK: shadd v0.8b, v0.8b, v1.8b +define <8 x i8> @test_avgceil_s(<8 x i16> %a, <8 x i16> %b) { + %ta = trunc <8 x i16> %a to <8 x i8> + %tb = trunc <8 x i16> %b to <8 x i8> + %res = call <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8> %ta, <8 x i8> %tb) + ret <8 x i8> %res +} + +; CHECK-LABEL: test_avgfloor_u +; CHECK: urhadd v0.8b, v0.8b, v1.8b +define <8 x i8> @test_avgfloor_u(<8 x i16> %a, <8 x i16> %b) { + %ta = trunc <8 x i16> %a to <8 x i8> + %tb = trunc <8 x i16> %b to <8 x i8> + %res = call <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8> %ta, <8 x i8> %tb) + ret <8 x i8> %res +} + +; CHECK-LABEL: test_avgfloor_s +; CHECK: srhadd v0.8b, v0.8b, v1.8b +define <8 x i8> @test_avgfloor_s(<8 x i16> %a, <8 x i16> %b) { + %ta = trunc <8 x i16> %a to <8 x i8> + %tb = trunc <8 x i16> %b to <8 x i8> + %res = call <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8> %ta, <8 x i8> %tb) + ret <8 x i8> %res +} + +declare <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8>, <8 x i8>) + From 24287f7f08d3bd238761b6e798aee655af931c3d Mon Sep 17 00:00:00 2001 From: william Date: Fri, 8 Aug 2025 00:04:32 +0800 Subject: [PATCH 02/15] [DAG] Fold trunc(avg(x,y)) for avgceil/floor u/s nodes if they have sufficient leading zero/sign bits -2 --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 16 +--- llvm/test/CodeGen/AArch64/trunc-avg-fold.ll | 96 +++++++++++++++---- 2 files changed, 79 insertions(+), 33 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 9ff256f8090ba..0cba06215d3fe 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -16300,37 +16300,28 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { TLI.isOperationLegal(N0.getOpcode(), VT)) { SDValue X = N0.getOperand(0); SDValue Y = N0.getOperand(1); - - KnownBits KnownX = DAG.computeKnownBits(X); - KnownBits KnownY = DAG.computeKnownBits(Y); - unsigned SrcBits = X.getScalarValueSizeInBits(); unsigned DstBits = VT.getScalarSizeInBits(); - unsigned NeededLeadingZeros = SrcBits - DstBits + 1; - - if (KnownX.countMinLeadingZeros() >= NeededLeadingZeros && - KnownY.countMinLeadingZeros() >= NeededLeadingZeros) { + unsigned MaxBitsX = DAG.ComputeMaxSignificantBits(X); + unsigned MaxBitsY = DAG.ComputeMaxSignificantBits(Y); + if (MaxBitsX <= DstBits && MaxBitsY <= DstBits) { SDValue Tx = DAG.getNode(ISD::TRUNCATE, DL, VT, X); SDValue Ty = DAG.getNode(ISD::TRUNCATE, DL, VT, Y); return DAG.getNode(N0.getOpcode(), DL, VT, Tx, Ty); } } break; - case ISD::AVGCEILS: case ISD::AVGFLOORS: if (!LegalOperations && N0.hasOneUse() && TLI.isOperationLegal(N0.getOpcode(), VT)) { SDValue X = N0.getOperand(0); SDValue Y = N0.getOperand(1); - unsigned SignBitsX = DAG.ComputeNumSignBits(X); unsigned SignBitsY = DAG.ComputeNumSignBits(Y); - unsigned SrcBits = X.getScalarValueSizeInBits(); unsigned DstBits = VT.getScalarSizeInBits(); unsigned NeededSignBits = SrcBits - DstBits + 1; - if (SignBitsX >= NeededSignBits && SignBitsY >= NeededSignBits) { SDValue Tx = DAG.getNode(ISD::TRUNCATE, DL, VT, X); SDValue Ty = DAG.getNode(ISD::TRUNCATE, DL, VT, Y); @@ -16338,7 +16329,6 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { } } break; - case ISD::ADD: case ISD::SUB: case ISD::MUL: diff --git a/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll b/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll index 175f54d6f9c05..db40746776d43 100644 --- a/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll +++ b/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll @@ -1,38 +1,91 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=aarch64-- -O2 -mattr=+neon < %s | FileCheck %s -; CHECK-LABEL: test_avgceil_u -; CHECK: uhadd v0.8b, v0.8b, v1.8b + define <8 x i8> @test_avgceil_u(<8 x i16> %a, <8 x i16> %b) { - %ta = trunc <8 x i16> %a to <8 x i8> - %tb = trunc <8 x i16> %b to <8 x i8> - %res = call <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8> %ta, <8 x i8> %tb) +; CHECK-LABEL: test_avgceil_u: +; CHECK: // %bb.0: +; CHECK-NEXT: xtn v0.8b, v0.8h +; CHECK-NEXT: xtn v1.8b, v1.8h +; CHECK-NEXT: uhadd v0.8b, v0.8b, v1.8b +; CHECK-NEXT: ret + + %mask = insertelement <8 x i16> undef, i16 255, i32 0 + %mask.splat = shufflevector <8 x i16> %mask, <8 x i16> undef, <8 x i32> zeroinitializer + %ta16 = and <8 x i16> %a, %mask.splat + %tb16 = and <8 x i16> %b, %mask.splat + %ta8 = trunc <8 x i16> %ta16 to <8 x i8> + %tb8 = trunc <8 x i16> %tb16 to <8 x i8> + %res = call <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8> %ta8, <8 x i8> %tb8) ret <8 x i8> %res } -; CHECK-LABEL: test_avgceil_s -; CHECK: shadd v0.8b, v0.8b, v1.8b + define <8 x i8> @test_avgceil_s(<8 x i16> %a, <8 x i16> %b) { - %ta = trunc <8 x i16> %a to <8 x i8> - %tb = trunc <8 x i16> %b to <8 x i8> - %res = call <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8> %ta, <8 x i8> %tb) +; CHECK-LABEL: test_avgceil_s: +; CHECK: // %bb.0: +; CHECK-NEXT: sqxtn v0.8b, v0.8h +; CHECK-NEXT: sqxtn v1.8b, v1.8h +; CHECK-NEXT: shadd v0.8b, v0.8b, v1.8b +; CHECK-NEXT: ret + + %smin = insertelement <8 x i16> undef, i16 -128, i32 0 + %smax = insertelement <8 x i16> undef, i16 127, i32 0 + %min = shufflevector <8 x i16> %smin, <8 x i16> undef, <8 x i32> zeroinitializer + %max = shufflevector <8 x i16> %smax, <8 x i16> undef, <8 x i32> zeroinitializer + + %ta16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %a, <8 x i16> %max) + %ta16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %ta16, <8 x i16> %min) + %tb16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %b, <8 x i16> %max) + %tb16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %tb16, <8 x i16> %min) + + %ta8 = trunc <8 x i16> %ta16.clamped to <8 x i8> + %tb8 = trunc <8 x i16> %tb16.clamped to <8 x i8> + %res = call <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8> %ta8, <8 x i8> %tb8) ret <8 x i8> %res } -; CHECK-LABEL: test_avgfloor_u -; CHECK: urhadd v0.8b, v0.8b, v1.8b + define <8 x i8> @test_avgfloor_u(<8 x i16> %a, <8 x i16> %b) { - %ta = trunc <8 x i16> %a to <8 x i8> - %tb = trunc <8 x i16> %b to <8 x i8> - %res = call <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8> %ta, <8 x i8> %tb) +; CHECK-LABEL: test_avgfloor_u: +; CHECK: // %bb.0: +; CHECK-NEXT: xtn v0.8b, v0.8h +; CHECK-NEXT: xtn v1.8b, v1.8h +; CHECK-NEXT: urhadd v0.8b, v0.8b, v1.8b +; CHECK-NEXT: ret + + %mask = insertelement <8 x i16> undef, i16 255, i32 0 + %mask.splat = shufflevector <8 x i16> %mask, <8 x i16> undef, <8 x i32> zeroinitializer + %ta16 = and <8 x i16> %a, %mask.splat + %tb16 = and <8 x i16> %b, %mask.splat + %ta8 = trunc <8 x i16> %ta16 to <8 x i8> + %tb8 = trunc <8 x i16> %tb16 to <8 x i8> + %res = call <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8> %ta8, <8 x i8> %tb8) ret <8 x i8> %res } -; CHECK-LABEL: test_avgfloor_s -; CHECK: srhadd v0.8b, v0.8b, v1.8b + define <8 x i8> @test_avgfloor_s(<8 x i16> %a, <8 x i16> %b) { - %ta = trunc <8 x i16> %a to <8 x i8> - %tb = trunc <8 x i16> %b to <8 x i8> - %res = call <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8> %ta, <8 x i8> %tb) +; CHECK-LABEL: test_avgfloor_s: +; CHECK: // %bb.0: +; CHECK-NEXT: sqxtn v0.8b, v0.8h +; CHECK-NEXT: sqxtn v1.8b, v1.8h +; CHECK-NEXT: srhadd v0.8b, v0.8b, v1.8b +; CHECK-NEXT: ret + + %smin = insertelement <8 x i16> undef, i16 -128, i32 0 + %smax = insertelement <8 x i16> undef, i16 127, i32 0 + %min = shufflevector <8 x i16> %smin, <8 x i16> undef, <8 x i32> zeroinitializer + %max = shufflevector <8 x i16> %smax, <8 x i16> undef, <8 x i32> zeroinitializer + + %ta16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %a, <8 x i16> %max) + %ta16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %ta16, <8 x i16> %min) + %tb16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %b, <8 x i16> %max) + %tb16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %tb16, <8 x i16> %min) + + %ta8 = trunc <8 x i16> %ta16.clamped to <8 x i8> + %tb8 = trunc <8 x i16> %tb16.clamped to <8 x i8> + %res = call <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8> %ta8, <8 x i8> %tb8) ret <8 x i8> %res } @@ -41,3 +94,6 @@ declare <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8>, <8 x i8>) declare <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8>, <8 x i8>) declare <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i16> @llvm.smin.v8i16(<8 x i16>, <8 x i16>) +declare <8 x i16> @llvm.smax.v8i16(<8 x i16>, <8 x i16>) + From c8cc2a98030154d6a95154d8fe8d7461cfb0daf4 Mon Sep 17 00:00:00 2001 From: william Date: Fri, 8 Aug 2025 10:51:33 +0800 Subject: [PATCH 03/15] [DAG] Fold trunc(avg(x,y)) for avgceil/floor u/s nodes if they have sufficient leading zero/sign bits -3 --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 8 +++++--- llvm/test/CodeGen/AArch64/trunc-avg-fold.ll | 20 +++++++++---------- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 0cba06215d3fe..7aea288c03208 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -16302,9 +16302,10 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { SDValue Y = N0.getOperand(1); unsigned SrcBits = X.getScalarValueSizeInBits(); unsigned DstBits = VT.getScalarSizeInBits(); - unsigned MaxBitsX = DAG.ComputeMaxSignificantBits(X); - unsigned MaxBitsY = DAG.ComputeMaxSignificantBits(Y); - if (MaxBitsX <= DstBits && MaxBitsY <= DstBits) { + KnownBits KnownX = DAG.computeKnownBits(X); + KnownBits KnownY = DAG.computeKnownBits(Y); + if (KnownX.countMinLeadingZeros() >= (SrcBits - DstBits) && + KnownY.countMinLeadingZeros() >= (SrcBits - DstBits)) { SDValue Tx = DAG.getNode(ISD::TRUNCATE, DL, VT, X); SDValue Ty = DAG.getNode(ISD::TRUNCATE, DL, VT, Y); return DAG.getNode(N0.getOpcode(), DL, VT, Tx, Ty); @@ -16322,6 +16323,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { unsigned SrcBits = X.getScalarValueSizeInBits(); unsigned DstBits = VT.getScalarSizeInBits(); unsigned NeededSignBits = SrcBits - DstBits + 1; + if (SignBitsX >= NeededSignBits && SignBitsY >= NeededSignBits) { SDValue Tx = DAG.getNode(ISD::TRUNCATE, DL, VT, X); SDValue Ty = DAG.getNode(ISD::TRUNCATE, DL, VT, Y); diff --git a/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll b/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll index db40746776d43..ede39e237a9c9 100644 --- a/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll +++ b/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll @@ -10,8 +10,8 @@ define <8 x i8> @test_avgceil_u(<8 x i16> %a, <8 x i16> %b) { ; CHECK-NEXT: uhadd v0.8b, v0.8b, v1.8b ; CHECK-NEXT: ret - %mask = insertelement <8 x i16> undef, i16 255, i32 0 - %mask.splat = shufflevector <8 x i16> %mask, <8 x i16> undef, <8 x i32> zeroinitializer + %mask = insertelement <8 x i16> poison, i16 255, i32 0 + %mask.splat = shufflevector <8 x i16> %mask, <8 x i16> poison, <8 x i32> zeroinitializer %ta16 = and <8 x i16> %a, %mask.splat %tb16 = and <8 x i16> %b, %mask.splat %ta8 = trunc <8 x i16> %ta16 to <8 x i8> @@ -29,10 +29,10 @@ define <8 x i8> @test_avgceil_s(<8 x i16> %a, <8 x i16> %b) { ; CHECK-NEXT: shadd v0.8b, v0.8b, v1.8b ; CHECK-NEXT: ret - %smin = insertelement <8 x i16> undef, i16 -128, i32 0 - %smax = insertelement <8 x i16> undef, i16 127, i32 0 - %min = shufflevector <8 x i16> %smin, <8 x i16> undef, <8 x i32> zeroinitializer - %max = shufflevector <8 x i16> %smax, <8 x i16> undef, <8 x i32> zeroinitializer + %smin = insertelement <8 x i16> poison, i16 -128, i32 0 + %smax = insertelement <8 x i16> poison, i16 127, i32 0 + %min = shufflevector <8 x i16> %smin, <8 x i16> poison, <8 x i32> zeroinitializer + %max = shufflevector <8 x i16> %smax, <8 x i16> poison, <8 x i32> zeroinitializer %ta16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %a, <8 x i16> %max) %ta16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %ta16, <8 x i16> %min) @@ -73,10 +73,10 @@ define <8 x i8> @test_avgfloor_s(<8 x i16> %a, <8 x i16> %b) { ; CHECK-NEXT: srhadd v0.8b, v0.8b, v1.8b ; CHECK-NEXT: ret - %smin = insertelement <8 x i16> undef, i16 -128, i32 0 - %smax = insertelement <8 x i16> undef, i16 127, i32 0 - %min = shufflevector <8 x i16> %smin, <8 x i16> undef, <8 x i32> zeroinitializer - %max = shufflevector <8 x i16> %smax, <8 x i16> undef, <8 x i32> zeroinitializer + %smin = insertelement <8 x i16> poison, i16 -128, i32 0 + %smax = insertelement <8 x i16> poison, i16 127, i32 0 + %min = shufflevector <8 x i16> %smin, <8 x i16> poison, <8 x i32> zeroinitializer + %max = shufflevector <8 x i16> %smax, <8 x i16> poison, <8 x i32> zeroinitializer %ta16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %a, <8 x i16> %max) %ta16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %ta16, <8 x i16> %min) From 11152562f1255a4fcd60404d1e08ca80bf422090 Mon Sep 17 00:00:00 2001 From: william Date: Fri, 8 Aug 2025 11:40:46 +0800 Subject: [PATCH 04/15] [DAG] Fold trunc(avg(x,y)) for avgceil/floor u/s nodes if they have sufficient leading zero/sign bits-4 --- llvm/test/CodeGen/AArch64/trunc-avg-fold.ll | 59 ++++++++------------- 1 file changed, 22 insertions(+), 37 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll b/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll index ede39e237a9c9..4d4e828a751bd 100644 --- a/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll +++ b/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll @@ -1,7 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=aarch64-- -O2 -mattr=+neon < %s | FileCheck %s - define <8 x i8> @test_avgceil_u(<8 x i16> %a, <8 x i16> %b) { ; CHECK-LABEL: test_avgceil_u: ; CHECK: // %bb.0: @@ -9,7 +8,6 @@ define <8 x i8> @test_avgceil_u(<8 x i16> %a, <8 x i16> %b) { ; CHECK-NEXT: xtn v1.8b, v1.8h ; CHECK-NEXT: uhadd v0.8b, v0.8b, v1.8b ; CHECK-NEXT: ret - %mask = insertelement <8 x i16> poison, i16 255, i32 0 %mask.splat = shufflevector <8 x i16> %mask, <8 x i16> poison, <8 x i32> zeroinitializer %ta16 = and <8 x i16> %a, %mask.splat @@ -20,7 +18,6 @@ define <8 x i8> @test_avgceil_u(<8 x i16> %a, <8 x i16> %b) { ret <8 x i8> %res } - define <8 x i8> @test_avgceil_s(<8 x i16> %a, <8 x i16> %b) { ; CHECK-LABEL: test_avgceil_s: ; CHECK: // %bb.0: @@ -28,72 +25,60 @@ define <8 x i8> @test_avgceil_s(<8 x i16> %a, <8 x i16> %b) { ; CHECK-NEXT: sqxtn v1.8b, v1.8h ; CHECK-NEXT: shadd v0.8b, v0.8b, v1.8b ; CHECK-NEXT: ret - - %smin = insertelement <8 x i16> poison, i16 -128, i32 0 - %smax = insertelement <8 x i16> poison, i16 127, i32 0 - %min = shufflevector <8 x i16> %smin, <8 x i16> poison, <8 x i32> zeroinitializer - %max = shufflevector <8 x i16> %smax, <8 x i16> poison, <8 x i32> zeroinitializer - - %ta16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %a, <8 x i16> %max) - %ta16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %ta16, <8 x i16> %min) - %tb16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %b, <8 x i16> %max) - %tb16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %tb16, <8 x i16> %min) - + %min = insertelement <8 x i16> poison, i16 -128, i32 0 + %min.splat = shufflevector <8 x i16> %min, <8 x i16> poison, <8 x i32> zeroinitializer + %max = insertelement <8 x i16> poison, i16 127, i32 0 + %max.splat = shufflevector <8 x i16> %max, <8 x i16> poison, <8 x i32> zeroinitializer + %ta16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %a, <8 x i16> %max.splat) + %ta16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %ta16, <8 x i16> %min.splat) + %tb16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %b, <8 x i16> %max.splat) + %tb16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %tb16, <8 x i16> %min.splat) %ta8 = trunc <8 x i16> %ta16.clamped to <8 x i8> %tb8 = trunc <8 x i16> %tb16.clamped to <8 x i8> %res = call <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8> %ta8, <8 x i8> %tb8) ret <8 x i8> %res } - define <8 x i8> @test_avgfloor_u(<8 x i16> %a, <8 x i16> %b) { ; CHECK-LABEL: test_avgfloor_u: ; CHECK: // %bb.0: ; CHECK-NEXT: xtn v0.8b, v0.8h ; CHECK-NEXT: xtn v1.8b, v1.8h -; CHECK-NEXT: urhadd v0.8b, v0.8b, v1.8b +; CHECK-NEXT: uhadd v0.8b, v0.8b, v1.8b ; CHECK-NEXT: ret - - %mask = insertelement <8 x i16> undef, i16 255, i32 0 - %mask.splat = shufflevector <8 x i16> %mask, <8 x i16> undef, <8 x i32> zeroinitializer + %mask = insertelement <8 x i16> poison, i16 255, i32 0 + %mask.splat = shufflevector <8 x i16> %mask, <8 x i16> poison, <8 x i32> zeroinitializer %ta16 = and <8 x i16> %a, %mask.splat %tb16 = and <8 x i16> %b, %mask.splat %ta8 = trunc <8 x i16> %ta16 to <8 x i8> %tb8 = trunc <8 x i16> %tb16 to <8 x i8> - %res = call <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8> %ta8, <8 x i8> %tb8) + %res = call <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8> %ta8, <8 x i8> %tb8) ret <8 x i8> %res } - define <8 x i8> @test_avgfloor_s(<8 x i16> %a, <8 x i16> %b) { ; CHECK-LABEL: test_avgfloor_s: ; CHECK: // %bb.0: ; CHECK-NEXT: sqxtn v0.8b, v0.8h ; CHECK-NEXT: sqxtn v1.8b, v1.8h -; CHECK-NEXT: srhadd v0.8b, v0.8b, v1.8b +; CHECK-NEXT: shadd v0.8b, v0.8b, v1.8b ; CHECK-NEXT: ret - - %smin = insertelement <8 x i16> poison, i16 -128, i32 0 - %smax = insertelement <8 x i16> poison, i16 127, i32 0 - %min = shufflevector <8 x i16> %smin, <8 x i16> poison, <8 x i32> zeroinitializer - %max = shufflevector <8 x i16> %smax, <8 x i16> poison, <8 x i32> zeroinitializer - - %ta16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %a, <8 x i16> %max) - %ta16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %ta16, <8 x i16> %min) - %tb16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %b, <8 x i16> %max) - %tb16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %tb16, <8 x i16> %min) - + %min = insertelement <8 x i16> poison, i16 -128, i32 0 + %min.splat = shufflevector <8 x i16> %min, <8 x i16> poison, <8 x i32> zeroinitializer + %max = insertelement <8 x i16> poison, i16 127, i32 0 + %max.splat = shufflevector <8 x i16> %max, <8 x i16> poison, <8 x i32> zeroinitializer + %ta16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %a, <8 x i16> %max.splat) + %ta16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %ta16, <8 x i16> %min.splat) + %tb16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %b, <8 x i16> %max.splat) + %tb16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %tb16, <8 x i16> %min.splat) %ta8 = trunc <8 x i16> %ta16.clamped to <8 x i8> %tb8 = trunc <8 x i16> %tb16.clamped to <8 x i8> - %res = call <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8> %ta8, <8 x i8> %tb8) + %res = call <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8> %ta8, <8 x i8> %tb8) ret <8 x i8> %res } declare <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8>, <8 x i8>) declare <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8>, <8 x i8>) -declare <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8>, <8 x i8>) -declare <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8>, <8 x i8>) - declare <8 x i16> @llvm.smin.v8i16(<8 x i16>, <8 x i16>) declare <8 x i16> @llvm.smax.v8i16(<8 x i16>, <8 x i16>) From 08138a2fde9896a580d11a2b4249eea86d42fefe Mon Sep 17 00:00:00 2001 From: william Date: Fri, 8 Aug 2025 12:55:44 +0800 Subject: [PATCH 05/15] [DAG] Fold trunc(avg(x,y)) for avgceil/floor u/s nodes if they have sufficient leading zero/sign bits-5 --- llvm/test/CodeGen/AArch64/trunc-avg-fold.ll | 52 ++++++++++++--------- 1 file changed, 30 insertions(+), 22 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll b/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll index 4d4e828a751bd..36fddedd78df6 100644 --- a/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll +++ b/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll @@ -4,26 +4,31 @@ define <8 x i8> @test_avgceil_u(<8 x i16> %a, <8 x i16> %b) { ; CHECK-LABEL: test_avgceil_u: ; CHECK: // %bb.0: +; CHECK-NEXT: bic v0.8h, #255, lsl #8 +; CHECK-NEXT: bic v1.8h, #255, lsl #8 +; CHECK-NEXT: uhadd v0.8h, v0.8h, v1.8h ; CHECK-NEXT: xtn v0.8b, v0.8h -; CHECK-NEXT: xtn v1.8b, v1.8h -; CHECK-NEXT: uhadd v0.8b, v0.8b, v1.8b ; CHECK-NEXT: ret %mask = insertelement <8 x i16> poison, i16 255, i32 0 %mask.splat = shufflevector <8 x i16> %mask, <8 x i16> poison, <8 x i32> zeroinitializer %ta16 = and <8 x i16> %a, %mask.splat %tb16 = and <8 x i16> %b, %mask.splat - %ta8 = trunc <8 x i16> %ta16 to <8 x i8> - %tb8 = trunc <8 x i16> %tb16 to <8 x i8> - %res = call <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8> %ta8, <8 x i8> %tb8) + %avg16 = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> %ta16, <8 x i16> %tb16) + %res = trunc <8 x i16> %avg16 to <8 x i8> ret <8 x i8> %res } define <8 x i8> @test_avgceil_s(<8 x i16> %a, <8 x i16> %b) { ; CHECK-LABEL: test_avgceil_s: ; CHECK: // %bb.0: -; CHECK-NEXT: sqxtn v0.8b, v0.8h -; CHECK-NEXT: sqxtn v1.8b, v1.8h -; CHECK-NEXT: shadd v0.8b, v0.8b, v1.8b +; CHECK-NEXT: movi v2.8h, #127 +; CHECK-NEXT: mvni v3.8h, #127 +; CHECK-NEXT: smin v0.8h, v0.8h, v2.8h +; CHECK-NEXT: smin v1.8h, v1.8h, v2.8h +; CHECK-NEXT: smax v0.8h, v0.8h, v3.8h +; CHECK-NEXT: smax v1.8h, v1.8h, v3.8h +; CHECK-NEXT: shadd v0.8h, v0.8h, v1.8h +; CHECK-NEXT: xtn v0.8b, v0.8h ; CHECK-NEXT: ret %min = insertelement <8 x i16> poison, i16 -128, i32 0 %min.splat = shufflevector <8 x i16> %min, <8 x i16> poison, <8 x i32> zeroinitializer @@ -33,35 +38,39 @@ define <8 x i8> @test_avgceil_s(<8 x i16> %a, <8 x i16> %b) { %ta16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %ta16, <8 x i16> %min.splat) %tb16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %b, <8 x i16> %max.splat) %tb16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %tb16, <8 x i16> %min.splat) - %ta8 = trunc <8 x i16> %ta16.clamped to <8 x i8> - %tb8 = trunc <8 x i16> %tb16.clamped to <8 x i8> - %res = call <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8> %ta8, <8 x i8> %tb8) + %avg16 = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> %ta16.clamped, <8 x i16> %tb16.clamped) + %res = trunc <8 x i16> %avg16 to <8 x i8> ret <8 x i8> %res } define <8 x i8> @test_avgfloor_u(<8 x i16> %a, <8 x i16> %b) { ; CHECK-LABEL: test_avgfloor_u: ; CHECK: // %bb.0: +; CHECK-NEXT: bic v0.8h, #255, lsl #8 +; CHECK-NEXT: bic v1.8h, #255, lsl #8 +; CHECK-NEXT: uhadd v0.8h, v0.8h, v1.8h ; CHECK-NEXT: xtn v0.8b, v0.8h -; CHECK-NEXT: xtn v1.8b, v1.8h -; CHECK-NEXT: uhadd v0.8b, v0.8b, v1.8b ; CHECK-NEXT: ret %mask = insertelement <8 x i16> poison, i16 255, i32 0 %mask.splat = shufflevector <8 x i16> %mask, <8 x i16> poison, <8 x i32> zeroinitializer %ta16 = and <8 x i16> %a, %mask.splat %tb16 = and <8 x i16> %b, %mask.splat - %ta8 = trunc <8 x i16> %ta16 to <8 x i8> - %tb8 = trunc <8 x i16> %tb16 to <8 x i8> - %res = call <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8> %ta8, <8 x i8> %tb8) + %avg16 = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> %ta16, <8 x i16> %tb16) + %res = trunc <8 x i16> %avg16 to <8 x i8> ret <8 x i8> %res } define <8 x i8> @test_avgfloor_s(<8 x i16> %a, <8 x i16> %b) { ; CHECK-LABEL: test_avgfloor_s: ; CHECK: // %bb.0: -; CHECK-NEXT: sqxtn v0.8b, v0.8h -; CHECK-NEXT: sqxtn v1.8b, v1.8h -; CHECK-NEXT: shadd v0.8b, v0.8b, v1.8b +; CHECK-NEXT: movi v2.8h, #127 +; CHECK-NEXT: mvni v3.8h, #127 +; CHECK-NEXT: smin v0.8h, v0.8h, v2.8h +; CHECK-NEXT: smin v1.8h, v1.8h, v2.8h +; CHECK-NEXT: smax v0.8h, v0.8h, v3.8h +; CHECK-NEXT: smax v1.8h, v1.8h, v3.8h +; CHECK-NEXT: shadd v0.8h, v0.8h, v1.8h +; CHECK-NEXT: xtn v0.8b, v0.8h ; CHECK-NEXT: ret %min = insertelement <8 x i16> poison, i16 -128, i32 0 %min.splat = shufflevector <8 x i16> %min, <8 x i16> poison, <8 x i32> zeroinitializer @@ -71,9 +80,8 @@ define <8 x i8> @test_avgfloor_s(<8 x i16> %a, <8 x i16> %b) { %ta16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %ta16, <8 x i16> %min.splat) %tb16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %b, <8 x i16> %max.splat) %tb16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %tb16, <8 x i16> %min.splat) - %ta8 = trunc <8 x i16> %ta16.clamped to <8 x i8> - %tb8 = trunc <8 x i16> %tb16.clamped to <8 x i8> - %res = call <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8> %ta8, <8 x i8> %tb8) + %avg16 = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> %ta16.clamped, <8 x i16> %tb16.clamped) + %res = trunc <8 x i16> %avg16 to <8 x i8> ret <8 x i8> %res } From 728b37db85a9821aec9931af00a8338ae9d7c95e Mon Sep 17 00:00:00 2001 From: william Date: Fri, 8 Aug 2025 13:05:41 +0800 Subject: [PATCH 06/15] [DAG] Fold trunc(avg(x,y)) for avgceil/floor u/s nodes if they have sufficient leading zero/sign bits-6 --- llvm/test/CodeGen/AArch64/trunc-avg-fold.ll | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll b/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll index 36fddedd78df6..24a1e6f60c078 100644 --- a/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll +++ b/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll @@ -85,8 +85,8 @@ define <8 x i8> @test_avgfloor_s(<8 x i16> %a, <8 x i16> %b) { ret <8 x i8> %res } -declare <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8>, <8 x i8>) -declare <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16>, <8 x i16>) +declare <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16>, <8 x i16>) declare <8 x i16> @llvm.smin.v8i16(<8 x i16>, <8 x i16>) declare <8 x i16> @llvm.smax.v8i16(<8 x i16>, <8 x i16>) From 44609a3b749675b758f1030b9401497192491dd4 Mon Sep 17 00:00:00 2001 From: william Date: Sat, 9 Aug 2025 21:13:30 +0800 Subject: [PATCH 07/15] [DAG] Fold trunc(avg(x,y)) for avgceil/floor u/s nodes if they have sufficient leading zero/sign bits-7 --- llvm/test/CodeGen/AArch64/trunc-avg-fold.ll | 36 +++++++-------------- 1 file changed, 12 insertions(+), 24 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll b/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll index 24a1e6f60c078..ca8e713cafc13 100644 --- a/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll +++ b/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll @@ -9,10 +9,8 @@ define <8 x i8> @test_avgceil_u(<8 x i16> %a, <8 x i16> %b) { ; CHECK-NEXT: uhadd v0.8h, v0.8h, v1.8h ; CHECK-NEXT: xtn v0.8b, v0.8h ; CHECK-NEXT: ret - %mask = insertelement <8 x i16> poison, i16 255, i32 0 - %mask.splat = shufflevector <8 x i16> %mask, <8 x i16> poison, <8 x i32> zeroinitializer - %ta16 = and <8 x i16> %a, %mask.splat - %tb16 = and <8 x i16> %b, %mask.splat + %ta16 = and <8 x i16> %a, splat (i16 255) + %tb16 = and <8 x i16> %b, splat (i16 255) %avg16 = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> %ta16, <8 x i16> %tb16) %res = trunc <8 x i16> %avg16 to <8 x i8> ret <8 x i8> %res @@ -30,14 +28,10 @@ define <8 x i8> @test_avgceil_s(<8 x i16> %a, <8 x i16> %b) { ; CHECK-NEXT: shadd v0.8h, v0.8h, v1.8h ; CHECK-NEXT: xtn v0.8b, v0.8h ; CHECK-NEXT: ret - %min = insertelement <8 x i16> poison, i16 -128, i32 0 - %min.splat = shufflevector <8 x i16> %min, <8 x i16> poison, <8 x i32> zeroinitializer - %max = insertelement <8 x i16> poison, i16 127, i32 0 - %max.splat = shufflevector <8 x i16> %max, <8 x i16> poison, <8 x i32> zeroinitializer - %ta16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %a, <8 x i16> %max.splat) - %ta16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %ta16, <8 x i16> %min.splat) - %tb16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %b, <8 x i16> %max.splat) - %tb16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %tb16, <8 x i16> %min.splat) + %ta16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %a, <8 x i16> splat (i16 127)) + %ta16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %ta16, <8 x i16> splat (i16 -128)) + %tb16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %b, <8 x i16> splat (i16 127)) + %tb16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %tb16, <8 x i16> splat (i16 -128)) %avg16 = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> %ta16.clamped, <8 x i16> %tb16.clamped) %res = trunc <8 x i16> %avg16 to <8 x i8> ret <8 x i8> %res @@ -51,10 +45,8 @@ define <8 x i8> @test_avgfloor_u(<8 x i16> %a, <8 x i16> %b) { ; CHECK-NEXT: uhadd v0.8h, v0.8h, v1.8h ; CHECK-NEXT: xtn v0.8b, v0.8h ; CHECK-NEXT: ret - %mask = insertelement <8 x i16> poison, i16 255, i32 0 - %mask.splat = shufflevector <8 x i16> %mask, <8 x i16> poison, <8 x i32> zeroinitializer - %ta16 = and <8 x i16> %a, %mask.splat - %tb16 = and <8 x i16> %b, %mask.splat + %ta16 = and <8 x i16> %a, splat (i16 255) + %tb16 = and <8 x i16> %b, splat (i16 255) %avg16 = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> %ta16, <8 x i16> %tb16) %res = trunc <8 x i16> %avg16 to <8 x i8> ret <8 x i8> %res @@ -72,14 +64,10 @@ define <8 x i8> @test_avgfloor_s(<8 x i16> %a, <8 x i16> %b) { ; CHECK-NEXT: shadd v0.8h, v0.8h, v1.8h ; CHECK-NEXT: xtn v0.8b, v0.8h ; CHECK-NEXT: ret - %min = insertelement <8 x i16> poison, i16 -128, i32 0 - %min.splat = shufflevector <8 x i16> %min, <8 x i16> poison, <8 x i32> zeroinitializer - %max = insertelement <8 x i16> poison, i16 127, i32 0 - %max.splat = shufflevector <8 x i16> %max, <8 x i16> poison, <8 x i32> zeroinitializer - %ta16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %a, <8 x i16> %max.splat) - %ta16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %ta16, <8 x i16> %min.splat) - %tb16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %b, <8 x i16> %max.splat) - %tb16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %tb16, <8 x i16> %min.splat) + %ta16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %a, <8 x i16> splat (i16 127)) + %ta16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %ta16, <8 x i16> splat (i16 -128)) + %tb16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %b, <8 x i16> splat (i16 127)) + %tb16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %tb16, <8 x i16> splat (i16 -128)) %avg16 = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> %ta16.clamped, <8 x i16> %tb16.clamped) %res = trunc <8 x i16> %avg16 to <8 x i8> ret <8 x i8> %res From 2d268fc6bd5de28d1dd6adbabc732e475a530014 Mon Sep 17 00:00:00 2001 From: william Date: Sun, 17 Aug 2025 00:09:15 +0800 Subject: [PATCH 08/15] [DAG] Fold trunc(avg(x,y)) for avgceil/floor u/s nodes if they have sufficient leading zero/sign bits-8 --- llvm/test/CodeGen/AArch64/trunc-avg-fold.ll | 81 +++++++-------------- 1 file changed, 27 insertions(+), 54 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll b/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll index ca8e713cafc13..8d9ea6c9d9922 100644 --- a/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll +++ b/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll @@ -1,80 +1,53 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=aarch64-- -O2 -mattr=+neon < %s | FileCheck %s -define <8 x i8> @test_avgceil_u(<8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_avgceil_u: +define <8 x i8> @avgceil_u_i8_to_i16(<8 x i8> %a, <8 x i8> %b) { +; CHECK-LABEL: avgceil_u_i8_to_i16: ; CHECK: // %bb.0: -; CHECK-NEXT: bic v0.8h, #255, lsl #8 -; CHECK-NEXT: bic v1.8h, #255, lsl #8 -; CHECK-NEXT: uhadd v0.8h, v0.8h, v1.8h -; CHECK-NEXT: xtn v0.8b, v0.8h +; CHECK-NEXT: urhadd v0.8b, v0.8b, v1.8b ; CHECK-NEXT: ret - %ta16 = and <8 x i16> %a, splat (i16 255) - %tb16 = and <8 x i16> %b, splat (i16 255) - %avg16 = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> %ta16, <8 x i16> %tb16) - %res = trunc <8 x i16> %avg16 to <8 x i8> - ret <8 x i8> %res + %a16 = zext <8 x i8> %a to <8 x i16> + %b16 = zext <8 x i8> %b to <8 x i16> + %avg16 = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> %a16, <8 x i16> %b16) + %r = trunc <8 x i16> %avg16 to <8 x i8> + ret <8 x i8> %r } -define <8 x i8> @test_avgceil_s(<8 x i16> %a, <8 x i16> %b) { + +define <8 x i8> @test_avgceil_s(<8 x i8> %a, <8 x i8> %b) { ; CHECK-LABEL: test_avgceil_s: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v2.8h, #127 -; CHECK-NEXT: mvni v3.8h, #127 -; CHECK-NEXT: smin v0.8h, v0.8h, v2.8h -; CHECK-NEXT: smin v1.8h, v1.8h, v2.8h -; CHECK-NEXT: smax v0.8h, v0.8h, v3.8h -; CHECK-NEXT: smax v1.8h, v1.8h, v3.8h -; CHECK-NEXT: shadd v0.8h, v0.8h, v1.8h -; CHECK-NEXT: xtn v0.8b, v0.8h +; CHECK-NEXT: srhadd v0.8b, v0.8b, v1.8b ; CHECK-NEXT: ret - %ta16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %a, <8 x i16> splat (i16 127)) - %ta16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %ta16, <8 x i16> splat (i16 -128)) - %tb16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %b, <8 x i16> splat (i16 127)) - %tb16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %tb16, <8 x i16> splat (i16 -128)) - %avg16 = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> %ta16.clamped, <8 x i16> %tb16.clamped) - %res = trunc <8 x i16> %avg16 to <8 x i8> + %a16 = sext <8 x i8> %a to <8 x i16> + %b16 = sext <8 x i8> %b to <8 x i16> + %avg16 = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> %a16, <8 x i16> %b16) + %res = trunc <8 x i16> %avg16 to <8 x i8> ret <8 x i8> %res } -define <8 x i8> @test_avgfloor_u(<8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_avgfloor_u: +define <8 x i8> @avgfloor_u_from_intrin(<8 x i8> %a, <8 x i8> %b) { +; CHECK-LABEL: avgfloor_u_from_intrin: ; CHECK: // %bb.0: -; CHECK-NEXT: bic v0.8h, #255, lsl #8 -; CHECK-NEXT: bic v1.8h, #255, lsl #8 -; CHECK-NEXT: uhadd v0.8h, v0.8h, v1.8h -; CHECK-NEXT: xtn v0.8b, v0.8h +; CHECK-NEXT: uhadd v0.8b, v0.8b, v1.8b ; CHECK-NEXT: ret - %ta16 = and <8 x i16> %a, splat (i16 255) - %tb16 = and <8 x i16> %b, splat (i16 255) - %avg16 = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> %ta16, <8 x i16> %tb16) + %a16 = zext <8 x i8> %a to <8 x i16> + %b16 = zext <8 x i8> %b to <8 x i16> + %avg16 = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> %a16, <8 x i16> %b16) %res = trunc <8 x i16> %avg16 to <8 x i8> ret <8 x i8> %res } -define <8 x i8> @test_avgfloor_s(<8 x i16> %a, <8 x i16> %b) { +define <8 x i8> @test_avgfloor_s(<8 x i8> %a, <8 x i8> %b) { ; CHECK-LABEL: test_avgfloor_s: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v2.8h, #127 -; CHECK-NEXT: mvni v3.8h, #127 -; CHECK-NEXT: smin v0.8h, v0.8h, v2.8h -; CHECK-NEXT: smin v1.8h, v1.8h, v2.8h -; CHECK-NEXT: smax v0.8h, v0.8h, v3.8h -; CHECK-NEXT: smax v1.8h, v1.8h, v3.8h -; CHECK-NEXT: shadd v0.8h, v0.8h, v1.8h -; CHECK-NEXT: xtn v0.8b, v0.8h +; CHECK-NEXT: shadd v0.8b, v0.8b, v1.8b ; CHECK-NEXT: ret - %ta16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %a, <8 x i16> splat (i16 127)) - %ta16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %ta16, <8 x i16> splat (i16 -128)) - %tb16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %b, <8 x i16> splat (i16 127)) - %tb16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %tb16, <8 x i16> splat (i16 -128)) - %avg16 = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> %ta16.clamped, <8 x i16> %tb16.clamped) - %res = trunc <8 x i16> %avg16 to <8 x i8> + %a16 = sext <8 x i8> %a to <8 x i16> + %b16 = sext <8 x i8> %b to <8 x i16> + %avg16 = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> %a16, <8 x i16> %b16) + %res = trunc <8 x i16> %avg16 to <8 x i8> ret <8 x i8> %res } -declare <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16>, <8 x i16>) -declare <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16>, <8 x i16>) -declare <8 x i16> @llvm.smin.v8i16(<8 x i16>, <8 x i16>) -declare <8 x i16> @llvm.smax.v8i16(<8 x i16>, <8 x i16>) From 32041fbb0b9696b8ab59feab66354aad96e4b1f7 Mon Sep 17 00:00:00 2001 From: william Date: Sun, 17 Aug 2025 00:10:04 +0800 Subject: [PATCH 09/15] [DAG] Fold trunc(avg(x,y)) for avgceil/floor u/s nodes if they have sufficient leading zero/sign bits-9 --- llvm/test/CodeGen/AArch64/trunc-avg-fold.ll | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll b/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll index 8d9ea6c9d9922..030e9ea994264 100644 --- a/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll +++ b/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll @@ -26,7 +26,7 @@ define <8 x i8> @test_avgceil_s(<8 x i8> %a, <8 x i8> %b) { ret <8 x i8> %res } -define <8 x i8> @avgfloor_u_from_intrin(<8 x i8> %a, <8 x i8> %b) { +define <8 x i8> @avgfloor_u_i8_to_i16(<8 x i8> %a, <8 x i8> %b) { ; CHECK-LABEL: avgfloor_u_from_intrin: ; CHECK: // %bb.0: ; CHECK-NEXT: uhadd v0.8b, v0.8b, v1.8b From 4e1af14d3efaed8c47448a158f547bdcd47879b3 Mon Sep 17 00:00:00 2001 From: william Date: Sun, 17 Aug 2025 23:32:43 +0800 Subject: [PATCH 10/15] Fold trunc(avg(x,y)) for avgceil/floor u/s nodes if they have sufficient leading zero/sign bits-10 --- llvm/test/CodeGen/AArch64/trunc-avg-fold.ll | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll b/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll index 030e9ea994264..54fcae4ba28b7 100644 --- a/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll +++ b/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll @@ -27,7 +27,7 @@ define <8 x i8> @test_avgceil_s(<8 x i8> %a, <8 x i8> %b) { } define <8 x i8> @avgfloor_u_i8_to_i16(<8 x i8> %a, <8 x i8> %b) { -; CHECK-LABEL: avgfloor_u_from_intrin: +; CHECK-LABEL: avgfloor_u_i8_to_i16: ; CHECK: // %bb.0: ; CHECK-NEXT: uhadd v0.8b, v0.8b, v1.8b ; CHECK-NEXT: ret From c4ea7bdf7df0749e30479967d7643b363df43bf7 Mon Sep 17 00:00:00 2001 From: william Date: Mon, 18 Aug 2025 21:05:31 +0800 Subject: [PATCH 11/15] [DAG] Fold trunc(avg(x,y)) for avgceil/floor u/s nodes if they have sufficient leading zero/sign bits-11 --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 7aea288c03208..738aa96b729ec 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -16302,10 +16302,9 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { SDValue Y = N0.getOperand(1); unsigned SrcBits = X.getScalarValueSizeInBits(); unsigned DstBits = VT.getScalarSizeInBits(); - KnownBits KnownX = DAG.computeKnownBits(X); - KnownBits KnownY = DAG.computeKnownBits(Y); - if (KnownX.countMinLeadingZeros() >= (SrcBits - DstBits) && - KnownY.countMinLeadingZeros() >= (SrcBits - DstBits)) { + APInt UpperBits = APInt::getBitsSetFrom(SrcBits, DstBits); + if (DAG.MaskedValueIsZero(X, UpperBits) && + DAG.MaskedValueIsZero(Y, UpperBits)) { SDValue Tx = DAG.getNode(ISD::TRUNCATE, DL, VT, X); SDValue Ty = DAG.getNode(ISD::TRUNCATE, DL, VT, Y); return DAG.getNode(N0.getOpcode(), DL, VT, Tx, Ty); @@ -16318,13 +16317,11 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { TLI.isOperationLegal(N0.getOpcode(), VT)) { SDValue X = N0.getOperand(0); SDValue Y = N0.getOperand(1); - unsigned SignBitsX = DAG.ComputeNumSignBits(X); - unsigned SignBitsY = DAG.ComputeNumSignBits(Y); unsigned SrcBits = X.getScalarValueSizeInBits(); unsigned DstBits = VT.getScalarSizeInBits(); unsigned NeededSignBits = SrcBits - DstBits + 1; - - if (SignBitsX >= NeededSignBits && SignBitsY >= NeededSignBits) { + if (DAG.ComputeNumSignBits(X) >= NeededSignBits && + DAG.ComputeNumSignBits(Y) >= NeededSignBits) { SDValue Tx = DAG.getNode(ISD::TRUNCATE, DL, VT, X); SDValue Ty = DAG.getNode(ISD::TRUNCATE, DL, VT, Y); return DAG.getNode(N0.getOpcode(), DL, VT, Tx, Ty); From fac54fffd2fc76a4523bb26008e2e2b5a37c0a16 Mon Sep 17 00:00:00 2001 From: william Date: Wed, 17 Sep 2025 10:23:43 +0800 Subject: [PATCH 12/15] [X86] X86TargetLowering::computeKnownBitsForTargetNode - add X86ISD::VPMADD52L\H handling-1 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 20 +++ llvm/test/CodeGen/X86/knownbits-vpmadd52.ll | 138 ++++++++++++++++++++ 2 files changed, 158 insertions(+) create mode 100644 llvm/test/CodeGen/X86/knownbits-vpmadd52.ll diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index f81efdc6414aa..b345a57d46863 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -38999,6 +38999,26 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, } break; } + case X86ISD::VPMADD52L: + case X86ISD::VPMADD52H: { + assert(Op.getValueType().isVector() && + Op.getValueType().getScalarType() == MVT::i64 && + "Unexpected VPMADD52 type"); + KnownBits K0 = + DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); + KnownBits K1 = + DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); + KnownBits KAcc = + DAG.computeKnownBits(Op.getOperand(2), DemandedElts, Depth + 1); + K0 = K0.trunc(52); + K1 = K1.trunc(52); + KnownBits KnownMul = (Op.getOpcode() == X86ISD::VPMADD52L) + ? KnownBits::mul(K0, K1) + : KnownBits::mulhu(K0, K1); + KnownMul = KnownMul.zext(64); + Known = KnownBits::add(KAcc, KnownMul); + return; + } } // Handle target shuffles. diff --git a/llvm/test/CodeGen/X86/knownbits-vpmadd52.ll b/llvm/test/CodeGen/X86/knownbits-vpmadd52.ll new file mode 100644 index 0000000000000..0b5be5fc9900b --- /dev/null +++ b/llvm/test/CodeGen/X86/knownbits-vpmadd52.ll @@ -0,0 +1,138 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512ifma,+avx512vl | FileCheck %s --check-prefixes=AVX512VL + + + +; H path: take the high 52 bits of the product and add them to the accumulator +; 25-bit = (1<<25)-1 = 33554431 +; 26-bit = (1<<26)-1 = 67108863 + +declare <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64>, <2 x i64>, <2 x i64>) +declare <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64>, <4 x i64>, <4 x i64>) +declare <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64>, <8 x i64>, <8 x i64>) + +define <2 x i64> @kb52h_128_mask25_and1(<2 x i64> %x, <2 x i64> %y) { +; AVX512VL-LABEL: kb52h_128_mask25_and1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovddup {{.*#+}} xmm0 = [1,1] +; AVX512VL-NEXT: # xmm0 = mem[0,0] +; AVX512VL-NEXT: retq + %mx = and <2 x i64> %x, + %my = and <2 x i64> %y, + %r = call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128( + <2 x i64> , ; acc + <2 x i64> %mx, ; x (masked to 25-bit) + <2 x i64> %my) ; y (masked to 25-bit) + %ret = and <2 x i64> %r, + ret <2 x i64> %ret +} + +define <4 x i64> @kb52h_256_mask25x26_acc1(<4 x i64> %x, <4 x i64> %y) { +; AVX512VL-LABEL: kb52h_256_mask25x26_acc1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vbroadcastsd {{.*#+}} ymm0 = [1,1,1,1] +; AVX512VL-NEXT: retq + %mx = and <4 x i64> %x, + %my = and <4 x i64> %y, + %r = call <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256( + <4 x i64> , + <4 x i64> %mx, + <4 x i64> %my) + ret <4 x i64> %r +} + +define <8 x i64> @kb52h_512_mask25_and1(<8 x i64> %x, <8 x i64> %y) { +; AVX512VL-LABEL: kb52h_512_mask25_and1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vbroadcastsd {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1] +; AVX512VL-NEXT: retq + %mx = and <8 x i64> %x, + %my = and <8 x i64> %y, + %r = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512( + <8 x i64> , + <8 x i64> %mx, + <8 x i64> %my) + %ret = and <8 x i64> %r, + ret <8 x i64> %ret +} + + +; 26-bit = 67108863 = (1<<26)-1 +; 50-bit = 1125899906842623 = (1<<50)-1 + +declare <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64>, <2 x i64>, <2 x i64>) +declare <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64>, <4 x i64>, <4 x i64>) +declare <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64>, <8 x i64>, <8 x i64>) + + + +define <2 x i64> @kb52l_128_mask26x26_add_intrin(<2 x i64> %x, <2 x i64> %y, <2 x i64> %acc) { +; AVX512VL-LABEL: kb52l_128_mask26x26_add_intrin: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [67108863,67108863] +; AVX512VL-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX512VL-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX512VL-NEXT: vpmadd52luq %xmm1, %xmm0, %xmm2 +; AVX512VL-NEXT: vmovdqa %xmm2, %xmm0 +; AVX512VL-NEXT: retq + %xm = and <2 x i64> %x, + %ym = and <2 x i64> %y, + %r = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %acc, <2 x i64> %xm, <2 x i64> %ym) + ret <2 x i64> %r +} + + + +define <4 x i64> @kb52l_256_mask50x3_add_intrin(<4 x i64> %x, <4 x i64> %y, <4 x i64> %acc) { +; AVX512VL-LABEL: kb52l_256_mask50x3_add_intrin: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 +; AVX512VL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm1 +; AVX512VL-NEXT: vpmadd52luq %ymm1, %ymm0, %ymm2 +; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 +; AVX512VL-NEXT: retq + %xm = and <4 x i64> %x, + %ym = and <4 x i64> %y, + %r = call <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64> %acc, <4 x i64> %xm, <4 x i64> %ym) + ret <4 x i64> %r +} + + + +define <8 x i64> @kb52l_512_mask26x26_add_intrin(<8 x i64> %x, <8 x i64> %y, <8 x i64> %acc) { +; AVX512-NOVL-LABEL: kb52l_512_mask26x26_add_intrin: +; AVX512-NOVL: vpmadd52luq +; AVX512-NOVL: retq +; AVX512VL-LABEL: kb52l_512_mask26x26_add_intrin: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm3 = [67108863,67108863,67108863,67108863,67108863,67108863,67108863,67108863] +; AVX512VL-NEXT: vpandq %zmm3, %zmm0, %zmm0 +; AVX512VL-NEXT: vpandq %zmm3, %zmm1, %zmm1 +; AVX512VL-NEXT: vpmadd52luq %zmm1, %zmm0, %zmm2 +; AVX512VL-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512VL-NEXT: retq + %xm = and <8 x i64> %x, + %ym = and <8 x i64> %y, + %r = call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %acc, <8 x i64> %xm, <8 x i64> %ym) + ret <8 x i64> %r +} + + + + +define <2 x i64> @kb52l_128_neg_27x27_plain(<2 x i64> %x, <2 x i64> %y, <2 x i64> %acc) { +; AVX512VL-LABEL: kb52l_128_neg_27x27_plain: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [67108864,67108864] +; AVX512VL-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX512VL-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX512VL-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; AVX512VL-NEXT: retq + %xm = and <2 x i64> %x, ; 1<<26 + %ym = and <2 x i64> %y, + %mul = mul <2 x i64> %xm, %ym + %res = add <2 x i64> %mul, %acc + ret <2 x i64> %res +} + From c5100dcee32919cd250088ece985123e6bf231ab Mon Sep 17 00:00:00 2001 From: william Date: Wed, 17 Sep 2025 10:39:09 +0800 Subject: [PATCH 13/15] Remove unintended changes to DAGCombiner.cpp --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 34 ------------------- 1 file changed, 34 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 8f4e84a34a8bd..4b20b756f8a15 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -16354,40 +16354,6 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { // because targets may prefer a wider type during later combines and invert // this transform. switch (N0.getOpcode()) { - case ISD::AVGCEILU: - case ISD::AVGFLOORU: - if (!LegalOperations && N0.hasOneUse() && - TLI.isOperationLegal(N0.getOpcode(), VT)) { - SDValue X = N0.getOperand(0); - SDValue Y = N0.getOperand(1); - unsigned SrcBits = X.getScalarValueSizeInBits(); - unsigned DstBits = VT.getScalarSizeInBits(); - APInt UpperBits = APInt::getBitsSetFrom(SrcBits, DstBits); - if (DAG.MaskedValueIsZero(X, UpperBits) && - DAG.MaskedValueIsZero(Y, UpperBits)) { - SDValue Tx = DAG.getNode(ISD::TRUNCATE, DL, VT, X); - SDValue Ty = DAG.getNode(ISD::TRUNCATE, DL, VT, Y); - return DAG.getNode(N0.getOpcode(), DL, VT, Tx, Ty); - } - } - break; - case ISD::AVGCEILS: - case ISD::AVGFLOORS: - if (!LegalOperations && N0.hasOneUse() && - TLI.isOperationLegal(N0.getOpcode(), VT)) { - SDValue X = N0.getOperand(0); - SDValue Y = N0.getOperand(1); - unsigned SrcBits = X.getScalarValueSizeInBits(); - unsigned DstBits = VT.getScalarSizeInBits(); - unsigned NeededSignBits = SrcBits - DstBits + 1; - if (DAG.ComputeNumSignBits(X) >= NeededSignBits && - DAG.ComputeNumSignBits(Y) >= NeededSignBits) { - SDValue Tx = DAG.getNode(ISD::TRUNCATE, DL, VT, X); - SDValue Ty = DAG.getNode(ISD::TRUNCATE, DL, VT, Y); - return DAG.getNode(N0.getOpcode(), DL, VT, Tx, Ty); - } - } - break; case ISD::ADD: case ISD::SUB: case ISD::MUL: From 27f0f4295c972e3b5611f13352c79d24c04a8bcf Mon Sep 17 00:00:00 2001 From: william Date: Fri, 19 Sep 2025 00:19:05 +0800 Subject: [PATCH 14/15] update test case --- llvm/test/CodeGen/X86/knownbits-vpmadd52.ll | 137 ++++++++------------ 1 file changed, 52 insertions(+), 85 deletions(-) diff --git a/llvm/test/CodeGen/X86/knownbits-vpmadd52.ll b/llvm/test/CodeGen/X86/knownbits-vpmadd52.ll index 0b5be5fc9900b..b3f7fe205a958 100644 --- a/llvm/test/CodeGen/X86/knownbits-vpmadd52.ll +++ b/llvm/test/CodeGen/X86/knownbits-vpmadd52.ll @@ -1,15 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512ifma,+avx512vl | FileCheck %s --check-prefixes=AVX512VL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512ifma,+avx512vl | FileCheck %s --check-prefixes=AVX512VL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avxifma | FileCheck %s --check-prefixes=AVXIFMA - -; H path: take the high 52 bits of the product and add them to the accumulator -; 25-bit = (1<<25)-1 = 33554431 -; 26-bit = (1<<26)-1 = 67108863 +; High-52 path declare <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64>, <2 x i64>, <2 x i64>) declare <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64>, <4 x i64>, <4 x i64>) -declare <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64>, <8 x i64>, <8 x i64>) define <2 x i64> @kb52h_128_mask25_and1(<2 x i64> %x, <2 x i64> %y) { ; AVX512VL-LABEL: kb52h_128_mask25_and1: @@ -17,13 +14,19 @@ define <2 x i64> @kb52h_128_mask25_and1(<2 x i64> %x, <2 x i64> %y) { ; AVX512VL-NEXT: vmovddup {{.*#+}} xmm0 = [1,1] ; AVX512VL-NEXT: # xmm0 = mem[0,0] ; AVX512VL-NEXT: retq - %mx = and <2 x i64> %x, - %my = and <2 x i64> %y, +; +; AVXIFMA-LABEL: kb52h_128_mask25_and1: +; AVXIFMA: # %bb.0: +; AVXIFMA-NEXT: vmovddup {{.*#+}} xmm0 = [1,1] +; AVXIFMA-NEXT: # xmm0 = mem[0,0] +; AVXIFMA-NEXT: retq + %mx = and <2 x i64> %x, splat (i64 33554431) ; (1<<25)-1 + %my = and <2 x i64> %y, splat (i64 33554431) ; (1<<25)-1 %r = call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128( - <2 x i64> , ; acc - <2 x i64> %mx, ; x (masked to 25-bit) - <2 x i64> %my) ; y (masked to 25-bit) - %ret = and <2 x i64> %r, + <2 x i64> splat (i64 1), + <2 x i64> %mx, + <2 x i64> %my) + %ret = and <2 x i64> %r, splat (i64 1) ret <2 x i64> %ret } @@ -32,39 +35,23 @@ define <4 x i64> @kb52h_256_mask25x26_acc1(<4 x i64> %x, <4 x i64> %y) { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vbroadcastsd {{.*#+}} ymm0 = [1,1,1,1] ; AVX512VL-NEXT: retq - %mx = and <4 x i64> %x, - %my = and <4 x i64> %y, +; +; AVXIFMA-LABEL: kb52h_256_mask25x26_acc1: +; AVXIFMA: # %bb.0: +; AVXIFMA-NEXT: vbroadcastsd {{.*#+}} ymm0 = [1,1,1,1] +; AVXIFMA-NEXT: retq + %mx = and <4 x i64> %x, splat (i64 33554431) ; (1<<25)-1 + %my = and <4 x i64> %y, splat (i64 67108863) ; (1<<26)-1 %r = call <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256( - <4 x i64> , - <4 x i64> %mx, - <4 x i64> %my) + <4 x i64> splat (i64 1), + <4 x i64> %mx, <4 x i64> %my) ret <4 x i64> %r } -define <8 x i64> @kb52h_512_mask25_and1(<8 x i64> %x, <8 x i64> %y) { -; AVX512VL-LABEL: kb52h_512_mask25_and1: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vbroadcastsd {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1] -; AVX512VL-NEXT: retq - %mx = and <8 x i64> %x, - %my = and <8 x i64> %y, - %r = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512( - <8 x i64> , - <8 x i64> %mx, - <8 x i64> %my) - %ret = and <8 x i64> %r, - ret <8 x i64> %ret -} - - -; 26-bit = 67108863 = (1<<26)-1 -; 50-bit = 1125899906842623 = (1<<50)-1 +; Low-52 path declare <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64>, <2 x i64>, <2 x i64>) declare <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64>, <4 x i64>, <4 x i64>) -declare <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64>, <8 x i64>, <8 x i64>) - - define <2 x i64> @kb52l_128_mask26x26_add_intrin(<2 x i64> %x, <2 x i64> %y, <2 x i64> %acc) { ; AVX512VL-LABEL: kb52l_128_mask26x26_add_intrin: @@ -75,14 +62,22 @@ define <2 x i64> @kb52l_128_mask26x26_add_intrin(<2 x i64> %x, <2 x i64> %y, <2 ; AVX512VL-NEXT: vpmadd52luq %xmm1, %xmm0, %xmm2 ; AVX512VL-NEXT: vmovdqa %xmm2, %xmm0 ; AVX512VL-NEXT: retq - %xm = and <2 x i64> %x, - %ym = and <2 x i64> %y, - %r = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %acc, <2 x i64> %xm, <2 x i64> %ym) +; +; AVXIFMA-LABEL: kb52l_128_mask26x26_add_intrin: +; AVXIFMA: # %bb.0: +; AVXIFMA-NEXT: vpbroadcastq {{.*#+}} xmm3 = [67108863,67108863] +; AVXIFMA-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVXIFMA-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVXIFMA-NEXT: {vex} vpmadd52luq %xmm1, %xmm0, %xmm2 +; AVXIFMA-NEXT: vmovdqa %xmm2, %xmm0 +; AVXIFMA-NEXT: retq + %xm = and <2 x i64> %x, splat (i64 67108863) ; (1<<26)-1 + %ym = and <2 x i64> %y, splat (i64 67108863) ; (1<<26)-1 + %r = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128( + <2 x i64> %acc, <2 x i64> %xm, <2 x i64> %ym) ret <2 x i64> %r } - - define <4 x i64> @kb52l_256_mask50x3_add_intrin(<4 x i64> %x, <4 x i64> %y, <4 x i64> %acc) { ; AVX512VL-LABEL: kb52l_256_mask50x3_add_intrin: ; AVX512VL: # %bb.0: @@ -91,48 +86,20 @@ define <4 x i64> @kb52l_256_mask50x3_add_intrin(<4 x i64> %x, <4 x i64> %y, <4 x ; AVX512VL-NEXT: vpmadd52luq %ymm1, %ymm0, %ymm2 ; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 ; AVX512VL-NEXT: retq - %xm = and <4 x i64> %x, - %ym = and <4 x i64> %y, - %r = call <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64> %acc, <4 x i64> %xm, <4 x i64> %ym) +; +; AVXIFMA-LABEL: kb52l_256_mask50x3_add_intrin: +; AVXIFMA: # %bb.0: +; AVXIFMA-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1125899906842623,1125899906842623,1125899906842623,1125899906842623] +; AVXIFMA-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVXIFMA-NEXT: vpbroadcastq {{.*#+}} ymm3 = [3,3,3,3] +; AVXIFMA-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVXIFMA-NEXT: {vex} vpmadd52luq %ymm1, %ymm0, %ymm2 +; AVXIFMA-NEXT: vmovdqa %ymm2, %ymm0 +; AVXIFMA-NEXT: retq + %xm = and <4 x i64> %x, splat (i64 1125899906842623) ; (1<<50)-1 + %ym = and <4 x i64> %y, splat (i64 3) + %r = call <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256( + <4 x i64> %acc, <4 x i64> %xm, <4 x i64> %ym) ret <4 x i64> %r } - - -define <8 x i64> @kb52l_512_mask26x26_add_intrin(<8 x i64> %x, <8 x i64> %y, <8 x i64> %acc) { -; AVX512-NOVL-LABEL: kb52l_512_mask26x26_add_intrin: -; AVX512-NOVL: vpmadd52luq -; AVX512-NOVL: retq -; AVX512VL-LABEL: kb52l_512_mask26x26_add_intrin: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm3 = [67108863,67108863,67108863,67108863,67108863,67108863,67108863,67108863] -; AVX512VL-NEXT: vpandq %zmm3, %zmm0, %zmm0 -; AVX512VL-NEXT: vpandq %zmm3, %zmm1, %zmm1 -; AVX512VL-NEXT: vpmadd52luq %zmm1, %zmm0, %zmm2 -; AVX512VL-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512VL-NEXT: retq - %xm = and <8 x i64> %x, - %ym = and <8 x i64> %y, - %r = call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %acc, <8 x i64> %xm, <8 x i64> %ym) - ret <8 x i64> %r -} - - - - -define <2 x i64> @kb52l_128_neg_27x27_plain(<2 x i64> %x, <2 x i64> %y, <2 x i64> %acc) { -; AVX512VL-LABEL: kb52l_128_neg_27x27_plain: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [67108864,67108864] -; AVX512VL-NEXT: vpand %xmm3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX512VL-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: retq - %xm = and <2 x i64> %x, ; 1<<26 - %ym = and <2 x i64> %y, - %mul = mul <2 x i64> %xm, %ym - %res = add <2 x i64> %mul, %acc - ret <2 x i64> %res -} - From efeb7402d3a899e2a420cdf8057408e331080834 Mon Sep 17 00:00:00 2001 From: william Date: Mon, 22 Sep 2025 11:23:18 +0800 Subject: [PATCH 15/15] update test case: knownbits-vpmadd52.ll --- llvm/test/CodeGen/X86/knownbits-vpmadd52.ll | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/llvm/test/CodeGen/X86/knownbits-vpmadd52.ll b/llvm/test/CodeGen/X86/knownbits-vpmadd52.ll index b3f7fe205a958..0e322fec2c7d9 100644 --- a/llvm/test/CodeGen/X86/knownbits-vpmadd52.ll +++ b/llvm/test/CodeGen/X86/knownbits-vpmadd52.ll @@ -8,6 +8,7 @@ declare <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64>, <2 x i64>, <2 x i64>) declare <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64>, <4 x i64>, <4 x i64>) +; High-52, 25x25 masked inputs, accumulator = 1, expected constant fold. define <2 x i64> @kb52h_128_mask25_and1(<2 x i64> %x, <2 x i64> %y) { ; AVX512VL-LABEL: kb52h_128_mask25_and1: ; AVX512VL: # %bb.0: @@ -30,6 +31,7 @@ define <2 x i64> @kb52h_128_mask25_and1(<2 x i64> %x, <2 x i64> %y) { ret <2 x i64> %ret } +; High-52, 25x26 masked inputs, accumulator = 1, expected constant fold. define <4 x i64> @kb52h_256_mask25x26_acc1(<4 x i64> %x, <4 x i64> %y) { ; AVX512VL-LABEL: kb52h_256_mask25x26_acc1: ; AVX512VL: # %bb.0: @@ -53,6 +55,7 @@ define <4 x i64> @kb52h_256_mask25x26_acc1(<4 x i64> %x, <4 x i64> %y) { declare <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64>, <2 x i64>, <2 x i64>) declare <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64>, <4 x i64>, <4 x i64>) +; Low-52, 26x26 masked inputs, add with accumulator. define <2 x i64> @kb52l_128_mask26x26_add_intrin(<2 x i64> %x, <2 x i64> %y, <2 x i64> %acc) { ; AVX512VL-LABEL: kb52l_128_mask26x26_add_intrin: ; AVX512VL: # %bb.0: @@ -78,6 +81,7 @@ define <2 x i64> @kb52l_128_mask26x26_add_intrin(<2 x i64> %x, <2 x i64> %y, <2 ret <2 x i64> %r } +; Low-52, 50-bit × 2-bit masked inputs, add with accumulator. define <4 x i64> @kb52l_256_mask50x3_add_intrin(<4 x i64> %x, <4 x i64> %y, <4 x i64> %acc) { ; AVX512VL-LABEL: kb52l_256_mask50x3_add_intrin: ; AVX512VL: # %bb.0: @@ -97,7 +101,7 @@ define <4 x i64> @kb52l_256_mask50x3_add_intrin(<4 x i64> %x, <4 x i64> %y, <4 x ; AVXIFMA-NEXT: vmovdqa %ymm2, %ymm0 ; AVXIFMA-NEXT: retq %xm = and <4 x i64> %x, splat (i64 1125899906842623) ; (1<<50)-1 - %ym = and <4 x i64> %y, splat (i64 3) + %ym = and <4 x i64> %y, splat (i64 3) ; (1<<2)-1 %r = call <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256( <4 x i64> %acc, <4 x i64> %xm, <4 x i64> %ym) ret <4 x i64> %r