-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[X86] SimplifyDemandedBitsForTargetNode - add handling for VPMADD52L/VPMADD52H #155494
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@llvm/pr-subscribers-backend-x86 Author: XChy (XChy) ChangesResolves #155387. Full diff: https://github.com/llvm/llvm-project/pull/155494.diff 2 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 19131fbd4102b..35f9256bb454d 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -44957,6 +44957,22 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
Known.Zero.setLowBits(Known2.countMinTrailingZeros());
return false;
}
+ case X86ISD::VPMADD52L:
+ case X86ISD::VPMADD52H: {
+ KnownBits KnownOp0, KnownOp1;
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+ // Only demand the lower 52-bits of operands 0 / 1 (and all 64-bits of operand 2).
+ APInt Low52Bits = APInt::getLowBitsSet(BitWidth, 52);
+ if (SimplifyDemandedBits(Op0, Low52Bits, OriginalDemandedElts, KnownOp0, TLO,
+ Depth + 1))
+ return true;
+
+ if (SimplifyDemandedBits(Op1, Low52Bits, OriginalDemandedElts, KnownOp1, TLO,
+ Depth + 1))
+ return true;
+ break;
+ }
}
return TargetLowering::SimplifyDemandedBitsForTargetNode(
@@ -60068,6 +60084,18 @@ static SDValue combineVPMADD(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+// Simplify VPMADD52L/VPMADD52H operations.
+static SDValue combineVPMADD52LH(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ MVT VT = N->getSimpleValueType(0);
+ unsigned NumEltBits = VT.getScalarSizeInBits();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumEltBits), DCI))
+ return SDValue(N, 0);
+
+ return SDValue();
+}
+
static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
@@ -60705,6 +60733,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget);
case X86ISD::VPMADDUBSW:
case X86ISD::VPMADDWD: return combineVPMADD(N, DAG, DCI);
+ case X86ISD::VPMADD52L:
+ case X86ISD::VPMADD52H: return combineVPMADD52LH(N, DAG, DCI);
case X86ISD::KSHIFTL:
case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI);
case ISD::FP16_TO_FP: return combineFP16_TO_FP(N, DAG, Subtarget);
diff --git a/llvm/test/CodeGen/X86/vpmadd.ll b/llvm/test/CodeGen/X86/vpmadd.ll
new file mode 100644
index 0000000000000..21027190ef318
--- /dev/null
+++ b/llvm/test/CodeGen/X86/vpmadd.ll
@@ -0,0 +1,54 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512ifma,+avx512vl | FileCheck %s --check-prefixes=CHECK,X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512ifma,+avx512vl | FileCheck %s --check-prefixes=CHECK,X64
+
+define <2 x i64> @test_vpmadd52l(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) {
+; CHECK-LABEL: test_vpmadd52l:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpmadd52luq %xmm1, %xmm1, %xmm0
+; CHECK-NEXT: ret{{[l|q]}}
+ %shl1 = shl <2 x i64> %x1, <i64 52, i64 52>
+ %shl2 = shl <2 x i64> %x2, <i64 52, i64 52>
+ %1 = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %shl1, <2 x i64> %shl2)
+ ret <2 x i64> %1
+}
+
+define <2 x i64> @test_vpmadd52l_wrong_shift(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) {
+; CHECK-LABEL: test_vpmadd52l_wrong_shift:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpsllq $51, %xmm1, %xmm1
+; CHECK-NEXT: vpsllq $51, %xmm2, %xmm2
+; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm0
+; CHECK-NEXT: ret{{[l|q]}}
+ %shl1 = shl <2 x i64> %x1, <i64 51, i64 51>
+ %shl2 = shl <2 x i64> %x2, <i64 51, i64 51>
+ %1 = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %shl1, <2 x i64> %shl2)
+ ret <2 x i64> %1
+}
+
+define <2 x i64> @test_vpmadd52l_wrong_op(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) {
+; CHECK-LABEL: test_vpmadd52l_wrong_op:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpsllq $52, %xmm0, %xmm0
+; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm0
+; CHECK-NEXT: ret{{[l|q]}}
+ %shl0 = shl <2 x i64> %x0, <i64 52, i64 52>
+ %1 = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %shl0, <2 x i64> %x1, <2 x i64> %x2)
+ ret <2 x i64> %1
+}
+
+define <2 x i64> @test_vpmadd52h(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) {
+; CHECK-LABEL: test_vpmadd52h:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpmadd52huq %xmm1, %xmm1, %xmm0
+; CHECK-NEXT: ret{{[l|q]}}
+ %shl1 = shl <2 x i64> %x1, <i64 52, i64 52>
+ %shl2 = shl <2 x i64> %x2, <i64 52, i64 52>
+ %1 = call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %shl1, <2 x i64> %shl2)
+ ret <2 x i64> %1
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; X64: {{.*}}
+; X86: {{.*}}
|
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
RKSimon
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
A few minors
8fbcb6b to
f57cb12
Compare
f57cb12 to
b258fd6
Compare
RKSimon
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM - cheers
|
Thanks for your review. I would like to work on the constant fold of VPMADD52L/VPMADD52H. Can you assign it to me? Or I post an issue myself. |
|
@houngkoungting is currently working on #155386 Building on this - please can you handle the vpmadd52(x,0,y) -> y fold in SimplifyDemandedBitsForTargetNode? If the lower 52bits of either multiplicand are known zero. |
Sure, I will work on it after merging the knownbits patch. |
Resolves comment in #155494 (comment)
Resolves #155387.
The X86ISD::VPMADD52L/VPMADD52H nodes only demand the lower 52 bits of operands 0 / 1.