diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 7344387ffe552..d78f7b6052206 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1852,7 +1852,12 @@ bool AArch64TargetLowering::shouldExpandGetActiveLaneMask(EVT ResVT, } bool AArch64TargetLowering::shouldExpandCttzElements(EVT VT) const { - return !Subtarget->hasSVEorSME() || VT != MVT::nxv16i1; + if (!Subtarget->hasSVEorSME()) + return true; + + // We can only use the BRKB + CNTP sequence with legal predicate types. + return VT != MVT::nxv16i1 && VT != MVT::nxv8i1 && VT != MVT::nxv4i1 && + VT != MVT::nxv2i1; } void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) { diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 62e68de1359f7..9fe88b0d7a92d 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -2075,6 +2075,18 @@ let Predicates = [HasSVEorSME] in { def : Pat<(i64 (AArch64CttzElts nxv16i1:$Op1)), (CNTP_XPP_B (BRKB_PPzP (PTRUE_B 31), PPR:$Op1), (BRKB_PPzP (PTRUE_B 31), PPR:$Op1))>; + + def : Pat<(i64 (AArch64CttzElts nxv8i1:$Op1)), + (CNTP_XPP_H (BRKB_PPzP (PTRUE_H 31), PPR:$Op1), + (BRKB_PPzP (PTRUE_H 31), PPR:$Op1))>; + + def : Pat<(i64 (AArch64CttzElts nxv4i1:$Op1)), + (CNTP_XPP_S (BRKB_PPzP (PTRUE_S 31), PPR:$Op1), + (BRKB_PPzP (PTRUE_S 31), PPR:$Op1))>; + + def : Pat<(i64 (AArch64CttzElts nxv2i1:$Op1)), + (CNTP_XPP_D (BRKB_PPzP (PTRUE_D 31), PPR:$Op1), + (BRKB_PPzP (PTRUE_D 31), PPR:$Op1))>; } defm INCB_XPiI : sve_int_pred_pattern_a<0b000, "incb", add, int_aarch64_sve_cntb>; @@ -2168,6 +2180,30 @@ let Predicates = [HasSVEorSME] in { (INSERT_SUBREG (IMPLICIT_DEF), GPR32:$Op1, sub_32)), sub_32)>; + def : Pat<(i64 (add GPR64:$Op1, (i64 (AArch64CttzElts nxv8i1:$Op2)))), + (INCP_XP_H (BRKB_PPzP (PTRUE_H 31), PPR:$Op2), GPR64:$Op1)>; + + def : Pat<(i32 (add GPR32:$Op1, (trunc (i64 (AArch64CttzElts nxv8i1:$Op2))))), + (EXTRACT_SUBREG (INCP_XP_H (BRKB_PPzP (PTRUE_H 31), PPR:$Op2), + (INSERT_SUBREG (IMPLICIT_DEF), GPR32:$Op1, sub_32)), + sub_32)>; + + def : Pat<(i64 (add GPR64:$Op1, (i64 (AArch64CttzElts nxv4i1:$Op2)))), + (INCP_XP_S (BRKB_PPzP (PTRUE_S 31), PPR:$Op2), GPR64:$Op1)>; + + def : Pat<(i32 (add GPR32:$Op1, (trunc (i64 (AArch64CttzElts nxv4i1:$Op2))))), + (EXTRACT_SUBREG (INCP_XP_S (BRKB_PPzP (PTRUE_S 31), PPR:$Op2), + (INSERT_SUBREG (IMPLICIT_DEF), GPR32:$Op1, sub_32)), + sub_32)>; + + def : Pat<(i64 (add GPR64:$Op1, (i64 (AArch64CttzElts nxv2i1:$Op2)))), + (INCP_XP_D (BRKB_PPzP (PTRUE_D 31), PPR:$Op2), GPR64:$Op1)>; + + def : Pat<(i32 (add GPR32:$Op1, (trunc (i64 (AArch64CttzElts nxv2i1:$Op2))))), + (EXTRACT_SUBREG (INCP_XP_D (BRKB_PPzP (PTRUE_D 31), PPR:$Op2), + (INSERT_SUBREG (IMPLICIT_DEF), GPR32:$Op1, sub_32)), + sub_32)>; + defm INDEX_RR : sve_int_index_rr<"index", AArch64mul_p_oneuse>; defm INDEX_IR : sve_int_index_ir<"index", AArch64mul_p, AArch64mul_p_oneuse>; defm INDEX_RI : sve_int_index_ri<"index">; diff --git a/llvm/test/Analysis/CostModel/AArch64/cttz_elts.ll b/llvm/test/Analysis/CostModel/AArch64/cttz_elts.ll index 01dc086d93853..cc1532ee33dcf 100644 --- a/llvm/test/Analysis/CostModel/AArch64/cttz_elts.ll +++ b/llvm/test/Analysis/CostModel/AArch64/cttz_elts.ll @@ -3,14 +3,14 @@ define void @foo_no_vscale_range() { ; CHECK-LABEL: 'foo_no_vscale_range' -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i64.nxv2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1( undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %res.i64.nxv4i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1( undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %res.i64.nxv8i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1( undef, i1 true) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1( undef, i1 true) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv4i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1( undef, i1 true) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv8i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1( undef, i1 true) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv16i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1( undef, i1 true) ; CHECK-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %res.i64.nxv32i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1( undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i32.nxv2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1( undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i32.nxv4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1( undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %res.i32.nxv8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1( undef, i1 true) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1( undef, i1 true) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1( undef, i1 true) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1( undef, i1 true) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1( undef, i1 true) ; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %res.i32.nxv32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1( undef, i1 true) ; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i64.v2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> undef, i1 true) @@ -23,14 +23,14 @@ define void @foo_no_vscale_range() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i32.v8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v8i1(<8 x i1> undef, i1 true) ; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i32.v16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1> undef, i1 true) ; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res.i32.v32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v32i1(<32 x i1> undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i64.nxv2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1( undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %res.i64.nxv4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1( undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %res.i64.nxv8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1( undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1( undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1( undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1( undef, i1 false) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv16i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1( undef, i1 false) ; CHECK-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %res.i64.nxv32i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1( undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i32.nxv2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1( undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i32.nxv4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1( undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %res.i32.nxv8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1( undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1( undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1( undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1( undef, i1 false) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1( undef, i1 false) ; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %res.i32.nxv32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1( undef, i1 false) ; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i64.v2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> undef, i1 false) @@ -95,24 +95,24 @@ define void @foo_no_vscale_range() { define void @foo_vscale_range_1_16() vscale_range(1,16) { ; CHECK-LABEL: 'foo_vscale_range_1_16' -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i64.nxv2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1( undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i64.nxv4i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1( undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i64.nxv8i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1( undef, i1 true) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1( undef, i1 true) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv4i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1( undef, i1 true) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv8i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1( undef, i1 true) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv16i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1( undef, i1 true) ; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %res.i64.nxv32i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1( undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i32.nxv2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1( undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i32.nxv4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1( undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i32.nxv8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1( undef, i1 true) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1( undef, i1 true) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1( undef, i1 true) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1( undef, i1 true) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1( undef, i1 true) ; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %res.i32.nxv32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1( undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i64.nxv2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1( undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i64.nxv4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1( undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i64.nxv8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1( undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1( undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1( undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1( undef, i1 false) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv16i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1( undef, i1 false) ; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %res.i64.nxv32i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1( undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i32.nxv2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1( undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i32.nxv4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1( undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i32.nxv8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1( undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1( undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1( undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1( undef, i1 false) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1( undef, i1 false) ; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %res.i32.nxv32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1( undef, i1 false) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void @@ -144,24 +144,24 @@ define void @foo_vscale_range_1_16() vscale_range(1,16) { define void @foo_vscale_range_1_16384() vscale_range(1,16384) { ; CHECK-LABEL: 'foo_vscale_range_1_16384' -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i64.nxv2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1( undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i64.nxv4i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1( undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %res.i64.nxv8i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1( undef, i1 true) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1( undef, i1 true) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv4i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1( undef, i1 true) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv8i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1( undef, i1 true) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv16i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1( undef, i1 true) ; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %res.i64.nxv32i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1( undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i32.nxv2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1( undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i32.nxv4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1( undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %res.i32.nxv8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1( undef, i1 true) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1( undef, i1 true) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1( undef, i1 true) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1( undef, i1 true) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1( undef, i1 true) ; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %res.i32.nxv32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1( undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i64.nxv2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1( undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i64.nxv4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1( undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %res.i64.nxv8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1( undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1( undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1( undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1( undef, i1 false) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv16i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1( undef, i1 false) ; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %res.i64.nxv32i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1( undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i32.nxv2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1( undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i32.nxv4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1( undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %res.i32.nxv8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1( undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1( undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1( undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1( undef, i1 false) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1( undef, i1 false) ; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %res.i32.nxv32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1( undef, i1 false) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void diff --git a/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll b/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll index 9bd2ed240810d..211237542a15b 100644 --- a/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll +++ b/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll @@ -4,25 +4,6 @@ ; WITH VSCALE RANGE -define i64 @ctz_nxv8i1( %a) #0 { -; CHECK-LABEL: ctz_nxv8i1: -; CHECK: // %bb.0: -; CHECK-NEXT: index z0.h, #0, #-1 -; CHECK-NEXT: mov z1.h, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: cnth x9 -; CHECK-NEXT: inch z0.h -; CHECK-NEXT: and z0.d, z0.d, z1.d -; CHECK-NEXT: and z0.h, z0.h, #0xff -; CHECK-NEXT: umaxv h0, p0, z0.h -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: sub w8, w9, w8 -; CHECK-NEXT: and x0, x8, #0xff -; CHECK-NEXT: ret - %res = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1( %a, i1 0) - ret i64 %res -} - define i32 @ctz_nxv32i1( %a) #0 { ; CHECK-LABEL: ctz_nxv32i1: ; CHECK: // %bb.0: @@ -156,41 +137,166 @@ define i64 @vscale_4096_poison( %a) #1 { ret i64 %res } -; NO VSCALE RANGE +; EFFICIENT LOWERING USING BRKB -define i32 @ctz_nxv8i1_no_range( %a) { -; CHECK-LABEL: ctz_nxv8i1_no_range: +define i32 @ctz_nxv2i1( %a) { +; CHECK-LABEL: ctz_nxv2i1: ; CHECK: // %bb.0: -; CHECK-NEXT: index z0.s, #0, #-1 -; CHECK-NEXT: cntw x8 -; CHECK-NEXT: punpklo p1.h, p0.b -; CHECK-NEXT: neg x8, x8 -; CHECK-NEXT: punpkhi p0.h, p0.b -; CHECK-NEXT: cnth x9 -; CHECK-NEXT: mov z1.s, w8 -; CHECK-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: mov z3.s, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: incw z0.s, all, mul #2 -; CHECK-NEXT: add z1.s, z0.s, z1.s -; CHECK-NEXT: and z0.d, z0.d, z2.d -; CHECK-NEXT: and z1.d, z1.d, z3.d -; CHECK-NEXT: umax z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: umaxv s0, p0, z0.s -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: sub w0, w9, w8 +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: brkb p0.b, p1/z, p0.b +; CHECK-NEXT: cntp x0, p0, p0.d +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret + %res = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1( %a, i1 0) + ret i32 %res +} + +define i32 @ctz_nxv2i1_poison( %a) { +; CHECK-LABEL: ctz_nxv2i1_poison: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: brkb p0.b, p1/z, p0.b +; CHECK-NEXT: cntp x0, p0, p0.d +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret + %res = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1( %a, i1 1) + ret i32 %res +} + +define i64 @add_i64_ctz_nxv2i1_poison( %a, i64 %b) { +; CHECK-LABEL: add_i64_ctz_nxv2i1_poison: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: brkb p0.b, p1/z, p0.b +; CHECK-NEXT: incp x0, p0.d +; CHECK-NEXT: ret + %res = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1( %a, i1 1) + %add = add i64 %res, %b + ret i64 %add +} + +define i32 @add_i32_ctz_nxv2i1_poison( %a, i32 %b) { +; CHECK-LABEL: add_i32_ctz_nxv2i1_poison: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: brkb p0.b, p1/z, p0.b +; CHECK-NEXT: incp x0, p0.d +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret + %res = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1( %a, i1 1) + %trunc = trunc i64 %res to i32 + %add = add i32 %trunc, %b + ret i32 %add +} + +define i32 @ctz_nxv4i1( %a) { +; CHECK-LABEL: ctz_nxv4i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: brkb p0.b, p1/z, p0.b +; CHECK-NEXT: cntp x0, p0, p0.s +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret + %res = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1( %a, i1 0) + ret i32 %res +} + +define i32 @ctz_nxv4i1_poison( %a) { +; CHECK-LABEL: ctz_nxv4i1_poison: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: brkb p0.b, p1/z, p0.b +; CHECK-NEXT: cntp x0, p0, p0.s +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret + %res = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1( %a, i1 1) + ret i32 %res +} + +define i64 @add_i64_ctz_nxv4i1_poison( %a, i64 %b) { +; CHECK-LABEL: add_i64_ctz_nxv4i1_poison: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: brkb p0.b, p1/z, p0.b +; CHECK-NEXT: incp x0, p0.s +; CHECK-NEXT: ret + %res = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1( %a, i1 1) + %add = add i64 %res, %b + ret i64 %add +} + +define i32 @add_i32_ctz_nxv4i1_poison( %a, i32 %b) { +; CHECK-LABEL: add_i32_ctz_nxv4i1_poison: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: brkb p0.b, p1/z, p0.b +; CHECK-NEXT: incp x0, p0.s +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret + %res = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1( %a, i1 1) + %trunc = trunc i64 %res to i32 + %add = add i32 %trunc, %b + ret i32 %add +} + +define i32 @ctz_nxv8i1( %a) { +; CHECK-LABEL: ctz_nxv8i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: brkb p0.b, p1/z, p0.b +; CHECK-NEXT: cntp x0, p0, p0.h +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 ; CHECK-NEXT: ret %res = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1( %a, i1 0) ret i32 %res } -; MATCH WITH BRKB + CNTP +define i32 @ctz_nxv8i1_poison( %a) { +; CHECK-LABEL: ctz_nxv8i1_poison: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: brkb p0.b, p1/z, p0.b +; CHECK-NEXT: cntp x0, p0, p0.h +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret + %res = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1( %a, i1 1) + ret i32 %res +} + +define i64 @add_i64_ctz_nxv8i1_poison( %a, i64 %b) { +; CHECK-LABEL: add_i64_ctz_nxv8i1_poison: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: brkb p0.b, p1/z, p0.b +; CHECK-NEXT: incp x0, p0.h +; CHECK-NEXT: ret + %res = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1( %a, i1 1) + %add = add i64 %res, %b + ret i64 %add +} -define i32 @ctz_nxv16i1( %pg, %a) { +define i32 @add_i32_ctz_nxv8i1_poison( %a, i32 %b) { +; CHECK-LABEL: add_i32_ctz_nxv8i1_poison: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: brkb p0.b, p1/z, p0.b +; CHECK-NEXT: incp x0, p0.h +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret + %res = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1( %a, i1 1) + %trunc = trunc i64 %res to i32 + %add = add i32 %trunc, %b + ret i32 %add +} + +define i32 @ctz_nxv16i1( %a) { ; CHECK-LABEL: ctz_nxv16i1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: brkb p0.b, p0/z, p1.b +; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: brkb p0.b, p1/z, p0.b ; CHECK-NEXT: cntp x0, p0, p0.b ; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 ; CHECK-NEXT: ret @@ -198,11 +304,11 @@ define i32 @ctz_nxv16i1( %pg, %a) { ret i32 %res } -define i32 @ctz_nxv16i1_poison( %pg, %a) { +define i32 @ctz_nxv16i1_poison( %a) { ; CHECK-LABEL: ctz_nxv16i1_poison: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: brkb p0.b, p0/z, p1.b +; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: brkb p0.b, p1/z, p0.b ; CHECK-NEXT: cntp x0, p0, p0.b ; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 ; CHECK-NEXT: ret @@ -226,11 +332,11 @@ define i32 @ctz_and_nxv16i1( %pg, %a, %pg, %a, i64 %b) { +define i64 @add_i64_ctz_nxv16i1_poison( %a, i64 %b) { ; CHECK-LABEL: add_i64_ctz_nxv16i1_poison: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: brkb p0.b, p0/z, p1.b +; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: brkb p0.b, p1/z, p0.b ; CHECK-NEXT: incp x0, p0.b ; CHECK-NEXT: ret %res = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1( %a, i1 1) @@ -238,12 +344,12 @@ define i64 @add_i64_ctz_nxv16i1_poison( %pg, %pg, %a, i32 %b) { +define i32 @add_i32_ctz_nxv16i1_poison( %a, i32 %b) { ; CHECK-LABEL: add_i32_ctz_nxv16i1_poison: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: ptrue p1.b ; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 -; CHECK-NEXT: brkb p0.b, p0/z, p1.b +; CHECK-NEXT: brkb p0.b, p1/z, p0.b ; CHECK-NEXT: incp x0, p0.b ; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 ; CHECK-NEXT: ret