diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index deb87365ae8d7..59496ebb93cd7 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -20295,6 +20295,9 @@ More update operation types may be added in the future. declare void @llvm.experimental.vector.histogram.add.v8p0.i32(<8 x ptr> %ptrs, i32 %inc, <8 x i1> %mask) declare void @llvm.experimental.vector.histogram.add.nxv2p0.i64( %ptrs, i64 %inc, %mask) + declare void @llvm.experimental.vector.histogram.uadd.sat.v8p0.i32(<8 x ptr> %ptrs, i32 %inc, <8 x i1> %mask) + declare void @llvm.experimental.vector.histogram.umax.v8p0.i32(<8 x ptr> %ptrs, i32 %val, <8 x i1> %mask) + declare void @llvm.experimental.vector.histogram.umin.v8p0.i32(<8 x ptr> %ptrs, i32 %val, <8 x i1> %mask) Arguments: """""""""" diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td index 14ecae41ff08f..31a0ba2e6500d 100644 --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -1947,6 +1947,24 @@ def int_experimental_vector_histogram_add : DefaultAttrsIntrinsic<[], LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], // Mask [ IntrArgMemOnly ]>; +def int_experimental_vector_histogram_uadd_sat : DefaultAttrsIntrinsic<[], + [ llvm_anyvector_ty, // Vector of pointers + llvm_anyint_ty, // Increment + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], // Mask + [ IntrArgMemOnly ]>; + +def int_experimental_vector_histogram_umin : DefaultAttrsIntrinsic<[], + [ llvm_anyvector_ty, // Vector of pointers + llvm_anyint_ty, // Update value + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], // Mask + [ IntrArgMemOnly ]>; + +def int_experimental_vector_histogram_umax : DefaultAttrsIntrinsic<[], + [ llvm_anyvector_ty, // Vector of pointers + llvm_anyint_ty, // Update value + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], // Mask + [ IntrArgMemOnly ]>; + // Experimental match def int_experimental_vector_match : DefaultAttrsIntrinsic< [ LLVMScalarOrSameVectorWidth<0, llvm_i1_ty> ], diff --git a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp index 63fcc1760ccaf..9506747bf464d 100644 --- a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp +++ b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp @@ -968,6 +968,29 @@ static void scalarizeMaskedVectorHistogram(const DataLayout &DL, CallInst *CI, // FIXME: Do we need to add an alignment parameter to the intrinsic? unsigned VectorWidth = AddrType->getNumElements(); + auto CreateHistogramUpdateValue = [&](IntrinsicInst *CI, Value *Load, + Value *Inc) -> Value * { + Value *UpdateOp; + switch (CI->getIntrinsicID()) { + case Intrinsic::experimental_vector_histogram_add: + UpdateOp = Builder.CreateAdd(Load, Inc); + break; + case Intrinsic::experimental_vector_histogram_uadd_sat: + UpdateOp = + Builder.CreateIntrinsic(Intrinsic::uadd_sat, {EltTy}, {Load, Inc}); + break; + case Intrinsic::experimental_vector_histogram_umin: + UpdateOp = Builder.CreateIntrinsic(Intrinsic::umin, {EltTy}, {Load, Inc}); + break; + case Intrinsic::experimental_vector_histogram_umax: + UpdateOp = Builder.CreateIntrinsic(Intrinsic::umax, {EltTy}, {Load, Inc}); + break; + + default: + llvm_unreachable("Unexpected histogram intrinsic"); + } + return UpdateOp; + }; // Shorten the way if the mask is a vector of constants. if (isConstantIntVector(Mask)) { @@ -976,8 +999,9 @@ static void scalarizeMaskedVectorHistogram(const DataLayout &DL, CallInst *CI, continue; Value *Ptr = Builder.CreateExtractElement(Ptrs, Idx, "Ptr" + Twine(Idx)); LoadInst *Load = Builder.CreateLoad(EltTy, Ptr, "Load" + Twine(Idx)); - Value *Add = Builder.CreateAdd(Load, Inc); - Builder.CreateStore(Add, Ptr); + Value *Update = + CreateHistogramUpdateValue(cast(CI), Load, Inc); + Builder.CreateStore(Update, Ptr); } CI->eraseFromParent(); return; @@ -997,8 +1021,9 @@ static void scalarizeMaskedVectorHistogram(const DataLayout &DL, CallInst *CI, Builder.SetInsertPoint(CondBlock->getTerminator()); Value *Ptr = Builder.CreateExtractElement(Ptrs, Idx, "Ptr" + Twine(Idx)); LoadInst *Load = Builder.CreateLoad(EltTy, Ptr, "Load" + Twine(Idx)); - Value *Add = Builder.CreateAdd(Load, Inc); - Builder.CreateStore(Add, Ptr); + Value *UpdateOp = + CreateHistogramUpdateValue(cast(CI), Load, Inc); + Builder.CreateStore(UpdateOp, Ptr); // Create "else" block, fill it in the next iteration BasicBlock *NewIfBlock = ThenTerm->getSuccessor(0); @@ -1089,6 +1114,9 @@ static bool optimizeCallInst(CallInst *CI, bool &ModifiedDT, default: break; case Intrinsic::experimental_vector_histogram_add: + case Intrinsic::experimental_vector_histogram_uadd_sat: + case Intrinsic::experimental_vector_histogram_umin: + case Intrinsic::experimental_vector_histogram_umax: if (TTI.isLegalMaskedVectorHistogram(CI->getArgOperand(0)->getType(), CI->getArgOperand(1)->getType())) return false; diff --git a/llvm/test/CodeGen/AArch64/neon-scalarize-histogram.ll b/llvm/test/CodeGen/AArch64/neon-scalarize-histogram.ll index e59d9098a30d6..ca74b4e95b0ae 100644 --- a/llvm/test/CodeGen/AArch64/neon-scalarize-histogram.ll +++ b/llvm/test/CodeGen/AArch64/neon-scalarize-histogram.ll @@ -112,3 +112,357 @@ define void @histogram_i32_literal_alltruemask(ptr %base, <4 x i32> %indices) { call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<4 x ptr> %buckets, i32 1, <4 x i1> ) ret void } + +define void @histogram_uadd_sat_i64(<2 x ptr> %buckets, i64 %inc, <2 x i1> %mask) { +; CHECK-LABEL: histogram_uadd_sat_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: tbnz w8, #0, .LBB3_3 +; CHECK-NEXT: // %bb.1: // %else +; CHECK-NEXT: mov w8, v1.s[1] +; CHECK-NEXT: tbnz w8, #0, .LBB3_4 +; CHECK-NEXT: .LBB3_2: // %else2 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB3_3: // %cond.histogram.update +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: ldr x9, [x8] +; CHECK-NEXT: adds x9, x9, x0 +; CHECK-NEXT: csinv x9, x9, xzr, lo +; CHECK-NEXT: str x9, [x8] +; CHECK-NEXT: mov w8, v1.s[1] +; CHECK-NEXT: tbz w8, #0, .LBB3_2 +; CHECK-NEXT: .LBB3_4: // %cond.histogram.update1 +; CHECK-NEXT: mov x8, v0.d[1] +; CHECK-NEXT: ldr x9, [x8] +; CHECK-NEXT: adds x9, x9, x0 +; CHECK-NEXT: csinv x9, x9, xzr, lo +; CHECK-NEXT: str x9, [x8] +; CHECK-NEXT: ret + call void @llvm.experimental.vector.histogram.uadd.sat.nxv2p0.i64(<2 x ptr> %buckets, i64 %inc, <2 x i1> %mask) + ret void +} + +define void @histogram_uadd_sat_i32_literal(ptr %base, <4 x i32> %indices, <4 x i1> %mask) { +; CHECK-LABEL: histogram_uadd_sat_i32_literal: +; CHECK: // %bb.0: +; CHECK-NEXT: dup v2.2d, x0 +; CHECK-NEXT: sshll v3.2d, v0.2s, #2 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: umov w8, v1.h[0] +; CHECK-NEXT: add v3.2d, v2.2d, v3.2d +; CHECK-NEXT: tbz w8, #0, .LBB4_2 +; CHECK-NEXT: // %bb.1: // %cond.histogram.update +; CHECK-NEXT: fmov x8, d3 +; CHECK-NEXT: ldr w9, [x8] +; CHECK-NEXT: adds w9, w9, #1 +; CHECK-NEXT: csinv w9, w9, wzr, lo +; CHECK-NEXT: str w9, [x8] +; CHECK-NEXT: .LBB4_2: // %else +; CHECK-NEXT: umov w8, v1.h[1] +; CHECK-NEXT: sshll2 v0.2d, v0.4s, #2 +; CHECK-NEXT: tbz w8, #0, .LBB4_4 +; CHECK-NEXT: // %bb.3: // %cond.histogram.update1 +; CHECK-NEXT: mov x8, v3.d[1] +; CHECK-NEXT: ldr w9, [x8] +; CHECK-NEXT: adds w9, w9, #1 +; CHECK-NEXT: csinv w9, w9, wzr, lo +; CHECK-NEXT: str w9, [x8] +; CHECK-NEXT: .LBB4_4: // %else2 +; CHECK-NEXT: umov w8, v1.h[2] +; CHECK-NEXT: add v0.2d, v2.2d, v0.2d +; CHECK-NEXT: tbnz w8, #0, .LBB4_7 +; CHECK-NEXT: // %bb.5: // %else4 +; CHECK-NEXT: umov w8, v1.h[3] +; CHECK-NEXT: tbnz w8, #0, .LBB4_8 +; CHECK-NEXT: .LBB4_6: // %else6 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB4_7: // %cond.histogram.update3 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: ldr w9, [x8] +; CHECK-NEXT: adds w9, w9, #1 +; CHECK-NEXT: csinv w9, w9, wzr, lo +; CHECK-NEXT: str w9, [x8] +; CHECK-NEXT: umov w8, v1.h[3] +; CHECK-NEXT: tbz w8, #0, .LBB4_6 +; CHECK-NEXT: .LBB4_8: // %cond.histogram.update5 +; CHECK-NEXT: mov x8, v0.d[1] +; CHECK-NEXT: ldr w9, [x8] +; CHECK-NEXT: adds w9, w9, #1 +; CHECK-NEXT: csinv w9, w9, wzr, lo +; CHECK-NEXT: str w9, [x8] +; CHECK-NEXT: ret + %buckets = getelementptr i32, ptr %base, <4 x i32> %indices + call void @llvm.experimental.vector.histogram.uadd.sat.nxv4p0.i32(<4 x ptr> %buckets, i32 1, <4 x i1> %mask) + ret void +} + +define void @histogram_uadd_sat_i32_literal_alltruemask(ptr %base, <4 x i32> %indices) { +; CHECK-LABEL: histogram_uadd_sat_i32_literal_alltruemask: +; CHECK: // %bb.0: +; CHECK-NEXT: dup v1.2d, x0 +; CHECK-NEXT: sshll v2.2d, v0.2s, #2 +; CHECK-NEXT: sshll2 v0.2d, v0.4s, #2 +; CHECK-NEXT: add v2.2d, v1.2d, v2.2d +; CHECK-NEXT: add v0.2d, v1.2d, v0.2d +; CHECK-NEXT: fmov x8, d2 +; CHECK-NEXT: mov x10, v2.d[1] +; CHECK-NEXT: ldr w9, [x8] +; CHECK-NEXT: adds w9, w9, #1 +; CHECK-NEXT: csinv w9, w9, wzr, lo +; CHECK-NEXT: str w9, [x8] +; CHECK-NEXT: ldr w8, [x10] +; CHECK-NEXT: adds w8, w8, #1 +; CHECK-NEXT: csinv w8, w8, wzr, lo +; CHECK-NEXT: str w8, [x10] +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: mov x10, v0.d[1] +; CHECK-NEXT: ldr w9, [x8] +; CHECK-NEXT: adds w9, w9, #1 +; CHECK-NEXT: csinv w9, w9, wzr, lo +; CHECK-NEXT: str w9, [x8] +; CHECK-NEXT: ldr w8, [x10] +; CHECK-NEXT: adds w8, w8, #1 +; CHECK-NEXT: csinv w8, w8, wzr, lo +; CHECK-NEXT: str w8, [x10] +; CHECK-NEXT: ret + %buckets = getelementptr i32, ptr %base, <4 x i32> %indices + call void @llvm.experimental.vector.histogram.uadd.sat.nxv4p0.i32(<4 x ptr> %buckets, i32 1, <4 x i1> ) + ret void +} + +define void @histogram_umax_i64(<2 x ptr> %buckets, i64 %inc, <2 x i1> %mask) { +; CHECK-LABEL: histogram_umax_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: tbnz w8, #0, .LBB6_3 +; CHECK-NEXT: // %bb.1: // %else +; CHECK-NEXT: mov w8, v1.s[1] +; CHECK-NEXT: tbnz w8, #0, .LBB6_4 +; CHECK-NEXT: .LBB6_2: // %else2 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB6_3: // %cond.histogram.update +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: ldr x9, [x8] +; CHECK-NEXT: cmp x9, x0 +; CHECK-NEXT: csel x9, x9, x0, hi +; CHECK-NEXT: str x9, [x8] +; CHECK-NEXT: mov w8, v1.s[1] +; CHECK-NEXT: tbz w8, #0, .LBB6_2 +; CHECK-NEXT: .LBB6_4: // %cond.histogram.update1 +; CHECK-NEXT: mov x8, v0.d[1] +; CHECK-NEXT: ldr x9, [x8] +; CHECK-NEXT: cmp x9, x0 +; CHECK-NEXT: csel x9, x9, x0, hi +; CHECK-NEXT: str x9, [x8] +; CHECK-NEXT: ret + call void @llvm.experimental.vector.histogram.umax.nxv2p0.i64(<2 x ptr> %buckets, i64 %inc, <2 x i1> %mask) + ret void +} + +define void @histogram_umax_i32_literal(ptr %base, <4 x i32> %indices, <4 x i1> %mask) { +; CHECK-LABEL: histogram_umax_i32_literal: +; CHECK: // %bb.0: +; CHECK-NEXT: dup v2.2d, x0 +; CHECK-NEXT: sshll v3.2d, v0.2s, #2 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: umov w8, v1.h[0] +; CHECK-NEXT: add v3.2d, v2.2d, v3.2d +; CHECK-NEXT: tbz w8, #0, .LBB7_2 +; CHECK-NEXT: // %bb.1: // %cond.histogram.update +; CHECK-NEXT: fmov x8, d3 +; CHECK-NEXT: ldr w9, [x8] +; CHECK-NEXT: cmp w9, #1 +; CHECK-NEXT: csinc w9, w9, wzr, hi +; CHECK-NEXT: str w9, [x8] +; CHECK-NEXT: .LBB7_2: // %else +; CHECK-NEXT: umov w8, v1.h[1] +; CHECK-NEXT: sshll2 v0.2d, v0.4s, #2 +; CHECK-NEXT: tbz w8, #0, .LBB7_4 +; CHECK-NEXT: // %bb.3: // %cond.histogram.update1 +; CHECK-NEXT: mov x8, v3.d[1] +; CHECK-NEXT: ldr w9, [x8] +; CHECK-NEXT: cmp w9, #1 +; CHECK-NEXT: csinc w9, w9, wzr, hi +; CHECK-NEXT: str w9, [x8] +; CHECK-NEXT: .LBB7_4: // %else2 +; CHECK-NEXT: umov w8, v1.h[2] +; CHECK-NEXT: add v0.2d, v2.2d, v0.2d +; CHECK-NEXT: tbnz w8, #0, .LBB7_7 +; CHECK-NEXT: // %bb.5: // %else4 +; CHECK-NEXT: umov w8, v1.h[3] +; CHECK-NEXT: tbnz w8, #0, .LBB7_8 +; CHECK-NEXT: .LBB7_6: // %else6 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB7_7: // %cond.histogram.update3 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: ldr w9, [x8] +; CHECK-NEXT: cmp w9, #1 +; CHECK-NEXT: csinc w9, w9, wzr, hi +; CHECK-NEXT: str w9, [x8] +; CHECK-NEXT: umov w8, v1.h[3] +; CHECK-NEXT: tbz w8, #0, .LBB7_6 +; CHECK-NEXT: .LBB7_8: // %cond.histogram.update5 +; CHECK-NEXT: mov x8, v0.d[1] +; CHECK-NEXT: ldr w9, [x8] +; CHECK-NEXT: cmp w9, #1 +; CHECK-NEXT: csinc w9, w9, wzr, hi +; CHECK-NEXT: str w9, [x8] +; CHECK-NEXT: ret + %buckets = getelementptr i32, ptr %base, <4 x i32> %indices + call void @llvm.experimental.vector.histogram.umax.nxv4p0.i32(<4 x ptr> %buckets, i32 1, <4 x i1> %mask) + ret void +} + +define void @histogram_umax_i32_literal_alltruemask(ptr %base, <4 x i32> %indices) { +; CHECK-LABEL: histogram_umax_i32_literal_alltruemask: +; CHECK: // %bb.0: +; CHECK-NEXT: dup v1.2d, x0 +; CHECK-NEXT: sshll v2.2d, v0.2s, #2 +; CHECK-NEXT: sshll2 v0.2d, v0.4s, #2 +; CHECK-NEXT: add v2.2d, v1.2d, v2.2d +; CHECK-NEXT: add v0.2d, v1.2d, v0.2d +; CHECK-NEXT: fmov x8, d2 +; CHECK-NEXT: mov x10, v2.d[1] +; CHECK-NEXT: ldr w9, [x8] +; CHECK-NEXT: cmp w9, #1 +; CHECK-NEXT: csinc w9, w9, wzr, hi +; CHECK-NEXT: str w9, [x8] +; CHECK-NEXT: ldr w8, [x10] +; CHECK-NEXT: cmp w8, #1 +; CHECK-NEXT: csinc w8, w8, wzr, hi +; CHECK-NEXT: str w8, [x10] +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: mov x10, v0.d[1] +; CHECK-NEXT: ldr w9, [x8] +; CHECK-NEXT: cmp w9, #1 +; CHECK-NEXT: csinc w9, w9, wzr, hi +; CHECK-NEXT: str w9, [x8] +; CHECK-NEXT: ldr w8, [x10] +; CHECK-NEXT: cmp w8, #1 +; CHECK-NEXT: csinc w8, w8, wzr, hi +; CHECK-NEXT: str w8, [x10] +; CHECK-NEXT: ret + %buckets = getelementptr i32, ptr %base, <4 x i32> %indices + call void @llvm.experimental.vector.histogram.umax.nxv4p0.i32(<4 x ptr> %buckets, i32 1, <4 x i1> ) + ret void +} + +define void @histogram_umin_i64(<2 x ptr> %buckets, i64 %inc, <2 x i1> %mask) { +; CHECK-LABEL: histogram_umin_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: tbnz w8, #0, .LBB9_3 +; CHECK-NEXT: // %bb.1: // %else +; CHECK-NEXT: mov w8, v1.s[1] +; CHECK-NEXT: tbnz w8, #0, .LBB9_4 +; CHECK-NEXT: .LBB9_2: // %else2 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB9_3: // %cond.histogram.update +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: ldr x9, [x8] +; CHECK-NEXT: cmp x9, x0 +; CHECK-NEXT: csel x9, x9, x0, lo +; CHECK-NEXT: str x9, [x8] +; CHECK-NEXT: mov w8, v1.s[1] +; CHECK-NEXT: tbz w8, #0, .LBB9_2 +; CHECK-NEXT: .LBB9_4: // %cond.histogram.update1 +; CHECK-NEXT: mov x8, v0.d[1] +; CHECK-NEXT: ldr x9, [x8] +; CHECK-NEXT: cmp x9, x0 +; CHECK-NEXT: csel x9, x9, x0, lo +; CHECK-NEXT: str x9, [x8] +; CHECK-NEXT: ret + call void @llvm.experimental.vector.histogram.umin.nxv2p0.i64(<2 x ptr> %buckets, i64 %inc, <2 x i1> %mask) + ret void +} + +define void @histogram_umin_i32_literal(ptr %base, <4 x i32> %indices, <4 x i1> %mask) { +; CHECK-LABEL: histogram_umin_i32_literal: +; CHECK: // %bb.0: +; CHECK-NEXT: dup v2.2d, x0 +; CHECK-NEXT: sshll v3.2d, v0.2s, #2 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: umov w8, v1.h[0] +; CHECK-NEXT: add v3.2d, v2.2d, v3.2d +; CHECK-NEXT: tbz w8, #0, .LBB10_2 +; CHECK-NEXT: // %bb.1: // %cond.histogram.update +; CHECK-NEXT: fmov x8, d3 +; CHECK-NEXT: ldr w9, [x8] +; CHECK-NEXT: cmp w9, #1 +; CHECK-NEXT: csinc w9, w9, wzr, lo +; CHECK-NEXT: str w9, [x8] +; CHECK-NEXT: .LBB10_2: // %else +; CHECK-NEXT: umov w8, v1.h[1] +; CHECK-NEXT: sshll2 v0.2d, v0.4s, #2 +; CHECK-NEXT: tbz w8, #0, .LBB10_4 +; CHECK-NEXT: // %bb.3: // %cond.histogram.update1 +; CHECK-NEXT: mov x8, v3.d[1] +; CHECK-NEXT: ldr w9, [x8] +; CHECK-NEXT: cmp w9, #1 +; CHECK-NEXT: csinc w9, w9, wzr, lo +; CHECK-NEXT: str w9, [x8] +; CHECK-NEXT: .LBB10_4: // %else2 +; CHECK-NEXT: umov w8, v1.h[2] +; CHECK-NEXT: add v0.2d, v2.2d, v0.2d +; CHECK-NEXT: tbnz w8, #0, .LBB10_7 +; CHECK-NEXT: // %bb.5: // %else4 +; CHECK-NEXT: umov w8, v1.h[3] +; CHECK-NEXT: tbnz w8, #0, .LBB10_8 +; CHECK-NEXT: .LBB10_6: // %else6 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB10_7: // %cond.histogram.update3 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: ldr w9, [x8] +; CHECK-NEXT: cmp w9, #1 +; CHECK-NEXT: csinc w9, w9, wzr, lo +; CHECK-NEXT: str w9, [x8] +; CHECK-NEXT: umov w8, v1.h[3] +; CHECK-NEXT: tbz w8, #0, .LBB10_6 +; CHECK-NEXT: .LBB10_8: // %cond.histogram.update5 +; CHECK-NEXT: mov x8, v0.d[1] +; CHECK-NEXT: ldr w9, [x8] +; CHECK-NEXT: cmp w9, #1 +; CHECK-NEXT: csinc w9, w9, wzr, lo +; CHECK-NEXT: str w9, [x8] +; CHECK-NEXT: ret + %buckets = getelementptr i32, ptr %base, <4 x i32> %indices + call void @llvm.experimental.vector.histogram.umin.nxv4p0.i32(<4 x ptr> %buckets, i32 1, <4 x i1> %mask) + ret void +} + +define void @histogram_umin_i32_literal_alltruemask(ptr %base, <4 x i32> %indices) { +; CHECK-LABEL: histogram_umin_i32_literal_alltruemask: +; CHECK: // %bb.0: +; CHECK-NEXT: dup v1.2d, x0 +; CHECK-NEXT: sshll v2.2d, v0.2s, #2 +; CHECK-NEXT: sshll2 v0.2d, v0.4s, #2 +; CHECK-NEXT: add v2.2d, v1.2d, v2.2d +; CHECK-NEXT: add v0.2d, v1.2d, v0.2d +; CHECK-NEXT: fmov x8, d2 +; CHECK-NEXT: mov x10, v2.d[1] +; CHECK-NEXT: ldr w9, [x8] +; CHECK-NEXT: cmp w9, #1 +; CHECK-NEXT: csinc w9, w9, wzr, lo +; CHECK-NEXT: str w9, [x8] +; CHECK-NEXT: ldr w8, [x10] +; CHECK-NEXT: cmp w8, #1 +; CHECK-NEXT: csinc w8, w8, wzr, lo +; CHECK-NEXT: str w8, [x10] +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: mov x10, v0.d[1] +; CHECK-NEXT: ldr w9, [x8] +; CHECK-NEXT: cmp w9, #1 +; CHECK-NEXT: csinc w9, w9, wzr, lo +; CHECK-NEXT: str w9, [x8] +; CHECK-NEXT: ldr w8, [x10] +; CHECK-NEXT: cmp w8, #1 +; CHECK-NEXT: csinc w8, w8, wzr, lo +; CHECK-NEXT: str w8, [x10] +; CHECK-NEXT: ret + %buckets = getelementptr i32, ptr %base, <4 x i32> %indices + call void @llvm.experimental.vector.histogram.umin.nxv4p0.i32(<4 x ptr> %buckets, i32 1, <4 x i1> ) + ret void +}