diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 0f17312b03827..8bc3194161bbd 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -222,7 +222,12 @@ class TargetTransformInfoImplBase; /// for IR-level transformations. class TargetTransformInfo { public: - enum PartialReductionExtendKind { PR_None, PR_SignExtend, PR_ZeroExtend }; + enum PartialReductionExtendKind { + PR_None, + PR_SignExtend, + PR_ZeroExtend, + PR_FPExtend + }; /// Get the kind of extension that an instruction represents. LLVM_ABI static PartialReductionExtendKind diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 0426ac7e62fab..32d4ac4bbe270 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1014,6 +1014,8 @@ TargetTransformInfo::getPartialReductionExtendKind( return PR_ZeroExtend; case Instruction::CastOps::SExt: return PR_SignExtend; + case Instruction::CastOps::FPExt: + return PR_FPExtend; default: return PR_None; } diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 197aae6e03cb1..5eaa604387659 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -5761,7 +5761,8 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost( (!ST->isNeonAvailable() || !ST->hasDotProd())) return Invalid; - if ((Opcode != Instruction::Add && Opcode != Instruction::Sub) || + if ((Opcode != Instruction::Add && Opcode != Instruction::Sub && + Opcode != Instruction::FAdd) || OpAExtend == TTI::PR_None) return Invalid; @@ -5771,7 +5772,8 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost( // We only support multiply binary operations for now, and for muls we // require the types being extended to be the same. - if (BinOp && (*BinOp != Instruction::Mul || InputTypeA != InputTypeB)) + if (BinOp && ((*BinOp != Instruction::Mul && *BinOp != Instruction::FMul) || + InputTypeA != InputTypeB)) return Invalid; bool IsUSDot = OpBExtend != TTI::PR_None && OpAExtend != OpBExtend; @@ -5842,6 +5844,13 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost( return Cost; } + // f16 -> f32 is natively supported for fdot + if (Opcode == Instruction::FAdd && (ST->hasSME2() || ST->hasSVE2p1())) { + if (AccumLT.second.getScalarType() == MVT::f32 && + InputLT.second.getScalarType() == MVT::f16) + return Cost; + } + // Add additional cost for the extends that would need to be inserted. return Cost + 2; } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index b7224a33f47b1..4d542b29dd2b2 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8008,7 +8008,8 @@ bool VPRecipeBuilder::getScaledReductions( continue; } Value *ExtOp; - if (!match(OpI, m_ZExtOrSExt(m_Value(ExtOp)))) + if (!match(OpI, m_ZExtOrSExt(m_Value(ExtOp))) && + !match(OpI, m_FPExt(m_Value(ExtOp)))) return false; Exts[I] = cast(OpI); @@ -8176,6 +8177,8 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction, "all accumulators in chain must have same scale factor"); unsigned ReductionOpcode = Reduction->getOpcode(); + if (ReductionOpcode == Instruction::FAdd && !Reduction->hasAllowReassoc()) + return nullptr; if (ReductionOpcode == Instruction::Sub) { auto *const Zero = ConstantInt::get(Reduction->getType(), 0); SmallVector Ops; diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h index b57c44872c1b6..b67a2461018dd 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h +++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h @@ -498,6 +498,12 @@ m_c_Mul(const Op0_t &Op0, const Op1_t &Op1) { return m_c_Binary(Op0, Op1); } +template +inline AllRecipe_match +m_FMul(const Op0_t &Op0, const Op1_t &Op1) { + return m_Binary(Op0, Op1); +} + /// Match a binary AND operation. template inline AllRecipe_commutative_match diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 80cd112dbcd8a..77baf03503427 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -317,7 +317,8 @@ VPPartialReductionRecipe::computeCost(ElementCount VF, // FIXME: Replace the entire function with this once all partial reduction // variants are bundled into VPExpressionRecipe. if (!match(Op, m_Select(m_VPValue(), m_VPValue(Op), m_VPValue())) && - !match(Op, m_Mul(m_VPValue(), m_ConstantInt(MulConst)))) { + !match(Op, m_Mul(m_VPValue(), m_ConstantInt(MulConst))) && + !match(Op, m_FMul(m_VPValue(), m_VPValue()))) { auto *PhiType = Ctx.Types.inferScalarType(getChainOp()); auto *InputType = Ctx.Types.inferScalarType(getVecOp()); return Ctx.TTI.getPartialReductionCost(getOpcode(), InputType, InputType, @@ -340,6 +341,8 @@ VPPartialReductionRecipe::computeCost(ElementCount VF, return TTI::PR_ZeroExtend; if (WidenCastR->getOpcode() == Instruction::CastOps::SExt) return TTI::PR_SignExtend; + if (WidenCastR->getOpcode() == Instruction::CastOps::FPExt) + return TTI::PR_FPExtend; return TTI::PR_None; }; @@ -392,8 +395,9 @@ VPPartialReductionRecipe::computeCost(ElementCount VF, void VPPartialReductionRecipe::execute(VPTransformState &State) { auto &Builder = State.Builder; - assert(getOpcode() == Instruction::Add && - "Unhandled partial reduction opcode"); + assert( + (getOpcode() == Instruction::Add || getOpcode() == Instruction::FAdd) && + "Unhandled partial reduction opcode"); Value *BinOpVal = State.get(getOperand(1)); Value *PhiVal = State.get(getOperand(0)); @@ -401,9 +405,20 @@ void VPPartialReductionRecipe::execute(VPTransformState &State) { Type *RetTy = PhiVal->getType(); - CallInst *V = - Builder.CreateIntrinsic(RetTy, Intrinsic::vector_partial_reduce_add, - {PhiVal, BinOpVal}, nullptr, "partial.reduce"); + enum llvm::Intrinsic::IndependentIntrinsics PRIntrinsic; + switch (getOpcode()) { + case Instruction::Add: { + PRIntrinsic = Intrinsic::vector_partial_reduce_add; + break; + } + case Instruction::FAdd: { + PRIntrinsic = Intrinsic::vector_partial_reduce_fadd; + break; + } + } + + CallInst *V = Builder.CreateIntrinsic(RetTy, PRIntrinsic, {PhiVal, BinOpVal}, + nullptr, "partial.reduce"); State.set(this, V); } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-fdot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-fdot-product.ll new file mode 100644 index 0000000000000..33c6fc89465e6 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-fdot-product.ll @@ -0,0 +1,1063 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6 +; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization=false -mattr=+sve2p1,+dotprod -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVE1 +; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+sve2p1,+dotprod -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVED +; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -vectorizer-maximize-bandwidth -enable-epilogue-vectorization=false -mattr=+sve2p1,+dotprod -S < %s | FileCheck %s --check-prefixes=CHECK-MAXBW + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-none-unknown-elf" + +define float @fdotp(ptr %a, ptr %b) #0 { +; CHECK-INTERLEAVE1-LABEL: define float @fdotp( +; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-INTERLEAVE1-NEXT: [[ENTRY:.*:]] +; CHECK-INTERLEAVE1-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK-INTERLEAVE1: [[VECTOR_PH]]: +; CHECK-INTERLEAVE1-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-INTERLEAVE1: [[VECTOR_BODY]]: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ , %[[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], %[[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = getelementptr half, ptr [[A]], i64 [[INDEX]] +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x half>, ptr [[TMP0]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = fpext <8 x half> [[WIDE_LOAD]] to <8 x float> +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = getelementptr half, ptr [[B]], i64 [[INDEX]] +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x half>, ptr [[TMP2]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = fpext <8 x half> [[WIDE_LOAD1]] to <8 x float> +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = fmul <8 x float> [[TMP3]], [[TMP1]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x float> @llvm.vector.partial.reduce.fadd.v4f32.v8f32(<4 x float> [[VEC_PHI]], <8 x float> [[TMP4]]) +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-INTERLEAVE1: [[MIDDLE_BLOCK]]: +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = call reassoc float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[PARTIAL_REDUCE]]) +; CHECK-INTERLEAVE1-NEXT: br label %[[FOR_EXIT:.*]] +; CHECK-INTERLEAVE1: [[FOR_EXIT]]: +; CHECK-INTERLEAVE1-NEXT: ret float [[TMP7]] +; +; CHECK-INTERLEAVED-LABEL: define float @fdotp( +; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-INTERLEAVED-NEXT: [[ENTRY:.*:]] +; CHECK-INTERLEAVED-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK-INTERLEAVED: [[VECTOR_PH]]: +; CHECK-INTERLEAVED-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-INTERLEAVED: [[VECTOR_BODY]]: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ , %[[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], %[[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x float> [ splat (float -0.000000e+00), %[[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = getelementptr half, ptr [[A]], i64 [[INDEX]] +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr half, ptr [[TMP0]], i32 8 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x half>, ptr [[TMP0]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x half>, ptr [[TMP1]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = fpext <8 x half> [[WIDE_LOAD]] to <8 x float> +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = fpext <8 x half> [[WIDE_LOAD2]] to <8 x float> +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = getelementptr half, ptr [[B]], i64 [[INDEX]] +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr half, ptr [[TMP4]], i32 8 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x half>, ptr [[TMP4]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x half>, ptr [[TMP5]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = fpext <8 x half> [[WIDE_LOAD3]] to <8 x float> +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = fpext <8 x half> [[WIDE_LOAD4]] to <8 x float> +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = fmul <8 x float> [[TMP6]], [[TMP2]] +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = fmul <8 x float> [[TMP7]], [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x float> @llvm.vector.partial.reduce.fadd.v4f32.v8f32(<4 x float> [[VEC_PHI]], <8 x float> [[TMP8]]) +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE5]] = call <4 x float> @llvm.vector.partial.reduce.fadd.v4f32.v8f32(<4 x float> [[VEC_PHI1]], <8 x float> [[TMP9]]) +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-INTERLEAVED: [[MIDDLE_BLOCK]]: +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = fadd reassoc <4 x float> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]] +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = call reassoc float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: br label %[[FOR_EXIT:.*]] +; CHECK-INTERLEAVED: [[FOR_EXIT]]: +; CHECK-INTERLEAVED-NEXT: ret float [[TMP11]] +; +; CHECK-MAXBW-LABEL: define float @fdotp( +; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-MAXBW-NEXT: [[ENTRY:.*]]: +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP4]], 3 +; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] +; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-MAXBW: [[VECTOR_PH]]: +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP6]], 8 +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-MAXBW: [[VECTOR_BODY]]: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( splat (float -0.000000e+00), float 0.000000e+00, i32 0), %[[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], %[[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = getelementptr half, ptr [[A]], i64 [[INDEX]] +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP0]], align 1 +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = fpext [[WIDE_LOAD]] to +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = getelementptr half, ptr [[B]], i64 [[INDEX]] +; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP2]], align 1 +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = fpext [[WIDE_LOAD1]] to +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = fmul [[TMP7]], [[TMP5]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call @llvm.vector.partial.reduce.fadd.nxv4f32.nxv8f32( [[VEC_PHI]], [[TMP8]]) +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-MAXBW: [[MIDDLE_BLOCK]]: +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = call reassoc float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, [[PARTIAL_REDUCE]]) +; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label %[[FOR_EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-MAXBW: [[SCALAR_PH]]: +; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ] +; CHECK-MAXBW-NEXT: br label %[[FOR_BODY:.*]] +; CHECK-MAXBW: [[FOR_BODY]]: +; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ACCUM:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[GEP_A:%.*]] = getelementptr half, ptr [[A]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_A:%.*]] = load half, ptr [[GEP_A]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_A:%.*]] = fpext half [[LOAD_A]] to float +; CHECK-MAXBW-NEXT: [[GEP_B:%.*]] = getelementptr half, ptr [[B]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_B:%.*]] = load half, ptr [[GEP_B]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_B:%.*]] = fpext half [[LOAD_B]] to float +; CHECK-MAXBW-NEXT: [[MUL:%.*]] = fmul float [[EXT_B]], [[EXT_A]] +; CHECK-MAXBW-NEXT: [[ADD]] = fadd reassoc float [[MUL]], [[ACCUM]] +; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-MAXBW: [[FOR_EXIT]]: +; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], %[[FOR_BODY]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: ret float [[ADD_LCSSA]] +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi float [ 0.0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr half, ptr %a, i64 %iv + %load.a = load half, ptr %gep.a, align 1 + %ext.a = fpext half %load.a to float + %gep.b = getelementptr half, ptr %b, i64 %iv + %load.b = load half, ptr %gep.b, align 1 + %ext.b = fpext half %load.b to float + %mul = fmul float %ext.b, %ext.a + %add = fadd reassoc float %mul, %accum + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + ret float %add +} + +define double @not_fdotp_different_types(ptr %a, ptr %b) #0 { +; CHECK-INTERLEAVE1-LABEL: define double @not_fdotp_different_types( +; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVE1-NEXT: [[ENTRY:.*:]] +; CHECK-INTERLEAVE1-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK-INTERLEAVE1: [[VECTOR_PH]]: +; CHECK-INTERLEAVE1-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-INTERLEAVE1: [[VECTOR_BODY]]: +; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[IV_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <8 x double> [ , %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[GEP_A:%.*]] = getelementptr half, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x half>, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = fpext <8 x half> [[WIDE_LOAD]] to <8 x double> +; CHECK-INTERLEAVE1-NEXT: [[GEP_B:%.*]] = getelementptr float, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, ptr [[GEP_B]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = fpext <8 x float> [[WIDE_LOAD1]] to <8 x double> +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = fmul <8 x double> [[TMP3]], [[TMP1]] +; CHECK-INTERLEAVE1-NEXT: [[TMP5]] = fadd reassoc <8 x double> [[TMP4]], [[VEC_PHI]] +; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], 8 +; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-INTERLEAVE1: [[MIDDLE_BLOCK]]: +; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = call reassoc double @llvm.vector.reduce.fadd.v8f64(double -0.000000e+00, <8 x double> [[TMP5]]) +; CHECK-INTERLEAVE1-NEXT: br label %[[FOR_EXIT:.*]] +; CHECK-INTERLEAVE1: [[FOR_EXIT]]: +; CHECK-INTERLEAVE1-NEXT: ret double [[ADD_LCSSA]] +; +; CHECK-INTERLEAVED-LABEL: define double @not_fdotp_different_types( +; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVED-NEXT: [[ENTRY:.*:]] +; CHECK-INTERLEAVED-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK-INTERLEAVED: [[VECTOR_PH]]: +; CHECK-INTERLEAVED-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-INTERLEAVED: [[VECTOR_BODY]]: +; CHECK-INTERLEAVED-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[IV_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <8 x double> [ , %[[VECTOR_PH]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <8 x double> [ splat (double -0.000000e+00), %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[GEP_A:%.*]] = getelementptr half, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr half, ptr [[GEP_A]], i32 8 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x half>, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x half>, ptr [[TMP1]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = fpext <8 x half> [[WIDE_LOAD]] to <8 x double> +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = fpext <8 x half> [[WIDE_LOAD2]] to <8 x double> +; CHECK-INTERLEAVED-NEXT: [[GEP_B:%.*]] = getelementptr float, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr float, ptr [[GEP_B]], i32 8 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x float>, ptr [[GEP_B]], align 2 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x float>, ptr [[TMP5]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = fpext <8 x float> [[WIDE_LOAD3]] to <8 x double> +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = fpext <8 x float> [[WIDE_LOAD4]] to <8 x double> +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = fmul <8 x double> [[TMP6]], [[TMP2]] +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = fmul <8 x double> [[TMP7]], [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[TMP10]] = fadd reassoc <8 x double> [[TMP8]], [[VEC_PHI]] +; CHECK-INTERLEAVED-NEXT: [[TMP11]] = fadd reassoc <8 x double> [[TMP9]], [[VEC_PHI1]] +; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], 16 +; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-INTERLEAVED: [[MIDDLE_BLOCK]]: +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = fadd reassoc <8 x double> [[TMP11]], [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = call reassoc double @llvm.vector.reduce.fadd.v8f64(double -0.000000e+00, <8 x double> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: br label %[[FOR_EXIT:.*]] +; CHECK-INTERLEAVED: [[FOR_EXIT]]: +; CHECK-INTERLEAVED-NEXT: ret double [[ADD_LCSSA]] +; +; CHECK-MAXBW-LABEL: define double @not_fdotp_different_types( +; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-MAXBW-NEXT: [[ENTRY:.*]]: +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 3 +; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] +; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-MAXBW: [[VECTOR_PH]]: +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8 +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-MAXBW: [[VECTOR_BODY]]: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( splat (double -0.000000e+00), double 0.000000e+00, i32 0), %[[VECTOR_PH]] ], [ [[TMP9:%.*]], %[[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = getelementptr half, ptr [[A]], i64 [[INDEX]] +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 1 +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = fpext [[WIDE_LOAD]] to +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = getelementptr float, ptr [[B]], i64 [[INDEX]] +; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP6]], align 2 +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = fpext [[WIDE_LOAD1]] to +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = fmul [[TMP7]], [[TMP5]] +; CHECK-MAXBW-NEXT: [[TMP9]] = fadd reassoc [[TMP8]], [[VEC_PHI]] +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-MAXBW: [[MIDDLE_BLOCK]]: +; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = call reassoc double @llvm.vector.reduce.fadd.nxv8f64(double -0.000000e+00, [[TMP9]]) +; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label %[[FOR_EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-MAXBW: [[SCALAR_PH]]: +; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ] +; CHECK-MAXBW-NEXT: br label %[[FOR_BODY:.*]] +; CHECK-MAXBW: [[FOR_BODY]]: +; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ACCUM:%.*]] = phi double [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[GEP_A:%.*]] = getelementptr half, ptr [[A]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_A:%.*]] = load half, ptr [[GEP_A]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_A:%.*]] = fpext half [[LOAD_A]] to double +; CHECK-MAXBW-NEXT: [[GEP_B:%.*]] = getelementptr float, ptr [[B]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_B:%.*]] = load float, ptr [[GEP_B]], align 2 +; CHECK-MAXBW-NEXT: [[EXT_B:%.*]] = fpext float [[LOAD_B]] to double +; CHECK-MAXBW-NEXT: [[MUL:%.*]] = fmul double [[EXT_B]], [[EXT_A]] +; CHECK-MAXBW-NEXT: [[ADD]] = fadd reassoc double [[MUL]], [[ACCUM]] +; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-MAXBW: [[FOR_EXIT]]: +; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi double [ [[ADD]], %[[FOR_BODY]] ], [ [[TMP11]], %[[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: ret double [[ADD_LCSSA]] +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi double [ 0.0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr half, ptr %a, i64 %iv + %load.a = load half, ptr %gep.a, align 1 + %ext.a = fpext half %load.a to double + %gep.b = getelementptr float, ptr %b, i64 %iv + %load.b = load float, ptr %gep.b, align 2 + %ext.b = fpext float %load.b to double + %mul = fmul double %ext.b, %ext.a + %add = fadd reassoc double %mul, %accum + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + ret double %add +} + +define float @not_fdotp_not_phi(ptr %a, ptr %b) #0 { +; CHECK-INTERLEAVE1-LABEL: define float @not_fdotp_not_phi( +; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVE1-NEXT: [[ENTRY:.*]]: +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 3 +; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-INTERLEAVE1: [[VECTOR_PH]]: +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8 +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-INTERLEAVE1-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-INTERLEAVE1: [[VECTOR_BODY]]: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr half, ptr [[A]], i64 [[INDEX]] +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = fpext [[WIDE_LOAD]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = getelementptr half, ptr [[B]], i64 [[INDEX]] +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP6]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = fpext [[WIDE_LOAD1]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = fmul [[TMP7]], [[TMP5]] +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = fadd reassoc [[TMP8]], [[TMP7]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-INTERLEAVE1: [[MIDDLE_BLOCK]]: +; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = mul nuw i32 [[TMP11]], 8 +; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = sub i32 [[TMP12]], 1 +; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement [[TMP9]], i32 [[TMP13]] +; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label %[[FOR_EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-INTERLEAVE1: [[SCALAR_PH]]: +; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-INTERLEAVE1-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi float [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ] +; CHECK-INTERLEAVE1-NEXT: br label %[[FOR_BODY:.*]] +; CHECK-INTERLEAVE1: [[FOR_BODY]]: +; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACCUM:%.*]] = phi float [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[GEP_A:%.*]] = getelementptr half, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A:%.*]] = load half, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_A:%.*]] = fpext half [[LOAD_A]] to float +; CHECK-INTERLEAVE1-NEXT: [[GEP_B:%.*]] = getelementptr half, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_B:%.*]] = load half, ptr [[GEP_B]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_B:%.*]] = fpext half [[LOAD_B]] to float +; CHECK-INTERLEAVE1-NEXT: [[MUL:%.*]] = fmul float [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVE1-NEXT: [[ADD]] = fadd reassoc float [[MUL]], [[EXT_B]] +; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-INTERLEAVE1: [[FOR_EXIT]]: +; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], %[[FOR_BODY]] ], [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVE1-NEXT: ret float [[ADD_LCSSA]] +; +; CHECK-INTERLEAVED-LABEL: define float @not_fdotp_not_phi( +; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVED-NEXT: [[ENTRY:.*]]: +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 4 +; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] +; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-INTERLEAVED: [[VECTOR_PH]]: +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16 +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-INTERLEAVED-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-INTERLEAVED: [[VECTOR_BODY]]: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = getelementptr half, ptr [[A]], i64 [[INDEX]] +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = shl nuw i64 [[TMP5]], 3 +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = getelementptr half, ptr [[TMP4]], i64 [[TMP6]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = fpext [[WIDE_LOAD]] to +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = getelementptr half, ptr [[B]], i64 [[INDEX]] +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = shl nuw i64 [[TMP10]], 3 +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = getelementptr half, ptr [[TMP9]], i64 [[TMP11]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP12]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = fpext [[WIDE_LOAD1]] to +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = fmul [[TMP13]], [[TMP8]] +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = fadd reassoc [[TMP14]], [[TMP13]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-INTERLEAVED: [[MIDDLE_BLOCK]]: +; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = mul nuw i32 [[TMP17]], 8 +; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = sub i32 [[TMP18]], 1 +; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement [[TMP15]], i32 [[TMP19]] +; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label %[[FOR_EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-INTERLEAVED: [[SCALAR_PH]]: +; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-INTERLEAVED-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi float [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ] +; CHECK-INTERLEAVED-NEXT: br label %[[FOR_BODY:.*]] +; CHECK-INTERLEAVED: [[FOR_BODY]]: +; CHECK-INTERLEAVED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ACCUM:%.*]] = phi float [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[GEP_A:%.*]] = getelementptr half, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_A:%.*]] = load half, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_A:%.*]] = fpext half [[LOAD_A]] to float +; CHECK-INTERLEAVED-NEXT: [[GEP_B:%.*]] = getelementptr half, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_B:%.*]] = load half, ptr [[GEP_B]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_B:%.*]] = fpext half [[LOAD_B]] to float +; CHECK-INTERLEAVED-NEXT: [[MUL:%.*]] = fmul float [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVED-NEXT: [[ADD]] = fadd reassoc float [[MUL]], [[EXT_B]] +; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-INTERLEAVED: [[FOR_EXIT]]: +; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], %[[FOR_BODY]] ], [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: ret float [[ADD_LCSSA]] +; +; CHECK-MAXBW-LABEL: define float @not_fdotp_not_phi( +; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-MAXBW-NEXT: [[ENTRY:.*]]: +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 3 +; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] +; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-MAXBW: [[VECTOR_PH]]: +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8 +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-MAXBW: [[VECTOR_BODY]]: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = getelementptr half, ptr [[A]], i64 [[INDEX]] +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 1 +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = fpext [[WIDE_LOAD]] to +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = getelementptr half, ptr [[B]], i64 [[INDEX]] +; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP6]], align 1 +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = fpext [[WIDE_LOAD1]] to +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = fmul [[TMP7]], [[TMP5]] +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = fadd reassoc [[TMP8]], [[TMP7]] +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-MAXBW: [[MIDDLE_BLOCK]]: +; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = mul nuw i32 [[TMP11]], 8 +; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = sub i32 [[TMP12]], 1 +; CHECK-MAXBW-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement [[TMP9]], i32 [[TMP13]] +; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label %[[FOR_EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-MAXBW: [[SCALAR_PH]]: +; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-MAXBW-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi float [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ] +; CHECK-MAXBW-NEXT: br label %[[FOR_BODY:.*]] +; CHECK-MAXBW: [[FOR_BODY]]: +; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ACCUM:%.*]] = phi float [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[GEP_A:%.*]] = getelementptr half, ptr [[A]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_A:%.*]] = load half, ptr [[GEP_A]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_A:%.*]] = fpext half [[LOAD_A]] to float +; CHECK-MAXBW-NEXT: [[GEP_B:%.*]] = getelementptr half, ptr [[B]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_B:%.*]] = load half, ptr [[GEP_B]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_B:%.*]] = fpext half [[LOAD_B]] to float +; CHECK-MAXBW-NEXT: [[MUL:%.*]] = fmul float [[EXT_B]], [[EXT_A]] +; CHECK-MAXBW-NEXT: [[ADD]] = fadd reassoc float [[MUL]], [[EXT_B]] +; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-MAXBW: [[FOR_EXIT]]: +; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], %[[FOR_BODY]] ], [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: ret float [[ADD_LCSSA]] +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi float [ 0.0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr half, ptr %a, i64 %iv + %load.a = load half, ptr %gep.a, align 1 + %ext.a = fpext half %load.a to float + %gep.b = getelementptr half, ptr %b, i64 %iv + %load.b = load half, ptr %gep.b, align 1 + %ext.b = fpext half %load.b to float + %mul = fmul float %ext.b, %ext.a + %add = fadd reassoc float %mul, %ext.b + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + ret float %add +} + +define void @not_fdotp_not_phi2(ptr %matrix, i32 %n) #0 { +; CHECK-INTERLEAVE1-LABEL: define void @not_fdotp_not_phi2( +; CHECK-INTERLEAVE1-SAME: ptr [[MATRIX:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVE1-NEXT: [[ENTRY:.*]]: +; CHECK-INTERLEAVE1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP]], label %[[FOR_PREHEADER:.*]], label %[[EXIT:.*]] +; CHECK-INTERLEAVE1: [[FOR_PREHEADER]]: +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A:%.*]] = load half, ptr null, align 1 +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A1:%.*]] = load half, ptr inttoptr (i64 1 to ptr), align 1 +; CHECK-INTERLEAVE1-NEXT: [[A_EXT:%.*]] = fpext half [[LOAD_A]] to float +; CHECK-INTERLEAVE1-NEXT: [[A_EXT1:%.*]] = fpext half [[LOAD_A1]] to float +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 +; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4 +; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-INTERLEAVE1: [[VECTOR_PH]]: +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4 +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = trunc i64 [[N_VEC]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = mul i64 [[N_VEC]], 32 +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP2]] +; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A_EXT]], i64 0 +; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer +; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x float> poison, float [[A_EXT1]], i64 0 +; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT1]], <4 x float> poison, <4 x i32> zeroinitializer +; CHECK-INTERLEAVE1-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-INTERLEAVE1: [[VECTOR_BODY]]: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ , %[[VECTOR_PH]] ], [ [[TMP37:%.*]], %[[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 32 +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 32 +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 64 +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 96 +; CHECK-INTERLEAVE1-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP4]] +; CHECK-INTERLEAVE1-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP5]] +; CHECK-INTERLEAVE1-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP6]] +; CHECK-INTERLEAVE1-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP7]] +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = getelementptr half, ptr [[NEXT_GEP]], i64 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = getelementptr half, ptr [[NEXT_GEP3]], i64 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = getelementptr half, ptr [[NEXT_GEP4]], i64 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = getelementptr half, ptr [[NEXT_GEP5]], i64 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = getelementptr half, ptr [[NEXT_GEP]], i64 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = getelementptr half, ptr [[NEXT_GEP3]], i64 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = getelementptr half, ptr [[NEXT_GEP4]], i64 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = getelementptr half, ptr [[NEXT_GEP5]], i64 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = load half, ptr [[TMP8]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = load half, ptr [[TMP9]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = load half, ptr [[TMP10]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = load half, ptr [[TMP11]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = insertelement <4 x half> poison, half [[TMP16]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = insertelement <4 x half> [[TMP20]], half [[TMP17]], i32 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = insertelement <4 x half> [[TMP21]], half [[TMP18]], i32 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = insertelement <4 x half> [[TMP22]], half [[TMP19]], i32 3 +; CHECK-INTERLEAVE1-NEXT: [[TMP24:%.*]] = fpext <4 x half> [[TMP23]] to <4 x float> +; CHECK-INTERLEAVE1-NEXT: [[TMP25:%.*]] = fmul <4 x float> [[BROADCAST_SPLAT]], [[TMP24]] +; CHECK-INTERLEAVE1-NEXT: [[TMP26:%.*]] = fadd reassoc <4 x float> [[TMP25]], [[VEC_PHI]] +; CHECK-INTERLEAVE1-NEXT: [[TMP27:%.*]] = load half, ptr [[TMP12]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP28:%.*]] = load half, ptr [[TMP13]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP29:%.*]] = load half, ptr [[TMP14]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP30:%.*]] = load half, ptr [[TMP15]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP31:%.*]] = insertelement <4 x half> poison, half [[TMP27]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP32:%.*]] = insertelement <4 x half> [[TMP31]], half [[TMP28]], i32 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP33:%.*]] = insertelement <4 x half> [[TMP32]], half [[TMP29]], i32 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP34:%.*]] = insertelement <4 x half> [[TMP33]], half [[TMP30]], i32 3 +; CHECK-INTERLEAVE1-NEXT: [[TMP35:%.*]] = fpext <4 x half> [[TMP34]] to <4 x float> +; CHECK-INTERLEAVE1-NEXT: [[TMP36:%.*]] = fmul <4 x float> [[BROADCAST_SPLAT2]], [[TMP35]] +; CHECK-INTERLEAVE1-NEXT: [[TMP37]] = fadd reassoc <4 x float> [[TMP36]], [[TMP26]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-INTERLEAVE1-NEXT: [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP38]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-INTERLEAVE1: [[MIDDLE_BLOCK]]: +; CHECK-INTERLEAVE1-NEXT: [[TMP39:%.*]] = call reassoc float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP37]]) +; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label %[[FOR_EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-INTERLEAVE1: [[SCALAR_PH]]: +; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_PREHEADER]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL6:%.*]] = phi ptr [ [[TMP3]], %[[MIDDLE_BLOCK]] ], [ [[MATRIX]], %[[FOR_PREHEADER]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP39]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[FOR_PREHEADER]] ] +; CHECK-INTERLEAVE1-NEXT: br label %[[FOR_BODY:.*]] +; CHECK-INTERLEAVE1: [[FOR_BODY]]: +; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-INTERLEAVE1-NEXT: [[PTR:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL6]], %[[SCALAR_PH]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACCUM:%.*]] = phi float [ [[ADD_1:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ] +; CHECK-INTERLEAVE1-NEXT: [[GEP_B:%.*]] = getelementptr half, ptr [[PTR]], i64 1 +; CHECK-INTERLEAVE1-NEXT: [[GEP_B1:%.*]] = getelementptr half, ptr [[PTR]], i64 2 +; CHECK-INTERLEAVE1-NEXT: [[LOAD_B:%.*]] = load half, ptr [[GEP_B]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[B_EXT:%.*]] = fpext half [[LOAD_B]] to float +; CHECK-INTERLEAVE1-NEXT: [[MUL:%.*]] = fmul float [[A_EXT]], [[B_EXT]] +; CHECK-INTERLEAVE1-NEXT: [[ADD:%.*]] = fadd reassoc float [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_B1:%.*]] = load half, ptr [[GEP_B1]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[B_EXT1:%.*]] = fpext half [[LOAD_B1]] to float +; CHECK-INTERLEAVE1-NEXT: [[MUL_1:%.*]] = fmul float [[A_EXT1]], [[B_EXT1]] +; CHECK-INTERLEAVE1-NEXT: [[ADD_1]] = fadd reassoc float [[MUL_1]], [[ADD]] +; CHECK-INTERLEAVE1-NEXT: [[SCEVGEP]] = getelementptr half, ptr [[PTR]], i64 16 +; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 +; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-INTERLEAVE1: [[FOR_EXIT]]: +; CHECK-INTERLEAVE1-NEXT: [[ADD_FLOAT:%.*]] = phi float [ [[ADD_1]], %[[FOR_BODY]] ], [ [[TMP39]], %[[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVE1-NEXT: br label %[[EXIT]] +; CHECK-INTERLEAVE1: [[EXIT]]: +; CHECK-INTERLEAVE1-NEXT: [[RESULT:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[ADD_FLOAT]], %[[FOR_EXIT]] ] +; CHECK-INTERLEAVE1-NEXT: store float [[RESULT]], ptr [[MATRIX]], align 4 +; CHECK-INTERLEAVE1-NEXT: ret void +; +; CHECK-INTERLEAVED-LABEL: define void @not_fdotp_not_phi2( +; CHECK-INTERLEAVED-SAME: ptr [[MATRIX:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVED-NEXT: [[ENTRY:.*]]: +; CHECK-INTERLEAVED-NEXT: [[CMP:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP]], label %[[FOR_PREHEADER:.*]], label %[[EXIT:.*]] +; CHECK-INTERLEAVED: [[FOR_PREHEADER]]: +; CHECK-INTERLEAVED-NEXT: [[LOAD_A:%.*]] = load half, ptr null, align 1 +; CHECK-INTERLEAVED-NEXT: [[LOAD_A1:%.*]] = load half, ptr inttoptr (i64 1 to ptr), align 1 +; CHECK-INTERLEAVED-NEXT: [[A_EXT:%.*]] = fpext half [[LOAD_A]] to float +; CHECK-INTERLEAVED-NEXT: [[A_EXT1:%.*]] = fpext half [[LOAD_A1]] to float +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 +; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8 +; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-INTERLEAVED: [[VECTOR_PH]]: +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 8 +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = trunc i64 [[N_VEC]] to i32 +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = mul i64 [[N_VEC]], 32 +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP2]] +; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A_EXT]], i64 0 +; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer +; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x float> poison, float [[A_EXT1]], i64 0 +; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT1]], <4 x float> poison, <4 x i32> zeroinitializer +; CHECK-INTERLEAVED-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-INTERLEAVED: [[VECTOR_BODY]]: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ , %[[VECTOR_PH]] ], [ [[TMP70:%.*]], %[[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi <4 x float> [ splat (float -0.000000e+00), %[[VECTOR_PH]] ], [ [[TMP71:%.*]], %[[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 32 +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 32 +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 64 +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 96 +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 128 +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 160 +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = add i64 [[OFFSET_IDX]], 192 +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 224 +; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP4]] +; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP5]] +; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP6]] +; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP7]] +; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP7:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP8]] +; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP9]] +; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP9:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP10:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP11]] +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = getelementptr half, ptr [[NEXT_GEP]], i64 1 +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = getelementptr half, ptr [[NEXT_GEP4]], i64 1 +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr half, ptr [[NEXT_GEP5]], i64 1 +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = getelementptr half, ptr [[NEXT_GEP6]], i64 1 +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = getelementptr half, ptr [[NEXT_GEP7]], i64 1 +; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = getelementptr half, ptr [[NEXT_GEP8]], i64 1 +; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr half, ptr [[NEXT_GEP9]], i64 1 +; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = getelementptr half, ptr [[NEXT_GEP10]], i64 1 +; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = getelementptr half, ptr [[NEXT_GEP]], i64 2 +; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = getelementptr half, ptr [[NEXT_GEP4]], i64 2 +; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = getelementptr half, ptr [[NEXT_GEP5]], i64 2 +; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = getelementptr half, ptr [[NEXT_GEP6]], i64 2 +; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = getelementptr half, ptr [[NEXT_GEP7]], i64 2 +; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = getelementptr half, ptr [[NEXT_GEP8]], i64 2 +; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = getelementptr half, ptr [[NEXT_GEP9]], i64 2 +; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = getelementptr half, ptr [[NEXT_GEP10]], i64 2 +; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = load half, ptr [[TMP12]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = load half, ptr [[TMP13]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = load half, ptr [[TMP14]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = load half, ptr [[TMP15]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = insertelement <4 x half> poison, half [[TMP28]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = insertelement <4 x half> [[TMP32]], half [[TMP29]], i32 1 +; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = insertelement <4 x half> [[TMP33]], half [[TMP30]], i32 2 +; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = insertelement <4 x half> [[TMP34]], half [[TMP31]], i32 3 +; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = load half, ptr [[TMP16]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = load half, ptr [[TMP17]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = load half, ptr [[TMP18]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = load half, ptr [[TMP19]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP40:%.*]] = insertelement <4 x half> poison, half [[TMP36]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP41:%.*]] = insertelement <4 x half> [[TMP40]], half [[TMP37]], i32 1 +; CHECK-INTERLEAVED-NEXT: [[TMP42:%.*]] = insertelement <4 x half> [[TMP41]], half [[TMP38]], i32 2 +; CHECK-INTERLEAVED-NEXT: [[TMP43:%.*]] = insertelement <4 x half> [[TMP42]], half [[TMP39]], i32 3 +; CHECK-INTERLEAVED-NEXT: [[TMP44:%.*]] = fpext <4 x half> [[TMP35]] to <4 x float> +; CHECK-INTERLEAVED-NEXT: [[TMP45:%.*]] = fpext <4 x half> [[TMP43]] to <4 x float> +; CHECK-INTERLEAVED-NEXT: [[TMP46:%.*]] = fmul <4 x float> [[BROADCAST_SPLAT]], [[TMP44]] +; CHECK-INTERLEAVED-NEXT: [[TMP47:%.*]] = fmul <4 x float> [[BROADCAST_SPLAT]], [[TMP45]] +; CHECK-INTERLEAVED-NEXT: [[TMP48:%.*]] = fadd reassoc <4 x float> [[TMP46]], [[VEC_PHI]] +; CHECK-INTERLEAVED-NEXT: [[TMP49:%.*]] = fadd reassoc <4 x float> [[TMP47]], [[VEC_PHI3]] +; CHECK-INTERLEAVED-NEXT: [[TMP50:%.*]] = load half, ptr [[TMP20]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP51:%.*]] = load half, ptr [[TMP21]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP52:%.*]] = load half, ptr [[TMP22]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP53:%.*]] = load half, ptr [[TMP23]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP54:%.*]] = insertelement <4 x half> poison, half [[TMP50]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP55:%.*]] = insertelement <4 x half> [[TMP54]], half [[TMP51]], i32 1 +; CHECK-INTERLEAVED-NEXT: [[TMP56:%.*]] = insertelement <4 x half> [[TMP55]], half [[TMP52]], i32 2 +; CHECK-INTERLEAVED-NEXT: [[TMP57:%.*]] = insertelement <4 x half> [[TMP56]], half [[TMP53]], i32 3 +; CHECK-INTERLEAVED-NEXT: [[TMP58:%.*]] = load half, ptr [[TMP24]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP59:%.*]] = load half, ptr [[TMP25]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP60:%.*]] = load half, ptr [[TMP26]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP61:%.*]] = load half, ptr [[TMP27]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP62:%.*]] = insertelement <4 x half> poison, half [[TMP58]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP63:%.*]] = insertelement <4 x half> [[TMP62]], half [[TMP59]], i32 1 +; CHECK-INTERLEAVED-NEXT: [[TMP64:%.*]] = insertelement <4 x half> [[TMP63]], half [[TMP60]], i32 2 +; CHECK-INTERLEAVED-NEXT: [[TMP65:%.*]] = insertelement <4 x half> [[TMP64]], half [[TMP61]], i32 3 +; CHECK-INTERLEAVED-NEXT: [[TMP66:%.*]] = fpext <4 x half> [[TMP57]] to <4 x float> +; CHECK-INTERLEAVED-NEXT: [[TMP67:%.*]] = fpext <4 x half> [[TMP65]] to <4 x float> +; CHECK-INTERLEAVED-NEXT: [[TMP68:%.*]] = fmul <4 x float> [[BROADCAST_SPLAT2]], [[TMP66]] +; CHECK-INTERLEAVED-NEXT: [[TMP69:%.*]] = fmul <4 x float> [[BROADCAST_SPLAT2]], [[TMP67]] +; CHECK-INTERLEAVED-NEXT: [[TMP70]] = fadd reassoc <4 x float> [[TMP68]], [[TMP48]] +; CHECK-INTERLEAVED-NEXT: [[TMP71]] = fadd reassoc <4 x float> [[TMP69]], [[TMP49]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-INTERLEAVED-NEXT: [[TMP72:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP72]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-INTERLEAVED: [[MIDDLE_BLOCK]]: +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = fadd reassoc <4 x float> [[TMP71]], [[TMP70]] +; CHECK-INTERLEAVED-NEXT: [[TMP73:%.*]] = call reassoc float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label %[[FOR_EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-INTERLEAVED: [[SCALAR_PH]]: +; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_PREHEADER]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL11:%.*]] = phi ptr [ [[TMP3]], %[[MIDDLE_BLOCK]] ], [ [[MATRIX]], %[[FOR_PREHEADER]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP73]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[FOR_PREHEADER]] ] +; CHECK-INTERLEAVED-NEXT: br label %[[FOR_BODY:.*]] +; CHECK-INTERLEAVED: [[FOR_BODY]]: +; CHECK-INTERLEAVED-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-INTERLEAVED-NEXT: [[PTR:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL11]], %[[SCALAR_PH]] ] +; CHECK-INTERLEAVED-NEXT: [[ACCUM:%.*]] = phi float [ [[ADD_1:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ] +; CHECK-INTERLEAVED-NEXT: [[GEP_B:%.*]] = getelementptr half, ptr [[PTR]], i64 1 +; CHECK-INTERLEAVED-NEXT: [[GEP_B1:%.*]] = getelementptr half, ptr [[PTR]], i64 2 +; CHECK-INTERLEAVED-NEXT: [[LOAD_B:%.*]] = load half, ptr [[GEP_B]], align 1 +; CHECK-INTERLEAVED-NEXT: [[B_EXT:%.*]] = fpext half [[LOAD_B]] to float +; CHECK-INTERLEAVED-NEXT: [[MUL:%.*]] = fmul float [[A_EXT]], [[B_EXT]] +; CHECK-INTERLEAVED-NEXT: [[ADD:%.*]] = fadd reassoc float [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_B1:%.*]] = load half, ptr [[GEP_B1]], align 1 +; CHECK-INTERLEAVED-NEXT: [[B_EXT1:%.*]] = fpext half [[LOAD_B1]] to float +; CHECK-INTERLEAVED-NEXT: [[MUL_1:%.*]] = fmul float [[A_EXT1]], [[B_EXT1]] +; CHECK-INTERLEAVED-NEXT: [[ADD_1]] = fadd reassoc float [[MUL_1]], [[ADD]] +; CHECK-INTERLEAVED-NEXT: [[SCEVGEP]] = getelementptr half, ptr [[PTR]], i64 16 +; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 +; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]] +; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-INTERLEAVED: [[FOR_EXIT]]: +; CHECK-INTERLEAVED-NEXT: [[ADD_FLOAT:%.*]] = phi float [ [[ADD_1]], %[[FOR_BODY]] ], [ [[TMP73]], %[[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: br label %[[EXIT]] +; CHECK-INTERLEAVED: [[EXIT]]: +; CHECK-INTERLEAVED-NEXT: [[RESULT:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[ADD_FLOAT]], %[[FOR_EXIT]] ] +; CHECK-INTERLEAVED-NEXT: store float [[RESULT]], ptr [[MATRIX]], align 4 +; CHECK-INTERLEAVED-NEXT: ret void +; +; CHECK-MAXBW-LABEL: define void @not_fdotp_not_phi2( +; CHECK-MAXBW-SAME: ptr [[MATRIX:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-MAXBW-NEXT: [[ENTRY:.*]]: +; CHECK-MAXBW-NEXT: [[CMP:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-MAXBW-NEXT: br i1 [[CMP]], label %[[FOR_PREHEADER:.*]], label %[[EXIT:.*]] +; CHECK-MAXBW: [[FOR_PREHEADER]]: +; CHECK-MAXBW-NEXT: [[LOAD_A:%.*]] = load half, ptr null, align 1 +; CHECK-MAXBW-NEXT: [[LOAD_A1:%.*]] = load half, ptr inttoptr (i64 1 to ptr), align 1 +; CHECK-MAXBW-NEXT: [[A_EXT:%.*]] = fpext half [[LOAD_A]] to float +; CHECK-MAXBW-NEXT: [[A_EXT1:%.*]] = fpext half [[LOAD_A1]] to float +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 +; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4 +; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-MAXBW: [[VECTOR_PH]]: +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4 +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = trunc i64 [[N_VEC]] to i32 +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = mul i64 [[N_VEC]], 32 +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP2]] +; CHECK-MAXBW-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A_EXT]], i64 0 +; CHECK-MAXBW-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer +; CHECK-MAXBW-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x float> poison, float [[A_EXT1]], i64 0 +; CHECK-MAXBW-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT1]], <4 x float> poison, <4 x i32> zeroinitializer +; CHECK-MAXBW-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-MAXBW: [[VECTOR_BODY]]: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ , %[[VECTOR_PH]] ], [ [[TMP37:%.*]], %[[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 32 +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 32 +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 64 +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 96 +; CHECK-MAXBW-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP4]] +; CHECK-MAXBW-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP5]] +; CHECK-MAXBW-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP6]] +; CHECK-MAXBW-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP7]] +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr half, ptr [[NEXT_GEP]], i64 1 +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = getelementptr half, ptr [[NEXT_GEP3]], i64 1 +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr half, ptr [[NEXT_GEP4]], i64 1 +; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr half, ptr [[NEXT_GEP5]], i64 1 +; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = getelementptr half, ptr [[NEXT_GEP]], i64 2 +; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = getelementptr half, ptr [[NEXT_GEP3]], i64 2 +; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = getelementptr half, ptr [[NEXT_GEP4]], i64 2 +; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = getelementptr half, ptr [[NEXT_GEP5]], i64 2 +; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = load half, ptr [[TMP8]], align 1 +; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = load half, ptr [[TMP9]], align 1 +; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = load half, ptr [[TMP10]], align 1 +; CHECK-MAXBW-NEXT: [[TMP19:%.*]] = load half, ptr [[TMP11]], align 1 +; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = insertelement <4 x half> poison, half [[TMP16]], i32 0 +; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = insertelement <4 x half> [[TMP20]], half [[TMP17]], i32 1 +; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = insertelement <4 x half> [[TMP21]], half [[TMP18]], i32 2 +; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = insertelement <4 x half> [[TMP22]], half [[TMP19]], i32 3 +; CHECK-MAXBW-NEXT: [[TMP24:%.*]] = fpext <4 x half> [[TMP23]] to <4 x float> +; CHECK-MAXBW-NEXT: [[TMP25:%.*]] = fmul <4 x float> [[BROADCAST_SPLAT]], [[TMP24]] +; CHECK-MAXBW-NEXT: [[TMP26:%.*]] = fadd reassoc <4 x float> [[TMP25]], [[VEC_PHI]] +; CHECK-MAXBW-NEXT: [[TMP27:%.*]] = load half, ptr [[TMP12]], align 1 +; CHECK-MAXBW-NEXT: [[TMP28:%.*]] = load half, ptr [[TMP13]], align 1 +; CHECK-MAXBW-NEXT: [[TMP29:%.*]] = load half, ptr [[TMP14]], align 1 +; CHECK-MAXBW-NEXT: [[TMP30:%.*]] = load half, ptr [[TMP15]], align 1 +; CHECK-MAXBW-NEXT: [[TMP31:%.*]] = insertelement <4 x half> poison, half [[TMP27]], i32 0 +; CHECK-MAXBW-NEXT: [[TMP32:%.*]] = insertelement <4 x half> [[TMP31]], half [[TMP28]], i32 1 +; CHECK-MAXBW-NEXT: [[TMP33:%.*]] = insertelement <4 x half> [[TMP32]], half [[TMP29]], i32 2 +; CHECK-MAXBW-NEXT: [[TMP34:%.*]] = insertelement <4 x half> [[TMP33]], half [[TMP30]], i32 3 +; CHECK-MAXBW-NEXT: [[TMP35:%.*]] = fpext <4 x half> [[TMP34]] to <4 x float> +; CHECK-MAXBW-NEXT: [[TMP36:%.*]] = fmul <4 x float> [[BROADCAST_SPLAT2]], [[TMP35]] +; CHECK-MAXBW-NEXT: [[TMP37]] = fadd reassoc <4 x float> [[TMP36]], [[TMP26]] +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-MAXBW-NEXT: [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[TMP38]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-MAXBW: [[MIDDLE_BLOCK]]: +; CHECK-MAXBW-NEXT: [[TMP39:%.*]] = call reassoc float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP37]]) +; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label %[[FOR_EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-MAXBW: [[SCALAR_PH]]: +; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_PREHEADER]] ] +; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL6:%.*]] = phi ptr [ [[TMP3]], %[[MIDDLE_BLOCK]] ], [ [[MATRIX]], %[[FOR_PREHEADER]] ] +; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP39]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[FOR_PREHEADER]] ] +; CHECK-MAXBW-NEXT: br label %[[FOR_BODY:.*]] +; CHECK-MAXBW: [[FOR_BODY]]: +; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-MAXBW-NEXT: [[PTR:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL6]], %[[SCALAR_PH]] ] +; CHECK-MAXBW-NEXT: [[ACCUM:%.*]] = phi float [ [[ADD_1:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ] +; CHECK-MAXBW-NEXT: [[GEP_B:%.*]] = getelementptr half, ptr [[PTR]], i64 1 +; CHECK-MAXBW-NEXT: [[GEP_B1:%.*]] = getelementptr half, ptr [[PTR]], i64 2 +; CHECK-MAXBW-NEXT: [[LOAD_B:%.*]] = load half, ptr [[GEP_B]], align 1 +; CHECK-MAXBW-NEXT: [[B_EXT:%.*]] = fpext half [[LOAD_B]] to float +; CHECK-MAXBW-NEXT: [[MUL:%.*]] = fmul float [[A_EXT]], [[B_EXT]] +; CHECK-MAXBW-NEXT: [[ADD:%.*]] = fadd reassoc float [[MUL]], [[ACCUM]] +; CHECK-MAXBW-NEXT: [[LOAD_B1:%.*]] = load half, ptr [[GEP_B1]], align 1 +; CHECK-MAXBW-NEXT: [[B_EXT1:%.*]] = fpext half [[LOAD_B1]] to float +; CHECK-MAXBW-NEXT: [[MUL_1:%.*]] = fmul float [[A_EXT1]], [[B_EXT1]] +; CHECK-MAXBW-NEXT: [[ADD_1]] = fadd reassoc float [[MUL_1]], [[ADD]] +; CHECK-MAXBW-NEXT: [[SCEVGEP]] = getelementptr half, ptr [[PTR]], i64 16 +; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 +; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]] +; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-MAXBW: [[FOR_EXIT]]: +; CHECK-MAXBW-NEXT: [[ADD_FLOAT:%.*]] = phi float [ [[ADD_1]], %[[FOR_BODY]] ], [ [[TMP39]], %[[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: br label %[[EXIT]] +; CHECK-MAXBW: [[EXIT]]: +; CHECK-MAXBW-NEXT: [[RESULT:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[ADD_FLOAT]], %[[FOR_EXIT]] ] +; CHECK-MAXBW-NEXT: store float [[RESULT]], ptr [[MATRIX]], align 4 +; CHECK-MAXBW-NEXT: ret void +; +entry: + %cmp = icmp sgt i32 %n, 0 + br i1 %cmp, label %for.preheader, label %exit + +for.preheader: ; preds = %entry + %load.a = load half, ptr inttoptr (i64 0 to ptr), align 1 + %load.a1 = load half, ptr inttoptr (i64 1 to ptr), align 1 + %a.ext = fpext half %load.a to float + %a.ext1 = fpext half %load.a1 to float + br label %for.body + +for.body: ; preds = %for.preheader, %for.body + %iv = phi i32 [ %iv.next, %for.body ], [ 0, %for.preheader ] + %ptr = phi ptr [ %scevgep, %for.body ], [ %matrix, %for.preheader ] + %accum = phi float [ %add.1, %for.body ], [ 0.0, %for.preheader ] + %gep.b = getelementptr half, ptr %ptr, i64 1 + %gep.b1 = getelementptr half, ptr %ptr, i64 2 + %load.b = load half, ptr %gep.b, align 1 + %b.ext = fpext half %load.b to float + %mul = fmul float %a.ext, %b.ext + %add = fadd reassoc float %mul, %accum + %load.b1 = load half, ptr %gep.b1, align 1 + %b.ext1 = fpext half %load.b1 to float + %mul.1 = fmul float %a.ext1, %b.ext1 + %add.1 = fadd reassoc float %mul.1, %add + %scevgep = getelementptr half, ptr %ptr, i64 16 + %iv.next = add nuw nsw i32 %iv, 1 + %exitcond.not = icmp eq i32 %iv.next, %n + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + %add.float = phi float [ %add.1, %for.body ] + br label %exit + +exit: ; preds = %for.exit, %entry + %result = phi float [ 0.000000e+00, %entry ], [ %add.float, %for.exit ] + store float %result, ptr %matrix, align 4 + ret void +} + +define float @not_fdotp_ext_outside_plan(ptr %a, half %b, i64 %n) #0 { +; CHECK-INTERLEAVE1-LABEL: define float @not_fdotp_ext_outside_plan( +; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], half [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVE1-NEXT: [[ENTRY:.*]]: +; CHECK-INTERLEAVE1-NEXT: [[CMP:%.*]] = icmp eq i64 [[N]], 0 +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP]], label %[[EXIT:.*]], label %[[FOR_PH:.*]] +; CHECK-INTERLEAVE1: [[FOR_PH]]: +; CHECK-INTERLEAVE1-NEXT: [[EXT_B:%.*]] = fpext half [[B]] to float +; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8 +; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-INTERLEAVE1: [[VECTOR_PH]]: +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8 +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x float> poison, float [[EXT_B]], i64 0 +; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x float> [[BROADCAST_SPLATINSERT]], <8 x float> poison, <8 x i32> zeroinitializer +; CHECK-INTERLEAVE1-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-INTERLEAVE1: [[VECTOR_BODY]]: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <8 x float> [ , %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = getelementptr inbounds half, ptr [[A]], i64 [[INDEX]] +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x half>, ptr [[TMP0]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = fpext <8 x half> [[WIDE_LOAD]] to <8 x float> +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = fmul <8 x float> [[TMP1]], [[BROADCAST_SPLAT]] +; CHECK-INTERLEAVE1-NEXT: [[TMP3]] = fadd reassoc <8 x float> [[TMP2]], [[VEC_PHI]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-INTERLEAVE1: [[MIDDLE_BLOCK]]: +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = call reassoc float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP3]]) +; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK-INTERLEAVE1: [[SCALAR_PH]]: +; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_PH]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP5]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[FOR_PH]] ] +; CHECK-INTERLEAVE1-NEXT: br label %[[FOR_BODY:.*]] +; CHECK-INTERLEAVE1: [[FOR_BODY]]: +; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACCUM:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[GEP_A:%.*]] = getelementptr inbounds half, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A:%.*]] = load half, ptr [[GEP_A]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[EXT_A:%.*]] = fpext half [[LOAD_A]] to float +; CHECK-INTERLEAVE1-NEXT: [[MUL:%.*]] = fmul float [[EXT_A]], [[EXT_B]] +; CHECK-INTERLEAVE1-NEXT: [[ADD]] = fadd reassoc float [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-INTERLEAVE1-NEXT: [[CMP_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_1]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-INTERLEAVE1: [[EXIT_LOOPEXIT]]: +; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], %[[FOR_BODY]] ], [ [[TMP5]], %[[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVE1-NEXT: br label %[[EXIT]] +; CHECK-INTERLEAVE1: [[EXIT]]: +; CHECK-INTERLEAVE1-NEXT: [[RESULT:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[ADD_LCSSA]], %[[EXIT_LOOPEXIT]] ] +; CHECK-INTERLEAVE1-NEXT: ret float [[RESULT]] +; +; CHECK-INTERLEAVED-LABEL: define float @not_fdotp_ext_outside_plan( +; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], half [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVED-NEXT: [[ENTRY:.*]]: +; CHECK-INTERLEAVED-NEXT: [[CMP:%.*]] = icmp eq i64 [[N]], 0 +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP]], label %[[EXIT:.*]], label %[[FOR_PH:.*]] +; CHECK-INTERLEAVED: [[FOR_PH]]: +; CHECK-INTERLEAVED-NEXT: [[EXT_B:%.*]] = fpext half [[B]] to float +; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16 +; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-INTERLEAVED: [[VECTOR_PH]]: +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 16 +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x float> poison, float [[EXT_B]], i64 0 +; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x float> [[BROADCAST_SPLATINSERT]], <8 x float> poison, <8 x i32> zeroinitializer +; CHECK-INTERLEAVED-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-INTERLEAVED: [[VECTOR_BODY]]: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <8 x float> [ , %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <8 x float> [ splat (float -0.000000e+00), %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = getelementptr inbounds half, ptr [[A]], i64 [[INDEX]] +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr inbounds half, ptr [[TMP0]], i32 8 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x half>, ptr [[TMP0]], align 2 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x half>, ptr [[TMP1]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = fpext <8 x half> [[WIDE_LOAD]] to <8 x float> +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = fpext <8 x half> [[WIDE_LOAD2]] to <8 x float> +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = fmul <8 x float> [[TMP2]], [[BROADCAST_SPLAT]] +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = fmul <8 x float> [[TMP3]], [[BROADCAST_SPLAT]] +; CHECK-INTERLEAVED-NEXT: [[TMP6]] = fadd reassoc <8 x float> [[TMP4]], [[VEC_PHI]] +; CHECK-INTERLEAVED-NEXT: [[TMP7]] = fadd reassoc <8 x float> [[TMP5]], [[VEC_PHI1]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-INTERLEAVED: [[MIDDLE_BLOCK]]: +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = fadd reassoc <8 x float> [[TMP7]], [[TMP6]] +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call reassoc float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK-INTERLEAVED: [[SCALAR_PH]]: +; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_PH]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[FOR_PH]] ] +; CHECK-INTERLEAVED-NEXT: br label %[[FOR_BODY:.*]] +; CHECK-INTERLEAVED: [[FOR_BODY]]: +; CHECK-INTERLEAVED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ACCUM:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[GEP_A:%.*]] = getelementptr inbounds half, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_A:%.*]] = load half, ptr [[GEP_A]], align 2 +; CHECK-INTERLEAVED-NEXT: [[EXT_A:%.*]] = fpext half [[LOAD_A]] to float +; CHECK-INTERLEAVED-NEXT: [[MUL:%.*]] = fmul float [[EXT_A]], [[EXT_B]] +; CHECK-INTERLEAVED-NEXT: [[ADD]] = fadd reassoc float [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-INTERLEAVED-NEXT: [[CMP_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_1]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-INTERLEAVED: [[EXIT_LOOPEXIT]]: +; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], %[[FOR_BODY]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: br label %[[EXIT]] +; CHECK-INTERLEAVED: [[EXIT]]: +; CHECK-INTERLEAVED-NEXT: [[RESULT:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[ADD_LCSSA]], %[[EXIT_LOOPEXIT]] ] +; CHECK-INTERLEAVED-NEXT: ret float [[RESULT]] +; +; CHECK-MAXBW-LABEL: define float @not_fdotp_ext_outside_plan( +; CHECK-MAXBW-SAME: ptr [[A:%.*]], half [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-MAXBW-NEXT: [[ENTRY:.*]]: +; CHECK-MAXBW-NEXT: [[CMP:%.*]] = icmp eq i64 [[N]], 0 +; CHECK-MAXBW-NEXT: br i1 [[CMP]], label %[[EXIT:.*]], label %[[FOR_PH:.*]] +; CHECK-MAXBW: [[FOR_PH]]: +; CHECK-MAXBW-NEXT: [[EXT_B:%.*]] = fpext half [[B]] to float +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 3 +; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-MAXBW: [[VECTOR_PH]]: +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8 +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, float [[EXT_B]], i64 0 +; CHECK-MAXBW-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-MAXBW-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-MAXBW: [[VECTOR_BODY]]: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( splat (float -0.000000e+00), float 0.000000e+00, i32 0), %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = getelementptr inbounds half, ptr [[A]], i64 [[INDEX]] +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 2 +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = fpext [[WIDE_LOAD]] to +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = fmul [[TMP5]], [[BROADCAST_SPLAT]] +; CHECK-MAXBW-NEXT: [[TMP7]] = fadd reassoc [[TMP6]], [[VEC_PHI]] +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-MAXBW: [[MIDDLE_BLOCK]]: +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = call reassoc float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, [[TMP7]]) +; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK-MAXBW: [[SCALAR_PH]]: +; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_PH]] ] +; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[FOR_PH]] ] +; CHECK-MAXBW-NEXT: br label %[[FOR_BODY:.*]] +; CHECK-MAXBW: [[FOR_BODY]]: +; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ACCUM:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[GEP_A:%.*]] = getelementptr inbounds half, ptr [[A]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_A:%.*]] = load half, ptr [[GEP_A]], align 2 +; CHECK-MAXBW-NEXT: [[EXT_A:%.*]] = fpext half [[LOAD_A]] to float +; CHECK-MAXBW-NEXT: [[MUL:%.*]] = fmul float [[EXT_A]], [[EXT_B]] +; CHECK-MAXBW-NEXT: [[ADD]] = fadd reassoc float [[MUL]], [[ACCUM]] +; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-MAXBW-NEXT: [[CMP_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-MAXBW-NEXT: br i1 [[CMP_1]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-MAXBW: [[EXIT_LOOPEXIT]]: +; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], %[[FOR_BODY]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: br label %[[EXIT]] +; CHECK-MAXBW: [[EXIT]]: +; CHECK-MAXBW-NEXT: [[RESULT:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[ADD_LCSSA]], %[[EXIT_LOOPEXIT]] ] +; CHECK-MAXBW-NEXT: ret float [[RESULT]] +; +entry: + %cmp = icmp eq i64 %n, 0 + br i1 %cmp, label %exit, label %for.ph + +for.ph: ; preds = %entry + %ext.b = fpext half %b to float + br label %for.body + +for.body: ; preds = %for.body.lr.ph, %for.body + %iv = phi i64 [ 0, %for.ph ], [ %iv.next, %for.body ] + %accum = phi float [ 0.0, %for.ph ], [ %add, %for.body ] + %gep.a = getelementptr inbounds half, ptr %a, i64 %iv + %load.a = load half, ptr %gep.a, align 2 + %ext.a = fpext half %load.a to float + %mul = fmul float %ext.a, %ext.b + %add = fadd reassoc float %mul, %accum + %iv.next = add nuw nsw i64 %iv, 1 + %cmp.1 = icmp eq i64 %iv.next, %n + br i1 %cmp.1, label %exit, label %for.body + +exit: ; preds = %for.cond.cleanup.loopexit, %entry + %result = phi float [ 0.0, %entry ], [ %add, %for.body ] + ret float %result +}