Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion llvm/include/llvm/Analysis/TargetTransformInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,12 @@ class TargetTransformInfoImplBase;
/// for IR-level transformations.
class TargetTransformInfo {
public:
enum PartialReductionExtendKind { PR_None, PR_SignExtend, PR_ZeroExtend };
enum PartialReductionExtendKind {
PR_None,
PR_SignExtend,
PR_ZeroExtend,
PR_FPExtend
};

/// Get the kind of extension that an instruction represents.
LLVM_ABI static PartialReductionExtendKind
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Analysis/TargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1014,6 +1014,8 @@ TargetTransformInfo::getPartialReductionExtendKind(
return PR_ZeroExtend;
case Instruction::CastOps::SExt:
return PR_SignExtend;
case Instruction::CastOps::FPExt:
return PR_FPExtend;
default:
return PR_None;
}
Expand Down
13 changes: 11 additions & 2 deletions llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5761,7 +5761,8 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost(
(!ST->isNeonAvailable() || !ST->hasDotProd()))
return Invalid;

if ((Opcode != Instruction::Add && Opcode != Instruction::Sub) ||
if ((Opcode != Instruction::Add && Opcode != Instruction::Sub &&
Opcode != Instruction::FAdd) ||
OpAExtend == TTI::PR_None)
return Invalid;

Expand All @@ -5771,7 +5772,8 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost(

// We only support multiply binary operations for now, and for muls we
// require the types being extended to be the same.
if (BinOp && (*BinOp != Instruction::Mul || InputTypeA != InputTypeB))
if (BinOp && ((*BinOp != Instruction::Mul && *BinOp != Instruction::FMul) ||
InputTypeA != InputTypeB))
return Invalid;

bool IsUSDot = OpBExtend != TTI::PR_None && OpAExtend != OpBExtend;
Expand Down Expand Up @@ -5842,6 +5844,13 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost(
return Cost;
}

// f16 -> f32 is natively supported for fdot
if (Opcode == Instruction::FAdd && (ST->hasSME2() || ST->hasSVE2p1())) {
if (AccumLT.second.getScalarType() == MVT::f32 &&
InputLT.second.getScalarType() == MVT::f16)
return Cost;
}

// Add additional cost for the extends that would need to be inserted.
return Cost + 2;
}
Expand Down
5 changes: 4 additions & 1 deletion llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8008,7 +8008,8 @@ bool VPRecipeBuilder::getScaledReductions(
continue;
}
Value *ExtOp;
if (!match(OpI, m_ZExtOrSExt(m_Value(ExtOp))))
if (!match(OpI, m_ZExtOrSExt(m_Value(ExtOp))) &&
!match(OpI, m_FPExt(m_Value(ExtOp))))
return false;
Exts[I] = cast<Instruction>(OpI);

Expand Down Expand Up @@ -8176,6 +8177,8 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
"all accumulators in chain must have same scale factor");

unsigned ReductionOpcode = Reduction->getOpcode();
if (ReductionOpcode == Instruction::FAdd && !Reduction->hasAllowReassoc())
return nullptr;
if (ReductionOpcode == Instruction::Sub) {
auto *const Zero = ConstantInt::get(Reduction->getType(), 0);
SmallVector<VPValue *, 2> Ops;
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
Original file line number Diff line number Diff line change
Expand Up @@ -498,6 +498,12 @@ m_c_Mul(const Op0_t &Op0, const Op1_t &Op1) {
return m_c_Binary<Instruction::Mul, Op0_t, Op1_t>(Op0, Op1);
}

template <typename Op0_t, typename Op1_t>
inline AllRecipe_match<Instruction::FMul, Op0_t, Op1_t>
m_FMul(const Op0_t &Op0, const Op1_t &Op1) {
return m_Binary<Instruction::FMul, Op0_t, Op1_t>(Op0, Op1);
}

/// Match a binary AND operation.
template <typename Op0_t, typename Op1_t>
inline AllRecipe_commutative_match<Instruction::And, Op0_t, Op1_t>
Expand Down
27 changes: 21 additions & 6 deletions llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -317,7 +317,8 @@ VPPartialReductionRecipe::computeCost(ElementCount VF,
// FIXME: Replace the entire function with this once all partial reduction
// variants are bundled into VPExpressionRecipe.
if (!match(Op, m_Select(m_VPValue(), m_VPValue(Op), m_VPValue())) &&
!match(Op, m_Mul(m_VPValue(), m_ConstantInt(MulConst)))) {
!match(Op, m_Mul(m_VPValue(), m_ConstantInt(MulConst))) &&
!match(Op, m_FMul(m_VPValue(), m_VPValue()))) {
auto *PhiType = Ctx.Types.inferScalarType(getChainOp());
auto *InputType = Ctx.Types.inferScalarType(getVecOp());
return Ctx.TTI.getPartialReductionCost(getOpcode(), InputType, InputType,
Expand All @@ -340,6 +341,8 @@ VPPartialReductionRecipe::computeCost(ElementCount VF,
return TTI::PR_ZeroExtend;
if (WidenCastR->getOpcode() == Instruction::CastOps::SExt)
return TTI::PR_SignExtend;
if (WidenCastR->getOpcode() == Instruction::CastOps::FPExt)
return TTI::PR_FPExtend;
return TTI::PR_None;
};

Expand Down Expand Up @@ -392,18 +395,30 @@ VPPartialReductionRecipe::computeCost(ElementCount VF,
void VPPartialReductionRecipe::execute(VPTransformState &State) {
auto &Builder = State.Builder;

assert(getOpcode() == Instruction::Add &&
"Unhandled partial reduction opcode");
assert(
(getOpcode() == Instruction::Add || getOpcode() == Instruction::FAdd) &&
"Unhandled partial reduction opcode");

Value *BinOpVal = State.get(getOperand(1));
Value *PhiVal = State.get(getOperand(0));
assert(PhiVal && BinOpVal && "Phi and Mul must be set");

Type *RetTy = PhiVal->getType();

CallInst *V =
Builder.CreateIntrinsic(RetTy, Intrinsic::vector_partial_reduce_add,
{PhiVal, BinOpVal}, nullptr, "partial.reduce");
enum llvm::Intrinsic::IndependentIntrinsics PRIntrinsic;
switch (getOpcode()) {
case Instruction::Add: {
PRIntrinsic = Intrinsic::vector_partial_reduce_add;
break;
}
case Instruction::FAdd: {
PRIntrinsic = Intrinsic::vector_partial_reduce_fadd;
break;
}
}

CallInst *V = Builder.CreateIntrinsic(RetTy, PRIntrinsic, {PhiVal, BinOpVal},
nullptr, "partial.reduce");

State.set(this, V);
}
Expand Down
Loading