Skip to content

Commit 41d845f

Browse files
committed
Address review comments
1 parent 596edec commit 41d845f

File tree

3 files changed

+102
-72
lines changed

3 files changed

+102
-72
lines changed

llvm/docs/LangRef.rst

Lines changed: 70 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -20315,6 +20315,76 @@ Arguments:
2031520315
""""""""""
2031620316
The argument to this intrinsic must be a vector of floating-point values.
2031720317

20318+
Vector Partial Reduction Intrinsics
20319+
-----------------------------------
20320+
20321+
Partial reductions of vectors can be expressed using the following intrinsics.
20322+
Each one reduces the concatenation of the two vector arguments down to the
20323+
number of elements of the result vector type.
20324+
20325+
Other than the reduction operator (e.g. add, fadd) the way in which the
20326+
concatenated arguments is reduced is entirely unspecified. By their nature these
20327+
intrinsics are not expected to be useful in isolation but instead implement the
20328+
first phase of an overall reduction operation.
20329+
20330+
The typical use case is loop vectorization where reductions are split into an
20331+
in-loop phase, where maintaining an unordered vector result is important for
20332+
performance, and an out-of-loop phase to calculate the final scalar result.
20333+
20334+
By avoiding the introduction of new ordering constraints, these intrinsics
20335+
enhance the ability to leverage a target's accumulation instructions.
20336+
20337+
'``llvm.vector.partial.reduce.add.*``' Intrinsic
20338+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
20339+
20340+
Syntax:
20341+
"""""""
20342+
This is an overloaded intrinsic.
20343+
20344+
::
20345+
20346+
declare <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v4i32.v8i32(<4 x i32> %a, <8 x i32> %b)
20347+
declare <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v4i32.v16i32(<4 x i32> %a, <16 x i32> %b)
20348+
declare <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv8i32(<vscale x 4 x i32> %a, <vscale x 8 x i32> %b)
20349+
declare <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv16i32(<vscale x 4 x i32> %a, <vscale x 16 x i32> %b)
20350+
20351+
Arguments:
20352+
""""""""""
20353+
20354+
The first argument is an integer vector with the same type as the result.
20355+
20356+
The second argument is a vector with a length that is a known integer multiple
20357+
of the result's type, while maintaining the same element type.
20358+
20359+
'``llvm.vector.partial.reduce.fadd.*``' Intrinsic
20360+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
20361+
20362+
Syntax:
20363+
"""""""
20364+
This is an overloaded intrinsic.
20365+
20366+
::
20367+
20368+
declare <4 x f32> @llvm.vector.partial.reduce.fadd.v4f32.v8f32(<4 x f32> %a, <8 x f32> %b)
20369+
declare <vscale x 4 x f32> @llvm.vector.partial.reduce.fadd.nxv4f32.nxv8f32(<vscale x 4 x f32> %a, <vscale x 8 x f32> %b)
20370+
20371+
Arguments:
20372+
""""""""""
20373+
20374+
The first argument is a floating-point vector with the same type as the result.
20375+
20376+
The second argument is a vector with a length that is a known integer multiple
20377+
of the result's type, while maintaining the same element type.
20378+
20379+
Semantics:
20380+
""""""""""
20381+
20382+
As the way in which the arguments to this floating-point intrinsic are reduced
20383+
is unspecified, this intrinsic will assume floating-point reassociation and
20384+
contraction, which may result in variations to the results due to reordering or
20385+
by lowering to different instructions (including combining multiple instructions
20386+
into a single one).
20387+
2031820388
'``llvm.vector.insert``' Intrinsic
2031920389
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
2032020390

@@ -20688,74 +20758,6 @@ Note that it has the following implications:
2068820758
- If ``%cnt`` is non-zero, the return value is non-zero as well.
2068920759
- If ``%cnt`` is less than or equal to ``%max_lanes``, the return value is equal to ``%cnt``.
2069020760

20691-
Vector Partial Reduction Intrinsics
20692-
-----------------------------------
20693-
20694-
Partial horizontal reductions of vectors can be expressed using the following intrinsics.
20695-
Each one reduces the concatenation of the two vector arguments down to the number of elements
20696-
of the result vector type.
20697-
20698-
Other than the reduction operator (e.g. add, fadd) the way in which the concatenated
20699-
arguments is reduced is entirely unspecified. By their nature these intrinsics
20700-
are not expected to be useful in isolation but instead implement the first phase
20701-
of an overall reduction operation.
20702-
20703-
The typical use case is loop vectorization where reductions are split into an
20704-
in-loop phase, where maintaining an unordered vector result is important for
20705-
performance, and an out-of-loop phase to calculate the final scalar result.
20706-
20707-
By avoiding the introduction of new ordering constraints, these intrinsics
20708-
enhance the ability to leverage a target's accumulation instructions.
20709-
20710-
'``llvm.vector.partial.reduce.add.*``' Intrinsic
20711-
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
20712-
20713-
Syntax:
20714-
"""""""
20715-
This is an overloaded intrinsic.
20716-
20717-
::
20718-
20719-
declare <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v4i32.v8i32(<4 x i32> %a, <8 x i32> %b)
20720-
declare <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v4i32.v16i32(<4 x i32> %a, <16 x i32> %b)
20721-
declare <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv8i32(<vscale x 4 x i32> %a, <vscale x 8 x i32> %b)
20722-
declare <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv16i32(<vscale x 4 x i32> %a, <vscale x 16 x i32> %b)
20723-
20724-
Arguments:
20725-
""""""""""
20726-
20727-
The first argument is an integer vector with the same type as the result.
20728-
20729-
The second argument is a vector with a length that is a known integer multiple
20730-
of the result's type, while maintaining the same element type.
20731-
20732-
'``llvm.vector.partial.reduce.fadd.*``' Intrinsic
20733-
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
20734-
20735-
Syntax:
20736-
"""""""
20737-
This is an overloaded intrinsic.
20738-
20739-
::
20740-
20741-
declare <4 x f32> @llvm.vector.partial.reduce.fadd.v4f32.v8f32(<4 x f32> %a, <8 x f32> %b)
20742-
declare <vscale x 4 x f32> @llvm.vector.partial.reduce.fadd.nxv4f32.nxv8f32(<vscale x 4 x f32> %a, <vscale x 8 x f32> %b)
20743-
20744-
Arguments:
20745-
""""""""""
20746-
20747-
The first argument is a floating-point vector with the same type as the result.
20748-
20749-
The second argument is a vector with a length that is a known integer multiple
20750-
of the result's type, while maintaining the same element type.
20751-
20752-
Semantics:
20753-
""""""""""
20754-
20755-
As the way in which the arguments to this floating-point intrinsic are reduced is unspecified,
20756-
this intrinsic will reassociate floating-point values, which may result in variations to the
20757-
results due to reordering or by lowering to different instructions.
20758-
2075920761
'``llvm.experimental.vector.histogram.*``' Intrinsic
2076020762
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
2076120763

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12990,6 +12990,12 @@ SDValue DAGCombiner::visitPARTIAL_REDUCE_MLA(SDNode *N) {
1299012990
//
1299112991
// partial_reduce_*mla(acc, mul(ext(x), splat(C)), splat(1))
1299212992
// -> partial_reduce_*mla(acc, x, C)
12993+
//
12994+
// partial_reduce_fmla(acc, fmul(fpext(a), fpext(b)), splat(1.0))
12995+
// -> partial_reduce_fmla(acc, a, b)
12996+
//
12997+
// partial_reduce_fmla(acc, fmul(fpext(x), splat(C)), splat(1.0))
12998+
// -> partial_reduce_fmla(acc, x, C)
1299312999
SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) {
1299413000
SDLoc DL(N);
1299513001
auto *Context = DAG.getContext();
@@ -13006,10 +13012,14 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) {
1300613012
CFP->isExactlyValue(1.0)))
1300713013
return SDValue();
1300813014

13015+
auto IsIntOrFPExtOpcode = [](unsigned int Opcode) {
13016+
return (ISD::isExtOpcode(Opcode) || Opcode == ISD::FP_EXTEND);
13017+
};
13018+
1300913019
SDValue LHS = Op1->getOperand(0);
1301013020
SDValue RHS = Op1->getOperand(1);
1301113021
unsigned LHSOpcode = LHS->getOpcode();
13012-
if (!ISD::isExtOpcode(LHSOpcode) && LHSOpcode != ISD::FP_EXTEND)
13022+
if (!IsIntOrFPExtOpcode(LHSOpcode))
1301313023
return SDValue();
1301413024

1301513025
SDValue LHSExtOp = LHS->getOperand(0);
@@ -13041,7 +13051,7 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) {
1304113051
}
1304213052

1304313053
unsigned RHSOpcode = RHS->getOpcode();
13044-
if (!ISD::isExtOpcode(RHSOpcode) && RHSOpcode != ISD::FP_EXTEND)
13054+
if (!IsIntOrFPExtOpcode(RHSOpcode))
1304513055
return SDValue();
1304613056

1304713057
SDValue RHSExtOp = RHS->getOperand(0);
@@ -13087,6 +13097,8 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) {
1308713097
// -> partial.reduce.smla(acc, op, splat(trunc(1)))
1308813098
// partial.reduce.sumla(acc, sext(op), splat(1))
1308913099
// -> partial.reduce.smla(acc, op, splat(trunc(1)))
13100+
// partial.reduce.fmla(acc, fpext(op), splat(1.0))
13101+
// -> partial.reduce.fmla(acc, op, splat(1.0))
1309013102
SDValue DAGCombiner::foldPartialReduceAdd(SDNode *N) {
1309113103
SDLoc DL(N);
1309213104
SDValue Acc = N->getOperand(0);
@@ -13103,7 +13115,7 @@ SDValue DAGCombiner::foldPartialReduceAdd(SDNode *N) {
1310313115
return SDValue();
1310413116

1310513117
unsigned Op1Opcode = Op1.getOpcode();
13106-
if (!ISD::isExtOpcode(Op1Opcode))
13118+
if (!ISD::isExtOpcode(Op1Opcode) && Op1Opcode != ISD::FP_EXTEND)
1310713119
return SDValue();
1310813120

1310913121
bool Op1IsSigned = Op1Opcode == ISD::SIGN_EXTEND;
@@ -13127,8 +13139,12 @@ SDValue DAGCombiner::foldPartialReduceAdd(SDNode *N) {
1312713139
TLI.getTypeToTransformTo(*Context, UnextOp1VT)))
1312813140
return SDValue();
1312913141

13142+
auto Constant = N->getOpcode() == ISD::PARTIAL_REDUCE_FMLA
13143+
? DAG.getConstantFP(1, DL, UnextOp1VT)
13144+
: DAG.getConstant(1, DL, UnextOp1VT);
13145+
1313013146
return DAG.getNode(NewOpcode, DL, N->getValueType(0), Acc, UnextOp1,
13131-
DAG.getConstant(1, DL, UnextOp1VT));
13147+
Constant);
1313213148
}
1313313149

1313413150
SDValue DAGCombiner::visitVP_STRIDED_LOAD(SDNode *N) {

llvm/test/CodeGen/AArch64/sve2p1-fdot.ll

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,18 @@ entry:
1414
ret <vscale x 4 x float> %partial.reduce
1515
}
1616

17+
define <vscale x 4 x float> @fdot_splat_vl128(<vscale x 4 x float> %acc, <vscale x 8 x half> %a) {
18+
; CHECK-LABEL: fdot_splat_vl128:
19+
; CHECK: // %bb.0: // %entry
20+
; CHECK-NEXT: fmov z2.h, #1.00000000
21+
; CHECK-NEXT: fdot z0.s, z1.h, z2.h
22+
; CHECK-NEXT: ret
23+
entry:
24+
%a.wide = fpext <vscale x 8 x half> %a to <vscale x 8 x float>
25+
%partial.reduce = call <vscale x 4 x float> @llvm.vector.partial.reduce.fadd(<vscale x 4 x float> %acc, <vscale x 8 x float> %a.wide)
26+
ret <vscale x 4 x float> %partial.reduce
27+
}
28+
1729
define void @fdot_wide_vl256(ptr %accptr, ptr %aptr, ptr %bptr) vscale_range(2,2) {
1830
; CHECK-LABEL: fdot_wide_vl256:
1931
; CHECK: // %bb.0: // %entry

0 commit comments

Comments
 (0)