Skip to content

Commit 34a054c

Browse files
committed
[X86] combineX86ShuffleChain - add support for combining to X86ISD::ROTLI
Refactors matchShuffleAsBitRotate to allow use by both lowerShuffleAsBitRotate and matchUnaryPermuteShuffle.
1 parent aa5ebfd commit 34a054c

File tree

4 files changed

+89
-49
lines changed

4 files changed

+89
-49
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 60 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -11704,62 +11704,69 @@ static int matchShuffleAsBitRotate(ArrayRef<int> Mask, int NumSubElts) {
1170411704
return RotateAmt;
1170511705
}
1170611706

11707+
static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,
11708+
const X86Subtarget &Subtarget,
11709+
ArrayRef<int> Mask) {
11710+
assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11711+
assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers");
11712+
11713+
// AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
11714+
int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
11715+
int MaxSubElts = 64 / EltSizeInBits;
11716+
for (int NumSubElts = MinSubElts; NumSubElts <= MaxSubElts; NumSubElts *= 2) {
11717+
int RotateAmt = matchShuffleAsBitRotate(Mask, NumSubElts);
11718+
if (RotateAmt < 0)
11719+
continue;
11720+
11721+
int NumElts = Mask.size();
11722+
MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
11723+
RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
11724+
return RotateAmt * EltSizeInBits;
11725+
}
11726+
11727+
return -1;
11728+
}
11729+
1170711730
/// Lower shuffle using X86ISD::VROTLI rotations.
1170811731
static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1,
1170911732
ArrayRef<int> Mask,
1171011733
const X86Subtarget &Subtarget,
1171111734
SelectionDAG &DAG) {
11712-
assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11713-
11714-
MVT SVT = VT.getScalarType();
11715-
int EltSizeInBits = SVT.getScalarSizeInBits();
11716-
assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers");
11717-
1171811735
// Only XOP + AVX512 targets have bit rotation instructions.
1171911736
// If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.
1172011737
bool IsLegal =
1172111738
(VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();
1172211739
if (!IsLegal && Subtarget.hasSSE3())
1172311740
return SDValue();
1172411741

11725-
// AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
11726-
int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
11727-
int MaxSubElts = 64 / EltSizeInBits;
11728-
for (int NumSubElts = MinSubElts; NumSubElts <= MaxSubElts; NumSubElts *= 2) {
11729-
int RotateAmt = matchShuffleAsBitRotate(Mask, NumSubElts);
11730-
if (RotateAmt < 0)
11731-
continue;
11742+
MVT RotateVT;
11743+
int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),
11744+
Subtarget, Mask);
11745+
if (RotateAmt < 0)
11746+
return SDValue();
1173211747

11733-
int NumElts = VT.getVectorNumElements();
11734-
MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
11735-
MVT RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
11736-
11737-
// For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
11738-
// expanded to OR(SRL,SHL), will be more efficient, but if they can
11739-
// widen to vXi16 or more then existing lowering should will be better.
11740-
int RotateAmtInBits = RotateAmt * EltSizeInBits;
11741-
if (!IsLegal) {
11742-
if ((RotateAmtInBits % 16) == 0)
11743-
return SDValue();
11744-
// TODO: Use getTargetVShiftByConstNode.
11745-
unsigned ShlAmt = RotateAmtInBits;
11746-
unsigned SrlAmt = RotateSVT.getScalarSizeInBits() - RotateAmtInBits;
11747-
V1 = DAG.getBitcast(RotateVT, V1);
11748-
SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,
11749-
DAG.getTargetConstant(ShlAmt, DL, MVT::i8));
11750-
SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,
11751-
DAG.getTargetConstant(SrlAmt, DL, MVT::i8));
11752-
SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);
11753-
return DAG.getBitcast(VT, Rot);
11754-
}
11755-
11756-
SDValue Rot =
11757-
DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
11758-
DAG.getTargetConstant(RotateAmtInBits, DL, MVT::i8));
11748+
// For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
11749+
// expanded to OR(SRL,SHL), will be more efficient, but if they can
11750+
// widen to vXi16 or more then existing lowering should will be better.
11751+
if (!IsLegal) {
11752+
if ((RotateAmt % 16) == 0)
11753+
return SDValue();
11754+
// TODO: Use getTargetVShiftByConstNode.
11755+
unsigned ShlAmt = RotateAmt;
11756+
unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;
11757+
V1 = DAG.getBitcast(RotateVT, V1);
11758+
SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,
11759+
DAG.getTargetConstant(ShlAmt, DL, MVT::i8));
11760+
SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,
11761+
DAG.getTargetConstant(SrlAmt, DL, MVT::i8));
11762+
SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);
1175911763
return DAG.getBitcast(VT, Rot);
1176011764
}
1176111765

11762-
return SDValue();
11766+
SDValue Rot =
11767+
DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
11768+
DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
11769+
return DAG.getBitcast(VT, Rot);
1176311770
}
1176411771

1176511772
/// Try to lower a vector shuffle as a byte rotation.
@@ -33538,6 +33545,19 @@ static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask,
3353833545
}
3353933546
}
3354033547

33548+
// Attempt to match against bit rotates.
33549+
if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 &&
33550+
((MaskVT.is128BitVector() && Subtarget.hasXOP()) ||
33551+
Subtarget.hasAVX512())) {
33552+
int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits,
33553+
Subtarget, Mask);
33554+
if (0 < RotateAmt) {
33555+
Shuffle = X86ISD::VROTLI;
33556+
PermuteImm = (unsigned)RotateAmt;
33557+
return true;
33558+
}
33559+
}
33560+
3354133561
return false;
3354233562
}
3354333563

llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -464,10 +464,17 @@ define <32 x i8> @combine_pshufb_as_pshufhw(<32 x i8> %a0) {
464464
}
465465

466466
define <32 x i8> @combine_pshufb_not_as_pshufw(<32 x i8> %a0) {
467-
; CHECK-LABEL: combine_pshufb_not_as_pshufw:
468-
; CHECK: # %bb.0:
469-
; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13,18,19,16,17,22,23,20,21,26,27,24,25,30,31,28,29]
470-
; CHECK-NEXT: ret{{[l|q]}}
467+
; AVX2-LABEL: combine_pshufb_not_as_pshufw:
468+
; AVX2: # %bb.0:
469+
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13,18,19,16,17,22,23,20,21,26,27,24,25,30,31,28,29]
470+
; AVX2-NEXT: ret{{[l|q]}}
471+
;
472+
; AVX512-LABEL: combine_pshufb_not_as_pshufw:
473+
; AVX512: # %bb.0:
474+
; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
475+
; AVX512-NEXT: vprold $16, %zmm0, %zmm0
476+
; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
477+
; AVX512-NEXT: ret{{[l|q]}}
471478
%res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
472479
%res1 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %res0, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13>)
473480
ret <32 x i8> %res1

llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -403,10 +403,23 @@ define <16 x i8> @combine_pshufb_not_as_pshufw(<16 x i8> %a0) {
403403
; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
404404
; SSE-NEXT: retq
405405
;
406-
; AVX-LABEL: combine_pshufb_not_as_pshufw:
407-
; AVX: # %bb.0:
408-
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
409-
; AVX-NEXT: retq
406+
; AVX1-LABEL: combine_pshufb_not_as_pshufw:
407+
; AVX1: # %bb.0:
408+
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
409+
; AVX1-NEXT: retq
410+
;
411+
; AVX2-LABEL: combine_pshufb_not_as_pshufw:
412+
; AVX2: # %bb.0:
413+
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
414+
; AVX2-NEXT: retq
415+
;
416+
; AVX512F-LABEL: combine_pshufb_not_as_pshufw:
417+
; AVX512F: # %bb.0:
418+
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
419+
; AVX512F-NEXT: vprold $16, %zmm0, %zmm0
420+
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
421+
; AVX512F-NEXT: vzeroupper
422+
; AVX512F-NEXT: retq
410423
%res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
411424
%res1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %res0, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13>)
412425
ret <16 x i8> %res1

llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -255,7 +255,7 @@ define <4 x i32> @combine_vpperm_10zz32BA(<4 x i32> %a0, <4 x i32> %a1) {
255255
define <16 x i8> @combine_vpperm_as_proti_v8i16(<16 x i8> %a0, <16 x i8> %a1) {
256256
; CHECK-LABEL: combine_vpperm_as_proti_v8i16:
257257
; CHECK: # %bb.0:
258-
; CHECK-NEXT: vpperm {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
258+
; CHECK-NEXT: vprotw $8, %xmm0, %xmm0
259259
; CHECK-NEXT: ret{{[l|q]}}
260260
%res0 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> <i8 1, i8 0, i8 3, i8 2, i8 5, i8 4, i8 7, i8 6, i8 9, i8 8, i8 11, i8 10, i8 13, i8 12, i8 15, i8 14>)
261261
ret <16 x i8> %res0

0 commit comments

Comments
 (0)