@@ -11704,62 +11704,69 @@ static int matchShuffleAsBitRotate(ArrayRef<int> Mask, int NumSubElts) {
1170411704 return RotateAmt;
1170511705}
1170611706
11707+ static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,
11708+ const X86Subtarget &Subtarget,
11709+ ArrayRef<int> Mask) {
11710+ assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11711+ assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers");
11712+
11713+ // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
11714+ int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
11715+ int MaxSubElts = 64 / EltSizeInBits;
11716+ for (int NumSubElts = MinSubElts; NumSubElts <= MaxSubElts; NumSubElts *= 2) {
11717+ int RotateAmt = matchShuffleAsBitRotate(Mask, NumSubElts);
11718+ if (RotateAmt < 0)
11719+ continue;
11720+
11721+ int NumElts = Mask.size();
11722+ MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
11723+ RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
11724+ return RotateAmt * EltSizeInBits;
11725+ }
11726+
11727+ return -1;
11728+ }
11729+
1170711730/// Lower shuffle using X86ISD::VROTLI rotations.
1170811731static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1,
1170911732 ArrayRef<int> Mask,
1171011733 const X86Subtarget &Subtarget,
1171111734 SelectionDAG &DAG) {
11712- assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11713-
11714- MVT SVT = VT.getScalarType();
11715- int EltSizeInBits = SVT.getScalarSizeInBits();
11716- assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers");
11717-
1171811735 // Only XOP + AVX512 targets have bit rotation instructions.
1171911736 // If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.
1172011737 bool IsLegal =
1172111738 (VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();
1172211739 if (!IsLegal && Subtarget.hasSSE3())
1172311740 return SDValue();
1172411741
11725- // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
11726- int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
11727- int MaxSubElts = 64 / EltSizeInBits;
11728- for (int NumSubElts = MinSubElts; NumSubElts <= MaxSubElts; NumSubElts *= 2) {
11729- int RotateAmt = matchShuffleAsBitRotate(Mask, NumSubElts);
11730- if (RotateAmt < 0)
11731- continue;
11742+ MVT RotateVT;
11743+ int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),
11744+ Subtarget, Mask);
11745+ if (RotateAmt < 0)
11746+ return SDValue();
1173211747
11733- int NumElts = VT.getVectorNumElements();
11734- MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
11735- MVT RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
11736-
11737- // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
11738- // expanded to OR(SRL,SHL), will be more efficient, but if they can
11739- // widen to vXi16 or more then existing lowering should will be better.
11740- int RotateAmtInBits = RotateAmt * EltSizeInBits;
11741- if (!IsLegal) {
11742- if ((RotateAmtInBits % 16) == 0)
11743- return SDValue();
11744- // TODO: Use getTargetVShiftByConstNode.
11745- unsigned ShlAmt = RotateAmtInBits;
11746- unsigned SrlAmt = RotateSVT.getScalarSizeInBits() - RotateAmtInBits;
11747- V1 = DAG.getBitcast(RotateVT, V1);
11748- SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,
11749- DAG.getTargetConstant(ShlAmt, DL, MVT::i8));
11750- SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,
11751- DAG.getTargetConstant(SrlAmt, DL, MVT::i8));
11752- SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);
11753- return DAG.getBitcast(VT, Rot);
11754- }
11755-
11756- SDValue Rot =
11757- DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
11758- DAG.getTargetConstant(RotateAmtInBits, DL, MVT::i8));
11748+ // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
11749+ // expanded to OR(SRL,SHL), will be more efficient, but if they can
11750+ // widen to vXi16 or more then existing lowering should will be better.
11751+ if (!IsLegal) {
11752+ if ((RotateAmt % 16) == 0)
11753+ return SDValue();
11754+ // TODO: Use getTargetVShiftByConstNode.
11755+ unsigned ShlAmt = RotateAmt;
11756+ unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;
11757+ V1 = DAG.getBitcast(RotateVT, V1);
11758+ SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,
11759+ DAG.getTargetConstant(ShlAmt, DL, MVT::i8));
11760+ SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,
11761+ DAG.getTargetConstant(SrlAmt, DL, MVT::i8));
11762+ SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);
1175911763 return DAG.getBitcast(VT, Rot);
1176011764 }
1176111765
11762- return SDValue();
11766+ SDValue Rot =
11767+ DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
11768+ DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
11769+ return DAG.getBitcast(VT, Rot);
1176311770}
1176411771
1176511772/// Try to lower a vector shuffle as a byte rotation.
@@ -33538,6 +33545,19 @@ static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask,
3353833545 }
3353933546 }
3354033547
33548+ // Attempt to match against bit rotates.
33549+ if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 &&
33550+ ((MaskVT.is128BitVector() && Subtarget.hasXOP()) ||
33551+ Subtarget.hasAVX512())) {
33552+ int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits,
33553+ Subtarget, Mask);
33554+ if (0 < RotateAmt) {
33555+ Shuffle = X86ISD::VROTLI;
33556+ PermuteImm = (unsigned)RotateAmt;
33557+ return true;
33558+ }
33559+ }
33560+
3354133561 return false;
3354233562}
3354333563
0 commit comments