@@ -6119,28 +6119,38 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
61196119 EVT VT = N->getValueType(0);
61206120 unsigned ValSize = VT.getSizeInBits();
61216121 unsigned IID = N->getConstantOperandVal(0);
6122+ bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
6123+ IID == Intrinsic::amdgcn_permlanex16;
61226124 SDLoc SL(N);
61236125 MVT IntVT = MVT::getIntegerVT(ValSize);
61246126
61256127 auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
61266128 SDValue Src2, MVT ValT) -> SDValue {
61276129 SmallVector<SDValue, 8> Operands;
6128- Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32));
61296130 switch (IID) {
6130- case Intrinsic::amdgcn_readfirstlane:
6131- Operands.push_back(Src0);
6132- break;
6131+ case Intrinsic::amdgcn_permlane16:
6132+ case Intrinsic::amdgcn_permlanex16:
6133+ Operands.push_back(N->getOperand(6));
6134+ Operands.push_back(N->getOperand(5));
6135+ Operands.push_back(N->getOperand(4));
6136+ [[fallthrough]];
6137+ case Intrinsic::amdgcn_writelane:
6138+ Operands.push_back(Src2);
6139+ [[fallthrough]];
61336140 case Intrinsic::amdgcn_readlane:
6134- Operands.push_back(Src0);
61356141 Operands.push_back(Src1);
6136- break;
6137- case Intrinsic::amdgcn_writelane:
6142+ [[fallthrough]];
6143+ case Intrinsic::amdgcn_readfirstlane:
6144+ case Intrinsic::amdgcn_permlane64:
61386145 Operands.push_back(Src0);
6139- Operands.push_back(Src1);
6140- Operands.push_back(Src2);
61416146 break;
6147+ default:
6148+ llvm_unreachable("unhandled lane op");
61426149 }
61436150
6151+ Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32));
6152+ std::reverse(Operands.begin(), Operands.end());
6153+
61446154 if (SDNode *GL = N->getGluedNode()) {
61456155 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
61466156 GL = GL->getOperand(0).getNode();
@@ -6153,9 +6163,10 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
61536163
61546164 SDValue Src0 = N->getOperand(1);
61556165 SDValue Src1, Src2;
6156- if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane) {
6166+ if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6167+ IsPermLane16) {
61576168 Src1 = N->getOperand(2);
6158- if (IID == Intrinsic::amdgcn_writelane)
6169+ if (IID == Intrinsic::amdgcn_writelane || IsPermLane16 )
61596170 Src2 = N->getOperand(3);
61606171 }
61616172
@@ -6168,10 +6179,17 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
61686179 bool IsFloat = VT.isFloatingPoint();
61696180 Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
61706181 SL, MVT::i32);
6171- if (Src2.getNode()) {
6182+
6183+ if (IsPermLane16) {
6184+ Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1,
6185+ SL, MVT::i32);
6186+ }
6187+
6188+ if (IID == Intrinsic::amdgcn_writelane) {
61726189 Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2,
61736190 SL, MVT::i32);
61746191 }
6192+
61756193 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
61766194 SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
61776195 return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
@@ -6233,17 +6251,23 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
62336251 case MVT::bf16: {
62346252 MVT SubVecVT = MVT::getVectorVT(EltTy, 2);
62356253 SmallVector<SDValue, 4> Pieces;
6254+ SDValue Src0SubVec, Src1SubVec, Src2SubVec;
62366255 for (unsigned i = 0, EltIdx = 0; i < ValSize / 32; i++) {
6237- SDValue Src0SubVec =
6238- DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
6239- DAG.getConstant(EltIdx, SL, MVT::i32));
6256+ Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
6257+ DAG.getConstant(EltIdx, SL, MVT::i32));
62406258
6241- SDValue Src2SubVec;
6242- if (Src2)
6259+ if (IsPermLane16)
6260+ Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
6261+ DAG.getConstant(EltIdx, SL, MVT::i32));
6262+
6263+ if (IID == Intrinsic::amdgcn_writelane)
62436264 Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2,
62446265 DAG.getConstant(EltIdx, SL, MVT::i32));
62456266
6246- Pieces.push_back(createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
6267+ Pieces.push_back(
6268+ IsPermLane16
6269+ ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
6270+ : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
62476271 EltIdx += 2;
62486272 }
62496273 return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces);
@@ -6257,7 +6281,10 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
62576281 MVT VecVT = MVT::getVectorVT(MVT::i32, ValSize / 32);
62586282 Src0 = DAG.getBitcast(VecVT, Src0);
62596283
6260- if (Src2)
6284+ if (IsPermLane16)
6285+ Src1 = DAG.getBitcast(VecVT, Src1);
6286+
6287+ if (IID == Intrinsic::amdgcn_writelane)
62616288 Src2 = DAG.getBitcast(VecVT, Src2);
62626289
62636290 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
@@ -8734,6 +8761,9 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
87348761 case Intrinsic::amdgcn_readlane:
87358762 case Intrinsic::amdgcn_readfirstlane:
87368763 case Intrinsic::amdgcn_writelane:
8764+ case Intrinsic::amdgcn_permlane16:
8765+ case Intrinsic::amdgcn_permlanex16:
8766+ case Intrinsic::amdgcn_permlane64:
87378767 return lowerLaneOp(*this, Op.getNode(), DAG);
87388768 default:
87398769 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
0 commit comments