@@ -5397,25 +5397,39 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
53975397 Register DstReg = MI.getOperand (0 ).getReg ();
53985398 Register Src0 = MI.getOperand (2 ).getReg ();
53995399
5400+ bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
5401+ IID == Intrinsic::amdgcn_permlanex16;
5402+
54005403 auto createLaneOp = [&](Register Src0, Register Src1,
54015404 Register Src2) -> Register {
54025405 auto LaneOp = B.buildIntrinsic (IID, {S32}).addUse (Src0);
54035406 switch (IID) {
54045407 case Intrinsic::amdgcn_readfirstlane:
5408+ case Intrinsic::amdgcn_permlane64:
54055409 return LaneOp.getReg (0 );
54065410 case Intrinsic::amdgcn_readlane:
54075411 return LaneOp.addUse (Src1).getReg (0 );
54085412 case Intrinsic::amdgcn_writelane:
54095413 return LaneOp.addUse (Src1).addUse (Src2).getReg (0 );
5414+ case Intrinsic::amdgcn_permlane16:
5415+ case Intrinsic::amdgcn_permlanex16: {
5416+ Register Src3 = MI.getOperand (5 ).getReg ();
5417+ Register Src4 = MI.getOperand (6 ).getImm ();
5418+ Register Src5 = MI.getOperand (7 ).getImm ();
5419+ return LaneOp.addUse (Src1).addUse (Src2).
5420+ addUse (Src3).
5421+ addImm (Src4).
5422+ addImm (Src5).getReg (0 );
5423+ }
54105424 default :
54115425 llvm_unreachable (" unhandled lane op" );
54125426 }
54135427 };
54145428
54155429 Register Src1, Src2;
5416- if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane) {
5430+ if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane || IsPermLane16 ) {
54175431 Src1 = MI.getOperand (3 ).getReg ();
5418- if (IID == Intrinsic::amdgcn_writelane) {
5432+ if (IID == Intrinsic::amdgcn_writelane || IsPermLane16 ) {
54195433 Src2 = MI.getOperand (4 ).getReg ();
54205434 }
54215435 }
@@ -5433,7 +5447,16 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
54335447 ? Src0
54345448 : B.buildBitcast (LLT::scalar (Size), Src0).getReg (0 );
54355449 Src0 = B.buildAnyExt (S32, Src0Cast).getReg (0 );
5436- if (Src2.isValid ()) {
5450+
5451+ if (IsPermLane16) {
5452+ Register Src1Cast =
5453+ MRI.getType (Src1).isScalar ()
5454+ ? Src1
5455+ : B.buildBitcast (LLT::scalar (Size), Src2).getReg (0 );
5456+ Src1 = B.buildAnyExt (LLT::scalar (32 ), Src1Cast).getReg (0 );
5457+ }
5458+
5459+ if (IID == Intrinsic::amdgcn_writelane) {
54375460 Register Src2Cast =
54385461 MRI.getType (Src2).isScalar ()
54395462 ? Src2
@@ -5485,46 +5508,48 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
54855508 }
54865509 break ;
54875510 }
5488- case Intrinsic::amdgcn_readfirstlane: {
5511+ case Intrinsic::amdgcn_readfirstlane:
5512+ case Intrinsic::amdgcn_permlane64: {
54895513 for (unsigned i = 0 ; i < NumParts; ++i) {
54905514 Src0 = IsS16Vec ? B.buildBitcast (S32, Src0Parts.getReg (i)).getReg (0 )
54915515 : Src0Parts.getReg (i);
54925516 PartialRes.push_back (
5493- (B.buildIntrinsic (Intrinsic::amdgcn_readfirstlane , {S32})
5517+ (B.buildIntrinsic (IID , {S32})
54945518 .addUse (Src0)
54955519 .getReg (0 )));
54965520 }
54975521
54985522 break ;
54995523 }
5500- case Intrinsic::amdgcn_writelane: {
5524+ case Intrinsic::amdgcn_writelane:
5525+ case Intrinsic::amdgcn_permlane16:
5526+ case Intrinsic::amdgcn_permlanex16: {
55015527 Register Src1 = MI.getOperand (3 ).getReg ();
55025528 Register Src2 = MI.getOperand (4 ).getReg ();
5503- MachineInstrBuilder Src2Parts;
5529+
5530+ Register SrcX = IsPermLane16 ? Src1 : Src2;
5531+ MachineInstrBuilder SrcXParts;
55045532
55055533 if (Ty.isPointer ()) {
5506- auto PtrToInt = B.buildPtrToInt (S64, Src2 );
5507- Src2Parts = B.buildUnmerge (S32, PtrToInt);
5534+ auto PtrToInt = B.buildPtrToInt (S64, SrcX );
5535+ SrcXParts = B.buildUnmerge (S32, PtrToInt);
55085536 } else if (Ty.isPointerVector ()) {
55095537 LLT IntVecTy = Ty.changeElementType (
55105538 LLT::scalar (Ty.getElementType ().getSizeInBits ()));
5511- auto PtrToInt = B.buildPtrToInt (IntVecTy, Src2 );
5512- Src2Parts = B.buildUnmerge (S32, PtrToInt);
5539+ auto PtrToInt = B.buildPtrToInt (IntVecTy, SrcX );
5540+ SrcXParts = B.buildUnmerge (S32, PtrToInt);
55135541 } else
5514- Src2Parts =
5515- IsS16Vec ? B.buildUnmerge (V2S16, Src2 ) : B.buildUnmerge (S32, Src2 );
5542+ SrcXParts =
5543+ IsS16Vec ? B.buildUnmerge (V2S16, SrcX ) : B.buildUnmerge (S32, SrcX );
55165544
55175545 for (unsigned i = 0 ; i < NumParts; ++i) {
55185546 Src0 = IsS16Vec ? B.buildBitcast (S32, Src0Parts.getReg (i)).getReg (0 )
55195547 : Src0Parts.getReg (i);
5520- Src2 = IsS16Vec ? B.buildBitcast (S32, Src2Parts.getReg (i)).getReg (0 )
5521- : Src2Parts.getReg (i);
5522- PartialRes.push_back (
5523- (B.buildIntrinsic (Intrinsic::amdgcn_writelane, {S32})
5524- .addUse (Src0)
5525- .addUse (Src1)
5526- .addUse (Src2))
5527- .getReg (0 ));
5548+ SrcX = IsS16Vec ? B.buildBitcast (S32, SrcXParts.getReg (i)).getReg (0 )
5549+ : SrcXParts.getReg (i);
5550+ PartialRes.push_back ( IsPermLane16 ?
5551+ createLaneOp (Src0, SrcX, Src2) :
5552+ createLaneOp (Src0, Src1, SrcX));
55285553 }
55295554
55305555 break ;
@@ -7519,6 +7544,9 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
75197544 case Intrinsic::amdgcn_readlane:
75207545 case Intrinsic::amdgcn_writelane:
75217546 case Intrinsic::amdgcn_readfirstlane:
7547+ case Intrinsic::amdgcn_permlane16:
7548+ case Intrinsic::amdgcn_permlanex16:
7549+ case Intrinsic::amdgcn_permlane64:
75227550 return legalizeLaneOp (Helper, MI, IntrID);
75237551 default : {
75247552 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
0 commit comments