@@ -5397,25 +5397,42 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
53975397 Register DstReg = MI.getOperand (0 ).getReg ();
53985398 Register Src0 = MI.getOperand (2 ).getReg ();
53995399
5400+ bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
5401+ IID == Intrinsic::amdgcn_permlanex16;
5402+
54005403 auto createLaneOp = [&](Register Src0, Register Src1,
54015404 Register Src2) -> Register {
54025405 auto LaneOp = B.buildIntrinsic (IID, {S32}).addUse (Src0);
54035406 switch (IID) {
54045407 case Intrinsic::amdgcn_readfirstlane:
5408+ case Intrinsic::amdgcn_permlane64:
54055409 return LaneOp.getReg (0 );
54065410 case Intrinsic::amdgcn_readlane:
54075411 return LaneOp.addUse (Src1).getReg (0 );
54085412 case Intrinsic::amdgcn_writelane:
54095413 return LaneOp.addUse (Src1).addUse (Src2).getReg (0 );
5414+ case Intrinsic::amdgcn_permlane16:
5415+ case Intrinsic::amdgcn_permlanex16: {
5416+ Register Src3 = MI.getOperand (5 ).getReg ();
5417+ Register Src4 = MI.getOperand (6 ).getImm ();
5418+ Register Src5 = MI.getOperand (7 ).getImm ();
5419+ return LaneOp.addUse (Src1)
5420+ .addUse (Src2)
5421+ .addUse (Src3)
5422+ .addImm (Src4)
5423+ .addImm (Src5)
5424+ .getReg (0 );
5425+ }
54105426 default :
54115427 llvm_unreachable (" unhandled lane op" );
54125428 }
54135429 };
54145430
54155431 Register Src1, Src2;
5416- if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane) {
5432+ if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
5433+ IsPermLane16) {
54175434 Src1 = MI.getOperand (3 ).getReg ();
5418- if (IID == Intrinsic::amdgcn_writelane) {
5435+ if (IID == Intrinsic::amdgcn_writelane || IsPermLane16 ) {
54195436 Src2 = MI.getOperand (4 ).getReg ();
54205437 }
54215438 }
@@ -5433,7 +5450,16 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
54335450 ? Src0
54345451 : B.buildBitcast (LLT::scalar (Size), Src0).getReg (0 );
54355452 Src0 = B.buildAnyExt (S32, Src0Cast).getReg (0 );
5436- if (Src2.isValid ()) {
5453+
5454+ if (IsPermLane16) {
5455+ Register Src1Cast =
5456+ MRI.getType (Src1).isScalar ()
5457+ ? Src1
5458+ : B.buildBitcast (LLT::scalar (Size), Src2).getReg (0 );
5459+ Src1 = B.buildAnyExt (LLT::scalar (32 ), Src1Cast).getReg (0 );
5460+ }
5461+
5462+ if (IID == Intrinsic::amdgcn_writelane) {
54375463 Register Src2Cast =
54385464 MRI.getType (Src2).isScalar ()
54395465 ? Src2
@@ -5485,46 +5511,45 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
54855511 }
54865512 break ;
54875513 }
5488- case Intrinsic::amdgcn_readfirstlane: {
5514+ case Intrinsic::amdgcn_readfirstlane:
5515+ case Intrinsic::amdgcn_permlane64: {
54895516 for (unsigned i = 0 ; i < NumParts; ++i) {
54905517 Src0 = IsS16Vec ? B.buildBitcast (S32, Src0Parts.getReg (i)).getReg (0 )
54915518 : Src0Parts.getReg (i);
54925519 PartialRes.push_back (
5493- (B.buildIntrinsic (Intrinsic::amdgcn_readfirstlane, {S32})
5494- .addUse (Src0)
5495- .getReg (0 )));
5520+ (B.buildIntrinsic (IID, {S32}).addUse (Src0).getReg (0 )));
54965521 }
54975522
54985523 break ;
54995524 }
5500- case Intrinsic::amdgcn_writelane: {
5525+ case Intrinsic::amdgcn_writelane:
5526+ case Intrinsic::amdgcn_permlane16:
5527+ case Intrinsic::amdgcn_permlanex16: {
55015528 Register Src1 = MI.getOperand (3 ).getReg ();
55025529 Register Src2 = MI.getOperand (4 ).getReg ();
5503- MachineInstrBuilder Src2Parts;
5530+
5531+ Register SrcX = IsPermLane16 ? Src1 : Src2;
5532+ MachineInstrBuilder SrcXParts;
55045533
55055534 if (Ty.isPointer ()) {
5506- auto PtrToInt = B.buildPtrToInt (S64, Src2 );
5507- Src2Parts = B.buildUnmerge (S32, PtrToInt);
5535+ auto PtrToInt = B.buildPtrToInt (S64, SrcX );
5536+ SrcXParts = B.buildUnmerge (S32, PtrToInt);
55085537 } else if (Ty.isPointerVector ()) {
55095538 LLT IntVecTy = Ty.changeElementType (
55105539 LLT::scalar (Ty.getElementType ().getSizeInBits ()));
5511- auto PtrToInt = B.buildPtrToInt (IntVecTy, Src2 );
5512- Src2Parts = B.buildUnmerge (S32, PtrToInt);
5540+ auto PtrToInt = B.buildPtrToInt (IntVecTy, SrcX );
5541+ SrcXParts = B.buildUnmerge (S32, PtrToInt);
55135542 } else
5514- Src2Parts =
5515- IsS16Vec ? B.buildUnmerge (V2S16, Src2 ) : B.buildUnmerge (S32, Src2 );
5543+ SrcXParts =
5544+ IsS16Vec ? B.buildUnmerge (V2S16, SrcX ) : B.buildUnmerge (S32, SrcX );
55165545
55175546 for (unsigned i = 0 ; i < NumParts; ++i) {
55185547 Src0 = IsS16Vec ? B.buildBitcast (S32, Src0Parts.getReg (i)).getReg (0 )
55195548 : Src0Parts.getReg (i);
5520- Src2 = IsS16Vec ? B.buildBitcast (S32, Src2Parts.getReg (i)).getReg (0 )
5521- : Src2Parts.getReg (i);
5522- PartialRes.push_back (
5523- (B.buildIntrinsic (Intrinsic::amdgcn_writelane, {S32})
5524- .addUse (Src0)
5525- .addUse (Src1)
5526- .addUse (Src2))
5527- .getReg (0 ));
5549+ SrcX = IsS16Vec ? B.buildBitcast (S32, SrcXParts.getReg (i)).getReg (0 )
5550+ : SrcXParts.getReg (i);
5551+ PartialRes.push_back (IsPermLane16 ? createLaneOp (Src0, SrcX, Src2)
5552+ : createLaneOp (Src0, Src1, SrcX));
55285553 }
55295554
55305555 break ;
@@ -7519,6 +7544,9 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
75197544 case Intrinsic::amdgcn_readlane:
75207545 case Intrinsic::amdgcn_writelane:
75217546 case Intrinsic::amdgcn_readfirstlane:
7547+ case Intrinsic::amdgcn_permlane16:
7548+ case Intrinsic::amdgcn_permlanex16:
7549+ case Intrinsic::amdgcn_permlane64:
75227550 return legalizeLaneOp (Helper, MI, IntrID);
75237551 default : {
75247552 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
0 commit comments