@@ -5387,6 +5387,192 @@ bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
53875387 return true ;
53885388}
53895389
5390+ bool AMDGPULegalizerInfo::legalizeLaneOp (LegalizerHelper &Helper,
5391+ MachineInstr &MI,
5392+ Intrinsic::ID IID) const {
5393+
5394+ MachineIRBuilder &B = Helper.MIRBuilder ;
5395+ MachineRegisterInfo &MRI = *B.getMRI ();
5396+
5397+ Register DstReg = MI.getOperand (0 ).getReg ();
5398+ Register Src0 = MI.getOperand (2 ).getReg ();
5399+
5400+ auto createLaneOp = [&](Register Src0, Register Src1,
5401+ Register Src2) -> Register {
5402+ auto LaneOp = B.buildIntrinsic (IID, {S32}).addUse (Src0);
5403+ switch (IID) {
5404+ case Intrinsic::amdgcn_readfirstlane:
5405+ return LaneOp.getReg (0 );
5406+ case Intrinsic::amdgcn_readlane:
5407+ return LaneOp.addUse (Src1).getReg (0 );
5408+ case Intrinsic::amdgcn_writelane:
5409+ return LaneOp.addUse (Src1).addUse (Src2).getReg (0 );
5410+ default :
5411+ llvm_unreachable (" unhandled lane op" );
5412+ }
5413+ };
5414+
5415+ Register Src1, Src2;
5416+ if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane) {
5417+ Src1 = MI.getOperand (3 ).getReg ();
5418+ if (IID == Intrinsic::amdgcn_writelane) {
5419+ Src2 = MI.getOperand (4 ).getReg ();
5420+ }
5421+ }
5422+
5423+ LLT Ty = MRI.getType (DstReg);
5424+ unsigned Size = Ty.getSizeInBits ();
5425+
5426+ if (Size == 32 ) {
5427+ // Already legal
5428+ return true ;
5429+ }
5430+
5431+ if (Size < 32 ) {
5432+ Register Src0Cast = MRI.getType (Src0).isScalar ()
5433+ ? Src0
5434+ : B.buildBitcast (LLT::scalar (Size), Src0).getReg (0 );
5435+ Src0 = B.buildAnyExt (S32, Src0Cast).getReg (0 );
5436+ if (Src2.isValid ()) {
5437+ Register Src2Cast =
5438+ MRI.getType (Src2).isScalar ()
5439+ ? Src2
5440+ : B.buildBitcast (LLT::scalar (Size), Src2).getReg (0 );
5441+ Src2 = B.buildAnyExt (LLT::scalar (32 ), Src2Cast).getReg (0 );
5442+ }
5443+
5444+ Register LaneOpDst = createLaneOp (Src0, Src1, Src2);
5445+ if (Ty.isScalar ())
5446+ B.buildTrunc (DstReg, LaneOpDst);
5447+ else {
5448+ auto Trunc = B.buildTrunc (LLT::scalar (Size), LaneOpDst);
5449+ B.buildBitcast (DstReg, Trunc);
5450+ }
5451+
5452+ MI.eraseFromParent ();
5453+ return true ;
5454+ }
5455+
5456+ if ((Size % 32 ) == 0 ) {
5457+ SmallVector<Register, 2 > PartialRes;
5458+ unsigned NumParts = Size / 32 ;
5459+ auto IsS16Vec = Ty.isVector () && Ty.getElementType () == S16;
5460+ MachineInstrBuilder Src0Parts;
5461+
5462+ if (Ty.isPointer ()) {
5463+ auto PtrToInt = B.buildPtrToInt (LLT::scalar (Size), Src0);
5464+ Src0Parts = B.buildUnmerge (S32, PtrToInt);
5465+ } else if (Ty.isPointerVector ()) {
5466+ LLT IntVecTy = Ty.changeElementType (
5467+ LLT::scalar (Ty.getElementType ().getSizeInBits ()));
5468+ auto PtrToInt = B.buildPtrToInt (IntVecTy, Src0);
5469+ Src0Parts = B.buildUnmerge (S32, PtrToInt);
5470+ } else
5471+ Src0Parts =
5472+ IsS16Vec ? B.buildUnmerge (V2S16, Src0) : B.buildUnmerge (S32, Src0);
5473+
5474+ switch (IID) {
5475+ case Intrinsic::amdgcn_readlane: {
5476+ Register Src1 = MI.getOperand (3 ).getReg ();
5477+ for (unsigned i = 0 ; i < NumParts; ++i) {
5478+ Src0 = IsS16Vec ? B.buildBitcast (S32, Src0Parts.getReg (i)).getReg (0 )
5479+ : Src0Parts.getReg (i);
5480+ PartialRes.push_back (
5481+ (B.buildIntrinsic (Intrinsic::amdgcn_readlane, {S32})
5482+ .addUse (Src0)
5483+ .addUse (Src1))
5484+ .getReg (0 ));
5485+ }
5486+ break ;
5487+ }
5488+ case Intrinsic::amdgcn_readfirstlane: {
5489+ for (unsigned i = 0 ; i < NumParts; ++i) {
5490+ Src0 = IsS16Vec ? B.buildBitcast (S32, Src0Parts.getReg (i)).getReg (0 )
5491+ : Src0Parts.getReg (i);
5492+ PartialRes.push_back (
5493+ (B.buildIntrinsic (Intrinsic::amdgcn_readfirstlane, {S32})
5494+ .addUse (Src0)
5495+ .getReg (0 )));
5496+ }
5497+
5498+ break ;
5499+ }
5500+ case Intrinsic::amdgcn_writelane: {
5501+ Register Src1 = MI.getOperand (3 ).getReg ();
5502+ Register Src2 = MI.getOperand (4 ).getReg ();
5503+ MachineInstrBuilder Src2Parts;
5504+
5505+ if (Ty.isPointer ()) {
5506+ auto PtrToInt = B.buildPtrToInt (S64, Src2);
5507+ Src2Parts = B.buildUnmerge (S32, PtrToInt);
5508+ } else if (Ty.isPointerVector ()) {
5509+ LLT IntVecTy = Ty.changeElementType (
5510+ LLT::scalar (Ty.getElementType ().getSizeInBits ()));
5511+ auto PtrToInt = B.buildPtrToInt (IntVecTy, Src2);
5512+ Src2Parts = B.buildUnmerge (S32, PtrToInt);
5513+ } else
5514+ Src2Parts =
5515+ IsS16Vec ? B.buildUnmerge (V2S16, Src2) : B.buildUnmerge (S32, Src2);
5516+
5517+ for (unsigned i = 0 ; i < NumParts; ++i) {
5518+ Src0 = IsS16Vec ? B.buildBitcast (S32, Src0Parts.getReg (i)).getReg (0 )
5519+ : Src0Parts.getReg (i);
5520+ Src2 = IsS16Vec ? B.buildBitcast (S32, Src2Parts.getReg (i)).getReg (0 )
5521+ : Src2Parts.getReg (i);
5522+ PartialRes.push_back (
5523+ (B.buildIntrinsic (Intrinsic::amdgcn_writelane, {S32})
5524+ .addUse (Src0)
5525+ .addUse (Src1)
5526+ .addUse (Src2))
5527+ .getReg (0 ));
5528+ }
5529+
5530+ break ;
5531+ }
5532+ }
5533+
5534+ if (Ty.isPointerVector ()) {
5535+ unsigned PtrSize = Ty.getElementType ().getSizeInBits ();
5536+ SmallVector<Register, 2 > PtrElements;
5537+ if (PtrSize == 32 ) {
5538+ // Handle 32 bit pointers
5539+ for (unsigned i = 0 ; i < NumParts; i++)
5540+ PtrElements.push_back (
5541+ B.buildIntToPtr (Ty.getElementType (), PartialRes[i]).getReg (0 ));
5542+ } else {
5543+ // Handle legalization of <? x [pointer type bigger than 32 bits]>
5544+ SmallVector<Register, 2 > PtrParts;
5545+ unsigned NumS32Parts = PtrSize / 32 ;
5546+ unsigned PartIdx = 0 ;
5547+ for (unsigned i = 0 , j = 1 ; i < NumParts; i += NumS32Parts, j++) {
5548+ // Merge S32 components of a pointer element first.
5549+ for (; PartIdx < (j * NumS32Parts); PartIdx++)
5550+ PtrParts.push_back (PartialRes[PartIdx]);
5551+
5552+ auto MergedPtr =
5553+ B.buildMergeLikeInstr (LLT::scalar (PtrSize), PtrParts);
5554+ PtrElements.push_back (
5555+ B.buildIntToPtr (Ty.getElementType (), MergedPtr).getReg (0 ));
5556+ PtrParts.clear ();
5557+ }
5558+ }
5559+
5560+ B.buildMergeLikeInstr (DstReg, PtrElements);
5561+ } else {
5562+ if (IsS16Vec) {
5563+ for (unsigned i = 0 ; i < NumParts; i++)
5564+ PartialRes[i] = B.buildBitcast (V2S16, PartialRes[i]).getReg (0 );
5565+ }
5566+ B.buildMergeLikeInstr (DstReg, PartialRes);
5567+ }
5568+
5569+ MI.eraseFromParent ();
5570+ return true ;
5571+ }
5572+
5573+ return false ;
5574+ }
5575+
53905576bool AMDGPULegalizerInfo::getImplicitArgPtr (Register DstReg,
53915577 MachineRegisterInfo &MRI,
53925578 MachineIRBuilder &B) const {
@@ -7330,6 +7516,10 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
73307516 Observer.changedInstr (MI);
73317517 return true ;
73327518 }
7519+ case Intrinsic::amdgcn_readlane:
7520+ case Intrinsic::amdgcn_writelane:
7521+ case Intrinsic::amdgcn_readfirstlane:
7522+ return legalizeLaneOp (Helper, MI, IntrID);
73337523 default : {
73347524 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
73357525 AMDGPU::getImageDimIntrinsicInfo (IntrID))
0 commit comments