diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index 9be8821d5bf96..d371d97b77d3a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -270,6 +270,66 @@ simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST, }); } } + + // Only perform D16 folding if every user of the image sample is + // an ExtractElementInst immediately followed by an FPTrunc to half. + SmallVector, 4> + ExtractTruncPairs; + bool AllHalfExtracts = true; + + for (User *U : II.users()) { + auto *Ext = dyn_cast(U); + if (!Ext || !Ext->hasOneUse()) { + AllHalfExtracts = false; + break; + } + + auto *Tr = dyn_cast(*Ext->user_begin()); + if (!Tr || !Tr->getType()->isHalfTy()) { + AllHalfExtracts = false; + break; + } + + ExtractTruncPairs.emplace_back(Ext, Tr); + } + + if (!ExtractTruncPairs.empty() && AllHalfExtracts) { + auto *VecTy = cast(II.getType()); + Type *HalfVecTy = + VecTy->getWithNewType(Type::getHalfTy(II.getContext())); + + // Obtain the original image sample intrinsic's signature + // and replace its return type with the half-vector for D16 folding + SmallVector SigTys; + Intrinsic::getIntrinsicSignature(II.getCalledFunction(), SigTys); + SigTys[0] = HalfVecTy; + + Module *M = II.getModule(); + Function *HalfDecl = + Intrinsic::getOrInsertDeclaration(M, ImageDimIntr->Intr, SigTys); + + II.mutateType(HalfVecTy); + II.setCalledFunction(HalfDecl); + + IRBuilder<> Builder(II.getContext()); + for (auto &[Ext, Tr] : ExtractTruncPairs) { + Value *Idx = Ext->getIndexOperand(); + + Builder.SetInsertPoint(Tr); + + Value *HalfExtract = Builder.CreateExtractElement(&II, Idx); + HalfExtract->takeName(Tr); + + Tr->replaceAllUsesWith(HalfExtract); + } + + for (auto &[Ext, Tr] : ExtractTruncPairs) { + IC.eraseInstFromFunction(*Tr); + IC.eraseInstFromFunction(*Ext); + } + + return &II; + } } } diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/image-d16.ll b/llvm/test/Transforms/InstCombine/AMDGPU/image-d16.ll index 30431ad724843..ee5ccf5af987d 100644 --- a/llvm/test/Transforms/InstCombine/AMDGPU/image-d16.ll +++ b/llvm/test/Transforms/InstCombine/AMDGPU/image-d16.ll @@ -3,6 +3,7 @@ ; RUN: opt -mtriple=amdgcn--amdpal -mcpu=gfx810 -S -passes=instcombine %s | FileCheck --check-prefixes=GFX81PLUS %s ; RUN: opt -mtriple=amdgcn--amdpal -mcpu=gfx900 -S -passes=instcombine %s | FileCheck --check-prefixes=GFX81PLUS %s ; RUN: opt -mtriple=amdgcn--amdpal -mcpu=gfx1010 -S -passes=instcombine %s | FileCheck --check-prefixes=GFX81PLUS %s +; RUN: opt -mtriple=amdgcn--amdpal -mcpu=gfx1100 -S -passes=instcombine %s | FileCheck --check-prefixes=GFX81PLUS %s define amdgpu_ps half @image_sample_2d_fptrunc_to_d16(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) { ; GFX7-LABEL: @image_sample_2d_fptrunc_to_d16( @@ -121,6 +122,123 @@ main_body: ret half %addf_sum.2 } +define amdgpu_ps half @image_sample_2d_multi_fptrunc_to_d16(<8 x i32> %surf_desc, <4 x i32> %samp, float %u, float %v) { +; GFX7-LABEL: @image_sample_2d_multi_fptrunc_to_d16( +; GFX7-NEXT: main_body: +; GFX7-NEXT: [[SAMPLE:%.*]] = call <3 x float> @llvm.amdgcn.image.sample.lz.2d.v3f32.f32.v8i32.v4i32(i32 7, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; GFX7-NEXT: [[E0:%.*]] = extractelement <3 x float> [[SAMPLE]], i64 0 +; GFX7-NEXT: [[H0:%.*]] = fptrunc float [[E0]] to half +; GFX7-NEXT: [[E1:%.*]] = extractelement <3 x float> [[SAMPLE]], i64 1 +; GFX7-NEXT: [[H1:%.*]] = fptrunc float [[E1]] to half +; GFX7-NEXT: [[E2:%.*]] = extractelement <3 x float> [[SAMPLE]], i64 2 +; GFX7-NEXT: [[H2:%.*]] = fptrunc float [[E2]] to half +; GFX7-NEXT: [[MUL:%.*]] = fmul half [[H0]], [[H1]] +; GFX7-NEXT: [[RES:%.*]] = fadd half [[MUL]], [[H2]] +; GFX7-NEXT: ret half [[RES]] +; +; GFX81PLUS-LABEL: @image_sample_2d_multi_fptrunc_to_d16( +; GFX81PLUS-NEXT: main_body: +; GFX81PLUS-NEXT: [[SAMPLE:%.*]] = call <3 x half> @llvm.amdgcn.image.sample.lz.2d.v3f16.f32.v8i32.v4i32(i32 7, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; GFX81PLUS-NEXT: [[H0:%.*]] = extractelement <3 x half> [[SAMPLE]], i64 0 +; GFX81PLUS-NEXT: [[H1:%.*]] = extractelement <3 x half> [[SAMPLE]], i64 1 +; GFX81PLUS-NEXT: [[H2:%.*]] = extractelement <3 x half> [[SAMPLE]], i64 2 +; GFX81PLUS-NEXT: [[MUL:%.*]] = fmul half [[H0]], [[H1]] +; GFX81PLUS-NEXT: [[RES:%.*]] = fadd half [[MUL]], [[H2]] +; GFX81PLUS-NEXT: ret half [[RES]] +; +main_body: + %sample = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32.v8i32.v4i32(i32 15, float %u, float %v, <8 x i32> %surf_desc, <4 x i32> %samp, i1 false, i32 0, i32 0) + %e0 = extractelement <4 x float> %sample, i32 0 + %h0 = fptrunc float %e0 to half + %e1 = extractelement <4 x float> %sample, i32 1 + %h1 = fptrunc float %e1 to half + %e2 = extractelement <4 x float> %sample, i32 2 + %h2 = fptrunc float %e2 to half + %mul = fmul half %h0, %h1 + %res = fadd half %mul, %h2 + ret half %res +} + +define amdgpu_ps half @image_sample_2d_extractelement_multi_use_no_d16(<8 x i32> %surf_desc, <4 x i32> %sampler_desc, float %u, float %v) { +; GFX7-LABEL: @image_sample_2d_extractelement_multi_use_no_d16( +; GFX7-NEXT: main_body: +; GFX7-NEXT: [[SAMPLE:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.lz.2d.v2f32.f32.v8i32.v4i32(i32 3, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMPLER_DESC:%.*]], i1 false, i32 0, i32 0) +; GFX7-NEXT: [[E0:%.*]] = extractelement <2 x float> [[SAMPLE]], i64 0 +; GFX7-NEXT: [[H0:%.*]] = fptrunc float [[E0]] to half +; GFX7-NEXT: [[USER2:%.*]] = fadd float [[E0]], 1.000000e+00 +; GFX7-NEXT: [[HALF:%.*]] = fptrunc float [[USER2]] to half +; GFX7-NEXT: [[E1:%.*]] = extractelement <2 x float> [[SAMPLE]], i64 1 +; GFX7-NEXT: [[H1:%.*]] = fptrunc float [[E1]] to half +; GFX7-NEXT: [[MUL:%.*]] = fmul half [[H0]], [[H1]] +; GFX7-NEXT: [[RES:%.*]] = fadd half [[MUL]], [[HALF]] +; GFX7-NEXT: ret half [[RES]] +; +; GFX81PLUS-LABEL: @image_sample_2d_extractelement_multi_use_no_d16( +; GFX81PLUS-NEXT: main_body: +; GFX81PLUS-NEXT: [[SAMPLE:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.lz.2d.v2f32.f32.v8i32.v4i32(i32 3, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMPLER_DESC:%.*]], i1 false, i32 0, i32 0) +; GFX81PLUS-NEXT: [[E0:%.*]] = extractelement <2 x float> [[SAMPLE]], i64 0 +; GFX81PLUS-NEXT: [[H0:%.*]] = fptrunc float [[E0]] to half +; GFX81PLUS-NEXT: [[USER2:%.*]] = fadd float [[E0]], 1.000000e+00 +; GFX81PLUS-NEXT: [[HALF:%.*]] = fptrunc float [[USER2]] to half +; GFX81PLUS-NEXT: [[E1:%.*]] = extractelement <2 x float> [[SAMPLE]], i64 1 +; GFX81PLUS-NEXT: [[H1:%.*]] = fptrunc float [[E1]] to half +; GFX81PLUS-NEXT: [[MUL:%.*]] = fmul half [[H0]], [[H1]] +; GFX81PLUS-NEXT: [[RES:%.*]] = fadd half [[MUL]], [[HALF]] +; GFX81PLUS-NEXT: ret half [[RES]] +; +main_body: + %sample = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32.v8i32.v4i32(i32 15, float %u, float %v, <8 x i32> %surf_desc, <4 x i32> %sampler_desc, i1 false, i32 0, i32 0) + %e0 = extractelement <4 x float> %sample, i32 0 + %h0 = fptrunc float %e0 to half + %user2 = fadd float %e0, 1.0 + %half = fptrunc float %user2 to half + %e1 = extractelement <4 x float> %sample, i32 1 + %h1 = fptrunc float %e1 to half + %mul = fmul half %h0, %h1 + %res = fadd half %mul, %half + ret half %res +} + +define amdgpu_ps bfloat @image_sample_2d_multi_fptrunc_non_half_no_d16(<8 x i32> %surf_desc, <4 x i32> %sampler_desc, float %u, float %v) { +; GFX7-LABEL: @image_sample_2d_multi_fptrunc_non_half_no_d16( +; GFX7-NEXT: main_body: +; GFX7-NEXT: [[SAMPLE:%.*]] = call <3 x float> @llvm.amdgcn.image.sample.lz.2d.v3f32.f32.v8i32.v4i32(i32 7, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMPLER_DESC:%.*]], i1 false, i32 0, i32 0) +; GFX7-NEXT: [[E0:%.*]] = extractelement <3 x float> [[SAMPLE]], i64 0 +; GFX7-NEXT: [[H0:%.*]] = fptrunc float [[E0]] to bfloat +; GFX7-NEXT: [[E1:%.*]] = extractelement <3 x float> [[SAMPLE]], i64 1 +; GFX7-NEXT: [[H1:%.*]] = fptrunc float [[E1]] to bfloat +; GFX7-NEXT: [[E2:%.*]] = extractelement <3 x float> [[SAMPLE]], i64 2 +; GFX7-NEXT: [[H2:%.*]] = fptrunc float [[E2]] to bfloat +; GFX7-NEXT: [[MUL:%.*]] = fmul bfloat [[H0]], [[H1]] +; GFX7-NEXT: [[RES:%.*]] = fadd bfloat [[MUL]], [[H2]] +; GFX7-NEXT: ret bfloat [[RES]] +; +; GFX81PLUS-LABEL: @image_sample_2d_multi_fptrunc_non_half_no_d16( +; GFX81PLUS-NEXT: main_body: +; GFX81PLUS-NEXT: [[SAMPLE:%.*]] = call <3 x float> @llvm.amdgcn.image.sample.lz.2d.v3f32.f32.v8i32.v4i32(i32 7, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMPLER_DESC:%.*]], i1 false, i32 0, i32 0) +; GFX81PLUS-NEXT: [[E0:%.*]] = extractelement <3 x float> [[SAMPLE]], i64 0 +; GFX81PLUS-NEXT: [[H0:%.*]] = fptrunc float [[E0]] to bfloat +; GFX81PLUS-NEXT: [[E1:%.*]] = extractelement <3 x float> [[SAMPLE]], i64 1 +; GFX81PLUS-NEXT: [[H1:%.*]] = fptrunc float [[E1]] to bfloat +; GFX81PLUS-NEXT: [[E2:%.*]] = extractelement <3 x float> [[SAMPLE]], i64 2 +; GFX81PLUS-NEXT: [[H2:%.*]] = fptrunc float [[E2]] to bfloat +; GFX81PLUS-NEXT: [[MUL:%.*]] = fmul bfloat [[H0]], [[H1]] +; GFX81PLUS-NEXT: [[RES:%.*]] = fadd bfloat [[MUL]], [[H2]] +; GFX81PLUS-NEXT: ret bfloat [[RES]] +; +main_body: + %sample = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32.v8i32.v4i32(i32 15, float %u, float %v, <8 x i32> %surf_desc, <4 x i32> %sampler_desc, i1 false, i32 0, i32 0) + %e0 = extractelement <4 x float> %sample, i32 0 + %h0 = fptrunc float %e0 to bfloat + %e1 = extractelement <4 x float> %sample, i32 1 + %h1 = fptrunc float %e1 to bfloat + %e2 = extractelement <4 x float> %sample, i32 2 + %h2 = fptrunc float %e2 to bfloat + %mul = fmul bfloat %h0, %h1 + %res = fadd bfloat %mul, %h2 + ret bfloat %res +} + define amdgpu_ps half @image_gather4_2d_v4f32(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t) { ; GFX7-LABEL: @image_gather4_2d_v4f32( ; GFX7-NEXT: main_body: