From 53d8ce509bd363bebeb76d85306b434ff84f378d Mon Sep 17 00:00:00 2001 From: "Pirog, Mikolaj Maciej" Date: Thu, 11 Sep 2025 11:58:34 +0200 Subject: [PATCH 1/6] Don't rely on global contract flag on tests --- llvm/test/CodeGen/X86/avx512-fma.ll | 62 +-- .../X86/avx512fp16-combine-vfmac-fadd.ll | 52 +-- .../X86/avx512fp16-combine-vfmulc-fadd.ll | 2 +- .../X86/avx512fp16-combine-xor-vfmulc-fadd.ll | 2 +- .../X86/avx512fp16-combine-xor-vfmulc.ll | 10 +- .../CodeGen/X86/dag-combiner-fma-folding.ll | 2 +- llvm/test/CodeGen/X86/fma-do-not-commute.ll | 6 +- llvm/test/CodeGen/X86/fma_patterns.ll | 390 +++++++++--------- llvm/test/CodeGen/X86/fma_patterns_wide.ll | 164 ++++---- .../X86/fold-int-pow2-with-fmul-or-fdiv.ll | 14 +- llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll | 50 +-- 11 files changed, 373 insertions(+), 381 deletions(-) diff --git a/llvm/test/CodeGen/X86/avx512-fma.ll b/llvm/test/CodeGen/X86/avx512-fma.ll index 97f8e5f4ea16c..54343ee771ff7 100644 --- a/llvm/test/CodeGen/X86/avx512-fma.ll +++ b/llvm/test/CodeGen/X86/avx512-fma.ll @@ -1,14 +1,14 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f -fp-contract=fast | FileCheck %s --check-prefix=ALL --check-prefix=KNL -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx -fp-contract=fast | FileCheck %s --check-prefix=ALL --check-prefix=SKX +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=KNL +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=ALL --check-prefix=SKX define <16 x float> @test_x86_fmadd_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { ; ALL-LABEL: test_x86_fmadd_ps_z: ; ALL: ## %bb.0: ; ALL-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2 ; ALL-NEXT: retq - %x = fmul <16 x float> %a0, %a1 - %res = fadd <16 x float> %x, %a2 + %x = fmul contract <16 x float> %a0, %a1 + %res = fadd contract <16 x float> %x, %a2 ret <16 x float> %res } @@ -17,8 +17,8 @@ define <16 x float> @test_x86_fmsub_ps_z(<16 x float> %a0, <16 x float> %a1, <16 ; ALL: ## %bb.0: ; ALL-NEXT: vfmsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm2 ; ALL-NEXT: retq - %x = fmul <16 x float> %a0, %a1 - %res = fsub <16 x float> %x, %a2 + %x = fmul contract <16 x float> %a0, %a1 + %res = fsub contract <16 x float> %x, %a2 ret <16 x float> %res } @@ -27,8 +27,8 @@ define <16 x float> @test_x86_fnmadd_ps_z(<16 x float> %a0, <16 x float> %a1, <1 ; ALL: ## %bb.0: ; ALL-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + zmm2 ; ALL-NEXT: retq - %x = fmul <16 x float> %a0, %a1 - %res = fsub <16 x float> %a2, %x + %x = fmul contract <16 x float> %a0, %a1 + %res = fsub contract <16 x float> %a2, %x ret <16 x float> %res } @@ -37,12 +37,12 @@ define <16 x float> @test_x86_fnmsub_ps_z(<16 x float> %a0, <16 x float> %a1, <1 ; ALL: ## %bb.0: ; ALL-NEXT: vfnmsub213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm2 ; ALL-NEXT: retq - %x = fmul <16 x float> %a0, %a1 - %y = fsub <16 x float> %a0, %a1 + %y = fsub contract <16 x float> , %x - %res = fsub <16 x float> %y, %a2 + %res = fsub contract <16 x float> %y, %a2 ret <16 x float> %res } @@ -51,8 +51,8 @@ define <8 x double> @test_x86_fmadd_pd_z(<8 x double> %a0, <8 x double> %a1, <8 ; ALL: ## %bb.0: ; ALL-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2 ; ALL-NEXT: retq - %x = fmul <8 x double> %a0, %a1 - %res = fadd <8 x double> %x, %a2 + %x = fmul contract <8 x double> %a0, %a1 + %res = fadd contract <8 x double> %x, %a2 ret <8 x double> %res } @@ -61,8 +61,8 @@ define <8 x double> @test_x86_fmsub_pd_z(<8 x double> %a0, <8 x double> %a1, <8 ; ALL: ## %bb.0: ; ALL-NEXT: vfmsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm2 ; ALL-NEXT: retq - %x = fmul <8 x double> %a0, %a1 - %res = fsub <8 x double> %x, %a2 + %x = fmul contract <8 x double> %a0, %a1 + %res = fsub contract <8 x double> %x, %a2 ret <8 x double> %res } @@ -71,8 +71,8 @@ define double @test_x86_fmsub_213(double %a0, double %a1, double %a2) { ; ALL: ## %bb.0: ; ALL-NEXT: vfmsub213sd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 ; ALL-NEXT: retq - %x = fmul double %a0, %a1 - %res = fsub double %x, %a2 + %x = fmul contract double %a0, %a1 + %res = fsub contract double %x, %a2 ret double %res } @@ -82,8 +82,8 @@ define double @test_x86_fmsub_213_m(double %a0, double %a1, ptr %a2_ptr) { ; ALL-NEXT: vfmsub213sd {{.*#+}} xmm0 = (xmm1 * xmm0) - mem ; ALL-NEXT: retq %a2 = load double , ptr%a2_ptr - %x = fmul double %a0, %a1 - %res = fsub double %x, %a2 + %x = fmul contract double %a0, %a1 + %res = fsub contract double %x, %a2 ret double %res } @@ -93,8 +93,8 @@ define double @test_x86_fmsub_231_m(double %a0, double %a1, ptr %a2_ptr) { ; ALL-NEXT: vfmsub132sd {{.*#+}} xmm0 = (xmm0 * mem) - xmm1 ; ALL-NEXT: retq %a2 = load double , ptr%a2_ptr - %x = fmul double %a0, %a2 - %res = fsub double %x, %a1 + %x = fmul contract double %a0, %a2 + %res = fsub contract double %x, %a1 ret double %res } @@ -103,8 +103,8 @@ define <16 x float> @test231_br(<16 x float> %a1, <16 x float> %a2) nounwind { ; ALL: ## %bb.0: ; ALL-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * mem) + zmm1 ; ALL-NEXT: retq - %b1 = fmul <16 x float> %a1, - %b2 = fadd <16 x float> %b1, %a2 + %b1 = fmul contract <16 x float> %a1, + %b2 = fadd contract <16 x float> %b1, %a2 ret <16 x float> %b2 } @@ -113,8 +113,8 @@ define <16 x float> @test213_br(<16 x float> %a1, <16 x float> %a2) nounwind { ; ALL: ## %bb.0: ; ALL-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + mem ; ALL-NEXT: retq - %b1 = fmul <16 x float> %a1, %a2 - %b2 = fadd <16 x float> %b1, + %b1 = fmul contract <16 x float> %a1, %a2 + %b2 = fadd contract <16 x float> %b1, ret <16 x float> %b2 } @@ -135,8 +135,8 @@ define <16 x float> @test_x86_fmadd132_ps(<16 x float> %a0, <16 x float> %a1, pt ; SKX-NEXT: vfmadd132ps {{.*#+}} zmm0 {%k1} = (zmm0 * mem) + zmm1 ; SKX-NEXT: retq %a2 = load <16 x float>,ptr%a2_ptrt,align 1 - %x = fmul <16 x float> %a0, %a2 - %y = fadd <16 x float> %x, %a1 + %x = fmul contract <16 x float> %a0, %a2 + %y = fadd contract <16 x float> %x, %a1 %res = select <16 x i1> %mask, <16 x float> %y, <16 x float> %a0 ret <16 x float> %res } @@ -160,8 +160,8 @@ define <16 x float> @test_x86_fmadd231_ps(<16 x float> %a0, <16 x float> %a1, pt ; SKX-NEXT: vmovaps %zmm1, %zmm0 ; SKX-NEXT: retq %a2 = load <16 x float>,ptr%a2_ptrt,align 1 - %x = fmul <16 x float> %a0, %a2 - %y = fadd <16 x float> %x, %a1 + %x = fmul contract <16 x float> %a0, %a2 + %y = fadd contract <16 x float> %x, %a1 %res = select <16 x i1> %mask, <16 x float> %y, <16 x float> %a1 ret <16 x float> %res } @@ -185,8 +185,8 @@ define <16 x float> @test_x86_fmadd213_ps(<16 x float> %a0, <16 x float> %a1, pt ; SKX-NEXT: vmovaps %zmm1, %zmm0 ; SKX-NEXT: retq %a2 = load <16 x float>,ptr%a2_ptrt,align 1 - %x = fmul <16 x float> %a1, %a0 - %y = fadd <16 x float> %x, %a2 + %x = fmul contract <16 x float> %a1, %a0 + %y = fadd contract <16 x float> %x, %a2 %res = select <16 x i1> %mask, <16 x float> %y, <16 x float> %a1 ret <16 x float> %res } diff --git a/llvm/test/CodeGen/X86/avx512fp16-combine-vfmac-fadd.ll b/llvm/test/CodeGen/X86/avx512fp16-combine-vfmac-fadd.ll index 36b95e744ba14..52e9507d43a1f 100644 --- a/llvm/test/CodeGen/X86/avx512fp16-combine-vfmac-fadd.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-combine-vfmac-fadd.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown --fp-contract=fast --enable-no-signed-zeros-fp-math -mattr=avx512fp16,avx512vl | FileCheck %s --check-prefixes=CHECK,NO-SZ -; RUN: llc < %s -mtriple=x86_64-unknown-unknown --fp-contract=fast -mattr=avx512fp16,avx512vl | FileCheck %s --check-prefixes=CHECK,HAS-SZ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown --enable-no-signed-zeros-fp-math -mattr=avx512fp16,avx512vl | FileCheck %s --check-prefixes=CHECK,NO-SZ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl | FileCheck %s --check-prefixes=CHECK,HAS-SZ ; FADD(acc, FMA(a, b, +0.0)) can be combined to FMA(a, b, acc) if the nsz flag set. define dso_local <32 x half> @test1(<32 x half> %acc, <32 x half> %a, <32 x half> %b) { @@ -18,9 +18,9 @@ define dso_local <32 x half> @test1(<32 x half> %acc, <32 x half> %a, <32 x half entry: %0 = bitcast <32 x half> %a to <16 x float> %1 = bitcast <32 x half> %b to <16 x float> - %2 = tail call <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float> %0, <16 x float> %1, <16 x float> zeroinitializer, i16 -1, i32 4) + %2 = tail call contract <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float> %0, <16 x float> %1, <16 x float> zeroinitializer, i16 -1, i32 4) %3 = bitcast <16 x float> %2 to <32 x half> - %add.i = fadd <32 x half> %3, %acc + %add.i = fadd contract <32 x half> %3, %acc ret <32 x half> %add.i } @@ -39,9 +39,9 @@ define dso_local <32 x half> @test2(<32 x half> %acc, <32 x half> %a, <32 x half entry: %0 = bitcast <32 x half> %a to <16 x float> %1 = bitcast <32 x half> %b to <16 x float> - %2 = tail call <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float> %0, <16 x float> %1, <16 x float> zeroinitializer, i16 -1, i32 4) + %2 = tail call contract <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float> %0, <16 x float> %1, <16 x float> zeroinitializer, i16 -1, i32 4) %3 = bitcast <16 x float> %2 to <32 x half> - %add.i = fadd <32 x half> %3, %acc + %add.i = fadd contract <32 x half> %3, %acc ret <32 x half> %add.i } @@ -60,9 +60,9 @@ define dso_local <16 x half> @test3(<16 x half> %acc, <16 x half> %a, <16 x half entry: %0 = bitcast <16 x half> %a to <8 x float> %1 = bitcast <16 x half> %b to <8 x float> - %2 = tail call <8 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.256(<8 x float> %0, <8 x float> %1, <8 x float> zeroinitializer, i8 -1) + %2 = tail call contract <8 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.256(<8 x float> %0, <8 x float> %1, <8 x float> zeroinitializer, i8 -1) %3 = bitcast <8 x float> %2 to <16 x half> - %add.i = fadd <16 x half> %3, %acc + %add.i = fadd contract <16 x half> %3, %acc ret <16 x half> %add.i } @@ -81,9 +81,9 @@ define dso_local <16 x half> @test4(<16 x half> %acc, <16 x half> %a, <16 x half entry: %0 = bitcast <16 x half> %a to <8 x float> %1 = bitcast <16 x half> %b to <8 x float> - %2 = tail call <8 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.256(<8 x float> %0, <8 x float> %1, <8 x float> zeroinitializer, i8 -1) + %2 = tail call contract <8 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.256(<8 x float> %0, <8 x float> %1, <8 x float> zeroinitializer, i8 -1) %3 = bitcast <8 x float> %2 to <16 x half> - %add.i = fadd <16 x half> %3, %acc + %add.i = fadd contract <16 x half> %3, %acc ret <16 x half> %add.i } @@ -102,9 +102,9 @@ define dso_local <8 x half> @test5(<8 x half> %acc, <8 x half> %a, <8 x half> %b entry: %0 = bitcast <8 x half> %a to <4 x float> %1 = bitcast <8 x half> %b to <4 x float> - %2 = tail call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.128(<4 x float> %0, <4 x float> %1, <4 x float> zeroinitializer, i8 -1) + %2 = tail call contract <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.128(<4 x float> %0, <4 x float> %1, <4 x float> zeroinitializer, i8 -1) %3 = bitcast <4 x float> %2 to <8 x half> - %add.i = fadd <8 x half> %3, %acc + %add.i = fadd contract <8 x half> %3, %acc ret <8 x half> %add.i } @@ -123,9 +123,9 @@ define dso_local <8 x half> @test6(<8 x half> %acc, <8 x half> %a, <8 x half> %b entry: %0 = bitcast <8 x half> %a to <4 x float> %1 = bitcast <8 x half> %b to <4 x float> - %2 = tail call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.128(<4 x float> %0, <4 x float> %1, <4 x float> zeroinitializer, i8 -1) + %2 = tail call contract <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.128(<4 x float> %0, <4 x float> %1, <4 x float> zeroinitializer, i8 -1) %3 = bitcast <4 x float> %2 to <8 x half> - %add.i = fadd <8 x half> %3, %acc + %add.i = fadd contract <8 x half> %3, %acc ret <8 x half> %add.i } @@ -138,9 +138,9 @@ define dso_local <32 x half> @test13(<32 x half> %acc, <32 x half> %a, <32 x hal entry: %0 = bitcast <32 x half> %a to <16 x float> %1 = bitcast <32 x half> %b to <16 x float> - %2 = tail call <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float> %0, <16 x float> %1, <16 x float> , i16 -1, i32 4) + %2 = tail call contract <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float> %0, <16 x float> %1, <16 x float> , i16 -1, i32 4) %3 = bitcast <16 x float> %2 to <32 x half> - %add.i = fadd <32 x half> %3, %acc + %add.i = fadd contract <32 x half> %3, %acc ret <32 x half> %add.i } @@ -152,9 +152,9 @@ define dso_local <32 x half> @test14(<32 x half> %acc, <32 x half> %a, <32 x hal entry: %0 = bitcast <32 x half> %a to <16 x float> %1 = bitcast <32 x half> %b to <16 x float> - %2 = tail call <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float> %0, <16 x float> %1, <16 x float> , i16 -1, i32 4) + %2 = tail call contract <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float> %0, <16 x float> %1, <16 x float> , i16 -1, i32 4) %3 = bitcast <16 x float> %2 to <32 x half> - %add.i = fadd <32 x half> %3, %acc + %add.i = fadd contract <32 x half> %3, %acc ret <32 x half> %add.i } @@ -166,9 +166,9 @@ define dso_local <16 x half> @test15(<16 x half> %acc, <16 x half> %a, <16 x hal entry: %0 = bitcast <16 x half> %a to <8 x float> %1 = bitcast <16 x half> %b to <8 x float> - %2 = tail call <8 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.256(<8 x float> %0, <8 x float> %1, <8 x float> , i8 -1) + %2 = tail call contract <8 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.256(<8 x float> %0, <8 x float> %1, <8 x float> , i8 -1) %3 = bitcast <8 x float> %2 to <16 x half> - %add.i = fadd <16 x half> %3, %acc + %add.i = fadd contract <16 x half> %3, %acc ret <16 x half> %add.i } @@ -180,9 +180,9 @@ define dso_local <16 x half> @test16(<16 x half> %acc, <16 x half> %a, <16 x hal entry: %0 = bitcast <16 x half> %a to <8 x float> %1 = bitcast <16 x half> %b to <8 x float> - %2 = tail call <8 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.256(<8 x float> %0, <8 x float> %1, <8 x float> , i8 -1) + %2 = tail call contract <8 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.256(<8 x float> %0, <8 x float> %1, <8 x float> , i8 -1) %3 = bitcast <8 x float> %2 to <16 x half> - %add.i = fadd <16 x half> %3, %acc + %add.i = fadd contract <16 x half> %3, %acc ret <16 x half> %add.i } @@ -194,9 +194,9 @@ define dso_local <8 x half> @test17(<8 x half> %acc, <8 x half> %a, <8 x half> % entry: %0 = bitcast <8 x half> %a to <4 x float> %1 = bitcast <8 x half> %b to <4 x float> - %2 = tail call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.128(<4 x float> %0, <4 x float> %1, <4 x float> , i8 -1) + %2 = tail call contract <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.128(<4 x float> %0, <4 x float> %1, <4 x float> , i8 -1) %3 = bitcast <4 x float> %2 to <8 x half> - %add.i = fadd <8 x half> %3, %acc + %add.i = fadd contract <8 x half> %3, %acc ret <8 x half> %add.i } @@ -208,9 +208,9 @@ define dso_local <8 x half> @test18(<8 x half> %acc, <8 x half> %a, <8 x half> % entry: %0 = bitcast <8 x half> %a to <4 x float> %1 = bitcast <8 x half> %b to <4 x float> - %2 = tail call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.128(<4 x float> %0, <4 x float> %1, <4 x float> , i8 -1) + %2 = tail call contract <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.128(<4 x float> %0, <4 x float> %1, <4 x float> , i8 -1) %3 = bitcast <4 x float> %2 to <8 x half> - %add.i = fadd <8 x half> %3, %acc + %add.i = fadd contract <8 x half> %3, %acc ret <8 x half> %add.i } diff --git a/llvm/test/CodeGen/X86/avx512fp16-combine-vfmulc-fadd.ll b/llvm/test/CodeGen/X86/avx512fp16-combine-vfmulc-fadd.ll index a509503584649..20df18c5a18b9 100644 --- a/llvm/test/CodeGen/X86/avx512fp16-combine-vfmulc-fadd.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-combine-vfmulc-fadd.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl --fp-contract=fast --enable-unsafe-fp-math | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl --enable-unsafe-fp-math | FileCheck %s define dso_local <32 x half> @test1(<32 x half> %acc.coerce, <32 x half> %lhs.coerce, <32 x half> %rhs.coerce) { ; CHECK-LABEL: test1: diff --git a/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc-fadd.ll b/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc-fadd.ll index 43f30da15b20d..d96d7d1602040 100644 --- a/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc-fadd.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc-fadd.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl --fp-contract=fast --enable-unsafe-fp-math | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl --enable-unsafe-fp-math | FileCheck %s define dso_local <32 x half> @test1(<32 x half> %acc.coerce, <32 x half> %lhs.coerce.conj, <32 x half> %rhs.coerce) local_unnamed_addr #0 { ; CHECK-LABEL: test1: diff --git a/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc.ll b/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc.ll index 7b142ea170c22..caf428a8c94d3 100644 --- a/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl --fp-contract=fast --enable-unsafe-fp-math | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl --enable-unsafe-fp-math | FileCheck %s define dso_local <32 x half> @test1(<32 x half> %lhs.coerce.conj, <32 x half> %rhs.coerce) local_unnamed_addr #0 { ; CHECK-LABEL: test1: @@ -94,13 +94,13 @@ define dso_local <32 x half> @test6(<16 x i32> %a, <16 x float> %b) local_unname entry: %0 = xor <16 x i32> %a, splat (i32 -2147483648) %1 = bitcast <16 x i32> %0 to <16 x float> - %2 = tail call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> splat (float 1.000000e+00), <16 x float> %1, <16 x float> zeroinitializer, i16 -1, i32 4) + %2 = tail call contract <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> splat (float 1.000000e+00), <16 x float> %1, <16 x float> zeroinitializer, i16 -1, i32 4) %3 = bitcast <16 x float> %2 to <32 x half> - %4 = tail call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> %1, <16 x float> %b, <16 x float> zeroinitializer, i16 -1, i32 4) + %4 = tail call contract <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> %1, <16 x float> %b, <16 x float> zeroinitializer, i16 -1, i32 4) %5 = bitcast <16 x float> %4 to <32 x half> - %6 = fadd <32 x half> %3, %5 + %6 = fadd contract <32 x half> %3, %5 %7 = bitcast <16 x float> %b to <32 x half> - %8 = fadd <32 x half> %6, %7 + %8 = fadd contract <32 x half> %6, %7 ret <32 x half> %8 } diff --git a/llvm/test/CodeGen/X86/dag-combiner-fma-folding.ll b/llvm/test/CodeGen/X86/dag-combiner-fma-folding.ll index 6291100f42c3d..c13b534b6881a 100644 --- a/llvm/test/CodeGen/X86/dag-combiner-fma-folding.ll +++ b/llvm/test/CodeGen/X86/dag-combiner-fma-folding.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=x86_64-- --start-before=x86-isel -mattr=+avx,+fma %s -o - | FileCheck %s -; RUN: llc -mtriple=x86_64-- --start-before=x86-isel -mattr=+avx,+fma %s -o - -fp-contract=fast | FileCheck %s +; RUN: llc -mtriple=x86_64-- --start-before=x86-isel -mattr=+avx,+fma %s -o - | FileCheck %s define double @fma_folding(double %x) { ; CHECK-LABEL: fma_folding: diff --git a/llvm/test/CodeGen/X86/fma-do-not-commute.ll b/llvm/test/CodeGen/X86/fma-do-not-commute.ll index 0dc8e62c56d0c..3009db2859dba 100644 --- a/llvm/test/CodeGen/X86/fma-do-not-commute.ll +++ b/llvm/test/CodeGen/X86/fma-do-not-commute.ll @@ -1,4 +1,4 @@ -; RUN: llc -fp-contract=fast -mattr=+fma -disable-cgp < %s -o - | FileCheck %s +; RUN: llc -mattr=+fma -disable-cgp < %s -o - | FileCheck %s ; Check that the 2nd and 3rd arguments of fmaXXX231 reg1, reg2, mem3 are not commuted. ; target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" @@ -20,8 +20,8 @@ loop: %sum0 = phi float [ %fma, %loop ], [ %arg, %entry ] %addrVal = load float, ptr %addr, align 4 %addr2Val = load float, ptr %addr2, align 4 - %fmul = fmul float %addrVal, %addr2Val - %fma = fadd float %sum0, %fmul + %fmul = fmul contract float %addrVal, %addr2Val + %fma = fadd contract float %sum0, %fmul br i1 true, label %exit, label %loop exit: diff --git a/llvm/test/CodeGen/X86/fma_patterns.ll b/llvm/test/CodeGen/X86/fma_patterns.ll index dc35c8f8dc657..8b409ba53b0fb 100644 --- a/llvm/test/CodeGen/X86/fma_patterns.ll +++ b/llvm/test/CodeGen/X86/fma_patterns.ll @@ -1,15 +1,15 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma -fp-contract=fast | FileCheck %s --check-prefixes=FMA,FMA-INFS -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+fma -fp-contract=fast | FileCheck %s --check-prefixes=FMA4,FMA4-INFS -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4 -fp-contract=fast | FileCheck %s --check-prefixes=FMA4,FMA4-INFS -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl -fp-contract=fast | FileCheck %s --check-prefixes=AVX512,AVX512-INFS -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma -fp-contract=fast -enable-no-infs-fp-math | FileCheck %s --check-prefixes=FMA,FMA-NOINFS -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+fma -fp-contract=fast -enable-no-infs-fp-math | FileCheck %s --check-prefixes=FMA4,FMA4-NOINFS -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4 -fp-contract=fast -enable-no-infs-fp-math | FileCheck %s --check-prefixes=FMA4,FMA4-NOINFS -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl -fp-contract=fast -enable-no-infs-fp-math | FileCheck %s --check-prefixes=AVX512,AVX512-NOINFS +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma | FileCheck %s --check-prefixes=FMA,FMA-INFS +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+fma | FileCheck %s --check-prefixes=FMA4,FMA4-INFS +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4 | FileCheck %s --check-prefixes=FMA4,FMA4-INFS +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512-INFS +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma -enable-no-infs-fp-math | FileCheck %s --check-prefixes=FMA,FMA-NOINFS +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+fma -enable-no-infs-fp-math | FileCheck %s --check-prefixes=FMA4,FMA4-NOINFS +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4 -enable-no-infs-fp-math | FileCheck %s --check-prefixes=FMA4,FMA4-NOINFS +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl -enable-no-infs-fp-math | FileCheck %s --check-prefixes=AVX512,AVX512-NOINFS ; -; Pattern: (fadd (fmul x, y), z) -> (fmadd x,y,z) +; Pattern: (fadd contract (fmul contract x, y), z) -> (fmadd x,y,z) ; define float @test_f32_fmadd(float %a0, float %a1, float %a2) { @@ -27,8 +27,8 @@ define float @test_f32_fmadd(float %a0, float %a1, float %a2) { ; AVX512: # %bb.0: ; AVX512-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 ; AVX512-NEXT: retq - %x = fmul float %a0, %a1 - %res = fadd float %x, %a2 + %x = fmul contract float %a0, %a1 + %res = fadd contract float %x, %a2 ret float %res } @@ -47,8 +47,8 @@ define <4 x float> @test_4f32_fmadd(<4 x float> %a0, <4 x float> %a1, <4 x float ; AVX512: # %bb.0: ; AVX512-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 ; AVX512-NEXT: retq - %x = fmul <4 x float> %a0, %a1 - %res = fadd <4 x float> %x, %a2 + %x = fmul contract <4 x float> %a0, %a1 + %res = fadd contract <4 x float> %x, %a2 ret <4 x float> %res } @@ -67,8 +67,8 @@ define <8 x float> @test_8f32_fmadd(<8 x float> %a0, <8 x float> %a1, <8 x float ; AVX512: # %bb.0: ; AVX512-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2 ; AVX512-NEXT: retq - %x = fmul <8 x float> %a0, %a1 - %res = fadd <8 x float> %x, %a2 + %x = fmul contract <8 x float> %a0, %a1 + %res = fadd contract <8 x float> %x, %a2 ret <8 x float> %res } @@ -87,8 +87,8 @@ define double @test_f64_fmadd(double %a0, double %a1, double %a2) { ; AVX512: # %bb.0: ; AVX512-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 ; AVX512-NEXT: retq - %x = fmul double %a0, %a1 - %res = fadd double %x, %a2 + %x = fmul contract double %a0, %a1 + %res = fadd contract double %x, %a2 ret double %res } @@ -107,8 +107,8 @@ define <2 x double> @test_2f64_fmadd(<2 x double> %a0, <2 x double> %a1, <2 x do ; AVX512: # %bb.0: ; AVX512-NEXT: vfmadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 ; AVX512-NEXT: retq - %x = fmul <2 x double> %a0, %a1 - %res = fadd <2 x double> %x, %a2 + %x = fmul contract <2 x double> %a0, %a1 + %res = fadd contract <2 x double> %x, %a2 ret <2 x double> %res } @@ -127,13 +127,13 @@ define <4 x double> @test_4f64_fmadd(<4 x double> %a0, <4 x double> %a1, <4 x do ; AVX512: # %bb.0: ; AVX512-NEXT: vfmadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2 ; AVX512-NEXT: retq - %x = fmul <4 x double> %a0, %a1 - %res = fadd <4 x double> %x, %a2 + %x = fmul contract <4 x double> %a0, %a1 + %res = fadd contract <4 x double> %x, %a2 ret <4 x double> %res } ; -; Pattern: (fsub (fmul x, y), z) -> (fmsub x, y, z) +; Pattern: (fsub contract (fmul contract x, y), z) -> (fmsub x, y, z) ; define float @test_f32_fmsub(float %a0, float %a1, float %a2) { @@ -151,8 +151,8 @@ define float @test_f32_fmsub(float %a0, float %a1, float %a2) { ; AVX512: # %bb.0: ; AVX512-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 ; AVX512-NEXT: retq - %x = fmul float %a0, %a1 - %res = fsub float %x, %a2 + %x = fmul contract float %a0, %a1 + %res = fsub contract float %x, %a2 ret float %res } @@ -171,8 +171,8 @@ define <4 x float> @test_4f32_fmsub(<4 x float> %a0, <4 x float> %a1, <4 x float ; AVX512: # %bb.0: ; AVX512-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 ; AVX512-NEXT: retq - %x = fmul <4 x float> %a0, %a1 - %res = fsub <4 x float> %x, %a2 + %x = fmul contract <4 x float> %a0, %a1 + %res = fsub contract <4 x float> %x, %a2 ret <4 x float> %res } @@ -191,8 +191,8 @@ define <8 x float> @test_8f32_fmsub(<8 x float> %a0, <8 x float> %a1, <8 x float ; AVX512: # %bb.0: ; AVX512-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) - ymm2 ; AVX512-NEXT: retq - %x = fmul <8 x float> %a0, %a1 - %res = fsub <8 x float> %x, %a2 + %x = fmul contract <8 x float> %a0, %a1 + %res = fsub contract <8 x float> %x, %a2 ret <8 x float> %res } @@ -211,8 +211,8 @@ define double @test_f64_fmsub(double %a0, double %a1, double %a2) { ; AVX512: # %bb.0: ; AVX512-NEXT: vfmsub213sd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 ; AVX512-NEXT: retq - %x = fmul double %a0, %a1 - %res = fsub double %x, %a2 + %x = fmul contract double %a0, %a1 + %res = fsub contract double %x, %a2 ret double %res } @@ -231,8 +231,8 @@ define <2 x double> @test_2f64_fmsub(<2 x double> %a0, <2 x double> %a1, <2 x do ; AVX512: # %bb.0: ; AVX512-NEXT: vfmsub213pd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 ; AVX512-NEXT: retq - %x = fmul <2 x double> %a0, %a1 - %res = fsub <2 x double> %x, %a2 + %x = fmul contract <2 x double> %a0, %a1 + %res = fsub contract <2 x double> %x, %a2 ret <2 x double> %res } @@ -251,13 +251,13 @@ define <4 x double> @test_4f64_fmsub(<4 x double> %a0, <4 x double> %a1, <4 x do ; AVX512: # %bb.0: ; AVX512-NEXT: vfmsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) - ymm2 ; AVX512-NEXT: retq - %x = fmul <4 x double> %a0, %a1 - %res = fsub <4 x double> %x, %a2 + %x = fmul contract <4 x double> %a0, %a1 + %res = fsub contract <4 x double> %x, %a2 ret <4 x double> %res } ; -; Pattern: (fsub z, (fmul x, y)) -> (fnmadd x, y, z) +; Pattern: (fsub contract z, (fmul contract x, y)) -> (fnmadd x, y, z) ; define float @test_f32_fnmadd(float %a0, float %a1, float %a2) { @@ -275,8 +275,8 @@ define float @test_f32_fnmadd(float %a0, float %a1, float %a2) { ; AVX512: # %bb.0: ; AVX512-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 ; AVX512-NEXT: retq - %x = fmul float %a0, %a1 - %res = fsub float %a2, %x + %x = fmul contract float %a0, %a1 + %res = fsub contract float %a2, %x ret float %res } @@ -295,8 +295,8 @@ define <4 x float> @test_4f32_fnmadd(<4 x float> %a0, <4 x float> %a1, <4 x floa ; AVX512: # %bb.0: ; AVX512-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 ; AVX512-NEXT: retq - %x = fmul <4 x float> %a0, %a1 - %res = fsub <4 x float> %a2, %x + %x = fmul contract <4 x float> %a0, %a1 + %res = fsub contract <4 x float> %a2, %x ret <4 x float> %res } @@ -315,8 +315,8 @@ define <8 x float> @test_8f32_fnmadd(<8 x float> %a0, <8 x float> %a1, <8 x floa ; AVX512: # %bb.0: ; AVX512-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 ; AVX512-NEXT: retq - %x = fmul <8 x float> %a0, %a1 - %res = fsub <8 x float> %a2, %x + %x = fmul contract <8 x float> %a0, %a1 + %res = fsub contract <8 x float> %a2, %x ret <8 x float> %res } @@ -335,8 +335,8 @@ define double @test_f64_fnmadd(double %a0, double %a1, double %a2) { ; AVX512: # %bb.0: ; AVX512-NEXT: vfnmadd213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 ; AVX512-NEXT: retq - %x = fmul double %a0, %a1 - %res = fsub double %a2, %x + %x = fmul contract double %a0, %a1 + %res = fsub contract double %a2, %x ret double %res } @@ -355,8 +355,8 @@ define <2 x double> @test_2f64_fnmadd(<2 x double> %a0, <2 x double> %a1, <2 x d ; AVX512: # %bb.0: ; AVX512-NEXT: vfnmadd213pd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 ; AVX512-NEXT: retq - %x = fmul <2 x double> %a0, %a1 - %res = fsub <2 x double> %a2, %x + %x = fmul contract <2 x double> %a0, %a1 + %res = fsub contract <2 x double> %a2, %x ret <2 x double> %res } @@ -375,13 +375,13 @@ define <4 x double> @test_4f64_fnmadd(<4 x double> %a0, <4 x double> %a1, <4 x d ; AVX512: # %bb.0: ; AVX512-NEXT: vfnmadd213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 ; AVX512-NEXT: retq - %x = fmul <4 x double> %a0, %a1 - %res = fsub <4 x double> %a2, %x + %x = fmul contract <4 x double> %a0, %a1 + %res = fsub contract <4 x double> %a2, %x ret <4 x double> %res } ; -; Pattern: (fsub (fneg (fmul x, y)), z) -> (fnmsub x, y, z) +; Pattern: (fsub contract (fneg (fmul contract x, y)), z) -> (fnmsub x, y, z) ; define float @test_f32_fnmsub(float %a0, float %a1, float %a2) { @@ -399,9 +399,9 @@ define float @test_f32_fnmsub(float %a0, float %a1, float %a2) { ; AVX512: # %bb.0: ; AVX512-NEXT: vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 ; AVX512-NEXT: retq - %x = fmul float %a0, %a1 - %y = fsub float -0.000000e+00, %x - %res = fsub float %y, %a2 + %x = fmul contract float %a0, %a1 + %y = fsub contract float -0.000000e+00, %x + %res = fsub contract float %y, %a2 ret float %res } @@ -420,9 +420,9 @@ define <4 x float> @test_4f32_fnmsub(<4 x float> %a0, <4 x float> %a1, <4 x floa ; AVX512: # %bb.0: ; AVX512-NEXT: vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 ; AVX512-NEXT: retq - %x = fmul <4 x float> %a0, %a1 - %y = fsub <4 x float> , %x - %res = fsub <4 x float> %y, %a2 + %x = fmul contract <4 x float> %a0, %a1 + %y = fsub contract <4 x float> , %x + %res = fsub contract <4 x float> %y, %a2 ret <4 x float> %res } @@ -441,9 +441,9 @@ define <8 x float> @test_8f32_fnmsub(<8 x float> %a0, <8 x float> %a1, <8 x floa ; AVX512: # %bb.0: ; AVX512-NEXT: vfnmsub213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) - ymm2 ; AVX512-NEXT: retq - %x = fmul <8 x float> %a0, %a1 - %y = fsub <8 x float> , %x - %res = fsub <8 x float> %y, %a2 + %x = fmul contract <8 x float> %a0, %a1 + %y = fsub contract <8 x float> , %x + %res = fsub contract <8 x float> %y, %a2 ret <8 x float> %res } @@ -462,9 +462,9 @@ define double @test_f64_fnmsub(double %a0, double %a1, double %a2) { ; AVX512: # %bb.0: ; AVX512-NEXT: vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 ; AVX512-NEXT: retq - %x = fmul double %a0, %a1 - %y = fsub double -0.000000e+00, %x - %res = fsub double %y, %a2 + %x = fmul contract double %a0, %a1 + %y = fsub contract double -0.000000e+00, %x + %res = fsub contract double %y, %a2 ret double %res } @@ -483,9 +483,9 @@ define <2 x double> @test_2f64_fnmsub(<2 x double> %a0, <2 x double> %a1, <2 x d ; AVX512: # %bb.0: ; AVX512-NEXT: vfnmsub213pd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 ; AVX512-NEXT: retq - %x = fmul <2 x double> %a0, %a1 - %y = fsub <2 x double> , %x - %res = fsub <2 x double> %y, %a2 + %x = fmul contract <2 x double> %a0, %a1 + %y = fsub contract <2 x double> , %x + %res = fsub contract <2 x double> %y, %a2 ret <2 x double> %res } @@ -504,9 +504,9 @@ define <4 x double> @test_4f64_fnmsub(<4 x double> %a0, <4 x double> %a1, <4 x d ; AVX512: # %bb.0: ; AVX512-NEXT: vfnmsub213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) - ymm2 ; AVX512-NEXT: retq - %x = fmul <4 x double> %a0, %a1 - %y = fsub <4 x double> , %x - %res = fsub <4 x double> %y, %a2 + %x = fmul contract <4 x double> %a0, %a1 + %y = fsub contract <4 x double> , %x + %res = fsub contract <4 x double> %y, %a2 ret <4 x double> %res } @@ -530,8 +530,8 @@ define <4 x float> @test_4f32_fmadd_load(ptr %a0, <4 x float> %a1, <4 x float> % ; AVX512-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * mem) + xmm1 ; AVX512-NEXT: retq %x = load <4 x float>, ptr %a0 - %y = fmul <4 x float> %x, %a1 - %res = fadd <4 x float> %y, %a2 + %y = fmul contract <4 x float> %x, %a1 + %res = fadd contract <4 x float> %y, %a2 ret <4 x float> %res } @@ -551,8 +551,8 @@ define <2 x double> @test_2f64_fmsub_load(ptr %a0, <2 x double> %a1, <2 x double ; AVX512-NEXT: vfmsub132pd {{.*#+}} xmm0 = (xmm0 * mem) - xmm1 ; AVX512-NEXT: retq %x = load <2 x double>, ptr %a0 - %y = fmul <2 x double> %x, %a1 - %res = fsub <2 x double> %y, %a2 + %y = fmul contract <2 x double> %x, %a1 + %res = fsub contract <2 x double> %y, %a2 ret <2 x double> %res } @@ -593,8 +593,8 @@ define <4 x float> @test_v4f32_mul_add_x_one_y(<4 x float> %x, <4 x float> %y) { ; AVX512-NOINFS: # %bb.0: ; AVX512-NOINFS-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1 ; AVX512-NOINFS-NEXT: retq - %a = fadd <4 x float> %x, - %m = fmul <4 x float> %a, %y + %a = fadd contract <4 x float> %x, + %m = fmul contract <4 x float> %a, %y ret <4 x float> %m } @@ -631,8 +631,8 @@ define <4 x float> @test_v4f32_mul_y_add_x_one(<4 x float> %x, <4 x float> %y) { ; AVX512-NOINFS: # %bb.0: ; AVX512-NOINFS-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1 ; AVX512-NOINFS-NEXT: retq - %a = fadd <4 x float> %x, - %m = fmul <4 x float> %y, %a + %a = fadd contract <4 x float> %x, + %m = fmul contract <4 x float> %y, %a ret <4 x float> %m } @@ -669,8 +669,8 @@ define <4 x float> @test_v4f32_mul_y_add_x_one_undefs(<4 x float> %x, <4 x float ; AVX512-NOINFS: # %bb.0: ; AVX512-NOINFS-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1 ; AVX512-NOINFS-NEXT: retq - %a = fadd <4 x float> %x, - %m = fmul <4 x float> %y, %a + %a = fadd contract <4 x float> %x, + %m = fmul contract <4 x float> %y, %a ret <4 x float> %m } @@ -707,8 +707,8 @@ define <4 x float> @test_v4f32_mul_add_x_negone_y(<4 x float> %x, <4 x float> %y ; AVX512-NOINFS: # %bb.0: ; AVX512-NOINFS-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm1 ; AVX512-NOINFS-NEXT: retq - %a = fadd <4 x float> %x, - %m = fmul <4 x float> %a, %y + %a = fadd contract <4 x float> %x, + %m = fmul contract <4 x float> %a, %y ret <4 x float> %m } @@ -745,8 +745,8 @@ define <4 x float> @test_v4f32_mul_y_add_x_negone(<4 x float> %x, <4 x float> %y ; AVX512-NOINFS: # %bb.0: ; AVX512-NOINFS-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm1 ; AVX512-NOINFS-NEXT: retq - %a = fadd <4 x float> %x, - %m = fmul <4 x float> %y, %a + %a = fadd contract <4 x float> %x, + %m = fmul contract <4 x float> %y, %a ret <4 x float> %m } @@ -783,8 +783,8 @@ define <4 x float> @test_v4f32_mul_y_add_x_negone_undefs(<4 x float> %x, <4 x fl ; AVX512-NOINFS: # %bb.0: ; AVX512-NOINFS-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm1 ; AVX512-NOINFS-NEXT: retq - %a = fadd <4 x float> %x, - %m = fmul <4 x float> %y, %a + %a = fadd contract <4 x float> %x, + %m = fmul contract <4 x float> %y, %a ret <4 x float> %m } @@ -824,8 +824,8 @@ define <4 x float> @test_v4f32_mul_sub_one_x_y(<4 x float> %x, <4 x float> %y) { ; AVX512-NOINFS: # %bb.0: ; AVX512-NOINFS-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm1 ; AVX512-NOINFS-NEXT: retq - %s = fsub <4 x float> , %x - %m = fmul <4 x float> %s, %y + %s = fsub contract <4 x float> , %x + %m = fmul contract <4 x float> %s, %y ret <4 x float> %m } @@ -865,8 +865,8 @@ define <4 x float> @test_v4f32_mul_y_sub_one_x(<4 x float> %x, <4 x float> %y) { ; AVX512-NOINFS: # %bb.0: ; AVX512-NOINFS-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm1 ; AVX512-NOINFS-NEXT: retq - %s = fsub <4 x float> , %x - %m = fmul <4 x float> %y, %s + %s = fsub contract <4 x float> , %x + %m = fmul contract <4 x float> %y, %s ret <4 x float> %m } @@ -906,8 +906,8 @@ define <4 x float> @test_v4f32_mul_y_sub_one_x_undefs(<4 x float> %x, <4 x float ; AVX512-NOINFS: # %bb.0: ; AVX512-NOINFS-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm1 ; AVX512-NOINFS-NEXT: retq - %s = fsub <4 x float> , %x - %m = fmul <4 x float> %y, %s + %s = fsub contract <4 x float> , %x + %m = fmul contract <4 x float> %y, %s ret <4 x float> %m } @@ -947,8 +947,8 @@ define <4 x float> @test_v4f32_mul_sub_negone_x_y(<4 x float> %x, <4 x float> %y ; AVX512-NOINFS: # %bb.0: ; AVX512-NOINFS-NEXT: vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm1 ; AVX512-NOINFS-NEXT: retq - %s = fsub <4 x float> , %x - %m = fmul <4 x float> %s, %y + %s = fsub contract <4 x float> , %x + %m = fmul contract <4 x float> %s, %y ret <4 x float> %m } @@ -988,8 +988,8 @@ define <4 x float> @test_v4f32_mul_y_sub_negone_x(<4 x float> %x, <4 x float> %y ; AVX512-NOINFS: # %bb.0: ; AVX512-NOINFS-NEXT: vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm1 ; AVX512-NOINFS-NEXT: retq - %s = fsub <4 x float> , %x - %m = fmul <4 x float> %y, %s + %s = fsub contract <4 x float> , %x + %m = fmul contract <4 x float> %y, %s ret <4 x float> %m } @@ -1029,8 +1029,8 @@ define <4 x float> @test_v4f32_mul_y_sub_negone_x_undefs(<4 x float> %x, <4 x fl ; AVX512-NOINFS: # %bb.0: ; AVX512-NOINFS-NEXT: vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm1 ; AVX512-NOINFS-NEXT: retq - %s = fsub <4 x float> , %x - %m = fmul <4 x float> %y, %s + %s = fsub contract <4 x float> , %x + %m = fmul contract <4 x float> %y, %s ret <4 x float> %m } @@ -1067,8 +1067,8 @@ define <4 x float> @test_v4f32_mul_sub_x_one_y(<4 x float> %x, <4 x float> %y) { ; AVX512-NOINFS: # %bb.0: ; AVX512-NOINFS-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm1 ; AVX512-NOINFS-NEXT: retq - %s = fsub <4 x float> %x, - %m = fmul <4 x float> %s, %y + %s = fsub contract <4 x float> %x, + %m = fmul contract <4 x float> %s, %y ret <4 x float> %m } @@ -1105,8 +1105,8 @@ define <4 x float> @test_v4f32_mul_y_sub_x_one(<4 x float> %x, <4 x float> %y) { ; AVX512-NOINFS: # %bb.0: ; AVX512-NOINFS-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm1 ; AVX512-NOINFS-NEXT: retq - %s = fsub <4 x float> %x, - %m = fmul <4 x float> %y, %s + %s = fsub contract <4 x float> %x, + %m = fmul contract <4 x float> %y, %s ret <4 x float> %m } @@ -1143,8 +1143,8 @@ define <4 x float> @test_v4f32_mul_y_sub_x_one_undefs(<4 x float> %x, <4 x float ; AVX512-NOINFS: # %bb.0: ; AVX512-NOINFS-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm1 ; AVX512-NOINFS-NEXT: retq - %s = fsub <4 x float> %x, - %m = fmul <4 x float> %y, %s + %s = fsub contract <4 x float> %x, + %m = fmul contract <4 x float> %y, %s ret <4 x float> %m } @@ -1181,8 +1181,8 @@ define <4 x float> @test_v4f32_mul_sub_x_negone_y(<4 x float> %x, <4 x float> %y ; AVX512-NOINFS: # %bb.0: ; AVX512-NOINFS-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1 ; AVX512-NOINFS-NEXT: retq - %s = fsub <4 x float> %x, - %m = fmul <4 x float> %s, %y + %s = fsub contract <4 x float> %x, + %m = fmul contract <4 x float> %s, %y ret <4 x float> %m } @@ -1219,8 +1219,8 @@ define <4 x float> @test_v4f32_mul_y_sub_x_negone(<4 x float> %x, <4 x float> %y ; AVX512-NOINFS: # %bb.0: ; AVX512-NOINFS-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1 ; AVX512-NOINFS-NEXT: retq - %s = fsub <4 x float> %x, - %m = fmul <4 x float> %y, %s + %s = fsub contract <4 x float> %x, + %m = fmul contract <4 x float> %y, %s ret <4 x float> %m } @@ -1257,8 +1257,8 @@ define <4 x float> @test_v4f32_mul_y_sub_x_negone_undefs(<4 x float> %x, <4 x fl ; AVX512-NOINFS: # %bb.0: ; AVX512-NOINFS-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1 ; AVX512-NOINFS-NEXT: retq - %s = fsub <4 x float> %x, - %m = fmul <4 x float> %y, %s + %s = fsub contract <4 x float> %x, + %m = fmul contract <4 x float> %y, %s ret <4 x float> %m } @@ -1308,10 +1308,10 @@ define float @test_f32_interp(float %x, float %y, float %t) { ; AVX512-NOINFS-NEXT: vfmsub213ss {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1 ; AVX512-NOINFS-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm2 * xmm0) - xmm1 ; AVX512-NOINFS-NEXT: retq - %t1 = fsub nsz float 1.0, %t - %tx = fmul nsz float %x, %t - %ty = fmul nsz float %y, %t1 - %r = fadd nsz float %tx, %ty + %t1 = fsub contract nsz float 1.0, %t + %tx = fmul contract nsz float %x, %t + %ty = fmul contract nsz float %y, %t1 + %r = fadd contract nsz float %tx, %ty ret float %r } @@ -1357,10 +1357,10 @@ define <4 x float> @test_v4f32_interp(<4 x float> %x, <4 x float> %y, <4 x float ; AVX512-NOINFS-NEXT: vfmsub213ps {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1 ; AVX512-NOINFS-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm2 * xmm0) - xmm1 ; AVX512-NOINFS-NEXT: retq - %t1 = fsub nsz <4 x float> , %t - %tx = fmul nsz <4 x float> %x, %t - %ty = fmul nsz <4 x float> %y, %t1 - %r = fadd nsz <4 x float> %tx, %ty + %t1 = fsub contract nsz <4 x float> , %t + %tx = fmul contract nsz <4 x float> %x, %t + %ty = fmul contract nsz <4 x float> %y, %t1 + %r = fadd contract nsz <4 x float> %tx, %ty ret <4 x float> %r } @@ -1406,10 +1406,10 @@ define <8 x float> @test_v8f32_interp(<8 x float> %x, <8 x float> %y, <8 x float ; AVX512-NOINFS-NEXT: vfmsub213ps {{.*#+}} ymm1 = (ymm2 * ymm1) - ymm1 ; AVX512-NOINFS-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm1 ; AVX512-NOINFS-NEXT: retq - %t1 = fsub nsz <8 x float> , %t - %tx = fmul nsz <8 x float> %x, %t - %ty = fmul nsz <8 x float> %y, %t1 - %r = fadd nsz <8 x float> %tx, %ty + %t1 = fsub contract nsz <8 x float> , %t + %tx = fmul contract nsz <8 x float> %x, %t + %ty = fmul contract nsz <8 x float> %y, %t1 + %r = fadd contract nsz <8 x float> %tx, %ty ret <8 x float> %r } @@ -1455,10 +1455,10 @@ define double @test_f64_interp(double %x, double %y, double %t) { ; AVX512-NOINFS-NEXT: vfmsub213sd {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1 ; AVX512-NOINFS-NEXT: vfmsub213sd {{.*#+}} xmm0 = (xmm2 * xmm0) - xmm1 ; AVX512-NOINFS-NEXT: retq - %t1 = fsub nsz double 1.0, %t - %tx = fmul nsz double %x, %t - %ty = fmul nsz double %y, %t1 - %r = fadd nsz double %tx, %ty + %t1 = fsub contract nsz double 1.0, %t + %tx = fmul contract nsz double %x, %t + %ty = fmul contract nsz double %y, %t1 + %r = fadd contract nsz double %tx, %ty ret double %r } @@ -1507,10 +1507,10 @@ define <2 x double> @test_v2f64_interp(<2 x double> %x, <2 x double> %y, <2 x do ; AVX512-NOINFS-NEXT: vfmsub213pd {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1 ; AVX512-NOINFS-NEXT: vfmsub213pd {{.*#+}} xmm0 = (xmm2 * xmm0) - xmm1 ; AVX512-NOINFS-NEXT: retq - %t1 = fsub nsz <2 x double> , %t - %tx = fmul nsz <2 x double> %x, %t - %ty = fmul nsz <2 x double> %y, %t1 - %r = fadd nsz <2 x double> %tx, %ty + %t1 = fsub contract nsz <2 x double> , %t + %tx = fmul contract nsz <2 x double> %x, %t + %ty = fmul contract nsz <2 x double> %y, %t1 + %r = fadd contract nsz <2 x double> %tx, %ty ret <2 x double> %r } @@ -1556,10 +1556,10 @@ define <4 x double> @test_v4f64_interp(<4 x double> %x, <4 x double> %y, <4 x do ; AVX512-NOINFS-NEXT: vfmsub213pd {{.*#+}} ymm1 = (ymm2 * ymm1) - ymm1 ; AVX512-NOINFS-NEXT: vfmsub213pd {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm1 ; AVX512-NOINFS-NEXT: retq - %t1 = fsub nsz <4 x double> , %t - %tx = fmul nsz <4 x double> %x, %t - %ty = fmul nsz <4 x double> %y, %t1 - %r = fadd nsz <4 x double> %tx, %ty + %t1 = fsub contract nsz <4 x double> , %t + %tx = fmul contract nsz <4 x double> %x, %t + %ty = fmul contract nsz <4 x double> %y, %t1 + %r = fadd contract nsz <4 x double> %tx, %ty ret <4 x double> %r } @@ -1582,9 +1582,9 @@ define <4 x float> @test_v4f32_fneg_fmadd(<4 x float> %a0, <4 x float> %a1, <4 x ; AVX512: # %bb.0: ; AVX512-NEXT: vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 ; AVX512-NEXT: retq - %mul = fmul contract nsz <4 x float> %a0, %a1 - %add = fadd contract nsz <4 x float> %mul, %a2 - %neg = fsub contract nsz <4 x float> , %add + %mul = fmul contract contract nsz <4 x float> %a0, %a1 + %add = fadd contract contract nsz <4 x float> %mul, %a2 + %neg = fsub contract contract nsz <4 x float> , %add ret <4 x float> %neg } @@ -1603,9 +1603,9 @@ define <4 x double> @test_v4f64_fneg_fmsub(<4 x double> %a0, <4 x double> %a1, < ; AVX512: # %bb.0: ; AVX512-NEXT: vfnmadd213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 ; AVX512-NEXT: retq - %mul = fmul nsz <4 x double> %a0, %a1 - %sub = fsub nsz <4 x double> %mul, %a2 - %neg = fsub nsz <4 x double> , %sub + %mul = fmul contract nsz <4 x double> %a0, %a1 + %sub = fsub contract nsz <4 x double> %mul, %a2 + %neg = fsub contract nsz <4 x double> , %sub ret <4 x double> %neg } @@ -1624,10 +1624,10 @@ define <4 x float> @test_v4f32_fneg_fnmadd(<4 x float> %a0, <4 x float> %a1, <4 ; AVX512: # %bb.0: ; AVX512-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 ; AVX512-NEXT: retq - %mul = fmul contract nsz <4 x float> %a0, %a1 - %neg0 = fsub contract nsz <4 x float> , %mul - %add = fadd contract nsz <4 x float> %neg0, %a2 - %neg1 = fsub contract nsz <4 x float> , %add + %mul = fmul contract contract nsz <4 x float> %a0, %a1 + %neg0 = fsub contract contract nsz <4 x float> , %mul + %add = fadd contract contract nsz <4 x float> %neg0, %a2 + %neg1 = fsub contract contract nsz <4 x float> , %add ret <4 x float> %neg1 } @@ -1646,15 +1646,15 @@ define <4 x double> @test_v4f64_fneg_fnmsub(<4 x double> %a0, <4 x double> %a1, ; AVX512: # %bb.0: ; AVX512-NEXT: vfmadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2 ; AVX512-NEXT: retq - %mul = fmul contract nsz <4 x double> %a0, %a1 - %neg0 = fsub contract nsz <4 x double> , %mul - %sub = fsub contract nsz <4 x double> %neg0, %a2 - %neg1 = fsub contract nsz <4 x double> , %sub + %mul = fmul contract contract nsz <4 x double> %a0, %a1 + %neg0 = fsub contract contract nsz <4 x double> , %mul + %sub = fsub contract contract nsz <4 x double> %neg0, %a2 + %neg1 = fsub contract contract nsz <4 x double> , %sub ret <4 x double> %neg1 } ; -; Pattern: (fma x, c1, (fmul x, c2)) -> (fmul x, c1+c2) +; Pattern: (fma x, c1, (fmul contract x, c2)) -> (fmul contract x, c1+c2) ; define <4 x float> @test_v4f32_fma_x_c1_fmul_x_c2(<4 x float> %x) { @@ -1672,14 +1672,14 @@ define <4 x float> @test_v4f32_fma_x_c1_fmul_x_c2(<4 x float> %x) { ; AVX512: # %bb.0: ; AVX512-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; AVX512-NEXT: retq - %m0 = fmul contract reassoc <4 x float> %x, - %m1 = fmul contract reassoc <4 x float> %x, - %a = fadd contract reassoc <4 x float> %m0, %m1 + %m0 = fmul contract contract reassoc <4 x float> %x, + %m1 = fmul contract contract reassoc <4 x float> %x, + %a = fadd contract contract reassoc <4 x float> %m0, %m1 ret <4 x float> %a } ; -; Pattern: (fma (fmul x, c1), c2, y) -> (fma x, c1*c2, y) +; Pattern: (fma (fmul contract x, c1), c2, y) -> (fma x, c1*c2, y) ; define <4 x float> @test_v4f32_fma_fmul_x_c1_c2_y(<4 x float> %x, <4 x float> %y) { @@ -1697,13 +1697,13 @@ define <4 x float> @test_v4f32_fma_fmul_x_c1_c2_y(<4 x float> %x, <4 x float> %y ; AVX512: # %bb.0: ; AVX512-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * mem) + xmm1 ; AVX512-NEXT: retq - %m0 = fmul contract reassoc <4 x float> %x, - %m1 = fmul contract reassoc <4 x float> %m0, - %a = fadd contract reassoc <4 x float> %m1, %y + %m0 = fmul contract contract reassoc <4 x float> %x, + %m1 = fmul contract contract reassoc <4 x float> %m0, + %a = fadd contract contract reassoc <4 x float> %m1, %y ret <4 x float> %a } -; Pattern: (fneg (fmul x, y)) -> (fnmsub x, y, 0) +; Pattern: (fneg (fmul contract x, y)) -> (fnmsub x, y, 0) define double @test_f64_fneg_fmul(double %x, double %y) { ; FMA-LABEL: test_f64_fneg_fmul: @@ -1723,8 +1723,8 @@ define double @test_f64_fneg_fmul(double %x, double %y) { ; AVX512-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; AVX512-NEXT: vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 ; AVX512-NEXT: retq - %m = fmul contract nsz double %x, %y - %n = fsub contract double -0.0, %m + %m = fmul contract contract nsz double %x, %y + %n = fsub contract contract double -0.0, %m ret double %n } @@ -1746,8 +1746,8 @@ define <4 x float> @test_v4f32_fneg_fmul(<4 x float> %x, <4 x float> %y) { ; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; AVX512-NEXT: vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 ; AVX512-NEXT: retq - %m = fmul contract nsz <4 x float> %x, %y - %n = fsub contract <4 x float> , %m + %m = fmul contract contract nsz <4 x float> %x, %y + %n = fsub contract contract <4 x float> , %m ret <4 x float> %n } @@ -1769,8 +1769,8 @@ define <4 x double> @test_v4f64_fneg_fmul(<4 x double> %x, <4 x double> %y) { ; AVX512-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; AVX512-NEXT: vfnmsub213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) - ymm2 ; AVX512-NEXT: retq - %m = fmul contract nsz <4 x double> %x, %y - %n = fsub contract <4 x double> , %m + %m = fmul contract contract nsz <4 x double> %x, %y + %n = fsub contract contract <4 x double> , %m ret <4 x double> %n } @@ -1792,8 +1792,8 @@ define <4 x double> @test_v4f64_fneg_fmul_no_nsz(<4 x double> %x, <4 x double> % ; AVX512-NEXT: vmulpd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 ; AVX512-NEXT: retq - %m = fmul contract <4 x double> %x, %y - %n = fsub contract <4 x double> , %m + %m = fmul contract contract <4 x double> %x, %y + %n = fsub contract contract <4 x double> , %m ret <4 x double> %n } @@ -1817,15 +1817,15 @@ define double @fadd_fma_fmul_1(double %a, double %b, double %c, double %d, doubl ; AVX512-NEXT: vfmadd213sd {{.*#+}} xmm2 = (xmm3 * xmm2) + xmm4 ; AVX512-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 ; AVX512-NEXT: retq - %m1 = fmul fast double %a, %b - %m2 = fmul fast double %c, %d - %a1 = fadd fast double %m1, %m2 - %a2 = fadd fast double %a1, %n1 + %m1 = fmul contract fast double %a, %b + %m2 = fmul contract fast double %c, %d + %a1 = fadd contract fast double %m1, %m2 + %a2 = fadd contract fast double %a1, %n1 ret double %a2 } -; Minimum FMF - the 1st fadd is contracted because that combines -; fmul+fadd as specified by the order of operations; the 2nd fadd +; Minimum FMF - the 1st fadd contract is contracted because that combines +; fmul contract+fadd contract as specified by the order of operations; the 2nd fadd contract ; requires reassociation to fuse with c*d. define float @fadd_fma_fmul_fmf(float %a, float %b, float %c, float %d, float %n0) nounwind { @@ -1846,10 +1846,10 @@ define float @fadd_fma_fmul_fmf(float %a, float %b, float %c, float %d, float %n ; AVX512-NEXT: vfmadd213ss {{.*#+}} xmm2 = (xmm3 * xmm2) + xmm4 ; AVX512-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 ; AVX512-NEXT: retq - %m1 = fmul float %a, %b - %m2 = fmul float %c, %d - %a1 = fadd contract float %m1, %m2 - %a2 = fadd reassoc float %n0, %a1 + %m1 = fmul contract float %a, %b + %m2 = fmul contract float %c, %d + %a1 = fadd contract contract float %m1, %m2 + %a2 = fadd contract reassoc float %n0, %a1 ret float %a2 } @@ -1876,14 +1876,14 @@ define float @fadd_fma_fmul_2(float %a, float %b, float %c, float %d, float %n0) ; AVX512-NEXT: vfmadd231ss {{.*#+}} xmm2 = (xmm1 * xmm0) + xmm2 ; AVX512-NEXT: vaddss %xmm2, %xmm4, %xmm0 ; AVX512-NEXT: retq - %m1 = fmul float %a, %b - %m2 = fmul float %c, %d - %a1 = fadd contract float %m1, %m2 - %a2 = fadd contract float %n0, %a1 + %m1 = fmul contract float %a, %b + %m2 = fmul contract float %c, %d + %a1 = fadd contract contract float %m1, %m2 + %a2 = fadd contract contract float %n0, %a1 ret float %a2 } -; The final fadd can be folded with either 1 of the leading fmuls. +; The final fadd contract can be folded with either 1 of the leading fmul contracts. define <2 x double> @fadd_fma_fmul_3(<2 x double> %x1, <2 x double> %x2, <2 x double> %x3, <2 x double> %x4, <2 x double> %x5, <2 x double> %x6, <2 x double> %x7, <2 x double> %x8) nounwind { ; FMA-LABEL: fadd_fma_fmul_3: @@ -1911,13 +1911,13 @@ define <2 x double> @fadd_fma_fmul_3(<2 x double> %x1, <2 x double> %x2, <2 x do ; AVX512-NEXT: vfmadd231pd {{.*#+}} xmm2 = (xmm5 * xmm4) + xmm2 ; AVX512-NEXT: vmovapd %xmm2, %xmm0 ; AVX512-NEXT: retq - %m1 = fmul fast <2 x double> %x1, %x2 - %m2 = fmul fast <2 x double> %x3, %x4 - %m3 = fmul fast <2 x double> %x5, %x6 - %m4 = fmul fast <2 x double> %x7, %x8 - %a1 = fadd fast <2 x double> %m1, %m2 - %a2 = fadd fast <2 x double> %m3, %m4 - %a3 = fadd fast <2 x double> %a1, %a2 + %m1 = fmul contract fast <2 x double> %x1, %x2 + %m2 = fmul contract fast <2 x double> %x3, %x4 + %m3 = fmul contract fast <2 x double> %x5, %x6 + %m4 = fmul contract fast <2 x double> %x7, %x8 + %a1 = fadd contract fast <2 x double> %m1, %m2 + %a2 = fadd contract fast <2 x double> %m3, %m4 + %a3 = fadd contract fast <2 x double> %a1, %a2 ret <2 x double> %a3 } @@ -1947,11 +1947,11 @@ define float @fadd_fma_fmul_extra_use_1(float %a, float %b, float %c, float %d, ; AVX512-NEXT: vfmadd213ss {{.*#+}} xmm2 = (xmm3 * xmm2) + xmm0 ; AVX512-NEXT: vaddss %xmm2, %xmm4, %xmm0 ; AVX512-NEXT: retq - %m1 = fmul fast float %a, %b + %m1 = fmul contract fast float %a, %b store float %m1, ptr %p - %m2 = fmul fast float %c, %d - %a1 = fadd fast float %m1, %m2 - %a2 = fadd fast float %n0, %a1 + %m2 = fmul contract fast float %c, %d + %a1 = fadd contract fast float %m1, %m2 + %a2 = fadd contract fast float %n0, %a1 ret float %a2 } @@ -1981,11 +1981,11 @@ define float @fadd_fma_fmul_extra_use_2(float %a, float %b, float %c, float %d, ; AVX512-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 ; AVX512-NEXT: vaddss %xmm0, %xmm4, %xmm0 ; AVX512-NEXT: retq - %m1 = fmul fast float %a, %b - %m2 = fmul fast float %c, %d + %m1 = fmul contract fast float %a, %b + %m2 = fmul contract fast float %c, %d store float %m2, ptr %p - %a1 = fadd fast float %m1, %m2 - %a2 = fadd fast float %n0, %a1 + %a1 = fadd contract fast float %m1, %m2 + %a2 = fadd contract fast float %n0, %a1 ret float %a2 } @@ -2015,10 +2015,10 @@ define float @fadd_fma_fmul_extra_use_3(float %a, float %b, float %c, float %d, ; AVX512-NEXT: vmovss %xmm2, (%rdi) ; AVX512-NEXT: vaddss %xmm2, %xmm4, %xmm0 ; AVX512-NEXT: retq - %m1 = fmul fast float %a, %b - %m2 = fmul fast float %c, %d - %a1 = fadd fast float %m1, %m2 + %m1 = fmul contract fast float %a, %b + %m2 = fmul contract fast float %c, %d + %a1 = fadd contract fast float %m1, %m2 store float %a1, ptr %p - %a2 = fadd fast float %n0, %a1 + %a2 = fadd contract fast float %n0, %a1 ret float %a2 } diff --git a/llvm/test/CodeGen/X86/fma_patterns_wide.ll b/llvm/test/CodeGen/X86/fma_patterns_wide.ll index d910110467ee0..4546df5d0255a 100644 --- a/llvm/test/CodeGen/X86/fma_patterns_wide.ll +++ b/llvm/test/CodeGen/X86/fma_patterns_wide.ll @@ -1,12 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma -fp-contract=fast | FileCheck %s --check-prefix=FMA --check-prefix=FMA-INFS -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+fma -fp-contract=fast | FileCheck %s --check-prefix=FMA4 --check-prefix=FMA4-INFS -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4 -fp-contract=fast | FileCheck %s --check-prefix=FMA4 --check-prefix=FMA4-INFS -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq -fp-contract=fast | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512-INFS -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma -fp-contract=fast -enable-no-infs-fp-math | FileCheck %s --check-prefix=FMA --check-prefix=FMA-NOINFS -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+fma -fp-contract=fast -enable-no-infs-fp-math | FileCheck %s --check-prefix=FMA4 --check-prefix=FMA4-NOINFS -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4 -fp-contract=fast -enable-no-infs-fp-math | FileCheck %s --check-prefix=FMA4 --check-prefix=FMA4-NOINFS -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq -fp-contract=fast -enable-no-infs-fp-math | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512-NOINFS +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma | FileCheck %s --check-prefix=FMA --check-prefix=FMA-INFS +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+fma | FileCheck %s --check-prefix=FMA4 --check-prefix=FMA4-INFS +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4 | FileCheck %s --check-prefix=FMA4 --check-prefix=FMA4-INFS +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512-INFS +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma -enable-no-infs-fp-math | FileCheck %s --check-prefix=FMA --check-prefix=FMA-NOINFS +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+fma -enable-no-infs-fp-math | FileCheck %s --check-prefix=FMA4 --check-prefix=FMA4-NOINFS +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4 -enable-no-infs-fp-math | FileCheck %s --check-prefix=FMA4 --check-prefix=FMA4-NOINFS +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq -enable-no-infs-fp-math | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512-NOINFS ; ; Pattern: (fadd (fmul x, y), z) -> (fmadd x,y,z) @@ -29,8 +29,8 @@ define <16 x float> @test_16f32_fmadd(<16 x float> %a0, <16 x float> %a1, <16 x ; AVX512: # %bb.0: ; AVX512-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2 ; AVX512-NEXT: retq - %x = fmul <16 x float> %a0, %a1 - %res = fadd <16 x float> %x, %a2 + %x = fmul contract <16 x float> %a0, %a1 + %res = fadd contract <16 x float> %x, %a2 ret <16 x float> %res } @@ -51,8 +51,8 @@ define <8 x double> @test_8f64_fmadd(<8 x double> %a0, <8 x double> %a1, <8 x do ; AVX512: # %bb.0: ; AVX512-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2 ; AVX512-NEXT: retq - %x = fmul <8 x double> %a0, %a1 - %res = fadd <8 x double> %x, %a2 + %x = fmul contract <8 x double> %a0, %a1 + %res = fadd contract <8 x double> %x, %a2 ret <8 x double> %res } @@ -77,8 +77,8 @@ define <16 x float> @test_16f32_fmsub(<16 x float> %a0, <16 x float> %a1, <16 x ; AVX512: # %bb.0: ; AVX512-NEXT: vfmsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm2 ; AVX512-NEXT: retq - %x = fmul <16 x float> %a0, %a1 - %res = fsub <16 x float> %x, %a2 + %x = fmul contract <16 x float> %a0, %a1 + %res = fsub contract <16 x float> %x, %a2 ret <16 x float> %res } @@ -99,8 +99,8 @@ define <8 x double> @test_8f64_fmsub(<8 x double> %a0, <8 x double> %a1, <8 x do ; AVX512: # %bb.0: ; AVX512-NEXT: vfmsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm2 ; AVX512-NEXT: retq - %x = fmul <8 x double> %a0, %a1 - %res = fsub <8 x double> %x, %a2 + %x = fmul contract <8 x double> %a0, %a1 + %res = fsub contract <8 x double> %x, %a2 ret <8 x double> %res } @@ -125,8 +125,8 @@ define <16 x float> @test_16f32_fnmadd(<16 x float> %a0, <16 x float> %a1, <16 x ; AVX512: # %bb.0: ; AVX512-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + zmm2 ; AVX512-NEXT: retq - %x = fmul <16 x float> %a0, %a1 - %res = fsub <16 x float> %a2, %x + %x = fmul contract <16 x float> %a0, %a1 + %res = fsub contract <16 x float> %a2, %x ret <16 x float> %res } @@ -147,8 +147,8 @@ define <8 x double> @test_8f64_fnmadd(<8 x double> %a0, <8 x double> %a1, <8 x d ; AVX512: # %bb.0: ; AVX512-NEXT: vfnmadd213pd {{.*#+}} zmm0 = -(zmm1 * zmm0) + zmm2 ; AVX512-NEXT: retq - %x = fmul <8 x double> %a0, %a1 - %res = fsub <8 x double> %a2, %x + %x = fmul contract <8 x double> %a0, %a1 + %res = fsub contract <8 x double> %a2, %x ret <8 x double> %res } @@ -173,9 +173,9 @@ define <16 x float> @test_16f32_fnmsub(<16 x float> %a0, <16 x float> %a1, <16 x ; AVX512: # %bb.0: ; AVX512-NEXT: vfnmsub213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm2 ; AVX512-NEXT: retq - %x = fmul <16 x float> %a0, %a1 - %y = fsub <16 x float> , %x - %res = fsub <16 x float> %y, %a2 + %x = fmul contract <16 x float> %a0, %a1 + %y = fsub contract <16 x float> , %x + %res = fsub contract <16 x float> %y, %a2 ret <16 x float> %res } @@ -196,9 +196,9 @@ define <8 x double> @test_8f64_fnmsub(<8 x double> %a0, <8 x double> %a1, <8 x d ; AVX512: # %bb.0: ; AVX512-NEXT: vfnmsub213pd {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm2 ; AVX512-NEXT: retq - %x = fmul <8 x double> %a0, %a1 - %y = fsub <8 x double> , %x - %res = fsub <8 x double> %y, %a2 + %x = fmul contract <8 x double> %a0, %a1 + %y = fsub contract <8 x double> , %x + %res = fsub contract <8 x double> %y, %a2 ret <8 x double> %res } @@ -224,8 +224,8 @@ define <16 x float> @test_16f32_fmadd_load(ptr %a0, <16 x float> %a1, <16 x floa ; AVX512-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * mem) + zmm1 ; AVX512-NEXT: retq %x = load <16 x float>, ptr %a0 - %y = fmul <16 x float> %x, %a1 - %res = fadd <16 x float> %y, %a2 + %y = fmul contract <16 x float> %x, %a1 + %res = fadd contract <16 x float> %y, %a2 ret <16 x float> %res } @@ -247,8 +247,8 @@ define <8 x double> @test_8f64_fmsub_load(ptr %a0, <8 x double> %a1, <8 x double ; AVX512-NEXT: vfmsub132pd {{.*#+}} zmm0 = (zmm0 * mem) - zmm1 ; AVX512-NEXT: retq %x = load <8 x double>, ptr %a0 - %y = fmul <8 x double> %x, %a1 - %res = fsub <8 x double> %y, %a2 + %y = fmul contract <8 x double> %x, %a1 + %res = fsub contract <8 x double> %y, %a2 ret <8 x double> %res } @@ -297,8 +297,8 @@ define <16 x float> @test_v16f32_mul_add_x_one_y(<16 x float> %x, <16 x float> % ; AVX512-NOINFS: # %bb.0: ; AVX512-NOINFS-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm1 ; AVX512-NOINFS-NEXT: retq - %a = fadd <16 x float> %x, - %m = fmul <16 x float> %a, %y + %a = fadd contract <16 x float> %x, + %m = fmul contract <16 x float> %a, %y ret <16 x float> %m } @@ -343,8 +343,8 @@ define <8 x double> @test_v8f64_mul_y_add_x_one(<8 x double> %x, <8 x double> %y ; AVX512-NOINFS: # %bb.0: ; AVX512-NOINFS-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm1 ; AVX512-NOINFS-NEXT: retq - %a = fadd <8 x double> %x, - %m = fmul <8 x double> %y, %a + %a = fadd contract <8 x double> %x, + %m = fmul contract <8 x double> %y, %a ret <8 x double> %m } @@ -389,8 +389,8 @@ define <16 x float> @test_v16f32_mul_add_x_negone_y(<16 x float> %x, <16 x float ; AVX512-NOINFS: # %bb.0: ; AVX512-NOINFS-NEXT: vfmsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm1 ; AVX512-NOINFS-NEXT: retq - %a = fadd <16 x float> %x, - %m = fmul <16 x float> %a, %y + %a = fadd contract <16 x float> %x, + %m = fmul contract <16 x float> %a, %y ret <16 x float> %m } @@ -435,8 +435,8 @@ define <8 x double> @test_v8f64_mul_y_add_x_negone(<8 x double> %x, <8 x double> ; AVX512-NOINFS: # %bb.0: ; AVX512-NOINFS-NEXT: vfmsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm1 ; AVX512-NOINFS-NEXT: retq - %a = fadd <8 x double> %x, - %m = fmul <8 x double> %y, %a + %a = fadd contract <8 x double> %x, + %m = fmul contract <8 x double> %y, %a ret <8 x double> %m } @@ -482,8 +482,8 @@ define <16 x float> @test_v16f32_mul_sub_one_x_y(<16 x float> %x, <16 x float> % ; AVX512-NOINFS: # %bb.0: ; AVX512-NOINFS-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + zmm1 ; AVX512-NOINFS-NEXT: retq - %s = fsub <16 x float> , %x - %m = fmul <16 x float> %s, %y + %s = fsub contract <16 x float> , %x + %m = fmul contract <16 x float> %s, %y ret <16 x float> %m } @@ -529,8 +529,8 @@ define <8 x double> @test_v8f64_mul_y_sub_one_x(<8 x double> %x, <8 x double> %y ; AVX512-NOINFS: # %bb.0: ; AVX512-NOINFS-NEXT: vfnmadd213pd {{.*#+}} zmm0 = -(zmm1 * zmm0) + zmm1 ; AVX512-NOINFS-NEXT: retq - %s = fsub <8 x double> , %x - %m = fmul <8 x double> %y, %s + %s = fsub contract <8 x double> , %x + %m = fmul contract <8 x double> %y, %s ret <8 x double> %m } @@ -576,8 +576,8 @@ define <16 x float> @test_v16f32_mul_sub_negone_x_y(<16 x float> %x, <16 x float ; AVX512-NOINFS: # %bb.0: ; AVX512-NOINFS-NEXT: vfnmsub213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm1 ; AVX512-NOINFS-NEXT: retq - %s = fsub <16 x float> , %x - %m = fmul <16 x float> %s, %y + %s = fsub contract <16 x float> , %x + %m = fmul contract <16 x float> %s, %y ret <16 x float> %m } @@ -623,8 +623,8 @@ define <8 x double> @test_v8f64_mul_y_sub_negone_x(<8 x double> %x, <8 x double> ; AVX512-NOINFS: # %bb.0: ; AVX512-NOINFS-NEXT: vfnmsub213pd {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm1 ; AVX512-NOINFS-NEXT: retq - %s = fsub <8 x double> , %x - %m = fmul <8 x double> %y, %s + %s = fsub contract <8 x double> , %x + %m = fmul contract <8 x double> %y, %s ret <8 x double> %m } @@ -669,8 +669,8 @@ define <16 x float> @test_v16f32_mul_sub_x_one_y(<16 x float> %x, <16 x float> % ; AVX512-NOINFS: # %bb.0: ; AVX512-NOINFS-NEXT: vfmsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm1 ; AVX512-NOINFS-NEXT: retq - %s = fsub <16 x float> %x, - %m = fmul <16 x float> %s, %y + %s = fsub contract <16 x float> %x, + %m = fmul contract <16 x float> %s, %y ret <16 x float> %m } @@ -715,8 +715,8 @@ define <8 x double> @test_v8f64_mul_y_sub_x_one(<8 x double> %x, <8 x double> %y ; AVX512-NOINFS: # %bb.0: ; AVX512-NOINFS-NEXT: vfmsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm1 ; AVX512-NOINFS-NEXT: retq - %s = fsub <8 x double> %x, - %m = fmul <8 x double> %y, %s + %s = fsub contract <8 x double> %x, + %m = fmul contract <8 x double> %y, %s ret <8 x double> %m } @@ -761,8 +761,8 @@ define <16 x float> @test_v16f32_mul_sub_x_negone_y(<16 x float> %x, <16 x float ; AVX512-NOINFS: # %bb.0: ; AVX512-NOINFS-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm1 ; AVX512-NOINFS-NEXT: retq - %s = fsub <16 x float> %x, - %m = fmul <16 x float> %s, %y + %s = fsub contract <16 x float> %x, + %m = fmul contract <16 x float> %s, %y ret <16 x float> %m } @@ -807,8 +807,8 @@ define <8 x double> @test_v8f64_mul_y_sub_x_negone(<8 x double> %x, <8 x double> ; AVX512-NOINFS: # %bb.0: ; AVX512-NOINFS-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm1 ; AVX512-NOINFS-NEXT: retq - %s = fsub <8 x double> %x, - %m = fmul <8 x double> %y, %s + %s = fsub contract <8 x double> %x, + %m = fmul contract <8 x double> %y, %s ret <8 x double> %m } @@ -868,10 +868,10 @@ define <16 x float> @test_v16f32_interp(<16 x float> %x, <16 x float> %y, <16 x ; AVX512-NOINFS-NEXT: vfmsub213ps {{.*#+}} zmm1 = (zmm2 * zmm1) - zmm1 ; AVX512-NOINFS-NEXT: vfmsub213ps {{.*#+}} zmm0 = (zmm2 * zmm0) - zmm1 ; AVX512-NOINFS-NEXT: retq - %t1 = fsub nsz <16 x float> , %t - %tx = fmul nsz <16 x float> %x, %t - %ty = fmul nsz <16 x float> %y, %t1 - %r = fadd nsz <16 x float> %tx, %ty + %t1 = fsub contract nsz <16 x float> , %t + %tx = fmul contract nsz <16 x float> %x, %t + %ty = fmul contract nsz <16 x float> %y, %t1 + %r = fadd contract nsz <16 x float> %tx, %ty ret <16 x float> %r } @@ -927,10 +927,10 @@ define <8 x double> @test_v8f64_interp(<8 x double> %x, <8 x double> %y, <8 x do ; AVX512-NOINFS-NEXT: vfmsub213pd {{.*#+}} zmm1 = (zmm2 * zmm1) - zmm1 ; AVX512-NOINFS-NEXT: vfmsub213pd {{.*#+}} zmm0 = (zmm2 * zmm0) - zmm1 ; AVX512-NOINFS-NEXT: retq - %t1 = fsub nsz <8 x double> , %t - %tx = fmul nsz <8 x double> %x, %t - %ty = fmul nsz <8 x double> %y, %t1 - %r = fadd nsz <8 x double> %tx, %ty + %t1 = fsub contract nsz <8 x double> , %t + %tx = fmul contract nsz <8 x double> %x, %t + %ty = fmul contract nsz <8 x double> %y, %t1 + %r = fadd contract nsz <8 x double> %tx, %ty ret <8 x double> %r } @@ -955,9 +955,9 @@ define <16 x float> @test_v16f32_fneg_fmadd(<16 x float> %a0, <16 x float> %a1, ; AVX512: # %bb.0: ; AVX512-NEXT: vfnmsub213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm2 ; AVX512-NEXT: retq - %mul = fmul nsz <16 x float> %a0, %a1 - %add = fadd nsz <16 x float> %mul, %a2 - %neg = fsub nsz <16 x float> , %add + %mul = fmul contract nsz <16 x float> %a0, %a1 + %add = fadd contract nsz <16 x float> %mul, %a2 + %neg = fsub contract nsz <16 x float> , %add ret <16 x float> %neg } @@ -978,9 +978,9 @@ define <8 x double> @test_v8f64_fneg_fmsub(<8 x double> %a0, <8 x double> %a1, < ; AVX512: # %bb.0: ; AVX512-NEXT: vfnmadd213pd {{.*#+}} zmm0 = -(zmm1 * zmm0) + zmm2 ; AVX512-NEXT: retq - %mul = fmul nsz <8 x double> %a0, %a1 - %sub = fsub nsz <8 x double> %mul, %a2 - %neg = fsub nsz <8 x double> , %sub + %mul = fmul contract nsz <8 x double> %a0, %a1 + %sub = fsub contract nsz <8 x double> %mul, %a2 + %neg = fsub contract nsz <8 x double> , %sub ret <8 x double> %neg } @@ -1001,10 +1001,10 @@ define <16 x float> @test_v16f32_fneg_fnmadd(<16 x float> %a0, <16 x float> %a1, ; AVX512: # %bb.0: ; AVX512-NEXT: vfmsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm2 ; AVX512-NEXT: retq - %mul = fmul nsz <16 x float> %a0, %a1 - %neg0 = fsub nsz <16 x float> , %mul - %add = fadd nsz <16 x float> %neg0, %a2 - %neg1 = fsub nsz <16 x float> , %add + %mul = fmul contract nsz <16 x float> %a0, %a1 + %neg0 = fsub contract nsz <16 x float> , %mul + %add = fadd contract nsz <16 x float> %neg0, %a2 + %neg1 = fsub contract nsz <16 x float> , %add ret <16 x float> %neg1 } @@ -1025,10 +1025,10 @@ define <8 x double> @test_v8f64_fneg_fnmsub(<8 x double> %a0, <8 x double> %a1, ; AVX512: # %bb.0: ; AVX512-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2 ; AVX512-NEXT: retq - %mul = fmul nsz <8 x double> %a0, %a1 - %neg0 = fsub nsz <8 x double> , %mul - %sub = fsub nsz <8 x double> %neg0, %a2 - %neg1 = fsub nsz <8 x double> , %sub + %mul = fmul contract nsz <8 x double> %a0, %a1 + %neg0 = fsub contract nsz <8 x double> , %mul + %sub = fsub contract nsz <8 x double> %neg0, %a2 + %neg1 = fsub contract nsz <8 x double> , %sub ret <8 x double> %neg1 } @@ -1108,8 +1108,8 @@ define <16 x float> @test_v16f32_fneg_fmul(<16 x float> %x, <16 x float> %y) #0 ; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; AVX512-NEXT: vfnmsub213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm2 ; AVX512-NEXT: retq - %m = fmul nsz <16 x float> %x, %y - %n = fsub <16 x float> , %m + %m = fmul contract nsz <16 x float> %x, %y + %n = fsub contract <16 x float> , %m ret <16 x float> %n } @@ -1133,8 +1133,8 @@ define <8 x double> @test_v8f64_fneg_fmul(<8 x double> %x, <8 x double> %y) #0 { ; AVX512-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; AVX512-NEXT: vfnmsub213pd {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm2 ; AVX512-NEXT: retq - %m = fmul nsz <8 x double> %x, %y - %n = fsub <8 x double> , %m + %m = fmul contract nsz <8 x double> %x, %y + %n = fsub contract <8 x double> , %m ret <8 x double> %n } @@ -1162,8 +1162,8 @@ define <8 x double> @test_v8f64_fneg_fmul_no_nsz(<8 x double> %x, <8 x double> % ; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512-NEXT: retq - %m = fmul <8 x double> %x, %y - %n = fsub <8 x double> , %m + %m = fmul contract <8 x double> %x, %y + %n = fsub contract <8 x double> , %m ret <8 x double> %n } diff --git a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll index d59b12c6d1231..d8c8b1c646c7c 100644 --- a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll +++ b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll @@ -2,7 +2,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=CHECK-SSE ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK-AVX,CHECK-AVX2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK-AVX,CHECK-AVX512F,CHECK-NO-FASTFMA -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx -fp-contract=fast | FileCheck %s --check-prefixes=CHECK-AVX,CHECK-AVX512F,CHECK-FMA +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx | FileCheck %s --check-prefixes=CHECK-AVX,CHECK-AVX512F,CHECK-FMA declare i16 @llvm.umax.i16(i16, i16) declare i64 @llvm.umin.i64(i64, i64) @@ -1007,14 +1007,6 @@ define <4 x float> @fmul_pow_shl_cnt_vec_preserve_fma(<4 x i32> %cnt, <4 x float ; CHECK-AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; -; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt_vec_preserve_fma: -; CHECK-NO-FASTFMA: # %bb.0: -; CHECK-NO-FASTFMA-NEXT: vpslld $23, %xmm0, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1092616192,1092616192,1092616192,1092616192] -; CHECK-NO-FASTFMA-NEXT: vpaddd %xmm2, %xmm0, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; CHECK-NO-FASTFMA-NEXT: retq -; ; CHECK-FMA-LABEL: fmul_pow_shl_cnt_vec_preserve_fma: ; CHECK-FMA: # %bb.0: ; CHECK-FMA-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2,2,2,2] @@ -1024,8 +1016,8 @@ define <4 x float> @fmul_pow_shl_cnt_vec_preserve_fma(<4 x i32> %cnt, <4 x float ; CHECK-FMA-NEXT: retq %shl = shl nsw nuw <4 x i32> , %cnt %conv = uitofp <4 x i32> %shl to <4 x float> - %mul = fmul <4 x float> , %conv - %res = fadd <4 x float> %mul, %add + %mul = fmul contract <4 x float> , %conv + %res = fadd contract <4 x float> %mul, %add ret <4 x float> %res } diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll b/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll index 42617c1573be5..417910df5c457 100644 --- a/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll +++ b/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -fp-contract=fast < %s -mtriple=x86_64-unknown-unknown -mattr=avx2,fma -stop-after=finalize-isel 2>&1 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2,fma -stop-after=finalize-isel 2>&1 | FileCheck %s declare float @llvm.sqrt.f32(float) #2 @@ -10,10 +10,10 @@ define float @sqrt_ieee(float %f) #0 { ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:fr32 = COPY $xmm0 ; CHECK-NEXT: [[DEF:%[0-9]+]]:fr32 = IMPLICIT_DEF - ; CHECK-NEXT: [[VSQRTSSr:%[0-9]+]]:fr32 = nofpexcept VSQRTSSr killed [[DEF]], [[COPY]], implicit $mxcsr + ; CHECK-NEXT: [[VSQRTSSr:%[0-9]+]]:fr32 = contract nofpexcept VSQRTSSr killed [[DEF]], [[COPY]], implicit $mxcsr ; CHECK-NEXT: $xmm0 = COPY [[VSQRTSSr]] ; CHECK-NEXT: RET 0, $xmm0 - %call = tail call float @llvm.sqrt.f32(float %f) + %call = tail call contract float @llvm.sqrt.f32(float %f) ret float %call } @@ -25,16 +25,16 @@ define float @sqrt_ieee_ninf(float %f) #0 { ; CHECK-NEXT: [[COPY:%[0-9]+]]:fr32 = COPY $xmm0 ; CHECK-NEXT: [[DEF:%[0-9]+]]:fr32 = IMPLICIT_DEF ; CHECK-NEXT: [[VRSQRTSSr:%[0-9]+]]:fr32 = VRSQRTSSr killed [[DEF]], [[COPY]] - ; CHECK-NEXT: [[VMULSSrr:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr [[COPY]], [[VRSQRTSSr]], implicit $mxcsr + ; CHECK-NEXT: [[VMULSSrr:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VMULSSrr [[COPY]], [[VRSQRTSSr]], implicit $mxcsr ; CHECK-NEXT: [[VMOVSSrm_alt:%[0-9]+]]:fr32 = VMOVSSrm_alt $rip, 1, $noreg, %const.0, $noreg :: (load (s32) from constant-pool) - ; CHECK-NEXT: [[VFMADD213SSr:%[0-9]+]]:fr32 = ninf afn nofpexcept VFMADD213SSr [[VRSQRTSSr]], killed [[VMULSSrr]], [[VMOVSSrm_alt]], implicit $mxcsr + ; CHECK-NEXT: [[VFMADD213SSr:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VFMADD213SSr [[VRSQRTSSr]], killed [[VMULSSrr]], [[VMOVSSrm_alt]], implicit $mxcsr ; CHECK-NEXT: [[VMOVSSrm_alt1:%[0-9]+]]:fr32 = VMOVSSrm_alt $rip, 1, $noreg, %const.1, $noreg :: (load (s32) from constant-pool) - ; CHECK-NEXT: [[VMULSSrr1:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr [[VRSQRTSSr]], [[VMOVSSrm_alt1]], implicit $mxcsr - ; CHECK-NEXT: [[VMULSSrr2:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr killed [[VMULSSrr1]], killed [[VFMADD213SSr]], implicit $mxcsr - ; CHECK-NEXT: [[VMULSSrr3:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr [[COPY]], [[VMULSSrr2]], implicit $mxcsr - ; CHECK-NEXT: [[VFMADD213SSr1:%[0-9]+]]:fr32 = ninf afn nofpexcept VFMADD213SSr [[VMULSSrr2]], [[VMULSSrr3]], [[VMOVSSrm_alt]], implicit $mxcsr - ; CHECK-NEXT: [[VMULSSrr4:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr [[VMULSSrr3]], [[VMOVSSrm_alt1]], implicit $mxcsr - ; CHECK-NEXT: [[VMULSSrr5:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr killed [[VMULSSrr4]], killed [[VFMADD213SSr1]], implicit $mxcsr + ; CHECK-NEXT: [[VMULSSrr1:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VMULSSrr [[VRSQRTSSr]], [[VMOVSSrm_alt1]], implicit $mxcsr + ; CHECK-NEXT: [[VMULSSrr2:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VMULSSrr killed [[VMULSSrr1]], killed [[VFMADD213SSr]], implicit $mxcsr + ; CHECK-NEXT: [[VMULSSrr3:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VMULSSrr [[COPY]], [[VMULSSrr2]], implicit $mxcsr + ; CHECK-NEXT: [[VFMADD213SSr1:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VFMADD213SSr [[VMULSSrr2]], [[VMULSSrr3]], [[VMOVSSrm_alt]], implicit $mxcsr + ; CHECK-NEXT: [[VMULSSrr4:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VMULSSrr [[VMULSSrr3]], [[VMOVSSrm_alt1]], implicit $mxcsr + ; CHECK-NEXT: [[VMULSSrr5:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VMULSSrr killed [[VMULSSrr4]], killed [[VFMADD213SSr1]], implicit $mxcsr ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vr128 = COPY killed [[VMULSSrr5]] ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vr128 = COPY [[COPY]] ; CHECK-NEXT: [[VPBROADCASTDrm:%[0-9]+]]:vr128 = VPBROADCASTDrm $rip, 1, $noreg, %const.2, $noreg :: (load (s32) from constant-pool) @@ -46,7 +46,7 @@ define float @sqrt_ieee_ninf(float %f) #0 { ; CHECK-NEXT: [[COPY5:%[0-9]+]]:fr32 = COPY killed [[VPANDNrr]] ; CHECK-NEXT: $xmm0 = COPY [[COPY5]] ; CHECK-NEXT: RET 0, $xmm0 - %call = tail call ninf afn float @llvm.sqrt.f32(float %f) + %call = tail call contract ninf afn contract float @llvm.sqrt.f32(float %f) ret float %call } @@ -57,10 +57,10 @@ define float @sqrt_daz(float %f) #1 { ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:fr32 = COPY $xmm0 ; CHECK-NEXT: [[DEF:%[0-9]+]]:fr32 = IMPLICIT_DEF - ; CHECK-NEXT: [[VSQRTSSr:%[0-9]+]]:fr32 = nofpexcept VSQRTSSr killed [[DEF]], [[COPY]], implicit $mxcsr + ; CHECK-NEXT: [[VSQRTSSr:%[0-9]+]]:fr32 = contract nofpexcept VSQRTSSr killed [[DEF]], [[COPY]], implicit $mxcsr ; CHECK-NEXT: $xmm0 = COPY [[VSQRTSSr]] ; CHECK-NEXT: RET 0, $xmm0 - %call = tail call float @llvm.sqrt.f32(float %f) + %call = tail call contract float @llvm.sqrt.f32(float %f) ret float %call } @@ -72,16 +72,16 @@ define float @sqrt_daz_ninf(float %f) #1 { ; CHECK-NEXT: [[COPY:%[0-9]+]]:fr32 = COPY $xmm0 ; CHECK-NEXT: [[DEF:%[0-9]+]]:fr32 = IMPLICIT_DEF ; CHECK-NEXT: [[VRSQRTSSr:%[0-9]+]]:fr32 = VRSQRTSSr killed [[DEF]], [[COPY]] - ; CHECK-NEXT: [[VMULSSrr:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr [[COPY]], [[VRSQRTSSr]], implicit $mxcsr + ; CHECK-NEXT: [[VMULSSrr:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VMULSSrr [[COPY]], [[VRSQRTSSr]], implicit $mxcsr ; CHECK-NEXT: [[VMOVSSrm_alt:%[0-9]+]]:fr32 = VMOVSSrm_alt $rip, 1, $noreg, %const.0, $noreg :: (load (s32) from constant-pool) - ; CHECK-NEXT: [[VFMADD213SSr:%[0-9]+]]:fr32 = ninf afn nofpexcept VFMADD213SSr [[VRSQRTSSr]], killed [[VMULSSrr]], [[VMOVSSrm_alt]], implicit $mxcsr + ; CHECK-NEXT: [[VFMADD213SSr:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VFMADD213SSr [[VRSQRTSSr]], killed [[VMULSSrr]], [[VMOVSSrm_alt]], implicit $mxcsr ; CHECK-NEXT: [[VMOVSSrm_alt1:%[0-9]+]]:fr32 = VMOVSSrm_alt $rip, 1, $noreg, %const.1, $noreg :: (load (s32) from constant-pool) - ; CHECK-NEXT: [[VMULSSrr1:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr [[VRSQRTSSr]], [[VMOVSSrm_alt1]], implicit $mxcsr - ; CHECK-NEXT: [[VMULSSrr2:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr killed [[VMULSSrr1]], killed [[VFMADD213SSr]], implicit $mxcsr - ; CHECK-NEXT: [[VMULSSrr3:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr [[COPY]], [[VMULSSrr2]], implicit $mxcsr - ; CHECK-NEXT: [[VFMADD213SSr1:%[0-9]+]]:fr32 = ninf afn nofpexcept VFMADD213SSr [[VMULSSrr2]], [[VMULSSrr3]], [[VMOVSSrm_alt]], implicit $mxcsr - ; CHECK-NEXT: [[VMULSSrr4:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr [[VMULSSrr3]], [[VMOVSSrm_alt1]], implicit $mxcsr - ; CHECK-NEXT: [[VMULSSrr5:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr killed [[VMULSSrr4]], killed [[VFMADD213SSr1]], implicit $mxcsr + ; CHECK-NEXT: [[VMULSSrr1:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VMULSSrr [[VRSQRTSSr]], [[VMOVSSrm_alt1]], implicit $mxcsr + ; CHECK-NEXT: [[VMULSSrr2:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VMULSSrr killed [[VMULSSrr1]], killed [[VFMADD213SSr]], implicit $mxcsr + ; CHECK-NEXT: [[VMULSSrr3:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VMULSSrr [[COPY]], [[VMULSSrr2]], implicit $mxcsr + ; CHECK-NEXT: [[VFMADD213SSr1:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VFMADD213SSr [[VMULSSrr2]], [[VMULSSrr3]], [[VMOVSSrm_alt]], implicit $mxcsr + ; CHECK-NEXT: [[VMULSSrr4:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VMULSSrr [[VMULSSrr3]], [[VMOVSSrm_alt1]], implicit $mxcsr + ; CHECK-NEXT: [[VMULSSrr5:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VMULSSrr killed [[VMULSSrr4]], killed [[VFMADD213SSr1]], implicit $mxcsr ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vr128 = COPY killed [[VMULSSrr5]] ; CHECK-NEXT: [[FsFLD0SS:%[0-9]+]]:fr32 = FsFLD0SS ; CHECK-NEXT: [[VCMPSSrri:%[0-9]+]]:fr32 = nofpexcept VCMPSSrri [[COPY]], killed [[FsFLD0SS]], 0, implicit $mxcsr @@ -90,7 +90,7 @@ define float @sqrt_daz_ninf(float %f) #1 { ; CHECK-NEXT: [[COPY3:%[0-9]+]]:fr32 = COPY killed [[VPANDNrr]] ; CHECK-NEXT: $xmm0 = COPY [[COPY3]] ; CHECK-NEXT: RET 0, $xmm0 - %call = tail call ninf afn float @llvm.sqrt.f32(float %f) + %call = tail call contract ninf afn float @llvm.sqrt.f32(float %f) ret float %call } @@ -114,7 +114,7 @@ define float @rsqrt_ieee(float %f) #0 { ; CHECK-NEXT: [[VMULSSrr5:%[0-9]+]]:fr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept VMULSSrr killed [[VMULSSrr4]], killed [[VFMADD213SSr1]], implicit $mxcsr ; CHECK-NEXT: $xmm0 = COPY [[VMULSSrr5]] ; CHECK-NEXT: RET 0, $xmm0 - %sqrt = tail call float @llvm.sqrt.f32(float %f) + %sqrt = tail call contract float @llvm.sqrt.f32(float %f) %div = fdiv fast float 1.0, %sqrt ret float %div } @@ -139,7 +139,7 @@ define float @rsqrt_daz(float %f) #1 { ; CHECK-NEXT: [[VMULSSrr5:%[0-9]+]]:fr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept VMULSSrr killed [[VMULSSrr4]], killed [[VFMADD213SSr1]], implicit $mxcsr ; CHECK-NEXT: $xmm0 = COPY [[VMULSSrr5]] ; CHECK-NEXT: RET 0, $xmm0 - %sqrt = tail call float @llvm.sqrt.f32(float %f) + %sqrt = tail call contract float @llvm.sqrt.f32(float %f) %div = fdiv fast float 1.0, %sqrt ret float %div } From be5e2ea78b5f1146ba3d3545bf61022845ec89f9 Mon Sep 17 00:00:00 2001 From: "Pirog, Mikolaj Maciej" Date: Fri, 12 Sep 2025 14:18:21 +0200 Subject: [PATCH 2/6] Reviewer suggestions --- llvm/test/CodeGen/X86/avx512-fma.ll | 4 +- .../X86/avx512fp16-combine-vfmac-fadd.ll | 4 +- .../X86/avx512fp16-combine-vfmulc-fadd.ll | 2 +- .../X86/avx512fp16-combine-xor-vfmulc-fadd.ll | 2 +- .../X86/avx512fp16-combine-xor-vfmulc.ll | 2 +- .../CodeGen/X86/dag-combiner-fma-folding.ll | 1 - llvm/test/CodeGen/X86/fma-do-not-commute.ll | 2 +- llvm/test/CodeGen/X86/fma_patterns.ll | 16 +- llvm/test/CodeGen/X86/fma_patterns_wide.ll | 16 +- .../X86/fold-int-pow2-with-fmul-or-fdiv.ll | 1761 +++++++++++------ llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll | 2 +- 11 files changed, 1161 insertions(+), 651 deletions(-) diff --git a/llvm/test/CodeGen/X86/avx512-fma.ll b/llvm/test/CodeGen/X86/avx512-fma.ll index 54343ee771ff7..29120c8815aea 100644 --- a/llvm/test/CodeGen/X86/avx512-fma.ll +++ b/llvm/test/CodeGen/X86/avx512-fma.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=KNL -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=ALL --check-prefix=SKX +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=KNL +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=ALL --check-prefix=SKX define <16 x float> @test_x86_fmadd_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { ; ALL-LABEL: test_x86_fmadd_ps_z: diff --git a/llvm/test/CodeGen/X86/avx512fp16-combine-vfmac-fadd.ll b/llvm/test/CodeGen/X86/avx512fp16-combine-vfmac-fadd.ll index 52e9507d43a1f..f1477b57375c4 100644 --- a/llvm/test/CodeGen/X86/avx512fp16-combine-vfmac-fadd.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-combine-vfmac-fadd.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown --enable-no-signed-zeros-fp-math -mattr=avx512fp16,avx512vl | FileCheck %s --check-prefixes=CHECK,NO-SZ -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl | FileCheck %s --check-prefixes=CHECK,HAS-SZ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown --enable-no-signed-zeros-fp-math -mattr=avx512fp16,avx512vl | FileCheck %s --check-prefixes=CHECK,NO-SZ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl | FileCheck %s --check-prefixes=CHECK,HAS-SZ ; FADD(acc, FMA(a, b, +0.0)) can be combined to FMA(a, b, acc) if the nsz flag set. define dso_local <32 x half> @test1(<32 x half> %acc, <32 x half> %a, <32 x half> %b) { diff --git a/llvm/test/CodeGen/X86/avx512fp16-combine-vfmulc-fadd.ll b/llvm/test/CodeGen/X86/avx512fp16-combine-vfmulc-fadd.ll index 20df18c5a18b9..5d9784aa5d2eb 100644 --- a/llvm/test/CodeGen/X86/avx512fp16-combine-vfmulc-fadd.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-combine-vfmulc-fadd.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl --enable-unsafe-fp-math | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl --enable-unsafe-fp-math | FileCheck %s define dso_local <32 x half> @test1(<32 x half> %acc.coerce, <32 x half> %lhs.coerce, <32 x half> %rhs.coerce) { ; CHECK-LABEL: test1: diff --git a/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc-fadd.ll b/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc-fadd.ll index d96d7d1602040..b58bae93ed660 100644 --- a/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc-fadd.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc-fadd.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl --enable-unsafe-fp-math | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl --enable-unsafe-fp-math | FileCheck %s define dso_local <32 x half> @test1(<32 x half> %acc.coerce, <32 x half> %lhs.coerce.conj, <32 x half> %rhs.coerce) local_unnamed_addr #0 { ; CHECK-LABEL: test1: diff --git a/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc.ll b/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc.ll index caf428a8c94d3..92bdebb34979a 100644 --- a/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl --enable-unsafe-fp-math | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl --enable-unsafe-fp-math | FileCheck %s define dso_local <32 x half> @test1(<32 x half> %lhs.coerce.conj, <32 x half> %rhs.coerce) local_unnamed_addr #0 { ; CHECK-LABEL: test1: diff --git a/llvm/test/CodeGen/X86/dag-combiner-fma-folding.ll b/llvm/test/CodeGen/X86/dag-combiner-fma-folding.ll index c13b534b6881a..3ebbf34dd8367 100644 --- a/llvm/test/CodeGen/X86/dag-combiner-fma-folding.ll +++ b/llvm/test/CodeGen/X86/dag-combiner-fma-folding.ll @@ -1,6 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=x86_64-- --start-before=x86-isel -mattr=+avx,+fma %s -o - | FileCheck %s -; RUN: llc -mtriple=x86_64-- --start-before=x86-isel -mattr=+avx,+fma %s -o - | FileCheck %s define double @fma_folding(double %x) { ; CHECK-LABEL: fma_folding: diff --git a/llvm/test/CodeGen/X86/fma-do-not-commute.ll b/llvm/test/CodeGen/X86/fma-do-not-commute.ll index 3009db2859dba..1b60c15cf2be0 100644 --- a/llvm/test/CodeGen/X86/fma-do-not-commute.ll +++ b/llvm/test/CodeGen/X86/fma-do-not-commute.ll @@ -1,4 +1,4 @@ -; RUN: llc -mattr=+fma -disable-cgp < %s -o - | FileCheck %s +; RUN: llc -mattr=+fma -disable-cgp < %s -o - | FileCheck %s ; Check that the 2nd and 3rd arguments of fmaXXX231 reg1, reg2, mem3 are not commuted. ; target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" diff --git a/llvm/test/CodeGen/X86/fma_patterns.ll b/llvm/test/CodeGen/X86/fma_patterns.ll index 8b409ba53b0fb..61f321566bad8 100644 --- a/llvm/test/CodeGen/X86/fma_patterns.ll +++ b/llvm/test/CodeGen/X86/fma_patterns.ll @@ -1,12 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma | FileCheck %s --check-prefixes=FMA,FMA-INFS -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+fma | FileCheck %s --check-prefixes=FMA4,FMA4-INFS -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4 | FileCheck %s --check-prefixes=FMA4,FMA4-INFS -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512-INFS -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma -enable-no-infs-fp-math | FileCheck %s --check-prefixes=FMA,FMA-NOINFS -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+fma -enable-no-infs-fp-math | FileCheck %s --check-prefixes=FMA4,FMA4-NOINFS -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4 -enable-no-infs-fp-math | FileCheck %s --check-prefixes=FMA4,FMA4-NOINFS -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl -enable-no-infs-fp-math | FileCheck %s --check-prefixes=AVX512,AVX512-NOINFS +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma | FileCheck %s --check-prefixes=FMA,FMA-INFS +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+fma | FileCheck %s --check-prefixes=FMA4,FMA4-INFS +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4 | FileCheck %s --check-prefixes=FMA4,FMA4-INFS +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512-INFS +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma -enable-no-infs-fp-math | FileCheck %s --check-prefixes=FMA,FMA-NOINFS +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+fma -enable-no-infs-fp-math | FileCheck %s --check-prefixes=FMA4,FMA4-NOINFS +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4 -enable-no-infs-fp-math | FileCheck %s --check-prefixes=FMA4,FMA4-NOINFS +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl -enable-no-infs-fp-math | FileCheck %s --check-prefixes=AVX512,AVX512-NOINFS ; ; Pattern: (fadd contract (fmul contract x, y), z) -> (fmadd x,y,z) diff --git a/llvm/test/CodeGen/X86/fma_patterns_wide.ll b/llvm/test/CodeGen/X86/fma_patterns_wide.ll index 4546df5d0255a..f0af3945ae959 100644 --- a/llvm/test/CodeGen/X86/fma_patterns_wide.ll +++ b/llvm/test/CodeGen/X86/fma_patterns_wide.ll @@ -1,12 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma | FileCheck %s --check-prefix=FMA --check-prefix=FMA-INFS -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+fma | FileCheck %s --check-prefix=FMA4 --check-prefix=FMA4-INFS -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4 | FileCheck %s --check-prefix=FMA4 --check-prefix=FMA4-INFS -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512-INFS -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma -enable-no-infs-fp-math | FileCheck %s --check-prefix=FMA --check-prefix=FMA-NOINFS -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+fma -enable-no-infs-fp-math | FileCheck %s --check-prefix=FMA4 --check-prefix=FMA4-NOINFS -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4 -enable-no-infs-fp-math | FileCheck %s --check-prefix=FMA4 --check-prefix=FMA4-NOINFS -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq -enable-no-infs-fp-math | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512-NOINFS +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma | FileCheck %s --check-prefix=FMA --check-prefix=FMA-INFS +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+fma | FileCheck %s --check-prefix=FMA4 --check-prefix=FMA4-INFS +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4 | FileCheck %s --check-prefix=FMA4 --check-prefix=FMA4-INFS +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512-INFS +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma -enable-no-infs-fp-math | FileCheck %s --check-prefix=FMA --check-prefix=FMA-NOINFS +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+fma -enable-no-infs-fp-math | FileCheck %s --check-prefix=FMA4 --check-prefix=FMA4-NOINFS +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4 -enable-no-infs-fp-math | FileCheck %s --check-prefix=FMA4 --check-prefix=FMA4-NOINFS +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq -enable-no-infs-fp-math | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512-NOINFS ; ; Pattern: (fadd (fmul x, y), z) -> (fmadd x,y,z) diff --git a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll index d8c8b1c646c7c..f5f86aa70fc0f 100644 --- a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll +++ b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=CHECK-SSE ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK-AVX,CHECK-AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK-AVX,CHECK-AVX512F,CHECK-NO-FASTFMA -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx | FileCheck %s --check-prefixes=CHECK-AVX,CHECK-AVX512F,CHECK-FMA +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK-AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx | FileCheck %s --check-prefixes=CHECK-SKX declare i16 @llvm.umax.i16(i16, i16) declare i64 @llvm.umin.i64(i64, i64) @@ -16,25 +16,25 @@ define <4 x float> @fmul_pow2_4xfloat(<4 x i32> %i) { ; CHECK-SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX2-LABEL: fmul_pow2_4xfloat: -; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpslld $23, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1091567616,1091567616,1091567616,1091567616] -; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: retq -; -; CHECK-NO-FASTFMA-LABEL: fmul_pow2_4xfloat: -; CHECK-NO-FASTFMA: # %bb.0: -; CHECK-NO-FASTFMA-NEXT: vpslld $23, %xmm0, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1091567616,1091567616,1091567616,1091567616] -; CHECK-NO-FASTFMA-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-NO-FASTFMA-NEXT: retq -; -; CHECK-FMA-LABEL: fmul_pow2_4xfloat: -; CHECK-FMA: # %bb.0: -; CHECK-FMA-NEXT: vpslld $23, %xmm0, %xmm0 -; CHECK-FMA-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-FMA-NEXT: retq +; CHECK-AVX-LABEL: fmul_pow2_4xfloat: +; CHECK-AVX: # %bb.0: +; CHECK-AVX-NEXT: vpslld $23, %xmm0, %xmm0 +; CHECK-AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1091567616,1091567616,1091567616,1091567616] +; CHECK-AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX-NEXT: retq +; +; CHECK-AVX512F-LABEL: fmul_pow2_4xfloat: +; CHECK-AVX512F: # %bb.0: +; CHECK-AVX512F-NEXT: vpslld $23, %xmm0, %xmm0 +; CHECK-AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1091567616,1091567616,1091567616,1091567616] +; CHECK-AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512F-NEXT: retq +; +; CHECK-SKX-LABEL: fmul_pow2_4xfloat: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: vpslld $23, %xmm0, %xmm0 +; CHECK-SKX-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; CHECK-SKX-NEXT: retq %p2 = shl <4 x i32> , %i %p2_f = uitofp <4 x i32> %p2 to <4 x float> %r = fmul <4 x float> , %p2_f @@ -111,6 +111,72 @@ define <4 x float> @fmul_pow2_ldexp_4xfloat(<4 x i32> %i) { ; CHECK-AVX-NEXT: addq $40, %rsp ; CHECK-AVX-NEXT: .cfi_def_cfa_offset 8 ; CHECK-AVX-NEXT: retq +; +; CHECK-AVX512F-LABEL: fmul_pow2_ldexp_4xfloat: +; CHECK-AVX512F: # %bb.0: +; CHECK-AVX512F-NEXT: subq $40, %rsp +; CHECK-AVX512F-NEXT: .cfi_def_cfa_offset 48 +; CHECK-AVX512F-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-AVX512F-NEXT: vextractps $1, %xmm0, %edi +; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0] +; CHECK-AVX512F-NEXT: callq ldexpf@PLT +; CHECK-AVX512F-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-AVX512F-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-AVX512F-NEXT: vmovd %xmm0, %edi +; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0] +; CHECK-AVX512F-NEXT: callq ldexpf@PLT +; CHECK-AVX512F-NEXT: vinsertps $16, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3] +; CHECK-AVX512F-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-AVX512F-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-AVX512F-NEXT: vextractps $2, %xmm0, %edi +; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0] +; CHECK-AVX512F-NEXT: callq ldexpf@PLT +; CHECK-AVX512F-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; CHECK-AVX512F-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-AVX512F-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-AVX512F-NEXT: vextractps $3, %xmm0, %edi +; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0] +; CHECK-AVX512F-NEXT: callq ldexpf@PLT +; CHECK-AVX512F-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; CHECK-AVX512F-NEXT: addq $40, %rsp +; CHECK-AVX512F-NEXT: .cfi_def_cfa_offset 8 +; CHECK-AVX512F-NEXT: retq +; +; CHECK-SKX-LABEL: fmul_pow2_ldexp_4xfloat: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: subq $40, %rsp +; CHECK-SKX-NEXT: .cfi_def_cfa_offset 48 +; CHECK-SKX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-SKX-NEXT: vextractps $1, %xmm0, %edi +; CHECK-SKX-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0] +; CHECK-SKX-NEXT: callq ldexpf@PLT +; CHECK-SKX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-SKX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-SKX-NEXT: vmovd %xmm0, %edi +; CHECK-SKX-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0] +; CHECK-SKX-NEXT: callq ldexpf@PLT +; CHECK-SKX-NEXT: vinsertps $16, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-SKX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3] +; CHECK-SKX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-SKX-NEXT: vextractps $2, %xmm0, %edi +; CHECK-SKX-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0] +; CHECK-SKX-NEXT: callq ldexpf@PLT +; CHECK-SKX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-SKX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; CHECK-SKX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-SKX-NEXT: vextractps $3, %xmm0, %edi +; CHECK-SKX-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0] +; CHECK-SKX-NEXT: callq ldexpf@PLT +; CHECK-SKX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-SKX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; CHECK-SKX-NEXT: addq $40, %rsp +; CHECK-SKX-NEXT: .cfi_def_cfa_offset 8 +; CHECK-SKX-NEXT: retq %r = call <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float> , <4 x i32> %i) ret <4 x float> %r } @@ -130,6 +196,20 @@ define <4 x float> @fdiv_pow2_4xfloat(<4 x i32> %i) { ; CHECK-AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1091567616,1091567616,1091567616,1091567616] ; CHECK-AVX-NEXT: vpsubd %xmm0, %xmm1, %xmm0 ; CHECK-AVX-NEXT: retq +; +; CHECK-AVX512F-LABEL: fdiv_pow2_4xfloat: +; CHECK-AVX512F: # %bb.0: +; CHECK-AVX512F-NEXT: vpslld $23, %xmm0, %xmm0 +; CHECK-AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1091567616,1091567616,1091567616,1091567616] +; CHECK-AVX512F-NEXT: vpsubd %xmm0, %xmm1, %xmm0 +; CHECK-AVX512F-NEXT: retq +; +; CHECK-SKX-LABEL: fdiv_pow2_4xfloat: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: vpslld $23, %xmm0, %xmm0 +; CHECK-SKX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1091567616,1091567616,1091567616,1091567616] +; CHECK-SKX-NEXT: vpsubd %xmm0, %xmm1, %xmm0 +; CHECK-SKX-NEXT: retq %p2 = shl <4 x i32> , %i %p2_f = uitofp <4 x i32> %p2 to <4 x float> %r = fdiv <4 x float> , %p2_f @@ -262,143 +342,143 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) { ; CHECK-SSE-NEXT: .cfi_def_cfa_offset 8 ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX2-LABEL: fmul_pow2_8xhalf: -; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: subq $120, %rsp -; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 128 -; CHECK-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] -; CHECK-AVX2-NEXT: vpsllvd %ymm0, %ymm1, %ymm0 -; CHECK-AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; CHECK-AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; CHECK-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; CHECK-AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 -; CHECK-AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; CHECK-AVX2-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vzeroupper -; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; CHECK-AVX2-NEXT: vzeroupper -; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-AVX2-NEXT: # xmm0 = mem[1,0] -; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-AVX2-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-AVX2-NEXT: # xmm0 = mem[3,3,3,3] -; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 -; CHECK-AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-AVX2-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vzeroupper -; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-AVX2-NEXT: # xmm0 = mem[1,0] -; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-AVX2-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-AVX2-NEXT: # xmm0 = mem[3,3,3,3] -; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT -; CHECK-AVX2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-AVX2-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-AVX2-NEXT: # xmm0 = mem[0],zero,zero,zero -; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT -; CHECK-AVX2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-AVX2-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-AVX2-NEXT: # xmm0 = mem[0],zero,zero,zero -; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT -; CHECK-AVX2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-AVX2-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-AVX2-NEXT: # xmm0 = mem[0],zero,zero,zero -; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT -; CHECK-AVX2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; CHECK-AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-AVX2-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-AVX2-NEXT: # xmm0 = mem[0],zero,zero,zero -; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT -; CHECK-AVX2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-AVX2-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-AVX2-NEXT: # xmm0 = mem[0],zero,zero,zero -; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT -; CHECK-AVX2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-AVX2-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-AVX2-NEXT: # xmm0 = mem[0],zero,zero,zero -; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT -; CHECK-AVX2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-AVX2-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-AVX2-NEXT: # xmm0 = mem[0],zero,zero,zero -; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT -; CHECK-AVX2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; CHECK-AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-AVX2-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-AVX2-NEXT: # xmm0 = xmm0[0],mem[0] -; CHECK-AVX2-NEXT: addq $120, %rsp -; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 8 -; CHECK-AVX2-NEXT: retq -; -; CHECK-NO-FASTFMA-LABEL: fmul_pow2_8xhalf: -; CHECK-NO-FASTFMA: # %bb.0: -; CHECK-NO-FASTFMA-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; CHECK-NO-FASTFMA-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] -; CHECK-NO-FASTFMA-NEXT: vpsllvd %ymm0, %ymm1, %ymm0 -; CHECK-NO-FASTFMA-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NO-FASTFMA-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; CHECK-NO-FASTFMA-NEXT: vcvtdq2ps %ymm0, %ymm0 -; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %ymm0, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vcvtph2ps %xmm0, %ymm0 -; CHECK-NO-FASTFMA-NEXT: vbroadcastss {{.*#+}} ymm1 = [8.192E+3,8.192E+3,8.192E+3,8.192E+3,8.192E+3,8.192E+3,8.192E+3,8.192E+3] -; CHECK-NO-FASTFMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %ymm0, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vzeroupper -; CHECK-NO-FASTFMA-NEXT: retq -; -; CHECK-FMA-LABEL: fmul_pow2_8xhalf: -; CHECK-FMA: # %bb.0: -; CHECK-FMA-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1] -; CHECK-FMA-NEXT: vpsllvw %xmm0, %xmm1, %xmm0 -; CHECK-FMA-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; CHECK-FMA-NEXT: vcvtdq2ps %ymm0, %ymm0 -; CHECK-FMA-NEXT: vcvtps2ph $4, %ymm0, %xmm0 -; CHECK-FMA-NEXT: vcvtph2ps %xmm0, %ymm0 -; CHECK-FMA-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 -; CHECK-FMA-NEXT: vcvtps2ph $4, %ymm0, %xmm0 -; CHECK-FMA-NEXT: vzeroupper -; CHECK-FMA-NEXT: retq +; CHECK-AVX-LABEL: fmul_pow2_8xhalf: +; CHECK-AVX: # %bb.0: +; CHECK-AVX-NEXT: subq $120, %rsp +; CHECK-AVX-NEXT: .cfi_def_cfa_offset 128 +; CHECK-AVX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; CHECK-AVX-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] +; CHECK-AVX-NEXT: vpsllvd %ymm0, %ymm1, %ymm0 +; CHECK-AVX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] +; CHECK-AVX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; CHECK-AVX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; CHECK-AVX-NEXT: vcvtdq2ps %ymm0, %ymm0 +; CHECK-AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; CHECK-AVX-NEXT: vzeroupper +; CHECK-AVX-NEXT: callq __truncsfhf2@PLT +; CHECK-AVX-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; CHECK-AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-AVX-NEXT: vzeroupper +; CHECK-AVX-NEXT: callq __truncsfhf2@PLT +; CHECK-AVX-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-AVX-NEXT: # xmm0 = mem[1,0] +; CHECK-AVX-NEXT: callq __truncsfhf2@PLT +; CHECK-AVX-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-AVX-NEXT: # xmm0 = mem[3,3,3,3] +; CHECK-AVX-NEXT: callq __truncsfhf2@PLT +; CHECK-AVX-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; CHECK-AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; CHECK-AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; CHECK-AVX-NEXT: vzeroupper +; CHECK-AVX-NEXT: callq __truncsfhf2@PLT +; CHECK-AVX-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-AVX-NEXT: callq __truncsfhf2@PLT +; CHECK-AVX-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-AVX-NEXT: # xmm0 = mem[1,0] +; CHECK-AVX-NEXT: callq __truncsfhf2@PLT +; CHECK-AVX-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-AVX-NEXT: # xmm0 = mem[3,3,3,3] +; CHECK-AVX-NEXT: callq __truncsfhf2@PLT +; CHECK-AVX-NEXT: callq __extendhfsf2@PLT +; CHECK-AVX-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX-NEXT: callq __truncsfhf2@PLT +; CHECK-AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-AVX-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-AVX-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-AVX-NEXT: callq __extendhfsf2@PLT +; CHECK-AVX-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX-NEXT: callq __truncsfhf2@PLT +; CHECK-AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; CHECK-AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-AVX-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-AVX-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-AVX-NEXT: callq __extendhfsf2@PLT +; CHECK-AVX-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX-NEXT: callq __truncsfhf2@PLT +; CHECK-AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-AVX-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-AVX-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-AVX-NEXT: callq __extendhfsf2@PLT +; CHECK-AVX-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX-NEXT: callq __truncsfhf2@PLT +; CHECK-AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; CHECK-AVX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-AVX-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-AVX-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-AVX-NEXT: callq __extendhfsf2@PLT +; CHECK-AVX-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX-NEXT: callq __truncsfhf2@PLT +; CHECK-AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-AVX-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-AVX-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-AVX-NEXT: callq __extendhfsf2@PLT +; CHECK-AVX-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX-NEXT: callq __truncsfhf2@PLT +; CHECK-AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; CHECK-AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-AVX-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-AVX-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-AVX-NEXT: callq __extendhfsf2@PLT +; CHECK-AVX-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX-NEXT: callq __truncsfhf2@PLT +; CHECK-AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-AVX-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-AVX-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-AVX-NEXT: callq __extendhfsf2@PLT +; CHECK-AVX-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX-NEXT: callq __truncsfhf2@PLT +; CHECK-AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; CHECK-AVX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-AVX-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-AVX-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-AVX-NEXT: addq $120, %rsp +; CHECK-AVX-NEXT: .cfi_def_cfa_offset 8 +; CHECK-AVX-NEXT: retq +; +; CHECK-AVX512F-LABEL: fmul_pow2_8xhalf: +; CHECK-AVX512F: # %bb.0: +; CHECK-AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; CHECK-AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] +; CHECK-AVX512F-NEXT: vpsllvd %ymm0, %ymm1, %ymm0 +; CHECK-AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; CHECK-AVX512F-NEXT: vcvtdq2ps %ymm0, %ymm0 +; CHECK-AVX512F-NEXT: vcvtps2ph $4, %ymm0, %xmm0 +; CHECK-AVX512F-NEXT: vcvtph2ps %xmm0, %ymm0 +; CHECK-AVX512F-NEXT: vbroadcastss {{.*#+}} ymm1 = [8.192E+3,8.192E+3,8.192E+3,8.192E+3,8.192E+3,8.192E+3,8.192E+3,8.192E+3] +; CHECK-AVX512F-NEXT: vmulps %ymm1, %ymm0, %ymm0 +; CHECK-AVX512F-NEXT: vcvtps2ph $4, %ymm0, %xmm0 +; CHECK-AVX512F-NEXT: vzeroupper +; CHECK-AVX512F-NEXT: retq +; +; CHECK-SKX-LABEL: fmul_pow2_8xhalf: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1] +; CHECK-SKX-NEXT: vpsllvw %xmm0, %xmm1, %xmm0 +; CHECK-SKX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; CHECK-SKX-NEXT: vcvtdq2ps %ymm0, %ymm0 +; CHECK-SKX-NEXT: vcvtps2ph $4, %ymm0, %xmm0 +; CHECK-SKX-NEXT: vcvtph2ps %xmm0, %ymm0 +; CHECK-SKX-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 +; CHECK-SKX-NEXT: vcvtps2ph $4, %ymm0, %xmm0 +; CHECK-SKX-NEXT: vzeroupper +; CHECK-SKX-NEXT: retq %p2 = shl <8 x i16> , %i %p2_f = uitofp <8 x i16> %p2 to <8 x half> %r = fmul <8 x half> , %p2_f @@ -483,82 +563,82 @@ define <8 x half> @fmul_pow2_ldexp_8xhalf(<8 x i16> %i) { ; CHECK-SSE-NEXT: .cfi_def_cfa_offset 8 ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX2-LABEL: fmul_pow2_ldexp_8xhalf: -; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: subq $72, %rsp -; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 80 -; CHECK-AVX2-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; CHECK-AVX2-NEXT: vpextrw $7, %xmm0, %eax -; CHECK-AVX2-NEXT: movswl %ax, %edi -; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] -; CHECK-AVX2-NEXT: callq ldexpf@PLT -; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-AVX2-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; CHECK-AVX2-NEXT: vpextrw $6, %xmm0, %eax -; CHECK-AVX2-NEXT: movswl %ax, %edi -; CHECK-AVX2-NEXT: vmovd {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] -; CHECK-AVX2-NEXT: callq ldexpf@PLT -; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-AVX2-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; CHECK-AVX2-NEXT: vpextrw $5, %xmm0, %eax -; CHECK-AVX2-NEXT: movswl %ax, %edi -; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] -; CHECK-AVX2-NEXT: callq ldexpf@PLT -; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-AVX2-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; CHECK-AVX2-NEXT: vpextrw $4, %xmm0, %eax -; CHECK-AVX2-NEXT: movswl %ax, %edi -; CHECK-AVX2-NEXT: vmovd {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] -; CHECK-AVX2-NEXT: callq ldexpf@PLT -; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-AVX2-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; CHECK-AVX2-NEXT: vpextrw $3, %xmm0, %eax -; CHECK-AVX2-NEXT: movswl %ax, %edi -; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] -; CHECK-AVX2-NEXT: callq ldexpf@PLT -; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-AVX2-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; CHECK-AVX2-NEXT: vpextrw $2, %xmm0, %eax -; CHECK-AVX2-NEXT: movswl %ax, %edi -; CHECK-AVX2-NEXT: vmovd {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] -; CHECK-AVX2-NEXT: callq ldexpf@PLT -; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-AVX2-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; CHECK-AVX2-NEXT: vpextrw $1, %xmm0, %eax -; CHECK-AVX2-NEXT: movswl %ax, %edi -; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] -; CHECK-AVX2-NEXT: callq ldexpf@PLT -; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-AVX2-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; CHECK-AVX2-NEXT: vmovd %xmm0, %eax -; CHECK-AVX2-NEXT: movswl %ax, %edi -; CHECK-AVX2-NEXT: vmovd {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] -; CHECK-AVX2-NEXT: callq ldexpf@PLT -; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-AVX2-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-AVX2-NEXT: # xmm0 = xmm0[0],mem[0] -; CHECK-AVX2-NEXT: addq $72, %rsp -; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 8 -; CHECK-AVX2-NEXT: retq +; CHECK-AVX-LABEL: fmul_pow2_ldexp_8xhalf: +; CHECK-AVX: # %bb.0: +; CHECK-AVX-NEXT: subq $72, %rsp +; CHECK-AVX-NEXT: .cfi_def_cfa_offset 80 +; CHECK-AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-AVX-NEXT: vpextrw $7, %xmm0, %eax +; CHECK-AVX-NEXT: movswl %ax, %edi +; CHECK-AVX-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] +; CHECK-AVX-NEXT: callq ldexpf@PLT +; CHECK-AVX-NEXT: callq __truncsfhf2@PLT +; CHECK-AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-AVX-NEXT: vpextrw $6, %xmm0, %eax +; CHECK-AVX-NEXT: movswl %ax, %edi +; CHECK-AVX-NEXT: vmovd {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] +; CHECK-AVX-NEXT: callq ldexpf@PLT +; CHECK-AVX-NEXT: callq __truncsfhf2@PLT +; CHECK-AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; CHECK-AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-AVX-NEXT: vpextrw $5, %xmm0, %eax +; CHECK-AVX-NEXT: movswl %ax, %edi +; CHECK-AVX-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] +; CHECK-AVX-NEXT: callq ldexpf@PLT +; CHECK-AVX-NEXT: callq __truncsfhf2@PLT +; CHECK-AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-AVX-NEXT: vpextrw $4, %xmm0, %eax +; CHECK-AVX-NEXT: movswl %ax, %edi +; CHECK-AVX-NEXT: vmovd {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] +; CHECK-AVX-NEXT: callq ldexpf@PLT +; CHECK-AVX-NEXT: callq __truncsfhf2@PLT +; CHECK-AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; CHECK-AVX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-AVX-NEXT: vpextrw $3, %xmm0, %eax +; CHECK-AVX-NEXT: movswl %ax, %edi +; CHECK-AVX-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] +; CHECK-AVX-NEXT: callq ldexpf@PLT +; CHECK-AVX-NEXT: callq __truncsfhf2@PLT +; CHECK-AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-AVX-NEXT: vpextrw $2, %xmm0, %eax +; CHECK-AVX-NEXT: movswl %ax, %edi +; CHECK-AVX-NEXT: vmovd {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] +; CHECK-AVX-NEXT: callq ldexpf@PLT +; CHECK-AVX-NEXT: callq __truncsfhf2@PLT +; CHECK-AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; CHECK-AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-AVX-NEXT: vpextrw $1, %xmm0, %eax +; CHECK-AVX-NEXT: movswl %ax, %edi +; CHECK-AVX-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] +; CHECK-AVX-NEXT: callq ldexpf@PLT +; CHECK-AVX-NEXT: callq __truncsfhf2@PLT +; CHECK-AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-AVX-NEXT: vmovd %xmm0, %eax +; CHECK-AVX-NEXT: movswl %ax, %edi +; CHECK-AVX-NEXT: vmovd {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] +; CHECK-AVX-NEXT: callq ldexpf@PLT +; CHECK-AVX-NEXT: callq __truncsfhf2@PLT +; CHECK-AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; CHECK-AVX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-AVX-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-AVX-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-AVX-NEXT: addq $72, %rsp +; CHECK-AVX-NEXT: .cfi_def_cfa_offset 8 +; CHECK-AVX-NEXT: retq ; ; CHECK-AVX512F-LABEL: fmul_pow2_ldexp_8xhalf: ; CHECK-AVX512F: # %bb.0: @@ -636,6 +716,83 @@ define <8 x half> @fmul_pow2_ldexp_8xhalf(<8 x i16> %i) { ; CHECK-AVX512F-NEXT: addq $72, %rsp ; CHECK-AVX512F-NEXT: .cfi_def_cfa_offset 8 ; CHECK-AVX512F-NEXT: retq +; +; CHECK-SKX-LABEL: fmul_pow2_ldexp_8xhalf: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: subq $72, %rsp +; CHECK-SKX-NEXT: .cfi_def_cfa_offset 80 +; CHECK-SKX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-SKX-NEXT: vpextrw $7, %xmm0, %eax +; CHECK-SKX-NEXT: movswl %ax, %edi +; CHECK-SKX-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] +; CHECK-SKX-NEXT: callq ldexpf@PLT +; CHECK-SKX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; CHECK-SKX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-SKX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-SKX-NEXT: vpextrw $6, %xmm0, %eax +; CHECK-SKX-NEXT: movswl %ax, %edi +; CHECK-SKX-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] +; CHECK-SKX-NEXT: callq ldexpf@PLT +; CHECK-SKX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; CHECK-SKX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-SKX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; CHECK-SKX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-SKX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-SKX-NEXT: vpextrw $5, %xmm0, %eax +; CHECK-SKX-NEXT: movswl %ax, %edi +; CHECK-SKX-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] +; CHECK-SKX-NEXT: callq ldexpf@PLT +; CHECK-SKX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; CHECK-SKX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-SKX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-SKX-NEXT: vpextrw $4, %xmm0, %eax +; CHECK-SKX-NEXT: movswl %ax, %edi +; CHECK-SKX-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] +; CHECK-SKX-NEXT: callq ldexpf@PLT +; CHECK-SKX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; CHECK-SKX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-SKX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; CHECK-SKX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-SKX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-SKX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-SKX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-SKX-NEXT: vpextrw $3, %xmm0, %eax +; CHECK-SKX-NEXT: movswl %ax, %edi +; CHECK-SKX-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] +; CHECK-SKX-NEXT: callq ldexpf@PLT +; CHECK-SKX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; CHECK-SKX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-SKX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-SKX-NEXT: vpextrw $2, %xmm0, %eax +; CHECK-SKX-NEXT: movswl %ax, %edi +; CHECK-SKX-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] +; CHECK-SKX-NEXT: callq ldexpf@PLT +; CHECK-SKX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; CHECK-SKX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-SKX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; CHECK-SKX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-SKX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-SKX-NEXT: vpextrw $1, %xmm0, %eax +; CHECK-SKX-NEXT: movswl %ax, %edi +; CHECK-SKX-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] +; CHECK-SKX-NEXT: callq ldexpf@PLT +; CHECK-SKX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; CHECK-SKX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-SKX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-SKX-NEXT: vmovd %xmm0, %eax +; CHECK-SKX-NEXT: movswl %ax, %edi +; CHECK-SKX-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] +; CHECK-SKX-NEXT: callq ldexpf@PLT +; CHECK-SKX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; CHECK-SKX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-SKX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; CHECK-SKX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-SKX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-SKX-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-SKX-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-SKX-NEXT: addq $72, %rsp +; CHECK-SKX-NEXT: .cfi_def_cfa_offset 8 +; CHECK-SKX-NEXT: retq %r = call <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half> , <8 x i16> %i) ret <8 x half> %r } @@ -649,26 +806,26 @@ define <8 x half> @fdiv_pow2_8xhalf(<8 x i16> %i) { ; CHECK-SSE-NEXT: movdqa %xmm1, %xmm0 ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX2-LABEL: fdiv_pow2_8xhalf: -; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpsllw $10, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [28672,28672,28672,28672,28672,28672,28672,28672] -; CHECK-AVX2-NEXT: vpsubw %xmm0, %xmm1, %xmm0 -; CHECK-AVX2-NEXT: retq -; -; CHECK-NO-FASTFMA-LABEL: fdiv_pow2_8xhalf: -; CHECK-NO-FASTFMA: # %bb.0: -; CHECK-NO-FASTFMA-NEXT: vpsllw $10, %xmm0, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vpbroadcastw {{.*#+}} xmm1 = [28672,28672,28672,28672,28672,28672,28672,28672] -; CHECK-NO-FASTFMA-NEXT: vpsubw %xmm0, %xmm1, %xmm0 -; CHECK-NO-FASTFMA-NEXT: retq -; -; CHECK-FMA-LABEL: fdiv_pow2_8xhalf: -; CHECK-FMA: # %bb.0: -; CHECK-FMA-NEXT: vpsllw $10, %xmm0, %xmm0 -; CHECK-FMA-NEXT: vpbroadcastd {{.*#+}} xmm1 = [28672,28672,28672,28672,28672,28672,28672,28672] -; CHECK-FMA-NEXT: vpsubw %xmm0, %xmm1, %xmm0 -; CHECK-FMA-NEXT: retq +; CHECK-AVX-LABEL: fdiv_pow2_8xhalf: +; CHECK-AVX: # %bb.0: +; CHECK-AVX-NEXT: vpsllw $10, %xmm0, %xmm0 +; CHECK-AVX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [28672,28672,28672,28672,28672,28672,28672,28672] +; CHECK-AVX-NEXT: vpsubw %xmm0, %xmm1, %xmm0 +; CHECK-AVX-NEXT: retq +; +; CHECK-AVX512F-LABEL: fdiv_pow2_8xhalf: +; CHECK-AVX512F: # %bb.0: +; CHECK-AVX512F-NEXT: vpsllw $10, %xmm0, %xmm0 +; CHECK-AVX512F-NEXT: vpbroadcastw {{.*#+}} xmm1 = [28672,28672,28672,28672,28672,28672,28672,28672] +; CHECK-AVX512F-NEXT: vpsubw %xmm0, %xmm1, %xmm0 +; CHECK-AVX512F-NEXT: retq +; +; CHECK-SKX-LABEL: fdiv_pow2_8xhalf: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: vpsllw $10, %xmm0, %xmm0 +; CHECK-SKX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [28672,28672,28672,28672,28672,28672,28672,28672] +; CHECK-SKX-NEXT: vpsubw %xmm0, %xmm1, %xmm0 +; CHECK-SKX-NEXT: retq %p2 = shl <8 x i16> , %i %p2_f = uitofp <8 x i16> %p2 to <8 x half> %r = fdiv <8 x half> , %p2_f @@ -695,6 +852,24 @@ define double @fmul_pow_shl_cnt(i64 %cnt) nounwind { ; CHECK-AVX-NEXT: addq %rax, %rcx ; CHECK-AVX-NEXT: vmovq %rcx, %xmm0 ; CHECK-AVX-NEXT: retq +; +; CHECK-AVX512F-LABEL: fmul_pow_shl_cnt: +; CHECK-AVX512F: # %bb.0: +; CHECK-AVX512F-NEXT: movzbl %dil, %eax +; CHECK-AVX512F-NEXT: shlq $52, %rax +; CHECK-AVX512F-NEXT: movabsq $4621256167635550208, %rcx # imm = 0x4022000000000000 +; CHECK-AVX512F-NEXT: addq %rax, %rcx +; CHECK-AVX512F-NEXT: vmovq %rcx, %xmm0 +; CHECK-AVX512F-NEXT: retq +; +; CHECK-SKX-LABEL: fmul_pow_shl_cnt: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: movzbl %dil, %eax +; CHECK-SKX-NEXT: shlq $52, %rax +; CHECK-SKX-NEXT: movabsq $4621256167635550208, %rcx # imm = 0x4022000000000000 +; CHECK-SKX-NEXT: addq %rax, %rcx +; CHECK-SKX-NEXT: vmovq %rcx, %xmm0 +; CHECK-SKX-NEXT: retq %shl = shl nuw i64 1, %cnt %conv = uitofp i64 %shl to double %mul = fmul double 9.000000e+00, %conv @@ -723,6 +898,26 @@ define double @fmul_pow_shl_cnt2(i64 %cnt) nounwind { ; CHECK-AVX-NEXT: addq %rax, %rcx ; CHECK-AVX-NEXT: vmovq %rcx, %xmm0 ; CHECK-AVX-NEXT: retq +; +; CHECK-AVX512F-LABEL: fmul_pow_shl_cnt2: +; CHECK-AVX512F: # %bb.0: +; CHECK-AVX512F-NEXT: movzbl %dil, %eax +; CHECK-AVX512F-NEXT: incl %eax +; CHECK-AVX512F-NEXT: shlq $52, %rax +; CHECK-AVX512F-NEXT: movabsq $-4602115869219225600, %rcx # imm = 0xC022000000000000 +; CHECK-AVX512F-NEXT: addq %rax, %rcx +; CHECK-AVX512F-NEXT: vmovq %rcx, %xmm0 +; CHECK-AVX512F-NEXT: retq +; +; CHECK-SKX-LABEL: fmul_pow_shl_cnt2: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: movzbl %dil, %eax +; CHECK-SKX-NEXT: incl %eax +; CHECK-SKX-NEXT: shlq $52, %rax +; CHECK-SKX-NEXT: movabsq $-4602115869219225600, %rcx # imm = 0xC022000000000000 +; CHECK-SKX-NEXT: addq %rax, %rcx +; CHECK-SKX-NEXT: vmovq %rcx, %xmm0 +; CHECK-SKX-NEXT: retq %shl = shl nuw i64 2, %cnt %conv = uitofp i64 %shl to double %mul = fmul double -9.000000e+00, %conv @@ -748,6 +943,24 @@ define double @fmul_pow_shl_cnt3(i8 %cnt) nounwind { ; CHECK-AVX-NEXT: addq %rax, %rcx ; CHECK-AVX-NEXT: vmovq %rcx, %xmm0 ; CHECK-AVX-NEXT: retq +; +; CHECK-AVX512F-LABEL: fmul_pow_shl_cnt3: +; CHECK-AVX512F: # %bb.0: +; CHECK-AVX512F-NEXT: movzbl %dil, %eax +; CHECK-AVX512F-NEXT: shlq $52, %rax +; CHECK-AVX512F-NEXT: movabsq $-4602115869219225600, %rcx # imm = 0xC022000000000000 +; CHECK-AVX512F-NEXT: addq %rax, %rcx +; CHECK-AVX512F-NEXT: vmovq %rcx, %xmm0 +; CHECK-AVX512F-NEXT: retq +; +; CHECK-SKX-LABEL: fmul_pow_shl_cnt3: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: movzbl %dil, %eax +; CHECK-SKX-NEXT: shlq $52, %rax +; CHECK-SKX-NEXT: movabsq $-4602115869219225600, %rcx # imm = 0xC022000000000000 +; CHECK-SKX-NEXT: addq %rax, %rcx +; CHECK-SKX-NEXT: vmovq %rcx, %xmm0 +; CHECK-SKX-NEXT: retq %zext_cnt = zext i8 %cnt to i64 %shl = shl nuw i64 1, %zext_cnt %conv = uitofp i64 %shl to double @@ -779,6 +992,28 @@ define float @fmul_pow_select(i32 %cnt, i1 %c) nounwind { ; CHECK-AVX-NEXT: addl $1091567616, %ecx # imm = 0x41100000 ; CHECK-AVX-NEXT: vmovd %ecx, %xmm0 ; CHECK-AVX-NEXT: retq +; +; CHECK-AVX512F-LABEL: fmul_pow_select: +; CHECK-AVX512F: # %bb.0: +; CHECK-AVX512F-NEXT: movzbl %dil, %eax +; CHECK-AVX512F-NEXT: leal 1(%rax), %ecx +; CHECK-AVX512F-NEXT: testb $1, %sil +; CHECK-AVX512F-NEXT: cmovnel %eax, %ecx +; CHECK-AVX512F-NEXT: shll $23, %ecx +; CHECK-AVX512F-NEXT: addl $1091567616, %ecx # imm = 0x41100000 +; CHECK-AVX512F-NEXT: vmovd %ecx, %xmm0 +; CHECK-AVX512F-NEXT: retq +; +; CHECK-SKX-LABEL: fmul_pow_select: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: movzbl %dil, %eax +; CHECK-SKX-NEXT: leal 1(%rax), %ecx +; CHECK-SKX-NEXT: testb $1, %sil +; CHECK-SKX-NEXT: cmovnel %eax, %ecx +; CHECK-SKX-NEXT: shll $23, %ecx +; CHECK-SKX-NEXT: addl $1091567616, %ecx # imm = 0x41100000 +; CHECK-SKX-NEXT: vmovd %ecx, %xmm0 +; CHECK-SKX-NEXT: retq %shl2 = shl nuw i32 2, %cnt %shl1 = shl nuw i32 1, %cnt %shl = select i1 %c, i32 %shl1, i32 %shl2 @@ -813,6 +1048,30 @@ define float @fmul_fly_pow_mul_min_pow2(i64 %cnt) nounwind { ; CHECK-AVX-NEXT: addl $1091567616, %ecx # imm = 0x41100000 ; CHECK-AVX-NEXT: vmovd %ecx, %xmm0 ; CHECK-AVX-NEXT: retq +; +; CHECK-AVX512F-LABEL: fmul_fly_pow_mul_min_pow2: +; CHECK-AVX512F: # %bb.0: +; CHECK-AVX512F-NEXT: movzbl %dil, %eax +; CHECK-AVX512F-NEXT: addl $3, %eax +; CHECK-AVX512F-NEXT: cmpl $13, %eax +; CHECK-AVX512F-NEXT: movl $13, %ecx +; CHECK-AVX512F-NEXT: cmovbl %eax, %ecx +; CHECK-AVX512F-NEXT: shll $23, %ecx +; CHECK-AVX512F-NEXT: addl $1091567616, %ecx # imm = 0x41100000 +; CHECK-AVX512F-NEXT: vmovd %ecx, %xmm0 +; CHECK-AVX512F-NEXT: retq +; +; CHECK-SKX-LABEL: fmul_fly_pow_mul_min_pow2: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: movzbl %dil, %eax +; CHECK-SKX-NEXT: addl $3, %eax +; CHECK-SKX-NEXT: cmpl $13, %eax +; CHECK-SKX-NEXT: movl $13, %ecx +; CHECK-SKX-NEXT: cmovbl %eax, %ecx +; CHECK-SKX-NEXT: shll $23, %ecx +; CHECK-SKX-NEXT: addl $1091567616, %ecx # imm = 0x41100000 +; CHECK-SKX-NEXT: vmovd %ecx, %xmm0 +; CHECK-SKX-NEXT: retq %shl8 = shl nuw i64 8, %cnt %shl = call i64 @llvm.umin.i64(i64 %shl8, i64 8192) %conv = uitofp i64 %shl to float @@ -846,6 +1105,30 @@ define double @fmul_pow_mul_max_pow2(i16 %cnt) nounwind { ; CHECK-AVX-NEXT: orq %rcx, %rax ; CHECK-AVX-NEXT: vmovq %rax, %xmm0 ; CHECK-AVX-NEXT: retq +; +; CHECK-AVX512F-LABEL: fmul_pow_mul_max_pow2: +; CHECK-AVX512F: # %bb.0: +; CHECK-AVX512F-NEXT: movzbl %dil, %eax +; CHECK-AVX512F-NEXT: leaq 1(%rax), %rcx +; CHECK-AVX512F-NEXT: cmpq %rcx, %rax +; CHECK-AVX512F-NEXT: cmovaq %rax, %rcx +; CHECK-AVX512F-NEXT: shlq $52, %rcx +; CHECK-AVX512F-NEXT: movabsq $4613937818241073152, %rax # imm = 0x4008000000000000 +; CHECK-AVX512F-NEXT: orq %rcx, %rax +; CHECK-AVX512F-NEXT: vmovq %rax, %xmm0 +; CHECK-AVX512F-NEXT: retq +; +; CHECK-SKX-LABEL: fmul_pow_mul_max_pow2: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: movzbl %dil, %eax +; CHECK-SKX-NEXT: leaq 1(%rax), %rcx +; CHECK-SKX-NEXT: cmpq %rcx, %rax +; CHECK-SKX-NEXT: cmovaq %rax, %rcx +; CHECK-SKX-NEXT: shlq $52, %rcx +; CHECK-SKX-NEXT: movabsq $4613937818241073152, %rax # imm = 0x4008000000000000 +; CHECK-SKX-NEXT: orq %rcx, %rax +; CHECK-SKX-NEXT: vmovq %rax, %xmm0 +; CHECK-SKX-NEXT: retq %shl2 = shl nuw i16 2, %cnt %shl1 = shl nuw i16 1, %cnt %shl = call i16 @llvm.umax.i16(i16 %shl1, i16 %shl2) @@ -869,34 +1152,34 @@ define double @fmul_pow_shl_cnt_fail_maybe_non_pow2(i64 %v, i64 %cnt) nounwind { ; CHECK-SSE-NEXT: mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_fail_maybe_non_pow2: -; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: movq %rsi, %rcx -; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $rcx -; CHECK-AVX2-NEXT: shlq %cl, %rdi -; CHECK-AVX2-NEXT: vmovq %rdi, %xmm0 -; CHECK-AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-AVX2-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] -; CHECK-AVX2-NEXT: vaddsd %xmm0, %xmm1, %xmm0 -; CHECK-AVX2-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: retq -; -; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt_fail_maybe_non_pow2: -; CHECK-NO-FASTFMA: # %bb.0: -; CHECK-NO-FASTFMA-NEXT: movq %rsi, %rcx -; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $rcx -; CHECK-NO-FASTFMA-NEXT: shlq %cl, %rdi -; CHECK-NO-FASTFMA-NEXT: vcvtusi2sd %rdi, %xmm15, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-NO-FASTFMA-NEXT: retq -; -; CHECK-FMA-LABEL: fmul_pow_shl_cnt_fail_maybe_non_pow2: -; CHECK-FMA: # %bb.0: -; CHECK-FMA-NEXT: shlxq %rsi, %rdi, %rax -; CHECK-FMA-NEXT: vcvtusi2sd %rax, %xmm15, %xmm0 -; CHECK-FMA-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-FMA-NEXT: retq +; CHECK-AVX-LABEL: fmul_pow_shl_cnt_fail_maybe_non_pow2: +; CHECK-AVX: # %bb.0: +; CHECK-AVX-NEXT: movq %rsi, %rcx +; CHECK-AVX-NEXT: # kill: def $cl killed $cl killed $rcx +; CHECK-AVX-NEXT: shlq %cl, %rdi +; CHECK-AVX-NEXT: vmovq %rdi, %xmm0 +; CHECK-AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-AVX-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] +; CHECK-AVX-NEXT: vaddsd %xmm0, %xmm1, %xmm0 +; CHECK-AVX-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX-NEXT: retq +; +; CHECK-AVX512F-LABEL: fmul_pow_shl_cnt_fail_maybe_non_pow2: +; CHECK-AVX512F: # %bb.0: +; CHECK-AVX512F-NEXT: movq %rsi, %rcx +; CHECK-AVX512F-NEXT: # kill: def $cl killed $cl killed $rcx +; CHECK-AVX512F-NEXT: shlq %cl, %rdi +; CHECK-AVX512F-NEXT: vcvtusi2sd %rdi, %xmm15, %xmm0 +; CHECK-AVX512F-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512F-NEXT: retq +; +; CHECK-SKX-LABEL: fmul_pow_shl_cnt_fail_maybe_non_pow2: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: shlxq %rsi, %rdi, %rax +; CHECK-SKX-NEXT: vcvtusi2sd %rax, %xmm15, %xmm0 +; CHECK-SKX-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-SKX-NEXT: retq %shl = shl nuw i64 %v, %cnt %conv = uitofp i64 %shl to double %mul = fmul double 9.000000e+00, %conv @@ -922,39 +1205,39 @@ define <2 x float> @fmul_pow_shl_cnt_vec_fail_expensive_cast(<2 x i64> %cnt) nou ; CHECK-SSE-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_vec_fail_expensive_cast: -; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [2,2] -; CHECK-AVX2-NEXT: vpsllvq %xmm0, %xmm1, %xmm0 -; CHECK-AVX2-NEXT: vpextrq $1, %xmm0, %rax -; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm15, %xmm1 -; CHECK-AVX2-NEXT: vmovq %xmm0, %rax -; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 -; CHECK-AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero -; CHECK-AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.5E+1,1.5E+1,1.5E+1,1.5E+1] -; CHECK-AVX2-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: retq -; -; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt_vec_fail_expensive_cast: -; CHECK-NO-FASTFMA: # %bb.0: -; CHECK-NO-FASTFMA-NEXT: vpmovsxbq {{.*#+}} xmm1 = [2,2] -; CHECK-NO-FASTFMA-NEXT: vpsllvq %xmm0, %xmm1, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vpextrq $1, %xmm0, %rax -; CHECK-NO-FASTFMA-NEXT: vcvtsi2ss %rax, %xmm15, %xmm1 -; CHECK-NO-FASTFMA-NEXT: vmovq %xmm0, %rax -; CHECK-NO-FASTFMA-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero -; CHECK-NO-FASTFMA-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.5E+1,1.5E+1,1.5E+1,1.5E+1] -; CHECK-NO-FASTFMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; CHECK-NO-FASTFMA-NEXT: retq -; -; CHECK-FMA-LABEL: fmul_pow_shl_cnt_vec_fail_expensive_cast: -; CHECK-FMA: # %bb.0: -; CHECK-FMA-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2,2] -; CHECK-FMA-NEXT: vpsllvq %xmm0, %xmm1, %xmm0 -; CHECK-FMA-NEXT: vcvtqq2ps %xmm0, %xmm0 -; CHECK-FMA-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-FMA-NEXT: retq +; CHECK-AVX-LABEL: fmul_pow_shl_cnt_vec_fail_expensive_cast: +; CHECK-AVX: # %bb.0: +; CHECK-AVX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [2,2] +; CHECK-AVX-NEXT: vpsllvq %xmm0, %xmm1, %xmm0 +; CHECK-AVX-NEXT: vpextrq $1, %xmm0, %rax +; CHECK-AVX-NEXT: vcvtsi2ss %rax, %xmm15, %xmm1 +; CHECK-AVX-NEXT: vmovq %xmm0, %rax +; CHECK-AVX-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 +; CHECK-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero +; CHECK-AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.5E+1,1.5E+1,1.5E+1,1.5E+1] +; CHECK-AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; CHECK-AVX-NEXT: retq +; +; CHECK-AVX512F-LABEL: fmul_pow_shl_cnt_vec_fail_expensive_cast: +; CHECK-AVX512F: # %bb.0: +; CHECK-AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm1 = [2,2] +; CHECK-AVX512F-NEXT: vpsllvq %xmm0, %xmm1, %xmm0 +; CHECK-AVX512F-NEXT: vpextrq $1, %xmm0, %rax +; CHECK-AVX512F-NEXT: vcvtsi2ss %rax, %xmm15, %xmm1 +; CHECK-AVX512F-NEXT: vmovq %xmm0, %rax +; CHECK-AVX512F-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 +; CHECK-AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero +; CHECK-AVX512F-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.5E+1,1.5E+1,1.5E+1,1.5E+1] +; CHECK-AVX512F-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; CHECK-AVX512F-NEXT: retq +; +; CHECK-SKX-LABEL: fmul_pow_shl_cnt_vec_fail_expensive_cast: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2,2] +; CHECK-SKX-NEXT: vpsllvq %xmm0, %xmm1, %xmm0 +; CHECK-SKX-NEXT: vcvtqq2ps %xmm0, %xmm0 +; CHECK-SKX-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; CHECK-SKX-NEXT: retq %shl = shl nsw nuw <2 x i64> , %cnt %conv = uitofp <2 x i64> %shl to <2 x float> %mul = fmul <2 x float> , %conv @@ -968,23 +1251,23 @@ define <2 x double> @fmul_pow_shl_cnt_vec(<2 x i64> %cnt) nounwind { ; CHECK-SSE-NEXT: paddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_vec: -; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpsllq $52, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: retq -; -; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt_vec: -; CHECK-NO-FASTFMA: # %bb.0: -; CHECK-NO-FASTFMA-NEXT: vpsllq $52, %xmm0, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-NO-FASTFMA-NEXT: retq -; -; CHECK-FMA-LABEL: fmul_pow_shl_cnt_vec: -; CHECK-FMA: # %bb.0: -; CHECK-FMA-NEXT: vpsllq $52, %xmm0, %xmm0 -; CHECK-FMA-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 -; CHECK-FMA-NEXT: retq +; CHECK-AVX-LABEL: fmul_pow_shl_cnt_vec: +; CHECK-AVX: # %bb.0: +; CHECK-AVX-NEXT: vpsllq $52, %xmm0, %xmm0 +; CHECK-AVX-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX-NEXT: retq +; +; CHECK-AVX512F-LABEL: fmul_pow_shl_cnt_vec: +; CHECK-AVX512F: # %bb.0: +; CHECK-AVX512F-NEXT: vpsllq $52, %xmm0, %xmm0 +; CHECK-AVX512F-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512F-NEXT: retq +; +; CHECK-SKX-LABEL: fmul_pow_shl_cnt_vec: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: vpsllq $52, %xmm0, %xmm0 +; CHECK-SKX-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 +; CHECK-SKX-NEXT: retq %shl = shl nsw nuw <2 x i64> , %cnt %conv = uitofp <2 x i64> %shl to <2 x double> %mul = fmul <2 x double> , %conv @@ -999,21 +1282,30 @@ define <4 x float> @fmul_pow_shl_cnt_vec_preserve_fma(<4 x i32> %cnt, <4 x float ; CHECK-SSE-NEXT: addps %xmm1, %xmm0 ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_vec_preserve_fma: -; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpslld $23, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1092616192,1092616192,1092616192,1092616192] -; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: retq -; -; CHECK-FMA-LABEL: fmul_pow_shl_cnt_vec_preserve_fma: -; CHECK-FMA: # %bb.0: -; CHECK-FMA-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2,2,2,2] -; CHECK-FMA-NEXT: vpsllvd %xmm0, %xmm2, %xmm0 -; CHECK-FMA-NEXT: vcvtdq2ps %xmm0, %xmm0 -; CHECK-FMA-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * mem) + xmm1 -; CHECK-FMA-NEXT: retq +; CHECK-AVX-LABEL: fmul_pow_shl_cnt_vec_preserve_fma: +; CHECK-AVX: # %bb.0: +; CHECK-AVX-NEXT: vpslld $23, %xmm0, %xmm0 +; CHECK-AVX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1092616192,1092616192,1092616192,1092616192] +; CHECK-AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; CHECK-AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; CHECK-AVX-NEXT: retq +; +; CHECK-AVX512F-LABEL: fmul_pow_shl_cnt_vec_preserve_fma: +; CHECK-AVX512F: # %bb.0: +; CHECK-AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2,2,2,2] +; CHECK-AVX512F-NEXT: vpsllvd %xmm0, %xmm2, %xmm0 +; CHECK-AVX512F-NEXT: vcvtdq2ps %xmm0, %xmm2 +; CHECK-AVX512F-NEXT: vbroadcastss {{.*#+}} xmm0 = [5.0E+0,5.0E+0,5.0E+0,5.0E+0] +; CHECK-AVX512F-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1 +; CHECK-AVX512F-NEXT: retq +; +; CHECK-SKX-LABEL: fmul_pow_shl_cnt_vec_preserve_fma: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2,2,2,2] +; CHECK-SKX-NEXT: vpsllvd %xmm0, %xmm2, %xmm0 +; CHECK-SKX-NEXT: vcvtdq2ps %xmm0, %xmm0 +; CHECK-SKX-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * mem) + xmm1 +; CHECK-SKX-NEXT: retq %shl = shl nsw nuw <4 x i32> , %cnt %conv = uitofp <4 x i32> %shl to <4 x float> %mul = fmul contract <4 x float> , %conv @@ -1021,6 +1313,43 @@ define <4 x float> @fmul_pow_shl_cnt_vec_preserve_fma(<4 x i32> %cnt, <4 x float ret <4 x float> %res } +define <4 x float> @fmul_pow_shl_cnt_vec_no_fma(<4 x i32> %cnt, <4 x float> %add) nounwind { +; CHECK-SSE-LABEL: fmul_pow_shl_cnt_vec_no_fma: +; CHECK-SSE: # %bb.0: +; CHECK-SSE-NEXT: pslld $23, %xmm0 +; CHECK-SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE-NEXT: addps %xmm1, %xmm0 +; CHECK-SSE-NEXT: retq +; +; CHECK-AVX-LABEL: fmul_pow_shl_cnt_vec_no_fma: +; CHECK-AVX: # %bb.0: +; CHECK-AVX-NEXT: vpslld $23, %xmm0, %xmm0 +; CHECK-AVX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1092616192,1092616192,1092616192,1092616192] +; CHECK-AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; CHECK-AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; CHECK-AVX-NEXT: retq +; +; CHECK-AVX512F-LABEL: fmul_pow_shl_cnt_vec_no_fma: +; CHECK-AVX512F: # %bb.0: +; CHECK-AVX512F-NEXT: vpslld $23, %xmm0, %xmm0 +; CHECK-AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1092616192,1092616192,1092616192,1092616192] +; CHECK-AVX512F-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; CHECK-AVX512F-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; CHECK-AVX512F-NEXT: retq +; +; CHECK-SKX-LABEL: fmul_pow_shl_cnt_vec_no_fma: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: vpslld $23, %xmm0, %xmm0 +; CHECK-SKX-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; CHECK-SKX-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; CHECK-SKX-NEXT: retq + %shl = shl nsw nuw <4 x i32> , %cnt + %conv = uitofp <4 x i32> %shl to <4 x float> + %mul = fmul <4 x float> , %conv + %res = fadd <4 x float> %mul, %add + ret <4 x float> %res +} + define <2 x double> @fmul_pow_shl_cnt_vec_non_splat_todo(<2 x i64> %cnt) nounwind { ; CHECK-SSE-LABEL: fmul_pow_shl_cnt_vec_non_splat_todo: ; CHECK-SSE: # %bb.0: @@ -1033,6 +1362,18 @@ define <2 x double> @fmul_pow_shl_cnt_vec_non_splat_todo(<2 x i64> %cnt) nounwin ; CHECK-AVX-NEXT: vpsllq $52, %xmm0, %xmm0 ; CHECK-AVX-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX-NEXT: retq +; +; CHECK-AVX512F-LABEL: fmul_pow_shl_cnt_vec_non_splat_todo: +; CHECK-AVX512F: # %bb.0: +; CHECK-AVX512F-NEXT: vpsllq $52, %xmm0, %xmm0 +; CHECK-AVX512F-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512F-NEXT: retq +; +; CHECK-SKX-LABEL: fmul_pow_shl_cnt_vec_non_splat_todo: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: vpsllq $52, %xmm0, %xmm0 +; CHECK-SKX-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-SKX-NEXT: retq %shl = shl nsw nuw <2 x i64> , %cnt %conv = uitofp <2 x i64> %shl to <2 x double> %mul = fmul <2 x double> , %conv @@ -1051,6 +1392,18 @@ define <2 x double> @fmul_pow_shl_cnt_vec_non_splat2_todo(<2 x i64> %cnt) nounwi ; CHECK-AVX-NEXT: vpsllq $52, %xmm0, %xmm0 ; CHECK-AVX-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX-NEXT: retq +; +; CHECK-AVX512F-LABEL: fmul_pow_shl_cnt_vec_non_splat2_todo: +; CHECK-AVX512F: # %bb.0: +; CHECK-AVX512F-NEXT: vpsllq $52, %xmm0, %xmm0 +; CHECK-AVX512F-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512F-NEXT: retq +; +; CHECK-SKX-LABEL: fmul_pow_shl_cnt_vec_non_splat2_todo: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: vpsllq $52, %xmm0, %xmm0 +; CHECK-SKX-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-SKX-NEXT: retq %shl = shl nsw nuw <2 x i64> , %cnt %conv = uitofp <2 x i64> %shl to <2 x double> %mul = fmul <2 x double> , %conv @@ -1092,65 +1445,65 @@ define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind { ; CHECK-SSE-NEXT: addq $40, %rsp ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_vec_fail_to_large: -; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: subq $56, %rsp -; CHECK-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; CHECK-AVX2-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,2,0,0,0,0,0,0] -; CHECK-AVX2-NEXT: vpsllvd %ymm0, %ymm1, %ymm0 -; CHECK-AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; CHECK-AVX2-NEXT: vpextrw $2, %xmm0, %eax -; CHECK-AVX2-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0 -; CHECK-AVX2-NEXT: vzeroupper -; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-AVX2-NEXT: vpextrw $0, %xmm0, %eax -; CHECK-AVX2-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0 -; CHECK-AVX2-NEXT: vzeroupper -; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT -; CHECK-AVX2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-AVX2-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-AVX2-NEXT: # xmm0 = mem[0],zero,zero,zero -; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT -; CHECK-AVX2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; CHECK-AVX2-NEXT: addq $56, %rsp -; CHECK-AVX2-NEXT: retq -; -; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt_vec_fail_to_large: -; CHECK-NO-FASTFMA: # %bb.0: -; CHECK-NO-FASTFMA-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; CHECK-NO-FASTFMA-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,2,0,0,0,0,0,0] -; CHECK-NO-FASTFMA-NEXT: vpsllvd %ymm0, %ymm1, %ymm0 -; CHECK-NO-FASTFMA-NEXT: vpmovdw %zmm0, %ymm0 -; CHECK-NO-FASTFMA-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; CHECK-NO-FASTFMA-NEXT: vcvtdq2ps %ymm0, %ymm0 -; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %ymm0, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vcvtph2ps %xmm0, %ymm0 -; CHECK-NO-FASTFMA-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.5E+1,1.5E+1,1.5E+1,1.5E+1,1.5E+1,1.5E+1,1.5E+1,1.5E+1] -; CHECK-NO-FASTFMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %ymm0, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vzeroupper -; CHECK-NO-FASTFMA-NEXT: retq -; -; CHECK-FMA-LABEL: fmul_pow_shl_cnt_vec_fail_to_large: -; CHECK-FMA: # %bb.0: -; CHECK-FMA-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2] -; CHECK-FMA-NEXT: vpsllvw %xmm0, %xmm1, %xmm0 -; CHECK-FMA-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; CHECK-FMA-NEXT: vcvtdq2ps %ymm0, %ymm0 -; CHECK-FMA-NEXT: vcvtps2ph $4, %ymm0, %xmm0 -; CHECK-FMA-NEXT: vcvtph2ps %xmm0, %ymm0 -; CHECK-FMA-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 -; CHECK-FMA-NEXT: vcvtps2ph $4, %ymm0, %xmm0 -; CHECK-FMA-NEXT: vzeroupper -; CHECK-FMA-NEXT: retq +; CHECK-AVX-LABEL: fmul_pow_shl_cnt_vec_fail_to_large: +; CHECK-AVX: # %bb.0: +; CHECK-AVX-NEXT: subq $56, %rsp +; CHECK-AVX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; CHECK-AVX-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,2,0,0,0,0,0,0] +; CHECK-AVX-NEXT: vpsllvd %ymm0, %ymm1, %ymm0 +; CHECK-AVX-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-AVX-NEXT: vpextrw $2, %xmm0, %eax +; CHECK-AVX-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0 +; CHECK-AVX-NEXT: vzeroupper +; CHECK-AVX-NEXT: callq __truncsfhf2@PLT +; CHECK-AVX-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-AVX-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; CHECK-AVX-NEXT: vpextrw $0, %xmm0, %eax +; CHECK-AVX-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0 +; CHECK-AVX-NEXT: vzeroupper +; CHECK-AVX-NEXT: callq __truncsfhf2@PLT +; CHECK-AVX-NEXT: callq __extendhfsf2@PLT +; CHECK-AVX-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX-NEXT: callq __truncsfhf2@PLT +; CHECK-AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-AVX-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-AVX-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-AVX-NEXT: callq __extendhfsf2@PLT +; CHECK-AVX-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX-NEXT: callq __truncsfhf2@PLT +; CHECK-AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; CHECK-AVX-NEXT: addq $56, %rsp +; CHECK-AVX-NEXT: retq +; +; CHECK-AVX512F-LABEL: fmul_pow_shl_cnt_vec_fail_to_large: +; CHECK-AVX512F: # %bb.0: +; CHECK-AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; CHECK-AVX512F-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,2,0,0,0,0,0,0] +; CHECK-AVX512F-NEXT: vpsllvd %ymm0, %ymm1, %ymm0 +; CHECK-AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; CHECK-AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; CHECK-AVX512F-NEXT: vcvtdq2ps %ymm0, %ymm0 +; CHECK-AVX512F-NEXT: vcvtps2ph $4, %ymm0, %xmm0 +; CHECK-AVX512F-NEXT: vcvtph2ps %xmm0, %ymm0 +; CHECK-AVX512F-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.5E+1,1.5E+1,1.5E+1,1.5E+1,1.5E+1,1.5E+1,1.5E+1,1.5E+1] +; CHECK-AVX512F-NEXT: vmulps %ymm1, %ymm0, %ymm0 +; CHECK-AVX512F-NEXT: vcvtps2ph $4, %ymm0, %xmm0 +; CHECK-AVX512F-NEXT: vzeroupper +; CHECK-AVX512F-NEXT: retq +; +; CHECK-SKX-LABEL: fmul_pow_shl_cnt_vec_fail_to_large: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2] +; CHECK-SKX-NEXT: vpsllvw %xmm0, %xmm1, %xmm0 +; CHECK-SKX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; CHECK-SKX-NEXT: vcvtdq2ps %ymm0, %ymm0 +; CHECK-SKX-NEXT: vcvtps2ph $4, %ymm0, %xmm0 +; CHECK-SKX-NEXT: vcvtph2ps %xmm0, %ymm0 +; CHECK-SKX-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 +; CHECK-SKX-NEXT: vcvtps2ph $4, %ymm0, %xmm0 +; CHECK-SKX-NEXT: vzeroupper +; CHECK-SKX-NEXT: retq %shl = shl nsw nuw <2 x i16> , %cnt %conv = uitofp <2 x i16> %shl to <2 x half> %mul = fmul <2 x half> , %conv @@ -1173,37 +1526,37 @@ define double @fmul_pow_shl_cnt_fail_maybe_bad_exp(i64 %cnt) nounwind { ; CHECK-SSE-NEXT: mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_fail_maybe_bad_exp: -; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: movq %rdi, %rcx -; CHECK-AVX2-NEXT: movl $1, %eax -; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $rcx -; CHECK-AVX2-NEXT: shlq %cl, %rax -; CHECK-AVX2-NEXT: vmovq %rax, %xmm0 -; CHECK-AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-AVX2-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] -; CHECK-AVX2-NEXT: vaddsd %xmm0, %xmm1, %xmm0 -; CHECK-AVX2-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: retq -; -; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt_fail_maybe_bad_exp: -; CHECK-NO-FASTFMA: # %bb.0: -; CHECK-NO-FASTFMA-NEXT: movq %rdi, %rcx -; CHECK-NO-FASTFMA-NEXT: movl $1, %eax -; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $rcx -; CHECK-NO-FASTFMA-NEXT: shlq %cl, %rax -; CHECK-NO-FASTFMA-NEXT: vcvtusi2sd %rax, %xmm15, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-NO-FASTFMA-NEXT: retq -; -; CHECK-FMA-LABEL: fmul_pow_shl_cnt_fail_maybe_bad_exp: -; CHECK-FMA: # %bb.0: -; CHECK-FMA-NEXT: movl $1, %eax -; CHECK-FMA-NEXT: shlxq %rdi, %rax, %rax -; CHECK-FMA-NEXT: vcvtusi2sd %rax, %xmm15, %xmm0 -; CHECK-FMA-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-FMA-NEXT: retq +; CHECK-AVX-LABEL: fmul_pow_shl_cnt_fail_maybe_bad_exp: +; CHECK-AVX: # %bb.0: +; CHECK-AVX-NEXT: movq %rdi, %rcx +; CHECK-AVX-NEXT: movl $1, %eax +; CHECK-AVX-NEXT: # kill: def $cl killed $cl killed $rcx +; CHECK-AVX-NEXT: shlq %cl, %rax +; CHECK-AVX-NEXT: vmovq %rax, %xmm0 +; CHECK-AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-AVX-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] +; CHECK-AVX-NEXT: vaddsd %xmm0, %xmm1, %xmm0 +; CHECK-AVX-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX-NEXT: retq +; +; CHECK-AVX512F-LABEL: fmul_pow_shl_cnt_fail_maybe_bad_exp: +; CHECK-AVX512F: # %bb.0: +; CHECK-AVX512F-NEXT: movq %rdi, %rcx +; CHECK-AVX512F-NEXT: movl $1, %eax +; CHECK-AVX512F-NEXT: # kill: def $cl killed $cl killed $rcx +; CHECK-AVX512F-NEXT: shlq %cl, %rax +; CHECK-AVX512F-NEXT: vcvtusi2sd %rax, %xmm15, %xmm0 +; CHECK-AVX512F-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512F-NEXT: retq +; +; CHECK-SKX-LABEL: fmul_pow_shl_cnt_fail_maybe_bad_exp: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: movl $1, %eax +; CHECK-SKX-NEXT: shlxq %rdi, %rax, %rax +; CHECK-SKX-NEXT: vcvtusi2sd %rax, %xmm15, %xmm0 +; CHECK-SKX-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-SKX-NEXT: retq %shl = shl nuw i64 1, %cnt %conv = uitofp i64 %shl to double %mul = fmul double 9.745314e+288, %conv @@ -1230,6 +1583,24 @@ define double @fmul_pow_shl_cnt_safe(i16 %cnt) nounwind { ; CHECK-AVX-NEXT: addq %rax, %rcx ; CHECK-AVX-NEXT: vmovq %rcx, %xmm0 ; CHECK-AVX-NEXT: retq +; +; CHECK-AVX512F-LABEL: fmul_pow_shl_cnt_safe: +; CHECK-AVX512F: # %bb.0: +; CHECK-AVX512F-NEXT: movzbl %dil, %eax +; CHECK-AVX512F-NEXT: shlq $52, %rax +; CHECK-AVX512F-NEXT: movabsq $8930638061065157010, %rcx # imm = 0x7BEFFFFFFF5F3992 +; CHECK-AVX512F-NEXT: addq %rax, %rcx +; CHECK-AVX512F-NEXT: vmovq %rcx, %xmm0 +; CHECK-AVX512F-NEXT: retq +; +; CHECK-SKX-LABEL: fmul_pow_shl_cnt_safe: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: movzbl %dil, %eax +; CHECK-SKX-NEXT: shlq $52, %rax +; CHECK-SKX-NEXT: movabsq $8930638061065157010, %rcx # imm = 0x7BEFFFFFFF5F3992 +; CHECK-SKX-NEXT: addq %rax, %rcx +; CHECK-SKX-NEXT: vmovq %rcx, %xmm0 +; CHECK-SKX-NEXT: retq %shl = shl nuw i16 1, %cnt %conv = uitofp i16 %shl to double %mul = fmul double 9.745314e+288, %conv @@ -1251,6 +1622,20 @@ define <2 x double> @fdiv_pow_shl_cnt_vec(<2 x i64> %cnt) nounwind { ; CHECK-AVX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4607182418800017408,4607182418800017408] ; CHECK-AVX-NEXT: vpsubq %xmm0, %xmm1, %xmm0 ; CHECK-AVX-NEXT: retq +; +; CHECK-AVX512F-LABEL: fdiv_pow_shl_cnt_vec: +; CHECK-AVX512F: # %bb.0: +; CHECK-AVX512F-NEXT: vpsllq $52, %xmm0, %xmm0 +; CHECK-AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4607182418800017408,4607182418800017408] +; CHECK-AVX512F-NEXT: vpsubq %xmm0, %xmm1, %xmm0 +; CHECK-AVX512F-NEXT: retq +; +; CHECK-SKX-LABEL: fdiv_pow_shl_cnt_vec: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: vpsllq $52, %xmm0, %xmm0 +; CHECK-SKX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4607182418800017408,4607182418800017408] +; CHECK-SKX-NEXT: vpsubq %xmm0, %xmm1, %xmm0 +; CHECK-SKX-NEXT: retq %shl = shl nuw <2 x i64> , %cnt %conv = uitofp <2 x i64> %shl to <2 x double> %mul = fdiv <2 x double> , %conv @@ -1273,6 +1658,22 @@ define <2 x float> @fdiv_pow_shl_cnt_vec_with_expensive_cast(<2 x i64> %cnt) nou ; CHECK-AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1065353216,1065353216,1065353216,1065353216] ; CHECK-AVX-NEXT: vpsubd %xmm0, %xmm1, %xmm0 ; CHECK-AVX-NEXT: retq +; +; CHECK-AVX512F-LABEL: fdiv_pow_shl_cnt_vec_with_expensive_cast: +; CHECK-AVX512F: # %bb.0: +; CHECK-AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-AVX512F-NEXT: vpslld $23, %xmm0, %xmm0 +; CHECK-AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1065353216,1065353216,1065353216,1065353216] +; CHECK-AVX512F-NEXT: vpsubd %xmm0, %xmm1, %xmm0 +; CHECK-AVX512F-NEXT: retq +; +; CHECK-SKX-LABEL: fdiv_pow_shl_cnt_vec_with_expensive_cast: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SKX-NEXT: vpslld $23, %xmm0, %xmm0 +; CHECK-SKX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1065353216,1065353216,1065353216,1065353216] +; CHECK-SKX-NEXT: vpsubd %xmm0, %xmm1, %xmm0 +; CHECK-SKX-NEXT: retq %shl = shl nuw <2 x i64> , %cnt %conv = uitofp <2 x i64> %shl to <2 x float> %mul = fdiv <2 x float> , %conv @@ -1287,58 +1688,58 @@ define float @fdiv_pow_shl_cnt_fail_maybe_z(i64 %cnt) nounwind { ; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $rcx ; CHECK-SSE-NEXT: shlq %cl, %rax ; CHECK-SSE-NEXT: testq %rax, %rax -; CHECK-SSE-NEXT: js .LBB23_1 +; CHECK-SSE-NEXT: js .LBB24_1 ; CHECK-SSE-NEXT: # %bb.2: ; CHECK-SSE-NEXT: cvtsi2ss %rax, %xmm1 -; CHECK-SSE-NEXT: jmp .LBB23_3 -; CHECK-SSE-NEXT: .LBB23_1: +; CHECK-SSE-NEXT: jmp .LBB24_3 +; CHECK-SSE-NEXT: .LBB24_1: ; CHECK-SSE-NEXT: shrq %rax ; CHECK-SSE-NEXT: cvtsi2ss %rax, %xmm1 ; CHECK-SSE-NEXT: addss %xmm1, %xmm1 -; CHECK-SSE-NEXT: .LBB23_3: +; CHECK-SSE-NEXT: .LBB24_3: ; CHECK-SSE-NEXT: movss {{.*#+}} xmm0 = [-9.0E+0,0.0E+0,0.0E+0,0.0E+0] ; CHECK-SSE-NEXT: divss %xmm1, %xmm0 ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt_fail_maybe_z: -; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: movq %rdi, %rcx -; CHECK-AVX2-NEXT: movl $8, %eax -; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $rcx -; CHECK-AVX2-NEXT: shlq %cl, %rax -; CHECK-AVX2-NEXT: testq %rax, %rax -; CHECK-AVX2-NEXT: js .LBB23_1 -; CHECK-AVX2-NEXT: # %bb.2: -; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 -; CHECK-AVX2-NEXT: jmp .LBB23_3 -; CHECK-AVX2-NEXT: .LBB23_1: -; CHECK-AVX2-NEXT: shrq %rax -; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 -; CHECK-AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: .LBB23_3: -; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm1 = [-9.0E+0,0.0E+0,0.0E+0,0.0E+0] -; CHECK-AVX2-NEXT: vdivss %xmm0, %xmm1, %xmm0 -; CHECK-AVX2-NEXT: retq -; -; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt_fail_maybe_z: -; CHECK-NO-FASTFMA: # %bb.0: -; CHECK-NO-FASTFMA-NEXT: movq %rdi, %rcx -; CHECK-NO-FASTFMA-NEXT: movl $8, %eax -; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $rcx -; CHECK-NO-FASTFMA-NEXT: shlq %cl, %rax -; CHECK-NO-FASTFMA-NEXT: vcvtusi2ss %rax, %xmm15, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vmovss {{.*#+}} xmm1 = [-9.0E+0,0.0E+0,0.0E+0,0.0E+0] -; CHECK-NO-FASTFMA-NEXT: vdivss %xmm0, %xmm1, %xmm0 -; CHECK-NO-FASTFMA-NEXT: retq -; -; CHECK-FMA-LABEL: fdiv_pow_shl_cnt_fail_maybe_z: -; CHECK-FMA: # %bb.0: -; CHECK-FMA-NEXT: movl $8, %eax -; CHECK-FMA-NEXT: shlxq %rdi, %rax, %rax -; CHECK-FMA-NEXT: vcvtusi2ss %rax, %xmm15, %xmm0 -; CHECK-FMA-NEXT: vmovss {{.*#+}} xmm1 = [-9.0E+0,0.0E+0,0.0E+0,0.0E+0] -; CHECK-FMA-NEXT: vdivss %xmm0, %xmm1, %xmm0 -; CHECK-FMA-NEXT: retq +; CHECK-AVX-LABEL: fdiv_pow_shl_cnt_fail_maybe_z: +; CHECK-AVX: # %bb.0: +; CHECK-AVX-NEXT: movq %rdi, %rcx +; CHECK-AVX-NEXT: movl $8, %eax +; CHECK-AVX-NEXT: # kill: def $cl killed $cl killed $rcx +; CHECK-AVX-NEXT: shlq %cl, %rax +; CHECK-AVX-NEXT: testq %rax, %rax +; CHECK-AVX-NEXT: js .LBB24_1 +; CHECK-AVX-NEXT: # %bb.2: +; CHECK-AVX-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 +; CHECK-AVX-NEXT: jmp .LBB24_3 +; CHECK-AVX-NEXT: .LBB24_1: +; CHECK-AVX-NEXT: shrq %rax +; CHECK-AVX-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 +; CHECK-AVX-NEXT: vaddss %xmm0, %xmm0, %xmm0 +; CHECK-AVX-NEXT: .LBB24_3: +; CHECK-AVX-NEXT: vmovss {{.*#+}} xmm1 = [-9.0E+0,0.0E+0,0.0E+0,0.0E+0] +; CHECK-AVX-NEXT: vdivss %xmm0, %xmm1, %xmm0 +; CHECK-AVX-NEXT: retq +; +; CHECK-AVX512F-LABEL: fdiv_pow_shl_cnt_fail_maybe_z: +; CHECK-AVX512F: # %bb.0: +; CHECK-AVX512F-NEXT: movq %rdi, %rcx +; CHECK-AVX512F-NEXT: movl $8, %eax +; CHECK-AVX512F-NEXT: # kill: def $cl killed $cl killed $rcx +; CHECK-AVX512F-NEXT: shlq %cl, %rax +; CHECK-AVX512F-NEXT: vcvtusi2ss %rax, %xmm15, %xmm0 +; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm1 = [-9.0E+0,0.0E+0,0.0E+0,0.0E+0] +; CHECK-AVX512F-NEXT: vdivss %xmm0, %xmm1, %xmm0 +; CHECK-AVX512F-NEXT: retq +; +; CHECK-SKX-LABEL: fdiv_pow_shl_cnt_fail_maybe_z: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: movl $8, %eax +; CHECK-SKX-NEXT: shlxq %rdi, %rax, %rax +; CHECK-SKX-NEXT: vcvtusi2ss %rax, %xmm15, %xmm0 +; CHECK-SKX-NEXT: vmovss {{.*#+}} xmm1 = [-9.0E+0,0.0E+0,0.0E+0,0.0E+0] +; CHECK-SKX-NEXT: vdivss %xmm0, %xmm1, %xmm0 +; CHECK-SKX-NEXT: retq %shl = shl i64 8, %cnt %conv = uitofp i64 %shl to float %mul = fdiv float -9.000000e+00, %conv @@ -1357,36 +1758,36 @@ define float @fdiv_pow_shl_cnt_fail_neg_int(i64 %cnt) nounwind { ; CHECK-SSE-NEXT: divss %xmm1, %xmm0 ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt_fail_neg_int: -; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: movq %rdi, %rcx -; CHECK-AVX2-NEXT: movl $8, %eax -; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $rcx -; CHECK-AVX2-NEXT: shlq %cl, %rax -; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 -; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm1 = [-9.0E+0,0.0E+0,0.0E+0,0.0E+0] -; CHECK-AVX2-NEXT: vdivss %xmm0, %xmm1, %xmm0 -; CHECK-AVX2-NEXT: retq -; -; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt_fail_neg_int: -; CHECK-NO-FASTFMA: # %bb.0: -; CHECK-NO-FASTFMA-NEXT: movq %rdi, %rcx -; CHECK-NO-FASTFMA-NEXT: movl $8, %eax -; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $rcx -; CHECK-NO-FASTFMA-NEXT: shlq %cl, %rax -; CHECK-NO-FASTFMA-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vmovss {{.*#+}} xmm1 = [-9.0E+0,0.0E+0,0.0E+0,0.0E+0] -; CHECK-NO-FASTFMA-NEXT: vdivss %xmm0, %xmm1, %xmm0 -; CHECK-NO-FASTFMA-NEXT: retq -; -; CHECK-FMA-LABEL: fdiv_pow_shl_cnt_fail_neg_int: -; CHECK-FMA: # %bb.0: -; CHECK-FMA-NEXT: movl $8, %eax -; CHECK-FMA-NEXT: shlxq %rdi, %rax, %rax -; CHECK-FMA-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 -; CHECK-FMA-NEXT: vmovss {{.*#+}} xmm1 = [-9.0E+0,0.0E+0,0.0E+0,0.0E+0] -; CHECK-FMA-NEXT: vdivss %xmm0, %xmm1, %xmm0 -; CHECK-FMA-NEXT: retq +; CHECK-AVX-LABEL: fdiv_pow_shl_cnt_fail_neg_int: +; CHECK-AVX: # %bb.0: +; CHECK-AVX-NEXT: movq %rdi, %rcx +; CHECK-AVX-NEXT: movl $8, %eax +; CHECK-AVX-NEXT: # kill: def $cl killed $cl killed $rcx +; CHECK-AVX-NEXT: shlq %cl, %rax +; CHECK-AVX-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 +; CHECK-AVX-NEXT: vmovss {{.*#+}} xmm1 = [-9.0E+0,0.0E+0,0.0E+0,0.0E+0] +; CHECK-AVX-NEXT: vdivss %xmm0, %xmm1, %xmm0 +; CHECK-AVX-NEXT: retq +; +; CHECK-AVX512F-LABEL: fdiv_pow_shl_cnt_fail_neg_int: +; CHECK-AVX512F: # %bb.0: +; CHECK-AVX512F-NEXT: movq %rdi, %rcx +; CHECK-AVX512F-NEXT: movl $8, %eax +; CHECK-AVX512F-NEXT: # kill: def $cl killed $cl killed $rcx +; CHECK-AVX512F-NEXT: shlq %cl, %rax +; CHECK-AVX512F-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 +; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm1 = [-9.0E+0,0.0E+0,0.0E+0,0.0E+0] +; CHECK-AVX512F-NEXT: vdivss %xmm0, %xmm1, %xmm0 +; CHECK-AVX512F-NEXT: retq +; +; CHECK-SKX-LABEL: fdiv_pow_shl_cnt_fail_neg_int: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: movl $8, %eax +; CHECK-SKX-NEXT: shlxq %rdi, %rax, %rax +; CHECK-SKX-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 +; CHECK-SKX-NEXT: vmovss {{.*#+}} xmm1 = [-9.0E+0,0.0E+0,0.0E+0,0.0E+0] +; CHECK-SKX-NEXT: vdivss %xmm0, %xmm1, %xmm0 +; CHECK-SKX-NEXT: retq %shl = shl i64 8, %cnt %conv = sitofp i64 %shl to float %mul = fdiv float -9.000000e+00, %conv @@ -1411,6 +1812,24 @@ define float @fdiv_pow_shl_cnt(i64 %cnt_in) nounwind { ; CHECK-AVX-NEXT: subl %edi, %eax ; CHECK-AVX-NEXT: vmovd %eax, %xmm0 ; CHECK-AVX-NEXT: retq +; +; CHECK-AVX512F-LABEL: fdiv_pow_shl_cnt: +; CHECK-AVX512F: # %bb.0: +; CHECK-AVX512F-NEXT: andl $31, %edi +; CHECK-AVX512F-NEXT: shll $23, %edi +; CHECK-AVX512F-NEXT: movl $-1115684864, %eax # imm = 0xBD800000 +; CHECK-AVX512F-NEXT: subl %edi, %eax +; CHECK-AVX512F-NEXT: vmovd %eax, %xmm0 +; CHECK-AVX512F-NEXT: retq +; +; CHECK-SKX-LABEL: fdiv_pow_shl_cnt: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: andl $31, %edi +; CHECK-SKX-NEXT: shll $23, %edi +; CHECK-SKX-NEXT: movl $-1115684864, %eax # imm = 0xBD800000 +; CHECK-SKX-NEXT: subl %edi, %eax +; CHECK-SKX-NEXT: vmovd %eax, %xmm0 +; CHECK-SKX-NEXT: retq %cnt = and i64 %cnt_in, 31 %shl = shl i64 8, %cnt %conv = sitofp i64 %shl to float @@ -1436,47 +1855,47 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bounds(i32 %cnt) nounwind { ; CHECK-SSE-NEXT: popq %rax ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt_fail_out_of_bounds: -; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: pushq %rax -; CHECK-AVX2-NEXT: movl %edi, %ecx -; CHECK-AVX2-NEXT: movl $1, %eax -; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; CHECK-AVX2-NEXT: shll %cl, %eax -; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 -; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT -; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm1 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] -; CHECK-AVX2-NEXT: vdivss %xmm0, %xmm1, %xmm0 -; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX2-NEXT: popq %rax -; CHECK-AVX2-NEXT: retq -; -; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt_fail_out_of_bounds: -; CHECK-NO-FASTFMA: # %bb.0: -; CHECK-NO-FASTFMA-NEXT: movl %edi, %ecx -; CHECK-NO-FASTFMA-NEXT: movl $1, %eax -; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $ecx -; CHECK-NO-FASTFMA-NEXT: shll %cl, %eax -; CHECK-NO-FASTFMA-NEXT: vcvtusi2ss %eax, %xmm15, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vcvtph2ps %xmm0, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vmovss {{.*#+}} xmm1 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] -; CHECK-NO-FASTFMA-NEXT: vdivss %xmm0, %xmm1, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-NO-FASTFMA-NEXT: retq -; -; CHECK-FMA-LABEL: fdiv_pow_shl_cnt_fail_out_of_bounds: -; CHECK-FMA: # %bb.0: -; CHECK-FMA-NEXT: movl $1, %eax -; CHECK-FMA-NEXT: shlxl %edi, %eax, %eax -; CHECK-FMA-NEXT: vcvtusi2ss %eax, %xmm15, %xmm0 -; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-FMA-NEXT: vcvtph2ps %xmm0, %xmm0 -; CHECK-FMA-NEXT: vmovss {{.*#+}} xmm1 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] -; CHECK-FMA-NEXT: vdivss %xmm0, %xmm1, %xmm0 -; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-FMA-NEXT: retq +; CHECK-AVX-LABEL: fdiv_pow_shl_cnt_fail_out_of_bounds: +; CHECK-AVX: # %bb.0: +; CHECK-AVX-NEXT: pushq %rax +; CHECK-AVX-NEXT: movl %edi, %ecx +; CHECK-AVX-NEXT: movl $1, %eax +; CHECK-AVX-NEXT: # kill: def $cl killed $cl killed $ecx +; CHECK-AVX-NEXT: shll %cl, %eax +; CHECK-AVX-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 +; CHECK-AVX-NEXT: callq __truncsfhf2@PLT +; CHECK-AVX-NEXT: callq __extendhfsf2@PLT +; CHECK-AVX-NEXT: vmovss {{.*#+}} xmm1 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] +; CHECK-AVX-NEXT: vdivss %xmm0, %xmm1, %xmm0 +; CHECK-AVX-NEXT: callq __truncsfhf2@PLT +; CHECK-AVX-NEXT: popq %rax +; CHECK-AVX-NEXT: retq +; +; CHECK-AVX512F-LABEL: fdiv_pow_shl_cnt_fail_out_of_bounds: +; CHECK-AVX512F: # %bb.0: +; CHECK-AVX512F-NEXT: movl %edi, %ecx +; CHECK-AVX512F-NEXT: movl $1, %eax +; CHECK-AVX512F-NEXT: # kill: def $cl killed $cl killed $ecx +; CHECK-AVX512F-NEXT: shll %cl, %eax +; CHECK-AVX512F-NEXT: vcvtusi2ss %eax, %xmm15, %xmm0 +; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; CHECK-AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 +; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm1 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] +; CHECK-AVX512F-NEXT: vdivss %xmm0, %xmm1, %xmm0 +; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; CHECK-AVX512F-NEXT: retq +; +; CHECK-SKX-LABEL: fdiv_pow_shl_cnt_fail_out_of_bounds: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: movl $1, %eax +; CHECK-SKX-NEXT: shlxl %edi, %eax, %eax +; CHECK-SKX-NEXT: vcvtusi2ss %eax, %xmm15, %xmm0 +; CHECK-SKX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; CHECK-SKX-NEXT: vcvtph2ps %xmm0, %xmm0 +; CHECK-SKX-NEXT: vmovss {{.*#+}} xmm1 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] +; CHECK-SKX-NEXT: vdivss %xmm0, %xmm1, %xmm0 +; CHECK-SKX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; CHECK-SKX-NEXT: retq %shl = shl nuw i32 1, %cnt %conv = uitofp i32 %shl to half %mul = fdiv half 0xH7000, %conv @@ -1499,6 +1918,22 @@ define half @fdiv_pow_shl_cnt_in_bounds(i16 %cnt) nounwind { ; CHECK-AVX-NEXT: subl %edi, %eax ; CHECK-AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; CHECK-AVX-NEXT: retq +; +; CHECK-AVX512F-LABEL: fdiv_pow_shl_cnt_in_bounds: +; CHECK-AVX512F: # %bb.0: +; CHECK-AVX512F-NEXT: shll $10, %edi +; CHECK-AVX512F-NEXT: movl $28672, %eax # imm = 0x7000 +; CHECK-AVX512F-NEXT: subl %edi, %eax +; CHECK-AVX512F-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; CHECK-AVX512F-NEXT: retq +; +; CHECK-SKX-LABEL: fdiv_pow_shl_cnt_in_bounds: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: shll $10, %edi +; CHECK-SKX-NEXT: movl $28672, %eax # imm = 0x7000 +; CHECK-SKX-NEXT: subl %edi, %eax +; CHECK-SKX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; CHECK-SKX-NEXT: retq %shl = shl nuw i16 1, %cnt %conv = uitofp i16 %shl to half %mul = fdiv half 0xH7000, %conv @@ -1521,6 +1956,22 @@ define half @fdiv_pow_shl_cnt_in_bounds2(i16 %cnt) nounwind { ; CHECK-AVX-NEXT: subl %edi, %eax ; CHECK-AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; CHECK-AVX-NEXT: retq +; +; CHECK-AVX512F-LABEL: fdiv_pow_shl_cnt_in_bounds2: +; CHECK-AVX512F: # %bb.0: +; CHECK-AVX512F-NEXT: shll $10, %edi +; CHECK-AVX512F-NEXT: movl $18432, %eax # imm = 0x4800 +; CHECK-AVX512F-NEXT: subl %edi, %eax +; CHECK-AVX512F-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; CHECK-AVX512F-NEXT: retq +; +; CHECK-SKX-LABEL: fdiv_pow_shl_cnt_in_bounds2: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: shll $10, %edi +; CHECK-SKX-NEXT: movl $18432, %eax # imm = 0x4800 +; CHECK-SKX-NEXT: subl %edi, %eax +; CHECK-SKX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; CHECK-SKX-NEXT: retq %shl = shl nuw i16 1, %cnt %conv = uitofp i16 %shl to half %mul = fdiv half 0xH4800, %conv @@ -1546,50 +1997,50 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bound2(i16 %cnt) nounwind { ; CHECK-SSE-NEXT: popq %rax ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt_fail_out_of_bound2: -; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: pushq %rax -; CHECK-AVX2-NEXT: movl %edi, %ecx -; CHECK-AVX2-NEXT: movl $1, %eax -; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; CHECK-AVX2-NEXT: shll %cl, %eax -; CHECK-AVX2-NEXT: movzwl %ax, %eax -; CHECK-AVX2-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0 -; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT -; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm1 = [2.0E+0,0.0E+0,0.0E+0,0.0E+0] -; CHECK-AVX2-NEXT: vdivss %xmm0, %xmm1, %xmm0 -; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX2-NEXT: popq %rax -; CHECK-AVX2-NEXT: retq -; -; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt_fail_out_of_bound2: -; CHECK-NO-FASTFMA: # %bb.0: -; CHECK-NO-FASTFMA-NEXT: movl %edi, %ecx -; CHECK-NO-FASTFMA-NEXT: movl $1, %eax -; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $ecx -; CHECK-NO-FASTFMA-NEXT: shll %cl, %eax -; CHECK-NO-FASTFMA-NEXT: movzwl %ax, %eax -; CHECK-NO-FASTFMA-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vcvtph2ps %xmm0, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vmovss {{.*#+}} xmm1 = [2.0E+0,0.0E+0,0.0E+0,0.0E+0] -; CHECK-NO-FASTFMA-NEXT: vdivss %xmm0, %xmm1, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-NO-FASTFMA-NEXT: retq -; -; CHECK-FMA-LABEL: fdiv_pow_shl_cnt_fail_out_of_bound2: -; CHECK-FMA: # %bb.0: -; CHECK-FMA-NEXT: movl $1, %eax -; CHECK-FMA-NEXT: shlxl %edi, %eax, %eax -; CHECK-FMA-NEXT: movzwl %ax, %eax -; CHECK-FMA-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0 -; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-FMA-NEXT: vcvtph2ps %xmm0, %xmm0 -; CHECK-FMA-NEXT: vmovss {{.*#+}} xmm1 = [2.0E+0,0.0E+0,0.0E+0,0.0E+0] -; CHECK-FMA-NEXT: vdivss %xmm0, %xmm1, %xmm0 -; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-FMA-NEXT: retq +; CHECK-AVX-LABEL: fdiv_pow_shl_cnt_fail_out_of_bound2: +; CHECK-AVX: # %bb.0: +; CHECK-AVX-NEXT: pushq %rax +; CHECK-AVX-NEXT: movl %edi, %ecx +; CHECK-AVX-NEXT: movl $1, %eax +; CHECK-AVX-NEXT: # kill: def $cl killed $cl killed $ecx +; CHECK-AVX-NEXT: shll %cl, %eax +; CHECK-AVX-NEXT: movzwl %ax, %eax +; CHECK-AVX-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0 +; CHECK-AVX-NEXT: callq __truncsfhf2@PLT +; CHECK-AVX-NEXT: callq __extendhfsf2@PLT +; CHECK-AVX-NEXT: vmovss {{.*#+}} xmm1 = [2.0E+0,0.0E+0,0.0E+0,0.0E+0] +; CHECK-AVX-NEXT: vdivss %xmm0, %xmm1, %xmm0 +; CHECK-AVX-NEXT: callq __truncsfhf2@PLT +; CHECK-AVX-NEXT: popq %rax +; CHECK-AVX-NEXT: retq +; +; CHECK-AVX512F-LABEL: fdiv_pow_shl_cnt_fail_out_of_bound2: +; CHECK-AVX512F: # %bb.0: +; CHECK-AVX512F-NEXT: movl %edi, %ecx +; CHECK-AVX512F-NEXT: movl $1, %eax +; CHECK-AVX512F-NEXT: # kill: def $cl killed $cl killed $ecx +; CHECK-AVX512F-NEXT: shll %cl, %eax +; CHECK-AVX512F-NEXT: movzwl %ax, %eax +; CHECK-AVX512F-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0 +; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; CHECK-AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 +; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm1 = [2.0E+0,0.0E+0,0.0E+0,0.0E+0] +; CHECK-AVX512F-NEXT: vdivss %xmm0, %xmm1, %xmm0 +; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; CHECK-AVX512F-NEXT: retq +; +; CHECK-SKX-LABEL: fdiv_pow_shl_cnt_fail_out_of_bound2: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: movl $1, %eax +; CHECK-SKX-NEXT: shlxl %edi, %eax, %eax +; CHECK-SKX-NEXT: movzwl %ax, %eax +; CHECK-SKX-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0 +; CHECK-SKX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; CHECK-SKX-NEXT: vcvtph2ps %xmm0, %xmm0 +; CHECK-SKX-NEXT: vmovss {{.*#+}} xmm1 = [2.0E+0,0.0E+0,0.0E+0,0.0E+0] +; CHECK-SKX-NEXT: vdivss %xmm0, %xmm1, %xmm0 +; CHECK-SKX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; CHECK-SKX-NEXT: retq %shl = shl nuw i16 1, %cnt %conv = uitofp i16 %shl to half %mul = fdiv half 0xH4000, %conv @@ -1616,6 +2067,24 @@ define double @fdiv_pow_shl_cnt32_to_dbl_okay(i32 %cnt) nounwind { ; CHECK-AVX-NEXT: subq %rax, %rcx ; CHECK-AVX-NEXT: vmovq %rcx, %xmm0 ; CHECK-AVX-NEXT: retq +; +; CHECK-AVX512F-LABEL: fdiv_pow_shl_cnt32_to_dbl_okay: +; CHECK-AVX512F: # %bb.0: +; CHECK-AVX512F-NEXT: movzbl %dil, %eax +; CHECK-AVX512F-NEXT: shlq $52, %rax +; CHECK-AVX512F-NEXT: movabsq $3936146074321813504, %rcx # imm = 0x36A0000000000000 +; CHECK-AVX512F-NEXT: subq %rax, %rcx +; CHECK-AVX512F-NEXT: vmovq %rcx, %xmm0 +; CHECK-AVX512F-NEXT: retq +; +; CHECK-SKX-LABEL: fdiv_pow_shl_cnt32_to_dbl_okay: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: movzbl %dil, %eax +; CHECK-SKX-NEXT: shlq $52, %rax +; CHECK-SKX-NEXT: movabsq $3936146074321813504, %rcx # imm = 0x36A0000000000000 +; CHECK-SKX-NEXT: subq %rax, %rcx +; CHECK-SKX-NEXT: vmovq %rcx, %xmm0 +; CHECK-SKX-NEXT: retq %shl = shl nuw i32 1, %cnt %conv = uitofp i32 %shl to double %mul = fdiv double 0x36A0000000000000, %conv @@ -1634,36 +2103,36 @@ define float @fdiv_pow_shl_cnt32_out_of_bounds2(i32 %cnt) nounwind { ; CHECK-SSE-NEXT: divss %xmm1, %xmm0 ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt32_out_of_bounds2: -; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: movl %edi, %ecx -; CHECK-AVX2-NEXT: movl $1, %eax -; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; CHECK-AVX2-NEXT: shll %cl, %eax -; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 -; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm1 = [1.00974148E-28,0.0E+0,0.0E+0,0.0E+0] -; CHECK-AVX2-NEXT: vdivss %xmm0, %xmm1, %xmm0 -; CHECK-AVX2-NEXT: retq -; -; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt32_out_of_bounds2: -; CHECK-NO-FASTFMA: # %bb.0: -; CHECK-NO-FASTFMA-NEXT: movl %edi, %ecx -; CHECK-NO-FASTFMA-NEXT: movl $1, %eax -; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $ecx -; CHECK-NO-FASTFMA-NEXT: shll %cl, %eax -; CHECK-NO-FASTFMA-NEXT: vcvtusi2ss %eax, %xmm15, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vmovss {{.*#+}} xmm1 = [1.00974148E-28,0.0E+0,0.0E+0,0.0E+0] -; CHECK-NO-FASTFMA-NEXT: vdivss %xmm0, %xmm1, %xmm0 -; CHECK-NO-FASTFMA-NEXT: retq -; -; CHECK-FMA-LABEL: fdiv_pow_shl_cnt32_out_of_bounds2: -; CHECK-FMA: # %bb.0: -; CHECK-FMA-NEXT: movl $1, %eax -; CHECK-FMA-NEXT: shlxl %edi, %eax, %eax -; CHECK-FMA-NEXT: vcvtusi2ss %eax, %xmm15, %xmm0 -; CHECK-FMA-NEXT: vmovss {{.*#+}} xmm1 = [1.00974148E-28,0.0E+0,0.0E+0,0.0E+0] -; CHECK-FMA-NEXT: vdivss %xmm0, %xmm1, %xmm0 -; CHECK-FMA-NEXT: retq +; CHECK-AVX-LABEL: fdiv_pow_shl_cnt32_out_of_bounds2: +; CHECK-AVX: # %bb.0: +; CHECK-AVX-NEXT: movl %edi, %ecx +; CHECK-AVX-NEXT: movl $1, %eax +; CHECK-AVX-NEXT: # kill: def $cl killed $cl killed $ecx +; CHECK-AVX-NEXT: shll %cl, %eax +; CHECK-AVX-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 +; CHECK-AVX-NEXT: vmovss {{.*#+}} xmm1 = [1.00974148E-28,0.0E+0,0.0E+0,0.0E+0] +; CHECK-AVX-NEXT: vdivss %xmm0, %xmm1, %xmm0 +; CHECK-AVX-NEXT: retq +; +; CHECK-AVX512F-LABEL: fdiv_pow_shl_cnt32_out_of_bounds2: +; CHECK-AVX512F: # %bb.0: +; CHECK-AVX512F-NEXT: movl %edi, %ecx +; CHECK-AVX512F-NEXT: movl $1, %eax +; CHECK-AVX512F-NEXT: # kill: def $cl killed $cl killed $ecx +; CHECK-AVX512F-NEXT: shll %cl, %eax +; CHECK-AVX512F-NEXT: vcvtusi2ss %eax, %xmm15, %xmm0 +; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm1 = [1.00974148E-28,0.0E+0,0.0E+0,0.0E+0] +; CHECK-AVX512F-NEXT: vdivss %xmm0, %xmm1, %xmm0 +; CHECK-AVX512F-NEXT: retq +; +; CHECK-SKX-LABEL: fdiv_pow_shl_cnt32_out_of_bounds2: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: movl $1, %eax +; CHECK-SKX-NEXT: shlxl %edi, %eax, %eax +; CHECK-SKX-NEXT: vcvtusi2ss %eax, %xmm15, %xmm0 +; CHECK-SKX-NEXT: vmovss {{.*#+}} xmm1 = [1.00974148E-28,0.0E+0,0.0E+0,0.0E+0] +; CHECK-SKX-NEXT: vdivss %xmm0, %xmm1, %xmm0 +; CHECK-SKX-NEXT: retq %shl = shl nuw i32 1, %cnt %conv = uitofp i32 %shl to float %mul = fdiv float 0x3a1fffff00000000, %conv @@ -1690,6 +2159,24 @@ define float @fdiv_pow_shl_cnt32_okay(i32 %cnt) nounwind { ; CHECK-AVX-NEXT: subl %eax, %ecx ; CHECK-AVX-NEXT: vmovd %ecx, %xmm0 ; CHECK-AVX-NEXT: retq +; +; CHECK-AVX512F-LABEL: fdiv_pow_shl_cnt32_okay: +; CHECK-AVX512F: # %bb.0: +; CHECK-AVX512F-NEXT: movzbl %dil, %eax +; CHECK-AVX512F-NEXT: shll $23, %eax +; CHECK-AVX512F-NEXT: movl $285212672, %ecx # imm = 0x11000000 +; CHECK-AVX512F-NEXT: subl %eax, %ecx +; CHECK-AVX512F-NEXT: vmovd %ecx, %xmm0 +; CHECK-AVX512F-NEXT: retq +; +; CHECK-SKX-LABEL: fdiv_pow_shl_cnt32_okay: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: movzbl %dil, %eax +; CHECK-SKX-NEXT: shll $23, %eax +; CHECK-SKX-NEXT: movl $285212672, %ecx # imm = 0x11000000 +; CHECK-SKX-NEXT: subl %eax, %ecx +; CHECK-SKX-NEXT: vmovd %ecx, %xmm0 +; CHECK-SKX-NEXT: retq %shl = shl nuw i32 1, %cnt %conv = uitofp i32 %shl to float %mul = fdiv float 0x3a20000000000000, %conv @@ -1718,8 +2205,32 @@ define x86_fp80 @pr128528(i1 %cond) { ; CHECK-AVX-NEXT: fildl -{{[0-9]+}}(%rsp) ; CHECK-AVX-NEXT: fmull {{\.?LCPI[0-9]+_[0-9]+}}(%rip) ; CHECK-AVX-NEXT: retq +; +; CHECK-AVX512F-LABEL: pr128528: +; CHECK-AVX512F: # %bb.0: +; CHECK-AVX512F-NEXT: testb $1, %dil +; CHECK-AVX512F-NEXT: movl $8, %eax +; CHECK-AVX512F-NEXT: movl $1, %ecx +; CHECK-AVX512F-NEXT: cmovnel %eax, %ecx +; CHECK-AVX512F-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) +; CHECK-AVX512F-NEXT: fildl -{{[0-9]+}}(%rsp) +; CHECK-AVX512F-NEXT: fmull {{\.?LCPI[0-9]+_[0-9]+}}(%rip) +; CHECK-AVX512F-NEXT: retq +; +; CHECK-SKX-LABEL: pr128528: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: testb $1, %dil +; CHECK-SKX-NEXT: movl $8, %eax +; CHECK-SKX-NEXT: movl $1, %ecx +; CHECK-SKX-NEXT: cmovnel %eax, %ecx +; CHECK-SKX-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) +; CHECK-SKX-NEXT: fildl -{{[0-9]+}}(%rsp) +; CHECK-SKX-NEXT: fmull {{\.?LCPI[0-9]+_[0-9]+}}(%rip) +; CHECK-SKX-NEXT: retq %sub9 = select i1 %cond, i32 8, i32 1 %conv = uitofp i32 %sub9 to x86_fp80 %mul = fmul x86_fp80 %conv, 0xK4007D055555555555800 ret x86_fp80 %mul } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK-AVX2: {{.*}} diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll b/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll index 417910df5c457..e1649ba733498 100644 --- a/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll +++ b/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2,fma -stop-after=finalize-isel 2>&1 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2,fma -stop-after=finalize-isel 2>&1 | FileCheck %s declare float @llvm.sqrt.f32(float) #2 From efc3660174168103b02cb078a3b014af73422b2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Pir=C3=B3g?= Date: Sun, 14 Sep 2025 17:23:30 +0200 Subject: [PATCH 3/6] Reviewer suggestions --- llvm/test/CodeGen/X86/fma_patterns.ll | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/llvm/test/CodeGen/X86/fma_patterns.ll b/llvm/test/CodeGen/X86/fma_patterns.ll index 61f321566bad8..b81f2e888602d 100644 --- a/llvm/test/CodeGen/X86/fma_patterns.ll +++ b/llvm/test/CodeGen/X86/fma_patterns.ll @@ -9,7 +9,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl -enable-no-infs-fp-math | FileCheck %s --check-prefixes=AVX512,AVX512-NOINFS ; -; Pattern: (fadd contract (fmul contract x, y), z) -> (fmadd x,y,z) +; Pattern: (fadd (fmul x, y), z) -> (fmadd x,y,z) ; define float @test_f32_fmadd(float %a0, float %a1, float %a2) { @@ -133,7 +133,7 @@ define <4 x double> @test_4f64_fmadd(<4 x double> %a0, <4 x double> %a1, <4 x do } ; -; Pattern: (fsub contract (fmul contract x, y), z) -> (fmsub x, y, z) +; Pattern: (fsub (fmul x, y), z) -> (fmsub x, y, z) ; define float @test_f32_fmsub(float %a0, float %a1, float %a2) { @@ -257,7 +257,7 @@ define <4 x double> @test_4f64_fmsub(<4 x double> %a0, <4 x double> %a1, <4 x do } ; -; Pattern: (fsub contract z, (fmul contract x, y)) -> (fnmadd x, y, z) +; Pattern: (fsub z, (fmul x, y)) -> (fnmadd x, y, z) ; define float @test_f32_fnmadd(float %a0, float %a1, float %a2) { @@ -381,7 +381,7 @@ define <4 x double> @test_4f64_fnmadd(<4 x double> %a0, <4 x double> %a1, <4 x d } ; -; Pattern: (fsub contract (fneg (fmul contract x, y)), z) -> (fnmsub x, y, z) +; Pattern: (fsub (fneg (fmul x, y)), z) -> (fnmsub x, y, z) ; define float @test_f32_fnmsub(float %a0, float %a1, float %a2) { @@ -1654,7 +1654,7 @@ define <4 x double> @test_v4f64_fneg_fnmsub(<4 x double> %a0, <4 x double> %a1, } ; -; Pattern: (fma x, c1, (fmul contract x, c2)) -> (fmul contract x, c1+c2) +; Pattern: (fma x, c1, (fmul x, c2)) -> (fmul x, c1+c2) ; define <4 x float> @test_v4f32_fma_x_c1_fmul_x_c2(<4 x float> %x) { @@ -1679,7 +1679,7 @@ define <4 x float> @test_v4f32_fma_x_c1_fmul_x_c2(<4 x float> %x) { } ; -; Pattern: (fma (fmul contract x, c1), c2, y) -> (fma x, c1*c2, y) +; Pattern: (fma (fmul x, c1), c2, y) -> (fma x, c1*c2, y) ; define <4 x float> @test_v4f32_fma_fmul_x_c1_c2_y(<4 x float> %x, <4 x float> %y) { @@ -1703,7 +1703,7 @@ define <4 x float> @test_v4f32_fma_fmul_x_c1_c2_y(<4 x float> %x, <4 x float> %y ret <4 x float> %a } -; Pattern: (fneg (fmul contract x, y)) -> (fnmsub x, y, 0) +; Pattern: (fneg (fmul x, y)) -> (fnmsub x, y, 0) define double @test_f64_fneg_fmul(double %x, double %y) { ; FMA-LABEL: test_f64_fneg_fmul: @@ -1824,8 +1824,8 @@ define double @fadd_fma_fmul_1(double %a, double %b, double %c, double %d, doubl ret double %a2 } -; Minimum FMF - the 1st fadd contract is contracted because that combines -; fmul contract+fadd contract as specified by the order of operations; the 2nd fadd contract +; Minimum FMF - the 1st fadd is contracted because that combines +; fmul+fadd as specified by the order of operations; the 2nd fadd ; requires reassociation to fuse with c*d. define float @fadd_fma_fmul_fmf(float %a, float %b, float %c, float %d, float %n0) nounwind { @@ -1883,7 +1883,7 @@ define float @fadd_fma_fmul_2(float %a, float %b, float %c, float %d, float %n0) ret float %a2 } -; The final fadd contract can be folded with either 1 of the leading fmul contracts. +; The final fadd can be folded with either 1 of the leading fmuls. define <2 x double> @fadd_fma_fmul_3(<2 x double> %x1, <2 x double> %x2, <2 x double> %x3, <2 x double> %x4, <2 x double> %x5, <2 x double> %x6, <2 x double> %x7, <2 x double> %x8) nounwind { ; FMA-LABEL: fadd_fma_fmul_3: From c509ca95722b96b816ed652e00fd39b1ee018fd6 Mon Sep 17 00:00:00 2001 From: "Pirog, Mikolaj Maciej" Date: Mon, 15 Sep 2025 18:13:33 +0200 Subject: [PATCH 4/6] Reviewer suggestions --- llvm/test/CodeGen/X86/fma_patterns.ll | 56 +- .../X86/fold-int-pow2-with-fmul-or-fdiv.ll | 1336 +++++------------ llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll | 2 +- 3 files changed, 383 insertions(+), 1011 deletions(-) diff --git a/llvm/test/CodeGen/X86/fma_patterns.ll b/llvm/test/CodeGen/X86/fma_patterns.ll index b81f2e888602d..be5e23cd4cce3 100644 --- a/llvm/test/CodeGen/X86/fma_patterns.ll +++ b/llvm/test/CodeGen/X86/fma_patterns.ll @@ -1582,9 +1582,9 @@ define <4 x float> @test_v4f32_fneg_fmadd(<4 x float> %a0, <4 x float> %a1, <4 x ; AVX512: # %bb.0: ; AVX512-NEXT: vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 ; AVX512-NEXT: retq - %mul = fmul contract contract nsz <4 x float> %a0, %a1 - %add = fadd contract contract nsz <4 x float> %mul, %a2 - %neg = fsub contract contract nsz <4 x float> , %add + %mul = fmul contract nsz <4 x float> %a0, %a1 + %add = fadd contract nsz <4 x float> %mul, %a2 + %neg = fsub contract nsz <4 x float> , %add ret <4 x float> %neg } @@ -1624,10 +1624,10 @@ define <4 x float> @test_v4f32_fneg_fnmadd(<4 x float> %a0, <4 x float> %a1, <4 ; AVX512: # %bb.0: ; AVX512-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 ; AVX512-NEXT: retq - %mul = fmul contract contract nsz <4 x float> %a0, %a1 - %neg0 = fsub contract contract nsz <4 x float> , %mul - %add = fadd contract contract nsz <4 x float> %neg0, %a2 - %neg1 = fsub contract contract nsz <4 x float> , %add + %mul = fmul contract nsz <4 x float> %a0, %a1 + %neg0 = fsub contract nsz <4 x float> , %mul + %add = fadd contract nsz <4 x float> %neg0, %a2 + %neg1 = fsub contract nsz <4 x float> , %add ret <4 x float> %neg1 } @@ -1646,10 +1646,10 @@ define <4 x double> @test_v4f64_fneg_fnmsub(<4 x double> %a0, <4 x double> %a1, ; AVX512: # %bb.0: ; AVX512-NEXT: vfmadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2 ; AVX512-NEXT: retq - %mul = fmul contract contract nsz <4 x double> %a0, %a1 - %neg0 = fsub contract contract nsz <4 x double> , %mul - %sub = fsub contract contract nsz <4 x double> %neg0, %a2 - %neg1 = fsub contract contract nsz <4 x double> , %sub + %mul = fmul contract nsz <4 x double> %a0, %a1 + %neg0 = fsub contract nsz <4 x double> , %mul + %sub = fsub contract nsz <4 x double> %neg0, %a2 + %neg1 = fsub contract nsz <4 x double> , %sub ret <4 x double> %neg1 } @@ -1672,9 +1672,9 @@ define <4 x float> @test_v4f32_fma_x_c1_fmul_x_c2(<4 x float> %x) { ; AVX512: # %bb.0: ; AVX512-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; AVX512-NEXT: retq - %m0 = fmul contract contract reassoc <4 x float> %x, - %m1 = fmul contract contract reassoc <4 x float> %x, - %a = fadd contract contract reassoc <4 x float> %m0, %m1 + %m0 = fmul contract reassoc <4 x float> %x, + %m1 = fmul contract reassoc <4 x float> %x, + %a = fadd contract reassoc <4 x float> %m0, %m1 ret <4 x float> %a } @@ -1697,9 +1697,9 @@ define <4 x float> @test_v4f32_fma_fmul_x_c1_c2_y(<4 x float> %x, <4 x float> %y ; AVX512: # %bb.0: ; AVX512-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * mem) + xmm1 ; AVX512-NEXT: retq - %m0 = fmul contract contract reassoc <4 x float> %x, - %m1 = fmul contract contract reassoc <4 x float> %m0, - %a = fadd contract contract reassoc <4 x float> %m1, %y + %m0 = fmul contract reassoc <4 x float> %x, + %m1 = fmul contract reassoc <4 x float> %m0, + %a = fadd contract reassoc <4 x float> %m1, %y ret <4 x float> %a } @@ -1723,8 +1723,8 @@ define double @test_f64_fneg_fmul(double %x, double %y) { ; AVX512-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; AVX512-NEXT: vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 ; AVX512-NEXT: retq - %m = fmul contract contract nsz double %x, %y - %n = fsub contract contract double -0.0, %m + %m = fmul contract nsz double %x, %y + %n = fsub contract double -0.0, %m ret double %n } @@ -1746,8 +1746,8 @@ define <4 x float> @test_v4f32_fneg_fmul(<4 x float> %x, <4 x float> %y) { ; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; AVX512-NEXT: vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 ; AVX512-NEXT: retq - %m = fmul contract contract nsz <4 x float> %x, %y - %n = fsub contract contract <4 x float> , %m + %m = fmul contract nsz <4 x float> %x, %y + %n = fsub contract <4 x float> , %m ret <4 x float> %n } @@ -1769,8 +1769,8 @@ define <4 x double> @test_v4f64_fneg_fmul(<4 x double> %x, <4 x double> %y) { ; AVX512-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; AVX512-NEXT: vfnmsub213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) - ymm2 ; AVX512-NEXT: retq - %m = fmul contract contract nsz <4 x double> %x, %y - %n = fsub contract contract <4 x double> , %m + %m = fmul contract nsz <4 x double> %x, %y + %n = fsub contract <4 x double> , %m ret <4 x double> %n } @@ -1792,8 +1792,8 @@ define <4 x double> @test_v4f64_fneg_fmul_no_nsz(<4 x double> %x, <4 x double> % ; AVX512-NEXT: vmulpd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 ; AVX512-NEXT: retq - %m = fmul contract contract <4 x double> %x, %y - %n = fsub contract contract <4 x double> , %m + %m = fmul contract <4 x double> %x, %y + %n = fsub contract <4 x double> , %m ret <4 x double> %n } @@ -1848,7 +1848,7 @@ define float @fadd_fma_fmul_fmf(float %a, float %b, float %c, float %d, float %n ; AVX512-NEXT: retq %m1 = fmul contract float %a, %b %m2 = fmul contract float %c, %d - %a1 = fadd contract contract float %m1, %m2 + %a1 = fadd contract float %m1, %m2 %a2 = fadd contract reassoc float %n0, %a1 ret float %a2 } @@ -1878,8 +1878,8 @@ define float @fadd_fma_fmul_2(float %a, float %b, float %c, float %d, float %n0) ; AVX512-NEXT: retq %m1 = fmul contract float %a, %b %m2 = fmul contract float %c, %d - %a1 = fadd contract contract float %m1, %m2 - %a2 = fadd contract contract float %n0, %a1 + %a1 = fadd contract float %m1, %m2 + %a2 = fadd contract float %n0, %a1 ret float %a2 } diff --git a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll index f5f86aa70fc0f..db30be26d04ab 100644 --- a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll +++ b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=CHECK-SSE ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK-AVX,CHECK-AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK-AVX512F -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx | FileCheck %s --check-prefixes=CHECK-SKX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK-AVX,CHECK-AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx | FileCheck %s --check-prefixes=CHECK-AVX,CHECK-AVX512F,CHECK-SKX declare i16 @llvm.umax.i16(i16, i16) declare i64 @llvm.umin.i64(i64, i64) @@ -16,19 +16,12 @@ define <4 x float> @fmul_pow2_4xfloat(<4 x i32> %i) { ; CHECK-SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX-LABEL: fmul_pow2_4xfloat: -; CHECK-AVX: # %bb.0: -; CHECK-AVX-NEXT: vpslld $23, %xmm0, %xmm0 -; CHECK-AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1091567616,1091567616,1091567616,1091567616] -; CHECK-AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-AVX-NEXT: retq -; -; CHECK-AVX512F-LABEL: fmul_pow2_4xfloat: -; CHECK-AVX512F: # %bb.0: -; CHECK-AVX512F-NEXT: vpslld $23, %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1091567616,1091567616,1091567616,1091567616] -; CHECK-AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: retq +; CHECK-AVX2-LABEL: fmul_pow2_4xfloat: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vpslld $23, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1091567616,1091567616,1091567616,1091567616] +; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq ; ; CHECK-SKX-LABEL: fmul_pow2_4xfloat: ; CHECK-SKX: # %bb.0: @@ -111,72 +104,6 @@ define <4 x float> @fmul_pow2_ldexp_4xfloat(<4 x i32> %i) { ; CHECK-AVX-NEXT: addq $40, %rsp ; CHECK-AVX-NEXT: .cfi_def_cfa_offset 8 ; CHECK-AVX-NEXT: retq -; -; CHECK-AVX512F-LABEL: fmul_pow2_ldexp_4xfloat: -; CHECK-AVX512F: # %bb.0: -; CHECK-AVX512F-NEXT: subq $40, %rsp -; CHECK-AVX512F-NEXT: .cfi_def_cfa_offset 48 -; CHECK-AVX512F-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-AVX512F-NEXT: vextractps $1, %xmm0, %edi -; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0] -; CHECK-AVX512F-NEXT: callq ldexpf@PLT -; CHECK-AVX512F-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-AVX512F-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-AVX512F-NEXT: vmovd %xmm0, %edi -; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0] -; CHECK-AVX512F-NEXT: callq ldexpf@PLT -; CHECK-AVX512F-NEXT: vinsertps $16, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3] -; CHECK-AVX512F-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-AVX512F-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-AVX512F-NEXT: vextractps $2, %xmm0, %edi -; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0] -; CHECK-AVX512F-NEXT: callq ldexpf@PLT -; CHECK-AVX512F-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; CHECK-AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] -; CHECK-AVX512F-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-AVX512F-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-AVX512F-NEXT: vextractps $3, %xmm0, %edi -; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0] -; CHECK-AVX512F-NEXT: callq ldexpf@PLT -; CHECK-AVX512F-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; CHECK-AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; CHECK-AVX512F-NEXT: addq $40, %rsp -; CHECK-AVX512F-NEXT: .cfi_def_cfa_offset 8 -; CHECK-AVX512F-NEXT: retq -; -; CHECK-SKX-LABEL: fmul_pow2_ldexp_4xfloat: -; CHECK-SKX: # %bb.0: -; CHECK-SKX-NEXT: subq $40, %rsp -; CHECK-SKX-NEXT: .cfi_def_cfa_offset 48 -; CHECK-SKX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-SKX-NEXT: vextractps $1, %xmm0, %edi -; CHECK-SKX-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0] -; CHECK-SKX-NEXT: callq ldexpf@PLT -; CHECK-SKX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-SKX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-SKX-NEXT: vmovd %xmm0, %edi -; CHECK-SKX-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0] -; CHECK-SKX-NEXT: callq ldexpf@PLT -; CHECK-SKX-NEXT: vinsertps $16, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-SKX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3] -; CHECK-SKX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-SKX-NEXT: vextractps $2, %xmm0, %edi -; CHECK-SKX-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0] -; CHECK-SKX-NEXT: callq ldexpf@PLT -; CHECK-SKX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; CHECK-SKX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] -; CHECK-SKX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-SKX-NEXT: vextractps $3, %xmm0, %edi -; CHECK-SKX-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0] -; CHECK-SKX-NEXT: callq ldexpf@PLT -; CHECK-SKX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; CHECK-SKX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; CHECK-SKX-NEXT: addq $40, %rsp -; CHECK-SKX-NEXT: .cfi_def_cfa_offset 8 -; CHECK-SKX-NEXT: retq %r = call <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float> , <4 x i32> %i) ret <4 x float> %r } @@ -196,20 +123,6 @@ define <4 x float> @fdiv_pow2_4xfloat(<4 x i32> %i) { ; CHECK-AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1091567616,1091567616,1091567616,1091567616] ; CHECK-AVX-NEXT: vpsubd %xmm0, %xmm1, %xmm0 ; CHECK-AVX-NEXT: retq -; -; CHECK-AVX512F-LABEL: fdiv_pow2_4xfloat: -; CHECK-AVX512F: # %bb.0: -; CHECK-AVX512F-NEXT: vpslld $23, %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1091567616,1091567616,1091567616,1091567616] -; CHECK-AVX512F-NEXT: vpsubd %xmm0, %xmm1, %xmm0 -; CHECK-AVX512F-NEXT: retq -; -; CHECK-SKX-LABEL: fdiv_pow2_4xfloat: -; CHECK-SKX: # %bb.0: -; CHECK-SKX-NEXT: vpslld $23, %xmm0, %xmm0 -; CHECK-SKX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1091567616,1091567616,1091567616,1091567616] -; CHECK-SKX-NEXT: vpsubd %xmm0, %xmm1, %xmm0 -; CHECK-SKX-NEXT: retq %p2 = shl <4 x i32> , %i %p2_f = uitofp <4 x i32> %p2 to <4 x float> %r = fdiv <4 x float> , %p2_f @@ -342,130 +255,114 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) { ; CHECK-SSE-NEXT: .cfi_def_cfa_offset 8 ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX-LABEL: fmul_pow2_8xhalf: -; CHECK-AVX: # %bb.0: -; CHECK-AVX-NEXT: subq $120, %rsp -; CHECK-AVX-NEXT: .cfi_def_cfa_offset 128 -; CHECK-AVX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; CHECK-AVX-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] -; CHECK-AVX-NEXT: vpsllvd %ymm0, %ymm1, %ymm0 -; CHECK-AVX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; CHECK-AVX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; CHECK-AVX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; CHECK-AVX-NEXT: vcvtdq2ps %ymm0, %ymm0 -; CHECK-AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; CHECK-AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; CHECK-AVX-NEXT: vzeroupper -; CHECK-AVX-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; CHECK-AVX-NEXT: vzeroupper -; CHECK-AVX-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-AVX-NEXT: # xmm0 = mem[1,0] -; CHECK-AVX-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-AVX-NEXT: # xmm0 = mem[3,3,3,3] -; CHECK-AVX-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 -; CHECK-AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; CHECK-AVX-NEXT: vzeroupper -; CHECK-AVX-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-AVX-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-AVX-NEXT: # xmm0 = mem[1,0] -; CHECK-AVX-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-AVX-NEXT: # xmm0 = mem[3,3,3,3] -; CHECK-AVX-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX-NEXT: callq __extendhfsf2@PLT -; CHECK-AVX-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-AVX-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-AVX-NEXT: # xmm0 = mem[0],zero,zero,zero -; CHECK-AVX-NEXT: callq __extendhfsf2@PLT -; CHECK-AVX-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-AVX-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-AVX-NEXT: # xmm0 = mem[0],zero,zero,zero -; CHECK-AVX-NEXT: callq __extendhfsf2@PLT -; CHECK-AVX-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-AVX-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-AVX-NEXT: # xmm0 = mem[0],zero,zero,zero -; CHECK-AVX-NEXT: callq __extendhfsf2@PLT -; CHECK-AVX-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; CHECK-AVX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-AVX-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-AVX-NEXT: # xmm0 = mem[0],zero,zero,zero -; CHECK-AVX-NEXT: callq __extendhfsf2@PLT -; CHECK-AVX-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-AVX-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-AVX-NEXT: # xmm0 = mem[0],zero,zero,zero -; CHECK-AVX-NEXT: callq __extendhfsf2@PLT -; CHECK-AVX-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-AVX-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-AVX-NEXT: # xmm0 = mem[0],zero,zero,zero -; CHECK-AVX-NEXT: callq __extendhfsf2@PLT -; CHECK-AVX-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-AVX-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-AVX-NEXT: # xmm0 = mem[0],zero,zero,zero -; CHECK-AVX-NEXT: callq __extendhfsf2@PLT -; CHECK-AVX-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; CHECK-AVX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-AVX-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; CHECK-AVX-NEXT: addq $120, %rsp -; CHECK-AVX-NEXT: .cfi_def_cfa_offset 8 -; CHECK-AVX-NEXT: retq -; -; CHECK-AVX512F-LABEL: fmul_pow2_8xhalf: -; CHECK-AVX512F: # %bb.0: -; CHECK-AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; CHECK-AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] -; CHECK-AVX512F-NEXT: vpsllvd %ymm0, %ymm1, %ymm0 -; CHECK-AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; CHECK-AVX512F-NEXT: vcvtdq2ps %ymm0, %ymm0 -; CHECK-AVX512F-NEXT: vcvtps2ph $4, %ymm0, %xmm0 -; CHECK-AVX512F-NEXT: vcvtph2ps %xmm0, %ymm0 -; CHECK-AVX512F-NEXT: vbroadcastss {{.*#+}} ymm1 = [8.192E+3,8.192E+3,8.192E+3,8.192E+3,8.192E+3,8.192E+3,8.192E+3,8.192E+3] -; CHECK-AVX512F-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; CHECK-AVX512F-NEXT: vcvtps2ph $4, %ymm0, %xmm0 -; CHECK-AVX512F-NEXT: vzeroupper -; CHECK-AVX512F-NEXT: retq +; CHECK-AVX2-LABEL: fmul_pow2_8xhalf: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: subq $120, %rsp +; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 128 +; CHECK-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] +; CHECK-AVX2-NEXT: vpsllvd %ymm0, %ymm1, %ymm0 +; CHECK-AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] +; CHECK-AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; CHECK-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; CHECK-AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 +; CHECK-AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-AVX2-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vzeroupper +; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT +; CHECK-AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; CHECK-AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-AVX2-NEXT: vzeroupper +; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT +; CHECK-AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-AVX2-NEXT: # xmm0 = mem[1,0] +; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT +; CHECK-AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-AVX2-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-AVX2-NEXT: # xmm0 = mem[3,3,3,3] +; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT +; CHECK-AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; CHECK-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 +; CHECK-AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-AVX2-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vzeroupper +; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT +; CHECK-AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT +; CHECK-AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-AVX2-NEXT: # xmm0 = mem[1,0] +; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT +; CHECK-AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-AVX2-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-AVX2-NEXT: # xmm0 = mem[3,3,3,3] +; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT +; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT +; CHECK-AVX2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT +; CHECK-AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-AVX2-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-AVX2-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT +; CHECK-AVX2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT +; CHECK-AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; CHECK-AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-AVX2-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-AVX2-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT +; CHECK-AVX2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT +; CHECK-AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-AVX2-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-AVX2-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT +; CHECK-AVX2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT +; CHECK-AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; CHECK-AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-AVX2-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-AVX2-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT +; CHECK-AVX2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT +; CHECK-AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-AVX2-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-AVX2-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT +; CHECK-AVX2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT +; CHECK-AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; CHECK-AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-AVX2-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-AVX2-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT +; CHECK-AVX2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT +; CHECK-AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-AVX2-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-AVX2-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT +; CHECK-AVX2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT +; CHECK-AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; CHECK-AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-AVX2-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-AVX2-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-AVX2-NEXT: addq $120, %rsp +; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 8 +; CHECK-AVX2-NEXT: retq ; ; CHECK-SKX-LABEL: fmul_pow2_8xhalf: ; CHECK-SKX: # %bb.0: @@ -563,82 +460,82 @@ define <8 x half> @fmul_pow2_ldexp_8xhalf(<8 x i16> %i) { ; CHECK-SSE-NEXT: .cfi_def_cfa_offset 8 ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX-LABEL: fmul_pow2_ldexp_8xhalf: -; CHECK-AVX: # %bb.0: -; CHECK-AVX-NEXT: subq $72, %rsp -; CHECK-AVX-NEXT: .cfi_def_cfa_offset 80 -; CHECK-AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; CHECK-AVX-NEXT: vpextrw $7, %xmm0, %eax -; CHECK-AVX-NEXT: movswl %ax, %edi -; CHECK-AVX-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] -; CHECK-AVX-NEXT: callq ldexpf@PLT -; CHECK-AVX-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; CHECK-AVX-NEXT: vpextrw $6, %xmm0, %eax -; CHECK-AVX-NEXT: movswl %ax, %edi -; CHECK-AVX-NEXT: vmovd {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] -; CHECK-AVX-NEXT: callq ldexpf@PLT -; CHECK-AVX-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; CHECK-AVX-NEXT: vpextrw $5, %xmm0, %eax -; CHECK-AVX-NEXT: movswl %ax, %edi -; CHECK-AVX-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] -; CHECK-AVX-NEXT: callq ldexpf@PLT -; CHECK-AVX-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; CHECK-AVX-NEXT: vpextrw $4, %xmm0, %eax -; CHECK-AVX-NEXT: movswl %ax, %edi -; CHECK-AVX-NEXT: vmovd {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] -; CHECK-AVX-NEXT: callq ldexpf@PLT -; CHECK-AVX-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-AVX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; CHECK-AVX-NEXT: vpextrw $3, %xmm0, %eax -; CHECK-AVX-NEXT: movswl %ax, %edi -; CHECK-AVX-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] -; CHECK-AVX-NEXT: callq ldexpf@PLT -; CHECK-AVX-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; CHECK-AVX-NEXT: vpextrw $2, %xmm0, %eax -; CHECK-AVX-NEXT: movswl %ax, %edi -; CHECK-AVX-NEXT: vmovd {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] -; CHECK-AVX-NEXT: callq ldexpf@PLT -; CHECK-AVX-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; CHECK-AVX-NEXT: vpextrw $1, %xmm0, %eax -; CHECK-AVX-NEXT: movswl %ax, %edi -; CHECK-AVX-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] -; CHECK-AVX-NEXT: callq ldexpf@PLT -; CHECK-AVX-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; CHECK-AVX-NEXT: vmovd %xmm0, %eax -; CHECK-AVX-NEXT: movswl %ax, %edi -; CHECK-AVX-NEXT: vmovd {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] -; CHECK-AVX-NEXT: callq ldexpf@PLT -; CHECK-AVX-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-AVX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-AVX-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; CHECK-AVX-NEXT: addq $72, %rsp -; CHECK-AVX-NEXT: .cfi_def_cfa_offset 8 -; CHECK-AVX-NEXT: retq +; CHECK-AVX2-LABEL: fmul_pow2_ldexp_8xhalf: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: subq $72, %rsp +; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 80 +; CHECK-AVX2-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-AVX2-NEXT: vpextrw $7, %xmm0, %eax +; CHECK-AVX2-NEXT: movswl %ax, %edi +; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] +; CHECK-AVX2-NEXT: callq ldexpf@PLT +; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT +; CHECK-AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-AVX2-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-AVX2-NEXT: vpextrw $6, %xmm0, %eax +; CHECK-AVX2-NEXT: movswl %ax, %edi +; CHECK-AVX2-NEXT: vmovd {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] +; CHECK-AVX2-NEXT: callq ldexpf@PLT +; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT +; CHECK-AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; CHECK-AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-AVX2-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-AVX2-NEXT: vpextrw $5, %xmm0, %eax +; CHECK-AVX2-NEXT: movswl %ax, %edi +; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] +; CHECK-AVX2-NEXT: callq ldexpf@PLT +; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT +; CHECK-AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-AVX2-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-AVX2-NEXT: vpextrw $4, %xmm0, %eax +; CHECK-AVX2-NEXT: movswl %ax, %edi +; CHECK-AVX2-NEXT: vmovd {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] +; CHECK-AVX2-NEXT: callq ldexpf@PLT +; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT +; CHECK-AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; CHECK-AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-AVX2-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-AVX2-NEXT: vpextrw $3, %xmm0, %eax +; CHECK-AVX2-NEXT: movswl %ax, %edi +; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] +; CHECK-AVX2-NEXT: callq ldexpf@PLT +; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT +; CHECK-AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-AVX2-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-AVX2-NEXT: vpextrw $2, %xmm0, %eax +; CHECK-AVX2-NEXT: movswl %ax, %edi +; CHECK-AVX2-NEXT: vmovd {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] +; CHECK-AVX2-NEXT: callq ldexpf@PLT +; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT +; CHECK-AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; CHECK-AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-AVX2-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-AVX2-NEXT: vpextrw $1, %xmm0, %eax +; CHECK-AVX2-NEXT: movswl %ax, %edi +; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] +; CHECK-AVX2-NEXT: callq ldexpf@PLT +; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT +; CHECK-AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-AVX2-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-AVX2-NEXT: vmovd %xmm0, %eax +; CHECK-AVX2-NEXT: movswl %ax, %edi +; CHECK-AVX2-NEXT: vmovd {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] +; CHECK-AVX2-NEXT: callq ldexpf@PLT +; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT +; CHECK-AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; CHECK-AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-AVX2-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-AVX2-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-AVX2-NEXT: addq $72, %rsp +; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 8 +; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512F-LABEL: fmul_pow2_ldexp_8xhalf: ; CHECK-AVX512F: # %bb.0: @@ -716,83 +613,6 @@ define <8 x half> @fmul_pow2_ldexp_8xhalf(<8 x i16> %i) { ; CHECK-AVX512F-NEXT: addq $72, %rsp ; CHECK-AVX512F-NEXT: .cfi_def_cfa_offset 8 ; CHECK-AVX512F-NEXT: retq -; -; CHECK-SKX-LABEL: fmul_pow2_ldexp_8xhalf: -; CHECK-SKX: # %bb.0: -; CHECK-SKX-NEXT: subq $72, %rsp -; CHECK-SKX-NEXT: .cfi_def_cfa_offset 80 -; CHECK-SKX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; CHECK-SKX-NEXT: vpextrw $7, %xmm0, %eax -; CHECK-SKX-NEXT: movswl %ax, %edi -; CHECK-SKX-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] -; CHECK-SKX-NEXT: callq ldexpf@PLT -; CHECK-SKX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-SKX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-SKX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; CHECK-SKX-NEXT: vpextrw $6, %xmm0, %eax -; CHECK-SKX-NEXT: movswl %ax, %edi -; CHECK-SKX-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] -; CHECK-SKX-NEXT: callq ldexpf@PLT -; CHECK-SKX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-SKX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-SKX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-SKX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-SKX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; CHECK-SKX-NEXT: vpextrw $5, %xmm0, %eax -; CHECK-SKX-NEXT: movswl %ax, %edi -; CHECK-SKX-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] -; CHECK-SKX-NEXT: callq ldexpf@PLT -; CHECK-SKX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-SKX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-SKX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; CHECK-SKX-NEXT: vpextrw $4, %xmm0, %eax -; CHECK-SKX-NEXT: movswl %ax, %edi -; CHECK-SKX-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] -; CHECK-SKX-NEXT: callq ldexpf@PLT -; CHECK-SKX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-SKX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-SKX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-SKX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-SKX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-SKX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-SKX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; CHECK-SKX-NEXT: vpextrw $3, %xmm0, %eax -; CHECK-SKX-NEXT: movswl %ax, %edi -; CHECK-SKX-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] -; CHECK-SKX-NEXT: callq ldexpf@PLT -; CHECK-SKX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-SKX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-SKX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; CHECK-SKX-NEXT: vpextrw $2, %xmm0, %eax -; CHECK-SKX-NEXT: movswl %ax, %edi -; CHECK-SKX-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] -; CHECK-SKX-NEXT: callq ldexpf@PLT -; CHECK-SKX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-SKX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-SKX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-SKX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-SKX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; CHECK-SKX-NEXT: vpextrw $1, %xmm0, %eax -; CHECK-SKX-NEXT: movswl %ax, %edi -; CHECK-SKX-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] -; CHECK-SKX-NEXT: callq ldexpf@PLT -; CHECK-SKX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-SKX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-SKX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; CHECK-SKX-NEXT: vmovd %xmm0, %eax -; CHECK-SKX-NEXT: movswl %ax, %edi -; CHECK-SKX-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] -; CHECK-SKX-NEXT: callq ldexpf@PLT -; CHECK-SKX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-SKX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-SKX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-SKX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-SKX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-SKX-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-SKX-NEXT: # xmm0 = xmm0[0],mem[0] -; CHECK-SKX-NEXT: addq $72, %rsp -; CHECK-SKX-NEXT: .cfi_def_cfa_offset 8 -; CHECK-SKX-NEXT: retq %r = call <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half> , <8 x i16> %i) ret <8 x half> %r } @@ -806,19 +626,12 @@ define <8 x half> @fdiv_pow2_8xhalf(<8 x i16> %i) { ; CHECK-SSE-NEXT: movdqa %xmm1, %xmm0 ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX-LABEL: fdiv_pow2_8xhalf: -; CHECK-AVX: # %bb.0: -; CHECK-AVX-NEXT: vpsllw $10, %xmm0, %xmm0 -; CHECK-AVX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [28672,28672,28672,28672,28672,28672,28672,28672] -; CHECK-AVX-NEXT: vpsubw %xmm0, %xmm1, %xmm0 -; CHECK-AVX-NEXT: retq -; -; CHECK-AVX512F-LABEL: fdiv_pow2_8xhalf: -; CHECK-AVX512F: # %bb.0: -; CHECK-AVX512F-NEXT: vpsllw $10, %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: vpbroadcastw {{.*#+}} xmm1 = [28672,28672,28672,28672,28672,28672,28672,28672] -; CHECK-AVX512F-NEXT: vpsubw %xmm0, %xmm1, %xmm0 -; CHECK-AVX512F-NEXT: retq +; CHECK-AVX2-LABEL: fdiv_pow2_8xhalf: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vpsllw $10, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [28672,28672,28672,28672,28672,28672,28672,28672] +; CHECK-AVX2-NEXT: vpsubw %xmm0, %xmm1, %xmm0 +; CHECK-AVX2-NEXT: retq ; ; CHECK-SKX-LABEL: fdiv_pow2_8xhalf: ; CHECK-SKX: # %bb.0: @@ -852,24 +665,6 @@ define double @fmul_pow_shl_cnt(i64 %cnt) nounwind { ; CHECK-AVX-NEXT: addq %rax, %rcx ; CHECK-AVX-NEXT: vmovq %rcx, %xmm0 ; CHECK-AVX-NEXT: retq -; -; CHECK-AVX512F-LABEL: fmul_pow_shl_cnt: -; CHECK-AVX512F: # %bb.0: -; CHECK-AVX512F-NEXT: movzbl %dil, %eax -; CHECK-AVX512F-NEXT: shlq $52, %rax -; CHECK-AVX512F-NEXT: movabsq $4621256167635550208, %rcx # imm = 0x4022000000000000 -; CHECK-AVX512F-NEXT: addq %rax, %rcx -; CHECK-AVX512F-NEXT: vmovq %rcx, %xmm0 -; CHECK-AVX512F-NEXT: retq -; -; CHECK-SKX-LABEL: fmul_pow_shl_cnt: -; CHECK-SKX: # %bb.0: -; CHECK-SKX-NEXT: movzbl %dil, %eax -; CHECK-SKX-NEXT: shlq $52, %rax -; CHECK-SKX-NEXT: movabsq $4621256167635550208, %rcx # imm = 0x4022000000000000 -; CHECK-SKX-NEXT: addq %rax, %rcx -; CHECK-SKX-NEXT: vmovq %rcx, %xmm0 -; CHECK-SKX-NEXT: retq %shl = shl nuw i64 1, %cnt %conv = uitofp i64 %shl to double %mul = fmul double 9.000000e+00, %conv @@ -898,26 +693,6 @@ define double @fmul_pow_shl_cnt2(i64 %cnt) nounwind { ; CHECK-AVX-NEXT: addq %rax, %rcx ; CHECK-AVX-NEXT: vmovq %rcx, %xmm0 ; CHECK-AVX-NEXT: retq -; -; CHECK-AVX512F-LABEL: fmul_pow_shl_cnt2: -; CHECK-AVX512F: # %bb.0: -; CHECK-AVX512F-NEXT: movzbl %dil, %eax -; CHECK-AVX512F-NEXT: incl %eax -; CHECK-AVX512F-NEXT: shlq $52, %rax -; CHECK-AVX512F-NEXT: movabsq $-4602115869219225600, %rcx # imm = 0xC022000000000000 -; CHECK-AVX512F-NEXT: addq %rax, %rcx -; CHECK-AVX512F-NEXT: vmovq %rcx, %xmm0 -; CHECK-AVX512F-NEXT: retq -; -; CHECK-SKX-LABEL: fmul_pow_shl_cnt2: -; CHECK-SKX: # %bb.0: -; CHECK-SKX-NEXT: movzbl %dil, %eax -; CHECK-SKX-NEXT: incl %eax -; CHECK-SKX-NEXT: shlq $52, %rax -; CHECK-SKX-NEXT: movabsq $-4602115869219225600, %rcx # imm = 0xC022000000000000 -; CHECK-SKX-NEXT: addq %rax, %rcx -; CHECK-SKX-NEXT: vmovq %rcx, %xmm0 -; CHECK-SKX-NEXT: retq %shl = shl nuw i64 2, %cnt %conv = uitofp i64 %shl to double %mul = fmul double -9.000000e+00, %conv @@ -943,24 +718,6 @@ define double @fmul_pow_shl_cnt3(i8 %cnt) nounwind { ; CHECK-AVX-NEXT: addq %rax, %rcx ; CHECK-AVX-NEXT: vmovq %rcx, %xmm0 ; CHECK-AVX-NEXT: retq -; -; CHECK-AVX512F-LABEL: fmul_pow_shl_cnt3: -; CHECK-AVX512F: # %bb.0: -; CHECK-AVX512F-NEXT: movzbl %dil, %eax -; CHECK-AVX512F-NEXT: shlq $52, %rax -; CHECK-AVX512F-NEXT: movabsq $-4602115869219225600, %rcx # imm = 0xC022000000000000 -; CHECK-AVX512F-NEXT: addq %rax, %rcx -; CHECK-AVX512F-NEXT: vmovq %rcx, %xmm0 -; CHECK-AVX512F-NEXT: retq -; -; CHECK-SKX-LABEL: fmul_pow_shl_cnt3: -; CHECK-SKX: # %bb.0: -; CHECK-SKX-NEXT: movzbl %dil, %eax -; CHECK-SKX-NEXT: shlq $52, %rax -; CHECK-SKX-NEXT: movabsq $-4602115869219225600, %rcx # imm = 0xC022000000000000 -; CHECK-SKX-NEXT: addq %rax, %rcx -; CHECK-SKX-NEXT: vmovq %rcx, %xmm0 -; CHECK-SKX-NEXT: retq %zext_cnt = zext i8 %cnt to i64 %shl = shl nuw i64 1, %zext_cnt %conv = uitofp i64 %shl to double @@ -992,28 +749,6 @@ define float @fmul_pow_select(i32 %cnt, i1 %c) nounwind { ; CHECK-AVX-NEXT: addl $1091567616, %ecx # imm = 0x41100000 ; CHECK-AVX-NEXT: vmovd %ecx, %xmm0 ; CHECK-AVX-NEXT: retq -; -; CHECK-AVX512F-LABEL: fmul_pow_select: -; CHECK-AVX512F: # %bb.0: -; CHECK-AVX512F-NEXT: movzbl %dil, %eax -; CHECK-AVX512F-NEXT: leal 1(%rax), %ecx -; CHECK-AVX512F-NEXT: testb $1, %sil -; CHECK-AVX512F-NEXT: cmovnel %eax, %ecx -; CHECK-AVX512F-NEXT: shll $23, %ecx -; CHECK-AVX512F-NEXT: addl $1091567616, %ecx # imm = 0x41100000 -; CHECK-AVX512F-NEXT: vmovd %ecx, %xmm0 -; CHECK-AVX512F-NEXT: retq -; -; CHECK-SKX-LABEL: fmul_pow_select: -; CHECK-SKX: # %bb.0: -; CHECK-SKX-NEXT: movzbl %dil, %eax -; CHECK-SKX-NEXT: leal 1(%rax), %ecx -; CHECK-SKX-NEXT: testb $1, %sil -; CHECK-SKX-NEXT: cmovnel %eax, %ecx -; CHECK-SKX-NEXT: shll $23, %ecx -; CHECK-SKX-NEXT: addl $1091567616, %ecx # imm = 0x41100000 -; CHECK-SKX-NEXT: vmovd %ecx, %xmm0 -; CHECK-SKX-NEXT: retq %shl2 = shl nuw i32 2, %cnt %shl1 = shl nuw i32 1, %cnt %shl = select i1 %c, i32 %shl1, i32 %shl2 @@ -1048,30 +783,6 @@ define float @fmul_fly_pow_mul_min_pow2(i64 %cnt) nounwind { ; CHECK-AVX-NEXT: addl $1091567616, %ecx # imm = 0x41100000 ; CHECK-AVX-NEXT: vmovd %ecx, %xmm0 ; CHECK-AVX-NEXT: retq -; -; CHECK-AVX512F-LABEL: fmul_fly_pow_mul_min_pow2: -; CHECK-AVX512F: # %bb.0: -; CHECK-AVX512F-NEXT: movzbl %dil, %eax -; CHECK-AVX512F-NEXT: addl $3, %eax -; CHECK-AVX512F-NEXT: cmpl $13, %eax -; CHECK-AVX512F-NEXT: movl $13, %ecx -; CHECK-AVX512F-NEXT: cmovbl %eax, %ecx -; CHECK-AVX512F-NEXT: shll $23, %ecx -; CHECK-AVX512F-NEXT: addl $1091567616, %ecx # imm = 0x41100000 -; CHECK-AVX512F-NEXT: vmovd %ecx, %xmm0 -; CHECK-AVX512F-NEXT: retq -; -; CHECK-SKX-LABEL: fmul_fly_pow_mul_min_pow2: -; CHECK-SKX: # %bb.0: -; CHECK-SKX-NEXT: movzbl %dil, %eax -; CHECK-SKX-NEXT: addl $3, %eax -; CHECK-SKX-NEXT: cmpl $13, %eax -; CHECK-SKX-NEXT: movl $13, %ecx -; CHECK-SKX-NEXT: cmovbl %eax, %ecx -; CHECK-SKX-NEXT: shll $23, %ecx -; CHECK-SKX-NEXT: addl $1091567616, %ecx # imm = 0x41100000 -; CHECK-SKX-NEXT: vmovd %ecx, %xmm0 -; CHECK-SKX-NEXT: retq %shl8 = shl nuw i64 8, %cnt %shl = call i64 @llvm.umin.i64(i64 %shl8, i64 8192) %conv = uitofp i64 %shl to float @@ -1105,30 +816,6 @@ define double @fmul_pow_mul_max_pow2(i16 %cnt) nounwind { ; CHECK-AVX-NEXT: orq %rcx, %rax ; CHECK-AVX-NEXT: vmovq %rax, %xmm0 ; CHECK-AVX-NEXT: retq -; -; CHECK-AVX512F-LABEL: fmul_pow_mul_max_pow2: -; CHECK-AVX512F: # %bb.0: -; CHECK-AVX512F-NEXT: movzbl %dil, %eax -; CHECK-AVX512F-NEXT: leaq 1(%rax), %rcx -; CHECK-AVX512F-NEXT: cmpq %rcx, %rax -; CHECK-AVX512F-NEXT: cmovaq %rax, %rcx -; CHECK-AVX512F-NEXT: shlq $52, %rcx -; CHECK-AVX512F-NEXT: movabsq $4613937818241073152, %rax # imm = 0x4008000000000000 -; CHECK-AVX512F-NEXT: orq %rcx, %rax -; CHECK-AVX512F-NEXT: vmovq %rax, %xmm0 -; CHECK-AVX512F-NEXT: retq -; -; CHECK-SKX-LABEL: fmul_pow_mul_max_pow2: -; CHECK-SKX: # %bb.0: -; CHECK-SKX-NEXT: movzbl %dil, %eax -; CHECK-SKX-NEXT: leaq 1(%rax), %rcx -; CHECK-SKX-NEXT: cmpq %rcx, %rax -; CHECK-SKX-NEXT: cmovaq %rax, %rcx -; CHECK-SKX-NEXT: shlq $52, %rcx -; CHECK-SKX-NEXT: movabsq $4613937818241073152, %rax # imm = 0x4008000000000000 -; CHECK-SKX-NEXT: orq %rcx, %rax -; CHECK-SKX-NEXT: vmovq %rax, %xmm0 -; CHECK-SKX-NEXT: retq %shl2 = shl nuw i16 2, %cnt %shl1 = shl nuw i16 1, %cnt %shl = call i16 @llvm.umax.i16(i16 %shl1, i16 %shl2) @@ -1152,27 +839,18 @@ define double @fmul_pow_shl_cnt_fail_maybe_non_pow2(i64 %v, i64 %cnt) nounwind { ; CHECK-SSE-NEXT: mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX-LABEL: fmul_pow_shl_cnt_fail_maybe_non_pow2: -; CHECK-AVX: # %bb.0: -; CHECK-AVX-NEXT: movq %rsi, %rcx -; CHECK-AVX-NEXT: # kill: def $cl killed $cl killed $rcx -; CHECK-AVX-NEXT: shlq %cl, %rdi -; CHECK-AVX-NEXT: vmovq %rdi, %xmm0 -; CHECK-AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-AVX-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] -; CHECK-AVX-NEXT: vaddsd %xmm0, %xmm1, %xmm0 -; CHECK-AVX-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX-NEXT: retq -; -; CHECK-AVX512F-LABEL: fmul_pow_shl_cnt_fail_maybe_non_pow2: -; CHECK-AVX512F: # %bb.0: -; CHECK-AVX512F-NEXT: movq %rsi, %rcx -; CHECK-AVX512F-NEXT: # kill: def $cl killed $cl killed $rcx -; CHECK-AVX512F-NEXT: shlq %cl, %rdi -; CHECK-AVX512F-NEXT: vcvtusi2sd %rdi, %xmm15, %xmm0 -; CHECK-AVX512F-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: retq +; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_fail_maybe_non_pow2: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: movq %rsi, %rcx +; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $rcx +; CHECK-AVX2-NEXT: shlq %cl, %rdi +; CHECK-AVX2-NEXT: vmovq %rdi, %xmm0 +; CHECK-AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-AVX2-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] +; CHECK-AVX2-NEXT: vaddsd %xmm0, %xmm1, %xmm0 +; CHECK-AVX2-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq ; ; CHECK-SKX-LABEL: fmul_pow_shl_cnt_fail_maybe_non_pow2: ; CHECK-SKX: # %bb.0: @@ -1205,31 +883,18 @@ define <2 x float> @fmul_pow_shl_cnt_vec_fail_expensive_cast(<2 x i64> %cnt) nou ; CHECK-SSE-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX-LABEL: fmul_pow_shl_cnt_vec_fail_expensive_cast: -; CHECK-AVX: # %bb.0: -; CHECK-AVX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [2,2] -; CHECK-AVX-NEXT: vpsllvq %xmm0, %xmm1, %xmm0 -; CHECK-AVX-NEXT: vpextrq $1, %xmm0, %rax -; CHECK-AVX-NEXT: vcvtsi2ss %rax, %xmm15, %xmm1 -; CHECK-AVX-NEXT: vmovq %xmm0, %rax -; CHECK-AVX-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 -; CHECK-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero -; CHECK-AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.5E+1,1.5E+1,1.5E+1,1.5E+1] -; CHECK-AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; CHECK-AVX-NEXT: retq -; -; CHECK-AVX512F-LABEL: fmul_pow_shl_cnt_vec_fail_expensive_cast: -; CHECK-AVX512F: # %bb.0: -; CHECK-AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm1 = [2,2] -; CHECK-AVX512F-NEXT: vpsllvq %xmm0, %xmm1, %xmm0 -; CHECK-AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; CHECK-AVX512F-NEXT: vcvtsi2ss %rax, %xmm15, %xmm1 -; CHECK-AVX512F-NEXT: vmovq %xmm0, %rax -; CHECK-AVX512F-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 -; CHECK-AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero -; CHECK-AVX512F-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.5E+1,1.5E+1,1.5E+1,1.5E+1] -; CHECK-AVX512F-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: retq +; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_vec_fail_expensive_cast: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [2,2] +; CHECK-AVX2-NEXT: vpsllvq %xmm0, %xmm1, %xmm0 +; CHECK-AVX2-NEXT: vpextrq $1, %xmm0, %rax +; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm15, %xmm1 +; CHECK-AVX2-NEXT: vmovq %xmm0, %rax +; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 +; CHECK-AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero +; CHECK-AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.5E+1,1.5E+1,1.5E+1,1.5E+1] +; CHECK-AVX2-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq ; ; CHECK-SKX-LABEL: fmul_pow_shl_cnt_vec_fail_expensive_cast: ; CHECK-SKX: # %bb.0: @@ -1251,17 +916,11 @@ define <2 x double> @fmul_pow_shl_cnt_vec(<2 x i64> %cnt) nounwind { ; CHECK-SSE-NEXT: paddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX-LABEL: fmul_pow_shl_cnt_vec: -; CHECK-AVX: # %bb.0: -; CHECK-AVX-NEXT: vpsllq $52, %xmm0, %xmm0 -; CHECK-AVX-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX-NEXT: retq -; -; CHECK-AVX512F-LABEL: fmul_pow_shl_cnt_vec: -; CHECK-AVX512F: # %bb.0: -; CHECK-AVX512F-NEXT: vpsllq $52, %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: retq +; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_vec: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vpsllq $52, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq ; ; CHECK-SKX-LABEL: fmul_pow_shl_cnt_vec: ; CHECK-SKX: # %bb.0: @@ -1282,22 +941,13 @@ define <4 x float> @fmul_pow_shl_cnt_vec_preserve_fma(<4 x i32> %cnt, <4 x float ; CHECK-SSE-NEXT: addps %xmm1, %xmm0 ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX-LABEL: fmul_pow_shl_cnt_vec_preserve_fma: -; CHECK-AVX: # %bb.0: -; CHECK-AVX-NEXT: vpslld $23, %xmm0, %xmm0 -; CHECK-AVX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1092616192,1092616192,1092616192,1092616192] -; CHECK-AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0 -; CHECK-AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; CHECK-AVX-NEXT: retq -; -; CHECK-AVX512F-LABEL: fmul_pow_shl_cnt_vec_preserve_fma: -; CHECK-AVX512F: # %bb.0: -; CHECK-AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2,2,2,2] -; CHECK-AVX512F-NEXT: vpsllvd %xmm0, %xmm2, %xmm0 -; CHECK-AVX512F-NEXT: vcvtdq2ps %xmm0, %xmm2 -; CHECK-AVX512F-NEXT: vbroadcastss {{.*#+}} xmm0 = [5.0E+0,5.0E+0,5.0E+0,5.0E+0] -; CHECK-AVX512F-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1 -; CHECK-AVX512F-NEXT: retq +; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_vec_preserve_fma: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vpslld $23, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1092616192,1092616192,1092616192,1092616192] +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq ; ; CHECK-SKX-LABEL: fmul_pow_shl_cnt_vec_preserve_fma: ; CHECK-SKX: # %bb.0: @@ -1321,21 +971,13 @@ define <4 x float> @fmul_pow_shl_cnt_vec_no_fma(<4 x i32> %cnt, <4 x float> %add ; CHECK-SSE-NEXT: addps %xmm1, %xmm0 ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX-LABEL: fmul_pow_shl_cnt_vec_no_fma: -; CHECK-AVX: # %bb.0: -; CHECK-AVX-NEXT: vpslld $23, %xmm0, %xmm0 -; CHECK-AVX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1092616192,1092616192,1092616192,1092616192] -; CHECK-AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0 -; CHECK-AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; CHECK-AVX-NEXT: retq -; -; CHECK-AVX512F-LABEL: fmul_pow_shl_cnt_vec_no_fma: -; CHECK-AVX512F: # %bb.0: -; CHECK-AVX512F-NEXT: vpslld $23, %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1092616192,1092616192,1092616192,1092616192] -; CHECK-AVX512F-NEXT: vpaddd %xmm2, %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: retq +; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_vec_no_fma: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vpslld $23, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1092616192,1092616192,1092616192,1092616192] +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq ; ; CHECK-SKX-LABEL: fmul_pow_shl_cnt_vec_no_fma: ; CHECK-SKX: # %bb.0: @@ -1362,18 +1004,6 @@ define <2 x double> @fmul_pow_shl_cnt_vec_non_splat_todo(<2 x i64> %cnt) nounwin ; CHECK-AVX-NEXT: vpsllq $52, %xmm0, %xmm0 ; CHECK-AVX-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX-NEXT: retq -; -; CHECK-AVX512F-LABEL: fmul_pow_shl_cnt_vec_non_splat_todo: -; CHECK-AVX512F: # %bb.0: -; CHECK-AVX512F-NEXT: vpsllq $52, %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: retq -; -; CHECK-SKX-LABEL: fmul_pow_shl_cnt_vec_non_splat_todo: -; CHECK-SKX: # %bb.0: -; CHECK-SKX-NEXT: vpsllq $52, %xmm0, %xmm0 -; CHECK-SKX-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-SKX-NEXT: retq %shl = shl nsw nuw <2 x i64> , %cnt %conv = uitofp <2 x i64> %shl to <2 x double> %mul = fmul <2 x double> , %conv @@ -1392,18 +1022,6 @@ define <2 x double> @fmul_pow_shl_cnt_vec_non_splat2_todo(<2 x i64> %cnt) nounwi ; CHECK-AVX-NEXT: vpsllq $52, %xmm0, %xmm0 ; CHECK-AVX-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX-NEXT: retq -; -; CHECK-AVX512F-LABEL: fmul_pow_shl_cnt_vec_non_splat2_todo: -; CHECK-AVX512F: # %bb.0: -; CHECK-AVX512F-NEXT: vpsllq $52, %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: retq -; -; CHECK-SKX-LABEL: fmul_pow_shl_cnt_vec_non_splat2_todo: -; CHECK-SKX: # %bb.0: -; CHECK-SKX-NEXT: vpsllq $52, %xmm0, %xmm0 -; CHECK-SKX-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-SKX-NEXT: retq %shl = shl nsw nuw <2 x i64> , %cnt %conv = uitofp <2 x i64> %shl to <2 x double> %mul = fmul <2 x double> , %conv @@ -1445,52 +1063,36 @@ define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind { ; CHECK-SSE-NEXT: addq $40, %rsp ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX-LABEL: fmul_pow_shl_cnt_vec_fail_to_large: -; CHECK-AVX: # %bb.0: -; CHECK-AVX-NEXT: subq $56, %rsp -; CHECK-AVX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; CHECK-AVX-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,2,0,0,0,0,0,0] -; CHECK-AVX-NEXT: vpsllvd %ymm0, %ymm1, %ymm0 -; CHECK-AVX-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; CHECK-AVX-NEXT: vpextrw $2, %xmm0, %eax -; CHECK-AVX-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0 -; CHECK-AVX-NEXT: vzeroupper -; CHECK-AVX-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-AVX-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-AVX-NEXT: vpextrw $0, %xmm0, %eax -; CHECK-AVX-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0 -; CHECK-AVX-NEXT: vzeroupper -; CHECK-AVX-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX-NEXT: callq __extendhfsf2@PLT -; CHECK-AVX-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-AVX-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-AVX-NEXT: # xmm0 = mem[0],zero,zero,zero -; CHECK-AVX-NEXT: callq __extendhfsf2@PLT -; CHECK-AVX-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; CHECK-AVX-NEXT: addq $56, %rsp -; CHECK-AVX-NEXT: retq -; -; CHECK-AVX512F-LABEL: fmul_pow_shl_cnt_vec_fail_to_large: -; CHECK-AVX512F: # %bb.0: -; CHECK-AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; CHECK-AVX512F-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,2,0,0,0,0,0,0] -; CHECK-AVX512F-NEXT: vpsllvd %ymm0, %ymm1, %ymm0 -; CHECK-AVX512F-NEXT: vpmovdw %zmm0, %ymm0 -; CHECK-AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; CHECK-AVX512F-NEXT: vcvtdq2ps %ymm0, %ymm0 -; CHECK-AVX512F-NEXT: vcvtps2ph $4, %ymm0, %xmm0 -; CHECK-AVX512F-NEXT: vcvtph2ps %xmm0, %ymm0 -; CHECK-AVX512F-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.5E+1,1.5E+1,1.5E+1,1.5E+1,1.5E+1,1.5E+1,1.5E+1,1.5E+1] -; CHECK-AVX512F-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; CHECK-AVX512F-NEXT: vcvtps2ph $4, %ymm0, %xmm0 -; CHECK-AVX512F-NEXT: vzeroupper -; CHECK-AVX512F-NEXT: retq +; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_vec_fail_to_large: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: subq $56, %rsp +; CHECK-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; CHECK-AVX2-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,2,0,0,0,0,0,0] +; CHECK-AVX2-NEXT: vpsllvd %ymm0, %ymm1, %ymm0 +; CHECK-AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-AVX2-NEXT: vpextrw $2, %xmm0, %eax +; CHECK-AVX2-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0 +; CHECK-AVX2-NEXT: vzeroupper +; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT +; CHECK-AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; CHECK-AVX2-NEXT: vpextrw $0, %xmm0, %eax +; CHECK-AVX2-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0 +; CHECK-AVX2-NEXT: vzeroupper +; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT +; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT +; CHECK-AVX2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT +; CHECK-AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-AVX2-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-AVX2-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT +; CHECK-AVX2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT +; CHECK-AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; CHECK-AVX2-NEXT: addq $56, %rsp +; CHECK-AVX2-NEXT: retq ; ; CHECK-SKX-LABEL: fmul_pow_shl_cnt_vec_fail_to_large: ; CHECK-SKX: # %bb.0: @@ -1526,29 +1128,19 @@ define double @fmul_pow_shl_cnt_fail_maybe_bad_exp(i64 %cnt) nounwind { ; CHECK-SSE-NEXT: mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX-LABEL: fmul_pow_shl_cnt_fail_maybe_bad_exp: -; CHECK-AVX: # %bb.0: -; CHECK-AVX-NEXT: movq %rdi, %rcx -; CHECK-AVX-NEXT: movl $1, %eax -; CHECK-AVX-NEXT: # kill: def $cl killed $cl killed $rcx -; CHECK-AVX-NEXT: shlq %cl, %rax -; CHECK-AVX-NEXT: vmovq %rax, %xmm0 -; CHECK-AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-AVX-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] -; CHECK-AVX-NEXT: vaddsd %xmm0, %xmm1, %xmm0 -; CHECK-AVX-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX-NEXT: retq -; -; CHECK-AVX512F-LABEL: fmul_pow_shl_cnt_fail_maybe_bad_exp: -; CHECK-AVX512F: # %bb.0: -; CHECK-AVX512F-NEXT: movq %rdi, %rcx -; CHECK-AVX512F-NEXT: movl $1, %eax -; CHECK-AVX512F-NEXT: # kill: def $cl killed $cl killed $rcx -; CHECK-AVX512F-NEXT: shlq %cl, %rax -; CHECK-AVX512F-NEXT: vcvtusi2sd %rax, %xmm15, %xmm0 -; CHECK-AVX512F-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: retq +; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_fail_maybe_bad_exp: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: movq %rdi, %rcx +; CHECK-AVX2-NEXT: movl $1, %eax +; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $rcx +; CHECK-AVX2-NEXT: shlq %cl, %rax +; CHECK-AVX2-NEXT: vmovq %rax, %xmm0 +; CHECK-AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-AVX2-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] +; CHECK-AVX2-NEXT: vaddsd %xmm0, %xmm1, %xmm0 +; CHECK-AVX2-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq ; ; CHECK-SKX-LABEL: fmul_pow_shl_cnt_fail_maybe_bad_exp: ; CHECK-SKX: # %bb.0: @@ -1583,24 +1175,6 @@ define double @fmul_pow_shl_cnt_safe(i16 %cnt) nounwind { ; CHECK-AVX-NEXT: addq %rax, %rcx ; CHECK-AVX-NEXT: vmovq %rcx, %xmm0 ; CHECK-AVX-NEXT: retq -; -; CHECK-AVX512F-LABEL: fmul_pow_shl_cnt_safe: -; CHECK-AVX512F: # %bb.0: -; CHECK-AVX512F-NEXT: movzbl %dil, %eax -; CHECK-AVX512F-NEXT: shlq $52, %rax -; CHECK-AVX512F-NEXT: movabsq $8930638061065157010, %rcx # imm = 0x7BEFFFFFFF5F3992 -; CHECK-AVX512F-NEXT: addq %rax, %rcx -; CHECK-AVX512F-NEXT: vmovq %rcx, %xmm0 -; CHECK-AVX512F-NEXT: retq -; -; CHECK-SKX-LABEL: fmul_pow_shl_cnt_safe: -; CHECK-SKX: # %bb.0: -; CHECK-SKX-NEXT: movzbl %dil, %eax -; CHECK-SKX-NEXT: shlq $52, %rax -; CHECK-SKX-NEXT: movabsq $8930638061065157010, %rcx # imm = 0x7BEFFFFFFF5F3992 -; CHECK-SKX-NEXT: addq %rax, %rcx -; CHECK-SKX-NEXT: vmovq %rcx, %xmm0 -; CHECK-SKX-NEXT: retq %shl = shl nuw i16 1, %cnt %conv = uitofp i16 %shl to double %mul = fmul double 9.745314e+288, %conv @@ -1622,20 +1196,6 @@ define <2 x double> @fdiv_pow_shl_cnt_vec(<2 x i64> %cnt) nounwind { ; CHECK-AVX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4607182418800017408,4607182418800017408] ; CHECK-AVX-NEXT: vpsubq %xmm0, %xmm1, %xmm0 ; CHECK-AVX-NEXT: retq -; -; CHECK-AVX512F-LABEL: fdiv_pow_shl_cnt_vec: -; CHECK-AVX512F: # %bb.0: -; CHECK-AVX512F-NEXT: vpsllq $52, %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4607182418800017408,4607182418800017408] -; CHECK-AVX512F-NEXT: vpsubq %xmm0, %xmm1, %xmm0 -; CHECK-AVX512F-NEXT: retq -; -; CHECK-SKX-LABEL: fdiv_pow_shl_cnt_vec: -; CHECK-SKX: # %bb.0: -; CHECK-SKX-NEXT: vpsllq $52, %xmm0, %xmm0 -; CHECK-SKX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4607182418800017408,4607182418800017408] -; CHECK-SKX-NEXT: vpsubq %xmm0, %xmm1, %xmm0 -; CHECK-SKX-NEXT: retq %shl = shl nuw <2 x i64> , %cnt %conv = uitofp <2 x i64> %shl to <2 x double> %mul = fdiv <2 x double> , %conv @@ -1658,22 +1218,6 @@ define <2 x float> @fdiv_pow_shl_cnt_vec_with_expensive_cast(<2 x i64> %cnt) nou ; CHECK-AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1065353216,1065353216,1065353216,1065353216] ; CHECK-AVX-NEXT: vpsubd %xmm0, %xmm1, %xmm0 ; CHECK-AVX-NEXT: retq -; -; CHECK-AVX512F-LABEL: fdiv_pow_shl_cnt_vec_with_expensive_cast: -; CHECK-AVX512F: # %bb.0: -; CHECK-AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-AVX512F-NEXT: vpslld $23, %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1065353216,1065353216,1065353216,1065353216] -; CHECK-AVX512F-NEXT: vpsubd %xmm0, %xmm1, %xmm0 -; CHECK-AVX512F-NEXT: retq -; -; CHECK-SKX-LABEL: fdiv_pow_shl_cnt_vec_with_expensive_cast: -; CHECK-SKX: # %bb.0: -; CHECK-SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SKX-NEXT: vpslld $23, %xmm0, %xmm0 -; CHECK-SKX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1065353216,1065353216,1065353216,1065353216] -; CHECK-SKX-NEXT: vpsubd %xmm0, %xmm1, %xmm0 -; CHECK-SKX-NEXT: retq %shl = shl nuw <2 x i64> , %cnt %conv = uitofp <2 x i64> %shl to <2 x float> %mul = fdiv <2 x float> , %conv @@ -1701,36 +1245,25 @@ define float @fdiv_pow_shl_cnt_fail_maybe_z(i64 %cnt) nounwind { ; CHECK-SSE-NEXT: divss %xmm1, %xmm0 ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX-LABEL: fdiv_pow_shl_cnt_fail_maybe_z: -; CHECK-AVX: # %bb.0: -; CHECK-AVX-NEXT: movq %rdi, %rcx -; CHECK-AVX-NEXT: movl $8, %eax -; CHECK-AVX-NEXT: # kill: def $cl killed $cl killed $rcx -; CHECK-AVX-NEXT: shlq %cl, %rax -; CHECK-AVX-NEXT: testq %rax, %rax -; CHECK-AVX-NEXT: js .LBB24_1 -; CHECK-AVX-NEXT: # %bb.2: -; CHECK-AVX-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 -; CHECK-AVX-NEXT: jmp .LBB24_3 -; CHECK-AVX-NEXT: .LBB24_1: -; CHECK-AVX-NEXT: shrq %rax -; CHECK-AVX-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 -; CHECK-AVX-NEXT: vaddss %xmm0, %xmm0, %xmm0 -; CHECK-AVX-NEXT: .LBB24_3: -; CHECK-AVX-NEXT: vmovss {{.*#+}} xmm1 = [-9.0E+0,0.0E+0,0.0E+0,0.0E+0] -; CHECK-AVX-NEXT: vdivss %xmm0, %xmm1, %xmm0 -; CHECK-AVX-NEXT: retq -; -; CHECK-AVX512F-LABEL: fdiv_pow_shl_cnt_fail_maybe_z: -; CHECK-AVX512F: # %bb.0: -; CHECK-AVX512F-NEXT: movq %rdi, %rcx -; CHECK-AVX512F-NEXT: movl $8, %eax -; CHECK-AVX512F-NEXT: # kill: def $cl killed $cl killed $rcx -; CHECK-AVX512F-NEXT: shlq %cl, %rax -; CHECK-AVX512F-NEXT: vcvtusi2ss %rax, %xmm15, %xmm0 -; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm1 = [-9.0E+0,0.0E+0,0.0E+0,0.0E+0] -; CHECK-AVX512F-NEXT: vdivss %xmm0, %xmm1, %xmm0 -; CHECK-AVX512F-NEXT: retq +; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt_fail_maybe_z: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: movq %rdi, %rcx +; CHECK-AVX2-NEXT: movl $8, %eax +; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $rcx +; CHECK-AVX2-NEXT: shlq %cl, %rax +; CHECK-AVX2-NEXT: testq %rax, %rax +; CHECK-AVX2-NEXT: js .LBB24_1 +; CHECK-AVX2-NEXT: # %bb.2: +; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 +; CHECK-AVX2-NEXT: jmp .LBB24_3 +; CHECK-AVX2-NEXT: .LBB24_1: +; CHECK-AVX2-NEXT: shrq %rax +; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 +; CHECK-AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: .LBB24_3: +; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm1 = [-9.0E+0,0.0E+0,0.0E+0,0.0E+0] +; CHECK-AVX2-NEXT: vdivss %xmm0, %xmm1, %xmm0 +; CHECK-AVX2-NEXT: retq ; ; CHECK-SKX-LABEL: fdiv_pow_shl_cnt_fail_maybe_z: ; CHECK-SKX: # %bb.0: @@ -1758,27 +1291,16 @@ define float @fdiv_pow_shl_cnt_fail_neg_int(i64 %cnt) nounwind { ; CHECK-SSE-NEXT: divss %xmm1, %xmm0 ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX-LABEL: fdiv_pow_shl_cnt_fail_neg_int: -; CHECK-AVX: # %bb.0: -; CHECK-AVX-NEXT: movq %rdi, %rcx -; CHECK-AVX-NEXT: movl $8, %eax -; CHECK-AVX-NEXT: # kill: def $cl killed $cl killed $rcx -; CHECK-AVX-NEXT: shlq %cl, %rax -; CHECK-AVX-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 -; CHECK-AVX-NEXT: vmovss {{.*#+}} xmm1 = [-9.0E+0,0.0E+0,0.0E+0,0.0E+0] -; CHECK-AVX-NEXT: vdivss %xmm0, %xmm1, %xmm0 -; CHECK-AVX-NEXT: retq -; -; CHECK-AVX512F-LABEL: fdiv_pow_shl_cnt_fail_neg_int: -; CHECK-AVX512F: # %bb.0: -; CHECK-AVX512F-NEXT: movq %rdi, %rcx -; CHECK-AVX512F-NEXT: movl $8, %eax -; CHECK-AVX512F-NEXT: # kill: def $cl killed $cl killed $rcx -; CHECK-AVX512F-NEXT: shlq %cl, %rax -; CHECK-AVX512F-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 -; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm1 = [-9.0E+0,0.0E+0,0.0E+0,0.0E+0] -; CHECK-AVX512F-NEXT: vdivss %xmm0, %xmm1, %xmm0 -; CHECK-AVX512F-NEXT: retq +; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt_fail_neg_int: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: movq %rdi, %rcx +; CHECK-AVX2-NEXT: movl $8, %eax +; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $rcx +; CHECK-AVX2-NEXT: shlq %cl, %rax +; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 +; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm1 = [-9.0E+0,0.0E+0,0.0E+0,0.0E+0] +; CHECK-AVX2-NEXT: vdivss %xmm0, %xmm1, %xmm0 +; CHECK-AVX2-NEXT: retq ; ; CHECK-SKX-LABEL: fdiv_pow_shl_cnt_fail_neg_int: ; CHECK-SKX: # %bb.0: @@ -1812,24 +1334,6 @@ define float @fdiv_pow_shl_cnt(i64 %cnt_in) nounwind { ; CHECK-AVX-NEXT: subl %edi, %eax ; CHECK-AVX-NEXT: vmovd %eax, %xmm0 ; CHECK-AVX-NEXT: retq -; -; CHECK-AVX512F-LABEL: fdiv_pow_shl_cnt: -; CHECK-AVX512F: # %bb.0: -; CHECK-AVX512F-NEXT: andl $31, %edi -; CHECK-AVX512F-NEXT: shll $23, %edi -; CHECK-AVX512F-NEXT: movl $-1115684864, %eax # imm = 0xBD800000 -; CHECK-AVX512F-NEXT: subl %edi, %eax -; CHECK-AVX512F-NEXT: vmovd %eax, %xmm0 -; CHECK-AVX512F-NEXT: retq -; -; CHECK-SKX-LABEL: fdiv_pow_shl_cnt: -; CHECK-SKX: # %bb.0: -; CHECK-SKX-NEXT: andl $31, %edi -; CHECK-SKX-NEXT: shll $23, %edi -; CHECK-SKX-NEXT: movl $-1115684864, %eax # imm = 0xBD800000 -; CHECK-SKX-NEXT: subl %edi, %eax -; CHECK-SKX-NEXT: vmovd %eax, %xmm0 -; CHECK-SKX-NEXT: retq %cnt = and i64 %cnt_in, 31 %shl = shl i64 8, %cnt %conv = sitofp i64 %shl to float @@ -1855,35 +1359,21 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bounds(i32 %cnt) nounwind { ; CHECK-SSE-NEXT: popq %rax ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX-LABEL: fdiv_pow_shl_cnt_fail_out_of_bounds: -; CHECK-AVX: # %bb.0: -; CHECK-AVX-NEXT: pushq %rax -; CHECK-AVX-NEXT: movl %edi, %ecx -; CHECK-AVX-NEXT: movl $1, %eax -; CHECK-AVX-NEXT: # kill: def $cl killed $cl killed $ecx -; CHECK-AVX-NEXT: shll %cl, %eax -; CHECK-AVX-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 -; CHECK-AVX-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX-NEXT: callq __extendhfsf2@PLT -; CHECK-AVX-NEXT: vmovss {{.*#+}} xmm1 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] -; CHECK-AVX-NEXT: vdivss %xmm0, %xmm1, %xmm0 -; CHECK-AVX-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX-NEXT: popq %rax -; CHECK-AVX-NEXT: retq -; -; CHECK-AVX512F-LABEL: fdiv_pow_shl_cnt_fail_out_of_bounds: -; CHECK-AVX512F: # %bb.0: -; CHECK-AVX512F-NEXT: movl %edi, %ecx -; CHECK-AVX512F-NEXT: movl $1, %eax -; CHECK-AVX512F-NEXT: # kill: def $cl killed $cl killed $ecx -; CHECK-AVX512F-NEXT: shll %cl, %eax -; CHECK-AVX512F-NEXT: vcvtusi2ss %eax, %xmm15, %xmm0 -; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm1 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] -; CHECK-AVX512F-NEXT: vdivss %xmm0, %xmm1, %xmm0 -; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: retq +; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt_fail_out_of_bounds: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: pushq %rax +; CHECK-AVX2-NEXT: movl %edi, %ecx +; CHECK-AVX2-NEXT: movl $1, %eax +; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx +; CHECK-AVX2-NEXT: shll %cl, %eax +; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 +; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT +; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT +; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm1 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] +; CHECK-AVX2-NEXT: vdivss %xmm0, %xmm1, %xmm0 +; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT +; CHECK-AVX2-NEXT: popq %rax +; CHECK-AVX2-NEXT: retq ; ; CHECK-SKX-LABEL: fdiv_pow_shl_cnt_fail_out_of_bounds: ; CHECK-SKX: # %bb.0: @@ -1918,22 +1408,6 @@ define half @fdiv_pow_shl_cnt_in_bounds(i16 %cnt) nounwind { ; CHECK-AVX-NEXT: subl %edi, %eax ; CHECK-AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; CHECK-AVX-NEXT: retq -; -; CHECK-AVX512F-LABEL: fdiv_pow_shl_cnt_in_bounds: -; CHECK-AVX512F: # %bb.0: -; CHECK-AVX512F-NEXT: shll $10, %edi -; CHECK-AVX512F-NEXT: movl $28672, %eax # imm = 0x7000 -; CHECK-AVX512F-NEXT: subl %edi, %eax -; CHECK-AVX512F-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: retq -; -; CHECK-SKX-LABEL: fdiv_pow_shl_cnt_in_bounds: -; CHECK-SKX: # %bb.0: -; CHECK-SKX-NEXT: shll $10, %edi -; CHECK-SKX-NEXT: movl $28672, %eax # imm = 0x7000 -; CHECK-SKX-NEXT: subl %edi, %eax -; CHECK-SKX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 -; CHECK-SKX-NEXT: retq %shl = shl nuw i16 1, %cnt %conv = uitofp i16 %shl to half %mul = fdiv half 0xH7000, %conv @@ -1956,22 +1430,6 @@ define half @fdiv_pow_shl_cnt_in_bounds2(i16 %cnt) nounwind { ; CHECK-AVX-NEXT: subl %edi, %eax ; CHECK-AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; CHECK-AVX-NEXT: retq -; -; CHECK-AVX512F-LABEL: fdiv_pow_shl_cnt_in_bounds2: -; CHECK-AVX512F: # %bb.0: -; CHECK-AVX512F-NEXT: shll $10, %edi -; CHECK-AVX512F-NEXT: movl $18432, %eax # imm = 0x4800 -; CHECK-AVX512F-NEXT: subl %edi, %eax -; CHECK-AVX512F-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: retq -; -; CHECK-SKX-LABEL: fdiv_pow_shl_cnt_in_bounds2: -; CHECK-SKX: # %bb.0: -; CHECK-SKX-NEXT: shll $10, %edi -; CHECK-SKX-NEXT: movl $18432, %eax # imm = 0x4800 -; CHECK-SKX-NEXT: subl %edi, %eax -; CHECK-SKX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 -; CHECK-SKX-NEXT: retq %shl = shl nuw i16 1, %cnt %conv = uitofp i16 %shl to half %mul = fdiv half 0xH4800, %conv @@ -1997,37 +1455,22 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bound2(i16 %cnt) nounwind { ; CHECK-SSE-NEXT: popq %rax ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX-LABEL: fdiv_pow_shl_cnt_fail_out_of_bound2: -; CHECK-AVX: # %bb.0: -; CHECK-AVX-NEXT: pushq %rax -; CHECK-AVX-NEXT: movl %edi, %ecx -; CHECK-AVX-NEXT: movl $1, %eax -; CHECK-AVX-NEXT: # kill: def $cl killed $cl killed $ecx -; CHECK-AVX-NEXT: shll %cl, %eax -; CHECK-AVX-NEXT: movzwl %ax, %eax -; CHECK-AVX-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0 -; CHECK-AVX-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX-NEXT: callq __extendhfsf2@PLT -; CHECK-AVX-NEXT: vmovss {{.*#+}} xmm1 = [2.0E+0,0.0E+0,0.0E+0,0.0E+0] -; CHECK-AVX-NEXT: vdivss %xmm0, %xmm1, %xmm0 -; CHECK-AVX-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX-NEXT: popq %rax -; CHECK-AVX-NEXT: retq -; -; CHECK-AVX512F-LABEL: fdiv_pow_shl_cnt_fail_out_of_bound2: -; CHECK-AVX512F: # %bb.0: -; CHECK-AVX512F-NEXT: movl %edi, %ecx -; CHECK-AVX512F-NEXT: movl $1, %eax -; CHECK-AVX512F-NEXT: # kill: def $cl killed $cl killed $ecx -; CHECK-AVX512F-NEXT: shll %cl, %eax -; CHECK-AVX512F-NEXT: movzwl %ax, %eax -; CHECK-AVX512F-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0 -; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm1 = [2.0E+0,0.0E+0,0.0E+0,0.0E+0] -; CHECK-AVX512F-NEXT: vdivss %xmm0, %xmm1, %xmm0 -; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: retq +; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt_fail_out_of_bound2: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: pushq %rax +; CHECK-AVX2-NEXT: movl %edi, %ecx +; CHECK-AVX2-NEXT: movl $1, %eax +; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx +; CHECK-AVX2-NEXT: shll %cl, %eax +; CHECK-AVX2-NEXT: movzwl %ax, %eax +; CHECK-AVX2-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0 +; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT +; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT +; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm1 = [2.0E+0,0.0E+0,0.0E+0,0.0E+0] +; CHECK-AVX2-NEXT: vdivss %xmm0, %xmm1, %xmm0 +; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT +; CHECK-AVX2-NEXT: popq %rax +; CHECK-AVX2-NEXT: retq ; ; CHECK-SKX-LABEL: fdiv_pow_shl_cnt_fail_out_of_bound2: ; CHECK-SKX: # %bb.0: @@ -2067,24 +1510,6 @@ define double @fdiv_pow_shl_cnt32_to_dbl_okay(i32 %cnt) nounwind { ; CHECK-AVX-NEXT: subq %rax, %rcx ; CHECK-AVX-NEXT: vmovq %rcx, %xmm0 ; CHECK-AVX-NEXT: retq -; -; CHECK-AVX512F-LABEL: fdiv_pow_shl_cnt32_to_dbl_okay: -; CHECK-AVX512F: # %bb.0: -; CHECK-AVX512F-NEXT: movzbl %dil, %eax -; CHECK-AVX512F-NEXT: shlq $52, %rax -; CHECK-AVX512F-NEXT: movabsq $3936146074321813504, %rcx # imm = 0x36A0000000000000 -; CHECK-AVX512F-NEXT: subq %rax, %rcx -; CHECK-AVX512F-NEXT: vmovq %rcx, %xmm0 -; CHECK-AVX512F-NEXT: retq -; -; CHECK-SKX-LABEL: fdiv_pow_shl_cnt32_to_dbl_okay: -; CHECK-SKX: # %bb.0: -; CHECK-SKX-NEXT: movzbl %dil, %eax -; CHECK-SKX-NEXT: shlq $52, %rax -; CHECK-SKX-NEXT: movabsq $3936146074321813504, %rcx # imm = 0x36A0000000000000 -; CHECK-SKX-NEXT: subq %rax, %rcx -; CHECK-SKX-NEXT: vmovq %rcx, %xmm0 -; CHECK-SKX-NEXT: retq %shl = shl nuw i32 1, %cnt %conv = uitofp i32 %shl to double %mul = fdiv double 0x36A0000000000000, %conv @@ -2103,27 +1528,16 @@ define float @fdiv_pow_shl_cnt32_out_of_bounds2(i32 %cnt) nounwind { ; CHECK-SSE-NEXT: divss %xmm1, %xmm0 ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX-LABEL: fdiv_pow_shl_cnt32_out_of_bounds2: -; CHECK-AVX: # %bb.0: -; CHECK-AVX-NEXT: movl %edi, %ecx -; CHECK-AVX-NEXT: movl $1, %eax -; CHECK-AVX-NEXT: # kill: def $cl killed $cl killed $ecx -; CHECK-AVX-NEXT: shll %cl, %eax -; CHECK-AVX-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 -; CHECK-AVX-NEXT: vmovss {{.*#+}} xmm1 = [1.00974148E-28,0.0E+0,0.0E+0,0.0E+0] -; CHECK-AVX-NEXT: vdivss %xmm0, %xmm1, %xmm0 -; CHECK-AVX-NEXT: retq -; -; CHECK-AVX512F-LABEL: fdiv_pow_shl_cnt32_out_of_bounds2: -; CHECK-AVX512F: # %bb.0: -; CHECK-AVX512F-NEXT: movl %edi, %ecx -; CHECK-AVX512F-NEXT: movl $1, %eax -; CHECK-AVX512F-NEXT: # kill: def $cl killed $cl killed $ecx -; CHECK-AVX512F-NEXT: shll %cl, %eax -; CHECK-AVX512F-NEXT: vcvtusi2ss %eax, %xmm15, %xmm0 -; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm1 = [1.00974148E-28,0.0E+0,0.0E+0,0.0E+0] -; CHECK-AVX512F-NEXT: vdivss %xmm0, %xmm1, %xmm0 -; CHECK-AVX512F-NEXT: retq +; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt32_out_of_bounds2: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: movl %edi, %ecx +; CHECK-AVX2-NEXT: movl $1, %eax +; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx +; CHECK-AVX2-NEXT: shll %cl, %eax +; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 +; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm1 = [1.00974148E-28,0.0E+0,0.0E+0,0.0E+0] +; CHECK-AVX2-NEXT: vdivss %xmm0, %xmm1, %xmm0 +; CHECK-AVX2-NEXT: retq ; ; CHECK-SKX-LABEL: fdiv_pow_shl_cnt32_out_of_bounds2: ; CHECK-SKX: # %bb.0: @@ -2159,24 +1573,6 @@ define float @fdiv_pow_shl_cnt32_okay(i32 %cnt) nounwind { ; CHECK-AVX-NEXT: subl %eax, %ecx ; CHECK-AVX-NEXT: vmovd %ecx, %xmm0 ; CHECK-AVX-NEXT: retq -; -; CHECK-AVX512F-LABEL: fdiv_pow_shl_cnt32_okay: -; CHECK-AVX512F: # %bb.0: -; CHECK-AVX512F-NEXT: movzbl %dil, %eax -; CHECK-AVX512F-NEXT: shll $23, %eax -; CHECK-AVX512F-NEXT: movl $285212672, %ecx # imm = 0x11000000 -; CHECK-AVX512F-NEXT: subl %eax, %ecx -; CHECK-AVX512F-NEXT: vmovd %ecx, %xmm0 -; CHECK-AVX512F-NEXT: retq -; -; CHECK-SKX-LABEL: fdiv_pow_shl_cnt32_okay: -; CHECK-SKX: # %bb.0: -; CHECK-SKX-NEXT: movzbl %dil, %eax -; CHECK-SKX-NEXT: shll $23, %eax -; CHECK-SKX-NEXT: movl $285212672, %ecx # imm = 0x11000000 -; CHECK-SKX-NEXT: subl %eax, %ecx -; CHECK-SKX-NEXT: vmovd %ecx, %xmm0 -; CHECK-SKX-NEXT: retq %shl = shl nuw i32 1, %cnt %conv = uitofp i32 %shl to float %mul = fdiv float 0x3a20000000000000, %conv @@ -2205,32 +1601,8 @@ define x86_fp80 @pr128528(i1 %cond) { ; CHECK-AVX-NEXT: fildl -{{[0-9]+}}(%rsp) ; CHECK-AVX-NEXT: fmull {{\.?LCPI[0-9]+_[0-9]+}}(%rip) ; CHECK-AVX-NEXT: retq -; -; CHECK-AVX512F-LABEL: pr128528: -; CHECK-AVX512F: # %bb.0: -; CHECK-AVX512F-NEXT: testb $1, %dil -; CHECK-AVX512F-NEXT: movl $8, %eax -; CHECK-AVX512F-NEXT: movl $1, %ecx -; CHECK-AVX512F-NEXT: cmovnel %eax, %ecx -; CHECK-AVX512F-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) -; CHECK-AVX512F-NEXT: fildl -{{[0-9]+}}(%rsp) -; CHECK-AVX512F-NEXT: fmull {{\.?LCPI[0-9]+_[0-9]+}}(%rip) -; CHECK-AVX512F-NEXT: retq -; -; CHECK-SKX-LABEL: pr128528: -; CHECK-SKX: # %bb.0: -; CHECK-SKX-NEXT: testb $1, %dil -; CHECK-SKX-NEXT: movl $8, %eax -; CHECK-SKX-NEXT: movl $1, %ecx -; CHECK-SKX-NEXT: cmovnel %eax, %ecx -; CHECK-SKX-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) -; CHECK-SKX-NEXT: fildl -{{[0-9]+}}(%rsp) -; CHECK-SKX-NEXT: fmull {{\.?LCPI[0-9]+_[0-9]+}}(%rip) -; CHECK-SKX-NEXT: retq %sub9 = select i1 %cond, i32 8, i32 1 %conv = uitofp i32 %sub9 to x86_fp80 %mul = fmul x86_fp80 %conv, 0xK4007D055555555555800 ret x86_fp80 %mul } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; CHECK-AVX2: {{.*}} diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll b/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll index e1649ba733498..e01b908cfd5b6 100644 --- a/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll +++ b/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll @@ -46,7 +46,7 @@ define float @sqrt_ieee_ninf(float %f) #0 { ; CHECK-NEXT: [[COPY5:%[0-9]+]]:fr32 = COPY killed [[VPANDNrr]] ; CHECK-NEXT: $xmm0 = COPY [[COPY5]] ; CHECK-NEXT: RET 0, $xmm0 - %call = tail call contract ninf afn contract float @llvm.sqrt.f32(float %f) + %call = tail call contract ninf afn float @llvm.sqrt.f32(float %f) ret float %call } From db3af3a253c0b7830e02f616eaef096e7d0999a6 Mon Sep 17 00:00:00 2001 From: "Pirog, Mikolaj Maciej" Date: Tue, 16 Sep 2025 15:18:19 +0200 Subject: [PATCH 5/6] Apply reviewer suggestion --- .../X86/fold-int-pow2-with-fmul-or-fdiv.ll | 165 +++++++++++++++++- 1 file changed, 164 insertions(+), 1 deletion(-) diff --git a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll index db30be26d04ab..81529aff39ff1 100644 --- a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll +++ b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=CHECK-SSE ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK-AVX,CHECK-AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK-AVX,CHECK-AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK-AVX,CHECK-AVX512F,CHECK-ONLY-AVX512F ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx | FileCheck %s --check-prefixes=CHECK-AVX,CHECK-AVX512F,CHECK-SKX declare i16 @llvm.umax.i16(i16, i16) @@ -23,6 +23,13 @@ define <4 x float> @fmul_pow2_4xfloat(<4 x i32> %i) { ; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; +; CHECK-ONLY-AVX512F-LABEL: fmul_pow2_4xfloat: +; CHECK-ONLY-AVX512F: # %bb.0: +; CHECK-ONLY-AVX512F-NEXT: vpslld $23, %xmm0, %xmm0 +; CHECK-ONLY-AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1091567616,1091567616,1091567616,1091567616] +; CHECK-ONLY-AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-ONLY-AVX512F-NEXT: retq +; ; CHECK-SKX-LABEL: fmul_pow2_4xfloat: ; CHECK-SKX: # %bb.0: ; CHECK-SKX-NEXT: vpslld $23, %xmm0, %xmm0 @@ -364,6 +371,22 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) { ; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 8 ; CHECK-AVX2-NEXT: retq ; +; CHECK-ONLY-AVX512F-LABEL: fmul_pow2_8xhalf: +; CHECK-ONLY-AVX512F: # %bb.0: +; CHECK-ONLY-AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; CHECK-ONLY-AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] +; CHECK-ONLY-AVX512F-NEXT: vpsllvd %ymm0, %ymm1, %ymm0 +; CHECK-ONLY-AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-ONLY-AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; CHECK-ONLY-AVX512F-NEXT: vcvtdq2ps %ymm0, %ymm0 +; CHECK-ONLY-AVX512F-NEXT: vcvtps2ph $4, %ymm0, %xmm0 +; CHECK-ONLY-AVX512F-NEXT: vcvtph2ps %xmm0, %ymm0 +; CHECK-ONLY-AVX512F-NEXT: vbroadcastss {{.*#+}} ymm1 = [8.192E+3,8.192E+3,8.192E+3,8.192E+3,8.192E+3,8.192E+3,8.192E+3,8.192E+3] +; CHECK-ONLY-AVX512F-NEXT: vmulps %ymm1, %ymm0, %ymm0 +; CHECK-ONLY-AVX512F-NEXT: vcvtps2ph $4, %ymm0, %xmm0 +; CHECK-ONLY-AVX512F-NEXT: vzeroupper +; CHECK-ONLY-AVX512F-NEXT: retq +; ; CHECK-SKX-LABEL: fmul_pow2_8xhalf: ; CHECK-SKX: # %bb.0: ; CHECK-SKX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1] @@ -633,6 +656,13 @@ define <8 x half> @fdiv_pow2_8xhalf(<8 x i16> %i) { ; CHECK-AVX2-NEXT: vpsubw %xmm0, %xmm1, %xmm0 ; CHECK-AVX2-NEXT: retq ; +; CHECK-ONLY-AVX512F-LABEL: fdiv_pow2_8xhalf: +; CHECK-ONLY-AVX512F: # %bb.0: +; CHECK-ONLY-AVX512F-NEXT: vpsllw $10, %xmm0, %xmm0 +; CHECK-ONLY-AVX512F-NEXT: vpbroadcastw {{.*#+}} xmm1 = [28672,28672,28672,28672,28672,28672,28672,28672] +; CHECK-ONLY-AVX512F-NEXT: vpsubw %xmm0, %xmm1, %xmm0 +; CHECK-ONLY-AVX512F-NEXT: retq +; ; CHECK-SKX-LABEL: fdiv_pow2_8xhalf: ; CHECK-SKX: # %bb.0: ; CHECK-SKX-NEXT: vpsllw $10, %xmm0, %xmm0 @@ -852,6 +882,15 @@ define double @fmul_pow_shl_cnt_fail_maybe_non_pow2(i64 %v, i64 %cnt) nounwind { ; CHECK-AVX2-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; +; CHECK-ONLY-AVX512F-LABEL: fmul_pow_shl_cnt_fail_maybe_non_pow2: +; CHECK-ONLY-AVX512F: # %bb.0: +; CHECK-ONLY-AVX512F-NEXT: movq %rsi, %rcx +; CHECK-ONLY-AVX512F-NEXT: # kill: def $cl killed $cl killed $rcx +; CHECK-ONLY-AVX512F-NEXT: shlq %cl, %rdi +; CHECK-ONLY-AVX512F-NEXT: vcvtusi2sd %rdi, %xmm15, %xmm0 +; CHECK-ONLY-AVX512F-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-ONLY-AVX512F-NEXT: retq +; ; CHECK-SKX-LABEL: fmul_pow_shl_cnt_fail_maybe_non_pow2: ; CHECK-SKX: # %bb.0: ; CHECK-SKX-NEXT: shlxq %rsi, %rdi, %rax @@ -896,6 +935,19 @@ define <2 x float> @fmul_pow_shl_cnt_vec_fail_expensive_cast(<2 x i64> %cnt) nou ; CHECK-AVX2-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; +; CHECK-ONLY-AVX512F-LABEL: fmul_pow_shl_cnt_vec_fail_expensive_cast: +; CHECK-ONLY-AVX512F: # %bb.0: +; CHECK-ONLY-AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm1 = [2,2] +; CHECK-ONLY-AVX512F-NEXT: vpsllvq %xmm0, %xmm1, %xmm0 +; CHECK-ONLY-AVX512F-NEXT: vpextrq $1, %xmm0, %rax +; CHECK-ONLY-AVX512F-NEXT: vcvtsi2ss %rax, %xmm15, %xmm1 +; CHECK-ONLY-AVX512F-NEXT: vmovq %xmm0, %rax +; CHECK-ONLY-AVX512F-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 +; CHECK-ONLY-AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero +; CHECK-ONLY-AVX512F-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.5E+1,1.5E+1,1.5E+1,1.5E+1] +; CHECK-ONLY-AVX512F-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; CHECK-ONLY-AVX512F-NEXT: retq +; ; CHECK-SKX-LABEL: fmul_pow_shl_cnt_vec_fail_expensive_cast: ; CHECK-SKX: # %bb.0: ; CHECK-SKX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2,2] @@ -922,6 +974,12 @@ define <2 x double> @fmul_pow_shl_cnt_vec(<2 x i64> %cnt) nounwind { ; CHECK-AVX2-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; +; CHECK-ONLY-AVX512F-LABEL: fmul_pow_shl_cnt_vec: +; CHECK-ONLY-AVX512F: # %bb.0: +; CHECK-ONLY-AVX512F-NEXT: vpsllq $52, %xmm0, %xmm0 +; CHECK-ONLY-AVX512F-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-ONLY-AVX512F-NEXT: retq +; ; CHECK-SKX-LABEL: fmul_pow_shl_cnt_vec: ; CHECK-SKX: # %bb.0: ; CHECK-SKX-NEXT: vpsllq $52, %xmm0, %xmm0 @@ -949,6 +1007,15 @@ define <4 x float> @fmul_pow_shl_cnt_vec_preserve_fma(<4 x i32> %cnt, <4 x float ; CHECK-AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; +; CHECK-ONLY-AVX512F-LABEL: fmul_pow_shl_cnt_vec_preserve_fma: +; CHECK-ONLY-AVX512F: # %bb.0: +; CHECK-ONLY-AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2,2,2,2] +; CHECK-ONLY-AVX512F-NEXT: vpsllvd %xmm0, %xmm2, %xmm0 +; CHECK-ONLY-AVX512F-NEXT: vcvtdq2ps %xmm0, %xmm2 +; CHECK-ONLY-AVX512F-NEXT: vbroadcastss {{.*#+}} xmm0 = [5.0E+0,5.0E+0,5.0E+0,5.0E+0] +; CHECK-ONLY-AVX512F-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1 +; CHECK-ONLY-AVX512F-NEXT: retq +; ; CHECK-SKX-LABEL: fmul_pow_shl_cnt_vec_preserve_fma: ; CHECK-SKX: # %bb.0: ; CHECK-SKX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2,2,2,2] @@ -979,6 +1046,14 @@ define <4 x float> @fmul_pow_shl_cnt_vec_no_fma(<4 x i32> %cnt, <4 x float> %add ; CHECK-AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; +; CHECK-ONLY-AVX512F-LABEL: fmul_pow_shl_cnt_vec_no_fma: +; CHECK-ONLY-AVX512F: # %bb.0: +; CHECK-ONLY-AVX512F-NEXT: vpslld $23, %xmm0, %xmm0 +; CHECK-ONLY-AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1092616192,1092616192,1092616192,1092616192] +; CHECK-ONLY-AVX512F-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; CHECK-ONLY-AVX512F-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; CHECK-ONLY-AVX512F-NEXT: retq +; ; CHECK-SKX-LABEL: fmul_pow_shl_cnt_vec_no_fma: ; CHECK-SKX: # %bb.0: ; CHECK-SKX-NEXT: vpslld $23, %xmm0, %xmm0 @@ -1094,6 +1169,22 @@ define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind { ; CHECK-AVX2-NEXT: addq $56, %rsp ; CHECK-AVX2-NEXT: retq ; +; CHECK-ONLY-AVX512F-LABEL: fmul_pow_shl_cnt_vec_fail_to_large: +; CHECK-ONLY-AVX512F: # %bb.0: +; CHECK-ONLY-AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; CHECK-ONLY-AVX512F-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,2,0,0,0,0,0,0] +; CHECK-ONLY-AVX512F-NEXT: vpsllvd %ymm0, %ymm1, %ymm0 +; CHECK-ONLY-AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; CHECK-ONLY-AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; CHECK-ONLY-AVX512F-NEXT: vcvtdq2ps %ymm0, %ymm0 +; CHECK-ONLY-AVX512F-NEXT: vcvtps2ph $4, %ymm0, %xmm0 +; CHECK-ONLY-AVX512F-NEXT: vcvtph2ps %xmm0, %ymm0 +; CHECK-ONLY-AVX512F-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.5E+1,1.5E+1,1.5E+1,1.5E+1,1.5E+1,1.5E+1,1.5E+1,1.5E+1] +; CHECK-ONLY-AVX512F-NEXT: vmulps %ymm1, %ymm0, %ymm0 +; CHECK-ONLY-AVX512F-NEXT: vcvtps2ph $4, %ymm0, %xmm0 +; CHECK-ONLY-AVX512F-NEXT: vzeroupper +; CHECK-ONLY-AVX512F-NEXT: retq +; ; CHECK-SKX-LABEL: fmul_pow_shl_cnt_vec_fail_to_large: ; CHECK-SKX: # %bb.0: ; CHECK-SKX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2] @@ -1142,6 +1233,16 @@ define double @fmul_pow_shl_cnt_fail_maybe_bad_exp(i64 %cnt) nounwind { ; CHECK-AVX2-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; +; CHECK-ONLY-AVX512F-LABEL: fmul_pow_shl_cnt_fail_maybe_bad_exp: +; CHECK-ONLY-AVX512F: # %bb.0: +; CHECK-ONLY-AVX512F-NEXT: movq %rdi, %rcx +; CHECK-ONLY-AVX512F-NEXT: movl $1, %eax +; CHECK-ONLY-AVX512F-NEXT: # kill: def $cl killed $cl killed $rcx +; CHECK-ONLY-AVX512F-NEXT: shlq %cl, %rax +; CHECK-ONLY-AVX512F-NEXT: vcvtusi2sd %rax, %xmm15, %xmm0 +; CHECK-ONLY-AVX512F-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-ONLY-AVX512F-NEXT: retq +; ; CHECK-SKX-LABEL: fmul_pow_shl_cnt_fail_maybe_bad_exp: ; CHECK-SKX: # %bb.0: ; CHECK-SKX-NEXT: movl $1, %eax @@ -1265,6 +1366,17 @@ define float @fdiv_pow_shl_cnt_fail_maybe_z(i64 %cnt) nounwind { ; CHECK-AVX2-NEXT: vdivss %xmm0, %xmm1, %xmm0 ; CHECK-AVX2-NEXT: retq ; +; CHECK-ONLY-AVX512F-LABEL: fdiv_pow_shl_cnt_fail_maybe_z: +; CHECK-ONLY-AVX512F: # %bb.0: +; CHECK-ONLY-AVX512F-NEXT: movq %rdi, %rcx +; CHECK-ONLY-AVX512F-NEXT: movl $8, %eax +; CHECK-ONLY-AVX512F-NEXT: # kill: def $cl killed $cl killed $rcx +; CHECK-ONLY-AVX512F-NEXT: shlq %cl, %rax +; CHECK-ONLY-AVX512F-NEXT: vcvtusi2ss %rax, %xmm15, %xmm0 +; CHECK-ONLY-AVX512F-NEXT: vmovss {{.*#+}} xmm1 = [-9.0E+0,0.0E+0,0.0E+0,0.0E+0] +; CHECK-ONLY-AVX512F-NEXT: vdivss %xmm0, %xmm1, %xmm0 +; CHECK-ONLY-AVX512F-NEXT: retq +; ; CHECK-SKX-LABEL: fdiv_pow_shl_cnt_fail_maybe_z: ; CHECK-SKX: # %bb.0: ; CHECK-SKX-NEXT: movl $8, %eax @@ -1302,6 +1414,17 @@ define float @fdiv_pow_shl_cnt_fail_neg_int(i64 %cnt) nounwind { ; CHECK-AVX2-NEXT: vdivss %xmm0, %xmm1, %xmm0 ; CHECK-AVX2-NEXT: retq ; +; CHECK-ONLY-AVX512F-LABEL: fdiv_pow_shl_cnt_fail_neg_int: +; CHECK-ONLY-AVX512F: # %bb.0: +; CHECK-ONLY-AVX512F-NEXT: movq %rdi, %rcx +; CHECK-ONLY-AVX512F-NEXT: movl $8, %eax +; CHECK-ONLY-AVX512F-NEXT: # kill: def $cl killed $cl killed $rcx +; CHECK-ONLY-AVX512F-NEXT: shlq %cl, %rax +; CHECK-ONLY-AVX512F-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 +; CHECK-ONLY-AVX512F-NEXT: vmovss {{.*#+}} xmm1 = [-9.0E+0,0.0E+0,0.0E+0,0.0E+0] +; CHECK-ONLY-AVX512F-NEXT: vdivss %xmm0, %xmm1, %xmm0 +; CHECK-ONLY-AVX512F-NEXT: retq +; ; CHECK-SKX-LABEL: fdiv_pow_shl_cnt_fail_neg_int: ; CHECK-SKX: # %bb.0: ; CHECK-SKX-NEXT: movl $8, %eax @@ -1375,6 +1498,20 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bounds(i32 %cnt) nounwind { ; CHECK-AVX2-NEXT: popq %rax ; CHECK-AVX2-NEXT: retq ; +; CHECK-ONLY-AVX512F-LABEL: fdiv_pow_shl_cnt_fail_out_of_bounds: +; CHECK-ONLY-AVX512F: # %bb.0: +; CHECK-ONLY-AVX512F-NEXT: movl %edi, %ecx +; CHECK-ONLY-AVX512F-NEXT: movl $1, %eax +; CHECK-ONLY-AVX512F-NEXT: # kill: def $cl killed $cl killed $ecx +; CHECK-ONLY-AVX512F-NEXT: shll %cl, %eax +; CHECK-ONLY-AVX512F-NEXT: vcvtusi2ss %eax, %xmm15, %xmm0 +; CHECK-ONLY-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; CHECK-ONLY-AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 +; CHECK-ONLY-AVX512F-NEXT: vmovss {{.*#+}} xmm1 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] +; CHECK-ONLY-AVX512F-NEXT: vdivss %xmm0, %xmm1, %xmm0 +; CHECK-ONLY-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; CHECK-ONLY-AVX512F-NEXT: retq +; ; CHECK-SKX-LABEL: fdiv_pow_shl_cnt_fail_out_of_bounds: ; CHECK-SKX: # %bb.0: ; CHECK-SKX-NEXT: movl $1, %eax @@ -1472,6 +1609,21 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bound2(i16 %cnt) nounwind { ; CHECK-AVX2-NEXT: popq %rax ; CHECK-AVX2-NEXT: retq ; +; CHECK-ONLY-AVX512F-LABEL: fdiv_pow_shl_cnt_fail_out_of_bound2: +; CHECK-ONLY-AVX512F: # %bb.0: +; CHECK-ONLY-AVX512F-NEXT: movl %edi, %ecx +; CHECK-ONLY-AVX512F-NEXT: movl $1, %eax +; CHECK-ONLY-AVX512F-NEXT: # kill: def $cl killed $cl killed $ecx +; CHECK-ONLY-AVX512F-NEXT: shll %cl, %eax +; CHECK-ONLY-AVX512F-NEXT: movzwl %ax, %eax +; CHECK-ONLY-AVX512F-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0 +; CHECK-ONLY-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; CHECK-ONLY-AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 +; CHECK-ONLY-AVX512F-NEXT: vmovss {{.*#+}} xmm1 = [2.0E+0,0.0E+0,0.0E+0,0.0E+0] +; CHECK-ONLY-AVX512F-NEXT: vdivss %xmm0, %xmm1, %xmm0 +; CHECK-ONLY-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; CHECK-ONLY-AVX512F-NEXT: retq +; ; CHECK-SKX-LABEL: fdiv_pow_shl_cnt_fail_out_of_bound2: ; CHECK-SKX: # %bb.0: ; CHECK-SKX-NEXT: movl $1, %eax @@ -1539,6 +1691,17 @@ define float @fdiv_pow_shl_cnt32_out_of_bounds2(i32 %cnt) nounwind { ; CHECK-AVX2-NEXT: vdivss %xmm0, %xmm1, %xmm0 ; CHECK-AVX2-NEXT: retq ; +; CHECK-ONLY-AVX512F-LABEL: fdiv_pow_shl_cnt32_out_of_bounds2: +; CHECK-ONLY-AVX512F: # %bb.0: +; CHECK-ONLY-AVX512F-NEXT: movl %edi, %ecx +; CHECK-ONLY-AVX512F-NEXT: movl $1, %eax +; CHECK-ONLY-AVX512F-NEXT: # kill: def $cl killed $cl killed $ecx +; CHECK-ONLY-AVX512F-NEXT: shll %cl, %eax +; CHECK-ONLY-AVX512F-NEXT: vcvtusi2ss %eax, %xmm15, %xmm0 +; CHECK-ONLY-AVX512F-NEXT: vmovss {{.*#+}} xmm1 = [1.00974148E-28,0.0E+0,0.0E+0,0.0E+0] +; CHECK-ONLY-AVX512F-NEXT: vdivss %xmm0, %xmm1, %xmm0 +; CHECK-ONLY-AVX512F-NEXT: retq +; ; CHECK-SKX-LABEL: fdiv_pow_shl_cnt32_out_of_bounds2: ; CHECK-SKX: # %bb.0: ; CHECK-SKX-NEXT: movl $1, %eax From 51b1425122484f69145cd1cf72028be8432ec835 Mon Sep 17 00:00:00 2001 From: "Pirog, Mikolaj Maciej" Date: Tue, 16 Sep 2025 15:58:12 +0200 Subject: [PATCH 6/6] Don't change sqrt-fastmath-mir --- llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll | 50 +++++++++++----------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll b/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll index e01b908cfd5b6..42617c1573be5 100644 --- a/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll +++ b/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2,fma -stop-after=finalize-isel 2>&1 | FileCheck %s +; RUN: llc -fp-contract=fast < %s -mtriple=x86_64-unknown-unknown -mattr=avx2,fma -stop-after=finalize-isel 2>&1 | FileCheck %s declare float @llvm.sqrt.f32(float) #2 @@ -10,10 +10,10 @@ define float @sqrt_ieee(float %f) #0 { ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:fr32 = COPY $xmm0 ; CHECK-NEXT: [[DEF:%[0-9]+]]:fr32 = IMPLICIT_DEF - ; CHECK-NEXT: [[VSQRTSSr:%[0-9]+]]:fr32 = contract nofpexcept VSQRTSSr killed [[DEF]], [[COPY]], implicit $mxcsr + ; CHECK-NEXT: [[VSQRTSSr:%[0-9]+]]:fr32 = nofpexcept VSQRTSSr killed [[DEF]], [[COPY]], implicit $mxcsr ; CHECK-NEXT: $xmm0 = COPY [[VSQRTSSr]] ; CHECK-NEXT: RET 0, $xmm0 - %call = tail call contract float @llvm.sqrt.f32(float %f) + %call = tail call float @llvm.sqrt.f32(float %f) ret float %call } @@ -25,16 +25,16 @@ define float @sqrt_ieee_ninf(float %f) #0 { ; CHECK-NEXT: [[COPY:%[0-9]+]]:fr32 = COPY $xmm0 ; CHECK-NEXT: [[DEF:%[0-9]+]]:fr32 = IMPLICIT_DEF ; CHECK-NEXT: [[VRSQRTSSr:%[0-9]+]]:fr32 = VRSQRTSSr killed [[DEF]], [[COPY]] - ; CHECK-NEXT: [[VMULSSrr:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VMULSSrr [[COPY]], [[VRSQRTSSr]], implicit $mxcsr + ; CHECK-NEXT: [[VMULSSrr:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr [[COPY]], [[VRSQRTSSr]], implicit $mxcsr ; CHECK-NEXT: [[VMOVSSrm_alt:%[0-9]+]]:fr32 = VMOVSSrm_alt $rip, 1, $noreg, %const.0, $noreg :: (load (s32) from constant-pool) - ; CHECK-NEXT: [[VFMADD213SSr:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VFMADD213SSr [[VRSQRTSSr]], killed [[VMULSSrr]], [[VMOVSSrm_alt]], implicit $mxcsr + ; CHECK-NEXT: [[VFMADD213SSr:%[0-9]+]]:fr32 = ninf afn nofpexcept VFMADD213SSr [[VRSQRTSSr]], killed [[VMULSSrr]], [[VMOVSSrm_alt]], implicit $mxcsr ; CHECK-NEXT: [[VMOVSSrm_alt1:%[0-9]+]]:fr32 = VMOVSSrm_alt $rip, 1, $noreg, %const.1, $noreg :: (load (s32) from constant-pool) - ; CHECK-NEXT: [[VMULSSrr1:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VMULSSrr [[VRSQRTSSr]], [[VMOVSSrm_alt1]], implicit $mxcsr - ; CHECK-NEXT: [[VMULSSrr2:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VMULSSrr killed [[VMULSSrr1]], killed [[VFMADD213SSr]], implicit $mxcsr - ; CHECK-NEXT: [[VMULSSrr3:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VMULSSrr [[COPY]], [[VMULSSrr2]], implicit $mxcsr - ; CHECK-NEXT: [[VFMADD213SSr1:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VFMADD213SSr [[VMULSSrr2]], [[VMULSSrr3]], [[VMOVSSrm_alt]], implicit $mxcsr - ; CHECK-NEXT: [[VMULSSrr4:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VMULSSrr [[VMULSSrr3]], [[VMOVSSrm_alt1]], implicit $mxcsr - ; CHECK-NEXT: [[VMULSSrr5:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VMULSSrr killed [[VMULSSrr4]], killed [[VFMADD213SSr1]], implicit $mxcsr + ; CHECK-NEXT: [[VMULSSrr1:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr [[VRSQRTSSr]], [[VMOVSSrm_alt1]], implicit $mxcsr + ; CHECK-NEXT: [[VMULSSrr2:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr killed [[VMULSSrr1]], killed [[VFMADD213SSr]], implicit $mxcsr + ; CHECK-NEXT: [[VMULSSrr3:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr [[COPY]], [[VMULSSrr2]], implicit $mxcsr + ; CHECK-NEXT: [[VFMADD213SSr1:%[0-9]+]]:fr32 = ninf afn nofpexcept VFMADD213SSr [[VMULSSrr2]], [[VMULSSrr3]], [[VMOVSSrm_alt]], implicit $mxcsr + ; CHECK-NEXT: [[VMULSSrr4:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr [[VMULSSrr3]], [[VMOVSSrm_alt1]], implicit $mxcsr + ; CHECK-NEXT: [[VMULSSrr5:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr killed [[VMULSSrr4]], killed [[VFMADD213SSr1]], implicit $mxcsr ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vr128 = COPY killed [[VMULSSrr5]] ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vr128 = COPY [[COPY]] ; CHECK-NEXT: [[VPBROADCASTDrm:%[0-9]+]]:vr128 = VPBROADCASTDrm $rip, 1, $noreg, %const.2, $noreg :: (load (s32) from constant-pool) @@ -46,7 +46,7 @@ define float @sqrt_ieee_ninf(float %f) #0 { ; CHECK-NEXT: [[COPY5:%[0-9]+]]:fr32 = COPY killed [[VPANDNrr]] ; CHECK-NEXT: $xmm0 = COPY [[COPY5]] ; CHECK-NEXT: RET 0, $xmm0 - %call = tail call contract ninf afn float @llvm.sqrt.f32(float %f) + %call = tail call ninf afn float @llvm.sqrt.f32(float %f) ret float %call } @@ -57,10 +57,10 @@ define float @sqrt_daz(float %f) #1 { ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:fr32 = COPY $xmm0 ; CHECK-NEXT: [[DEF:%[0-9]+]]:fr32 = IMPLICIT_DEF - ; CHECK-NEXT: [[VSQRTSSr:%[0-9]+]]:fr32 = contract nofpexcept VSQRTSSr killed [[DEF]], [[COPY]], implicit $mxcsr + ; CHECK-NEXT: [[VSQRTSSr:%[0-9]+]]:fr32 = nofpexcept VSQRTSSr killed [[DEF]], [[COPY]], implicit $mxcsr ; CHECK-NEXT: $xmm0 = COPY [[VSQRTSSr]] ; CHECK-NEXT: RET 0, $xmm0 - %call = tail call contract float @llvm.sqrt.f32(float %f) + %call = tail call float @llvm.sqrt.f32(float %f) ret float %call } @@ -72,16 +72,16 @@ define float @sqrt_daz_ninf(float %f) #1 { ; CHECK-NEXT: [[COPY:%[0-9]+]]:fr32 = COPY $xmm0 ; CHECK-NEXT: [[DEF:%[0-9]+]]:fr32 = IMPLICIT_DEF ; CHECK-NEXT: [[VRSQRTSSr:%[0-9]+]]:fr32 = VRSQRTSSr killed [[DEF]], [[COPY]] - ; CHECK-NEXT: [[VMULSSrr:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VMULSSrr [[COPY]], [[VRSQRTSSr]], implicit $mxcsr + ; CHECK-NEXT: [[VMULSSrr:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr [[COPY]], [[VRSQRTSSr]], implicit $mxcsr ; CHECK-NEXT: [[VMOVSSrm_alt:%[0-9]+]]:fr32 = VMOVSSrm_alt $rip, 1, $noreg, %const.0, $noreg :: (load (s32) from constant-pool) - ; CHECK-NEXT: [[VFMADD213SSr:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VFMADD213SSr [[VRSQRTSSr]], killed [[VMULSSrr]], [[VMOVSSrm_alt]], implicit $mxcsr + ; CHECK-NEXT: [[VFMADD213SSr:%[0-9]+]]:fr32 = ninf afn nofpexcept VFMADD213SSr [[VRSQRTSSr]], killed [[VMULSSrr]], [[VMOVSSrm_alt]], implicit $mxcsr ; CHECK-NEXT: [[VMOVSSrm_alt1:%[0-9]+]]:fr32 = VMOVSSrm_alt $rip, 1, $noreg, %const.1, $noreg :: (load (s32) from constant-pool) - ; CHECK-NEXT: [[VMULSSrr1:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VMULSSrr [[VRSQRTSSr]], [[VMOVSSrm_alt1]], implicit $mxcsr - ; CHECK-NEXT: [[VMULSSrr2:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VMULSSrr killed [[VMULSSrr1]], killed [[VFMADD213SSr]], implicit $mxcsr - ; CHECK-NEXT: [[VMULSSrr3:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VMULSSrr [[COPY]], [[VMULSSrr2]], implicit $mxcsr - ; CHECK-NEXT: [[VFMADD213SSr1:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VFMADD213SSr [[VMULSSrr2]], [[VMULSSrr3]], [[VMOVSSrm_alt]], implicit $mxcsr - ; CHECK-NEXT: [[VMULSSrr4:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VMULSSrr [[VMULSSrr3]], [[VMOVSSrm_alt1]], implicit $mxcsr - ; CHECK-NEXT: [[VMULSSrr5:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VMULSSrr killed [[VMULSSrr4]], killed [[VFMADD213SSr1]], implicit $mxcsr + ; CHECK-NEXT: [[VMULSSrr1:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr [[VRSQRTSSr]], [[VMOVSSrm_alt1]], implicit $mxcsr + ; CHECK-NEXT: [[VMULSSrr2:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr killed [[VMULSSrr1]], killed [[VFMADD213SSr]], implicit $mxcsr + ; CHECK-NEXT: [[VMULSSrr3:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr [[COPY]], [[VMULSSrr2]], implicit $mxcsr + ; CHECK-NEXT: [[VFMADD213SSr1:%[0-9]+]]:fr32 = ninf afn nofpexcept VFMADD213SSr [[VMULSSrr2]], [[VMULSSrr3]], [[VMOVSSrm_alt]], implicit $mxcsr + ; CHECK-NEXT: [[VMULSSrr4:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr [[VMULSSrr3]], [[VMOVSSrm_alt1]], implicit $mxcsr + ; CHECK-NEXT: [[VMULSSrr5:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr killed [[VMULSSrr4]], killed [[VFMADD213SSr1]], implicit $mxcsr ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vr128 = COPY killed [[VMULSSrr5]] ; CHECK-NEXT: [[FsFLD0SS:%[0-9]+]]:fr32 = FsFLD0SS ; CHECK-NEXT: [[VCMPSSrri:%[0-9]+]]:fr32 = nofpexcept VCMPSSrri [[COPY]], killed [[FsFLD0SS]], 0, implicit $mxcsr @@ -90,7 +90,7 @@ define float @sqrt_daz_ninf(float %f) #1 { ; CHECK-NEXT: [[COPY3:%[0-9]+]]:fr32 = COPY killed [[VPANDNrr]] ; CHECK-NEXT: $xmm0 = COPY [[COPY3]] ; CHECK-NEXT: RET 0, $xmm0 - %call = tail call contract ninf afn float @llvm.sqrt.f32(float %f) + %call = tail call ninf afn float @llvm.sqrt.f32(float %f) ret float %call } @@ -114,7 +114,7 @@ define float @rsqrt_ieee(float %f) #0 { ; CHECK-NEXT: [[VMULSSrr5:%[0-9]+]]:fr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept VMULSSrr killed [[VMULSSrr4]], killed [[VFMADD213SSr1]], implicit $mxcsr ; CHECK-NEXT: $xmm0 = COPY [[VMULSSrr5]] ; CHECK-NEXT: RET 0, $xmm0 - %sqrt = tail call contract float @llvm.sqrt.f32(float %f) + %sqrt = tail call float @llvm.sqrt.f32(float %f) %div = fdiv fast float 1.0, %sqrt ret float %div } @@ -139,7 +139,7 @@ define float @rsqrt_daz(float %f) #1 { ; CHECK-NEXT: [[VMULSSrr5:%[0-9]+]]:fr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept VMULSSrr killed [[VMULSSrr4]], killed [[VFMADD213SSr1]], implicit $mxcsr ; CHECK-NEXT: $xmm0 = COPY [[VMULSSrr5]] ; CHECK-NEXT: RET 0, $xmm0 - %sqrt = tail call contract float @llvm.sqrt.f32(float %f) + %sqrt = tail call float @llvm.sqrt.f32(float %f) %div = fdiv fast float 1.0, %sqrt ret float %div }