From d620eb65d04f7233124e2e5dc68ba98b7a8ca8a4 Mon Sep 17 00:00:00 2001 From: Jatin Bhateja Date: Fri, 31 Jan 2025 16:19:52 +0530 Subject: [PATCH 1/4] 8349138: Optimize Math.copySign API for Intel e-core and p-core targets --- src/hotspot/cpu/x86/x86.ad | 107 +++++++++++++++++++++++++------------ 1 file changed, 73 insertions(+), 34 deletions(-) diff --git a/src/hotspot/cpu/x86/x86.ad b/src/hotspot/cpu/x86/x86.ad index 95b761ad44ead..bee709122acb6 100644 --- a/src/hotspot/cpu/x86/x86.ad +++ b/src/hotspot/cpu/x86/x86.ad @@ -1610,13 +1610,9 @@ bool Matcher::match_rule_supported(int opcode) { break; case Op_CopySignD: case Op_CopySignF: - if (UseAVX < 3 || !is_LP64) { + if (UseAVX < 1 || !is_LP64) { return false; } - if (!VM_Version::supports_avx512vl()) { - return false; - } - break; #ifndef _LP64 case Op_AddReductionVF: case Op_AddReductionVD: @@ -6711,54 +6707,97 @@ instruct signumV_reg_evex(vec dst, vec src, vec zero, vec one, kReg ktmp1) %{ ins_pipe( pipe_slow ); %} -// --------------------------------------- -// For copySign use 0xE4 as writemask for vpternlog -// Desired Truth Table: A -> xmm0 bit, B -> xmm1 bit, C -> xmm2 bit -// C (xmm2) is set to 0x7FFFFFFF -// Wherever xmm2 is 0, we want to pick from B (sign) -// Wherever xmm2 is 1, we want to pick from A (src) +// ---------------------------------------------------------------------- +// We are using bitwise ternary logic insturction VPTERNLOG which can +// absorb complex binary expressions involving 3 boolean variables. +// +// For copySign we set the truth table value as 0xE4. +// First column of truth table represents magnitude, second column +// represents sign operand while the third column is a conditional +// operand with fixed value of 0x7FFFFFFF. +// +// Whenever condition bit is 1 corresponding magnitude bit gets selected +// else corresponding sign bit is picked. +// Our condition mask is such that apart for sign bit i.e. MSB bit all +// other bits are set to 1, this ensures that all the bits of result +// apart from MSB bit are copied from magnitude operand while sign bit +// is borrowed from sign operand. // -// A B C Result -// 0 0 0 0 -// 0 0 1 0 -// 0 1 0 1 -// 0 1 1 0 -// 1 0 0 0 -// 1 0 1 1 -// 1 1 0 1 -// 1 1 1 1 +// Magnitude Sign Condition Result +// 0 0 0 0 +// 0 0 1 0 +// 0 1 0 1 +// 0 1 1 0 +// 1 0 0 0 +// 1 0 1 1 +// 1 1 0 1 +// 1 1 1 1 // -// Result going from high bit to low bit is 0x11100100 = 0xe4 -// --------------------------------------- +// ---------------------------------------------------------------------- #ifdef _LP64 -instruct copySignF_reg(regF dst, regF src, regF tmp1, rRegI tmp2) %{ +instruct copySignF_reg(regF dst, regF src, regF xtmp1) %{ + predicate(VM_Version::supports_avx512vl()); match(Set dst (CopySignF dst src)); - effect(TEMP tmp1, TEMP tmp2); - format %{ "CopySignF $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %} + effect(TEMP xtmp1); + format %{ "CopySignF $dst, $src\t! using $xtmp1 as TEMP" %} ins_encode %{ - __ movl($tmp2$$Register, 0x7FFFFFFF); - __ movdl($tmp1$$XMMRegister, $tmp2$$Register); - __ vpternlogd($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit); + __ vpcmpeqd($xtmp1$$XMMRegister, $xtmp1$$XMMRegister, $xtmp1$$XMMRegister, Assembler::AVX_128bit); + __ vpsrld($xtmp1$$XMMRegister, $xtmp1$$XMMRegister, 1, Assembler::AVX_128bit); + __ vpternlogd($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $xtmp1$$XMMRegister, Assembler::AVX_128bit); %} ins_pipe( pipe_slow ); %} -instruct copySignD_imm(regD dst, regD src, regD tmp1, rRegL tmp2, immD zero) %{ +instruct copySignD_imm(regD dst, regD src, regD xtmp1, immD zero) %{ + predicate(VM_Version::supports_avx512vl()); match(Set dst (CopySignD dst (Binary src zero))); ins_cost(100); - effect(TEMP tmp1, TEMP tmp2); - format %{ "CopySignD $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %} + effect(TEMP xtmp1); + format %{ "CopySignD $dst, $src\t! using $xtmp1 as TEMP" %} ins_encode %{ - __ mov64($tmp2$$Register, 0x7FFFFFFFFFFFFFFF); - __ movq($tmp1$$XMMRegister, $tmp2$$Register); - __ vpternlogq($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit); + __ vpcmpeqd($xtmp1$$XMMRegister, $xtmp1$$XMMRegister, $xtmp1$$XMMRegister, Assembler::AVX_128bit); + __ vpsrlq($xtmp1$$XMMRegister, $xtmp1$$XMMRegister, 1, Assembler::AVX_128bit); + __ vpternlogq($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $xtmp1$$XMMRegister, Assembler::AVX_128bit); %} ins_pipe( pipe_slow ); %} #endif // _LP64 +instruct copySignF_reg_avx(regF dst, regF src, regF xtmp1, regF xtmp2) %{ + predicate(!VM_Version::supports_avx512vl()); + match(Set dst (CopySignF dst src)); + effect(TEMP_DEF dst,TEMP xtmp1, TEMP xtmp2); + format %{ "CopySignF $dst, $src\t! using $xtmp1 and $xtmp2 as TEMP" %} + ins_encode %{ + __ vpcmpeqd($xtmp1$$XMMRegister, $xtmp1$$XMMRegister, $xtmp1$$XMMRegister, Assembler::AVX_128bit); + __ vpsrld($xtmp2$$XMMRegister, $xtmp1$$XMMRegister, 1, Assembler::AVX_128bit); + __ vpxor($xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp1$$XMMRegister, Assembler::AVX_128bit); + __ vandps($xtmp1$$XMMRegister, $xtmp1$$XMMRegister, $src$$XMMRegister, Assembler::AVX_128bit); + __ vandps($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit); + __ vpor($dst$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, Assembler::AVX_128bit); + %} + ins_pipe( pipe_slow ); +%} + +instruct copySignD_imm_avx(regD dst, regD src, regD xtmp1, regD xtmp2, immD zero) %{ + predicate(!VM_Version::supports_avx512vl()); + match(Set dst (CopySignD dst (Binary src zero))); + ins_cost(100); + effect(TEMP_DEF dst,TEMP xtmp1, TEMP xtmp2); + format %{ "CopySignD $dst, $src\t! using $xtmp1 and $xtmp2 as TEMP" %} + ins_encode %{ + __ vpcmpeqq($xtmp1$$XMMRegister, $xtmp1$$XMMRegister, $xtmp1$$XMMRegister, Assembler::AVX_128bit); + __ vpsrlq($xtmp2$$XMMRegister, $xtmp1$$XMMRegister, 1, Assembler::AVX_128bit); + __ vpxor($xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp1$$XMMRegister, Assembler::AVX_128bit); + __ vandpd($xtmp1$$XMMRegister, $xtmp1$$XMMRegister, $src$$XMMRegister, Assembler::AVX_128bit); + __ vandpd($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit); + __ vpor($dst$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, Assembler::AVX_128bit); + %} + ins_pipe( pipe_slow ); +%} + //----------------------------- CompressBits/ExpandBits ------------------------ instruct compressBitsI_reg(rRegI dst, rRegI src, rRegI mask) %{ From 2181850d940d267190b0775da628191345613179 Mon Sep 17 00:00:00 2001 From: Jatin Bhateja Date: Mon, 3 Feb 2025 19:21:22 +0530 Subject: [PATCH 2/4] Adding IR framework verification test --- src/hotspot/cpu/x86/x86.ad | 24 +-- .../math/TestCopySignIntrinsic.java | 144 ++++++++++++++++++ .../compiler/lib/ir_framework/IRNode.java | 10 ++ 3 files changed, 166 insertions(+), 12 deletions(-) create mode 100644 test/hotspot/jtreg/compiler/intrinsics/math/TestCopySignIntrinsic.java diff --git a/src/hotspot/cpu/x86/x86.ad b/src/hotspot/cpu/x86/x86.ad index bee709122acb6..a949c89523e8e 100644 --- a/src/hotspot/cpu/x86/x86.ad +++ b/src/hotspot/cpu/x86/x86.ad @@ -6736,29 +6736,29 @@ instruct signumV_reg_evex(vec dst, vec src, vec zero, vec one, kReg ktmp1) %{ // ---------------------------------------------------------------------- #ifdef _LP64 -instruct copySignF_reg(regF dst, regF src, regF xtmp1) %{ +instruct copySignF_reg(regF dst, regF src, regF xtmp) %{ predicate(VM_Version::supports_avx512vl()); match(Set dst (CopySignF dst src)); - effect(TEMP xtmp1); - format %{ "CopySignF $dst, $src\t! using $xtmp1 as TEMP" %} + effect(TEMP xtmp); + format %{ "CopySignF $dst, $src\t! using $xtmp as TEMP" %} ins_encode %{ - __ vpcmpeqd($xtmp1$$XMMRegister, $xtmp1$$XMMRegister, $xtmp1$$XMMRegister, Assembler::AVX_128bit); - __ vpsrld($xtmp1$$XMMRegister, $xtmp1$$XMMRegister, 1, Assembler::AVX_128bit); - __ vpternlogd($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $xtmp1$$XMMRegister, Assembler::AVX_128bit); + __ vpternlogd($xtmp$$XMMRegister, 0xFF, $xtmp$$XMMRegister, $xtmp$$XMMRegister, Assembler::AVX_128bit); + __ vpsrld($xtmp$$XMMRegister, $xtmp$$XMMRegister, 1, Assembler::AVX_128bit); + __ vpternlogd($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $xtmp$$XMMRegister, Assembler::AVX_128bit); %} ins_pipe( pipe_slow ); %} -instruct copySignD_imm(regD dst, regD src, regD xtmp1, immD zero) %{ +instruct copySignD_imm(regD dst, regD src, regD xtmp, immD zero) %{ predicate(VM_Version::supports_avx512vl()); match(Set dst (CopySignD dst (Binary src zero))); ins_cost(100); - effect(TEMP xtmp1); - format %{ "CopySignD $dst, $src\t! using $xtmp1 as TEMP" %} + effect(TEMP xtmp); + format %{ "CopySignD $dst, $src\t! using $xtmp as TEMP" %} ins_encode %{ - __ vpcmpeqd($xtmp1$$XMMRegister, $xtmp1$$XMMRegister, $xtmp1$$XMMRegister, Assembler::AVX_128bit); - __ vpsrlq($xtmp1$$XMMRegister, $xtmp1$$XMMRegister, 1, Assembler::AVX_128bit); - __ vpternlogq($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $xtmp1$$XMMRegister, Assembler::AVX_128bit); + __ vpternlogq($xtmp$$XMMRegister, 0xFF, $xtmp$$XMMRegister, $xtmp$$XMMRegister, Assembler::AVX_128bit); + __ vpsrlq($xtmp$$XMMRegister, $xtmp$$XMMRegister, 1, Assembler::AVX_128bit); + __ vpternlogq($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $xtmp$$XMMRegister, Assembler::AVX_128bit); %} ins_pipe( pipe_slow ); %} diff --git a/test/hotspot/jtreg/compiler/intrinsics/math/TestCopySignIntrinsic.java b/test/hotspot/jtreg/compiler/intrinsics/math/TestCopySignIntrinsic.java new file mode 100644 index 0000000000000..7318bee1f328c --- /dev/null +++ b/test/hotspot/jtreg/compiler/intrinsics/math/TestCopySignIntrinsic.java @@ -0,0 +1,144 @@ +/* + * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/** + * @test + * @bug 8349138 + * @key randomness + * @summary Optimize Math.copySign API for Intel e-core targets + * @library /test/lib / + * @run driver compiler.intrinsics.math.TestCopySignIntrinsic +*/ + +package compiler.intrinsics.math; + +import compiler.lib.ir_framework.Check; +import compiler.lib.ir_framework.IR; +import compiler.lib.ir_framework.IRNode; +import compiler.lib.ir_framework.Test; +import compiler.lib.ir_framework.TestFramework; +import compiler.lib.ir_framework.Setup; +import java.util.stream.IntStream; +import java.util.Random; +import jdk.test.lib.Utils; + +public class TestCopySignIntrinsic { + private static final Random rd = Utils.getRandomInstance(); + + public static void main(String[] args) { + new TestFramework(TestCopySignIntrinsic.class).run(); + } + + public final int SIZE = 1024; + public float [] fmagnitude; + public float [] fsign; + public float [] afresult; + public float [] efresult; + + public double [] dmagnitude; + public double [] dsign; + public double [] adresult; + public double [] edresult; + + public TestCopySignIntrinsic() { + fmagnitude = new float[SIZE]; + fsign = new float[SIZE]; + + dmagnitude = new double[SIZE]; + dsign = new double[SIZE]; + + afresult = new float[SIZE]; + efresult = new float[SIZE]; + + adresult = new double[SIZE]; + edresult = new double[SIZE]; + + IntStream.range(0, SIZE - 8).forEach(i -> { fmagnitude[i] = rd.nextFloat(-Float.MAX_VALUE, Float.MAX_VALUE); }); + IntStream.range(0, SIZE - 8).forEach(i -> { dmagnitude[i] = rd.nextFloat(-Float.MAX_VALUE, Float.MAX_VALUE); }); + IntStream.range(0, SIZE).forEach(i -> { fsign[i] = rd.nextFloat(-Float.MAX_VALUE, Float.MAX_VALUE); }); + IntStream.range(0, SIZE).forEach(i -> { dsign[i] = rd.nextFloat(-Float.MAX_VALUE, Float.MAX_VALUE); }); + + fmagnitude[SIZE - 1] = Float.NaN; + fmagnitude[SIZE - 2] = Float.NaN; + fmagnitude[SIZE - 3] = 0.0f; + fmagnitude[SIZE - 4] = -0.0f; + fmagnitude[SIZE - 5] = Float.NEGATIVE_INFINITY; + fmagnitude[SIZE - 6] = Float.POSITIVE_INFINITY; + fmagnitude[SIZE - 7] = -Float.MIN_VALUE; + fmagnitude[SIZE - 8] = Float.MIN_VALUE; + + dmagnitude[SIZE - 1] = Double.NaN; + dmagnitude[SIZE - 2] = Double.NaN; + dmagnitude[SIZE - 3] = 0.0; + dmagnitude[SIZE - 4] = -0.0; + dmagnitude[SIZE - 5] = Double.NEGATIVE_INFINITY; + dmagnitude[SIZE - 6] = Double.POSITIVE_INFINITY; + dmagnitude[SIZE - 7] = -Double.MIN_VALUE; + dmagnitude[SIZE - 8] = Double.MIN_VALUE; + + for (int i = 0; i < SIZE; i++) { + efresult[i] = Math.copySign(fmagnitude[i], fsign[i]); + edresult[i] = Math.copySign(dmagnitude[i], dsign[i]); + } + } + + @Test + @IR(counts = {IRNode.COPYSIGN_F, " >0 "}, applyIfCPUFeature = { "avx", "true"}) + public void testCopySignF() { + for (int i = 0; i < SIZE; i++) { + afresult[i] = Math.copySign(fmagnitude[i], fsign[i]); + } + } + + @Check(test = "testCopySignF") + public void checkCopySignF() { + for (int i = 0; i < SIZE; i++) { + if (afresult[i] != efresult[i]) { + if (Float.isNaN(afresult[i]) ^ Float.isNaN(efresult[i])) { + throw new RuntimeException("Incorrect result, Math.copySign(" + fmagnitude[i] + " , " + fsign[i] + ") => " + + "expected : " + efresult[i] + " != " + " actual : " + afresult[i]); + } + } + } + } + + @Test + @IR(counts = {IRNode.COPYSIGN_D, " >0 "}, applyIfCPUFeature = { "avx", "true"}) + public void testCopySignD() { + for (int i = 0; i < SIZE; i++) { + adresult[i] = Math.copySign(dmagnitude[i], dsign[i]); + } + } + + @Check(test = "testCopySignD") + public void checkCopySignD() { + for (int i = 0; i < SIZE; i++) { + if (Double.isNaN(adresult[i]) ^ Double.isNaN(edresult[i])) { + if (adresult[i] != edresult[i]) { + throw new RuntimeException("Incorrect result, Math.copySign(" + dmagnitude[i] + " , " + dsign[i] + ") => " + + "expected : " + edresult[i] + " != " + " actual : " + adresult[i]); + } + } + } + } +} diff --git a/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java b/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java index bfbfd80139c46..ec77ae0db2bd2 100644 --- a/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java +++ b/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java @@ -1591,6 +1591,16 @@ public class IRNode { optoOnly(SCOPE_OBJECT, regex); } + public static final String COPYSIGN_F = PREFIX + "COPYSIGN_F" + POSTFIX; + static { + beforeMatchingNameRegex(COPYSIGN_F, "CopySignF"); + } + + public static final String COPYSIGN_D = PREFIX + "COPYSIGN_D" + POSTFIX; + static { + beforeMatchingNameRegex(COPYSIGN_D, "CopySignD"); + } + public static final String SIGNUM_VD = VECTOR_PREFIX + "SIGNUM_VD" + POSTFIX; static { vectorNode(SIGNUM_VD, "SignumVD", TYPE_DOUBLE); From a25487328858355bec82745ea0f7ef77d58d9cbd Mon Sep 17 00:00:00 2001 From: Jatin Bhateja Date: Thu, 6 Feb 2025 22:49:44 +0530 Subject: [PATCH 3/4] Adding vector support along with some refactoring. --- src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp | 93 +++++++++++++++++++ src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp | 9 ++ src/hotspot/cpu/x86/x86.ad | 88 +++++++----------- src/hotspot/share/adlc/formssel.cpp | 2 +- src/hotspot/share/opto/classes.hpp | 2 + .../share/opto/superwordVTransformBuilder.cpp | 4 + src/hotspot/share/opto/vectornode.cpp | 7 ++ src/hotspot/share/opto/vectornode.hpp | 16 ++++ .../math/TestCopySignIntrinsic.java | 4 +- .../compiler/lib/ir_framework/IRNode.java | 10 ++ 10 files changed, 179 insertions(+), 56 deletions(-) diff --git a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp index 87583ddabd5e9..47bafa619f405 100644 --- a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp +++ b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp @@ -7080,3 +7080,96 @@ void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, X vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); } } + +// ---------------------------------------------------------------------- +// We are using bitwise ternary logic insturction VPTERNLOG which can +// absorb complex binary expressions involving 3 boolean variables. +// +// For copySign we set the truth table value as 0xE4. +// First column of truth table represents magnitude, second column +// represents sign operand while the third column is a conditional +// operand with fixed value of 0x7FFFFFFF. +// +// Whenever condition bit is 1 corresponding magnitude bit gets selected +// else corresponding sign bit is picked. +// Our condition mask is such that apart for sign bit i.e. MSB bit all +// other bits are set to 1, this ensures that all the bits of result +// apart from MSB bit are copied from magnitude operand while sign bit +// is borrowed from sign operand. +// +// Magnitude Sign Condition Result +// 0 0 0 0 +// 0 0 1 0 +// 0 1 0 1 +// 0 1 1 0 +// 1 0 0 0 +// 1 0 1 1 +// 1 1 0 1 +// 1 1 1 1 +// +// ---------------------------------------------------------------------- + +void C2_MacroAssembler::vector_copy_sign_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src, + XMMRegister xtmp, int vlen_enc) { + assert(is_floating_point_type(elem_bt), ""); + vpternlogq(xtmp, 0xFF, xtmp, xtmp, vlen_enc); + if (elem_bt == T_FLOAT) { + vpsrld(xtmp, xtmp, 1, vlen_enc); + vpternlogd(dst, 0xE4, src, xtmp, vlen_enc); + } else { + assert(elem_bt == T_DOUBLE, ""); + vpsrlq(xtmp, xtmp, 1, vlen_enc); + vpternlogq(dst, 0xE4, src, xtmp, vlen_enc); + } +} + +void C2_MacroAssembler::vandpsd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { + if (elem_bt == T_FLOAT) { + vandps(dst, src1, src2, vlen_enc); + } else { + assert(elem_bt == T_DOUBLE, ""); + vandpd(dst, src1, src2, vlen_enc); + } +} + +void C2_MacroAssembler::vpslldq_imm(BasicType elem_bt, XMMRegister dst, XMMRegister src, int shift, int vlen_enc) { + int elem_sz = type2aelembytes(elem_bt); + if (elem_sz == 2) { + vpsllw(dst, src, shift, vlen_enc); + } else if (elem_sz == 4) { + vpslld(dst, src, shift, vlen_enc); + } else if (elem_sz == 8) { + vpsllq(dst, src, shift, vlen_enc); + } else { + fatal("Unsupported lane size %s", type2name(elem_bt)); + } +} + +void C2_MacroAssembler::vpsrldq_imm(BasicType elem_bt, XMMRegister dst, XMMRegister src, int shift, int vlen_enc) { + int elem_sz = type2aelembytes(elem_bt); + if (elem_sz == 2) { + vpsrlw(dst, src, shift, vlen_enc); + } else if (elem_sz == 4) { + vpsrld(dst, src, shift, vlen_enc); + } else if (elem_sz == 8) { + vpsrlq(dst, src, shift, vlen_enc); + } else { + fatal("Unsupported lane size %s", type2name(elem_bt)); + } +} + +void C2_MacroAssembler::vector_copy_sign_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp, int vlen_enc) { + int sign_mask_shift = elem_bt == T_DOUBLE ? 63 : 31; + // set all double lanes of temporary vector to 0xFFFFFFFF + vcmpps(xtmp, xtmp, xtmp, Assembler::EQ_UQ, vlen_enc); + // compute mask for magnitude bits i.e. 0x7FFFFFFFF + vpsrldq_imm(elem_bt, xtmp, xtmp, 1, vlen_enc); + // extract magnitude bits from destination lanes. + vandpsd(elem_bt, dst, dst, xtmp, vlen_enc); + // compute mask for sign bit i.e. 0x80000000 + vpslldq_imm(elem_bt, xtmp, xtmp, sign_mask_shift, vlen_enc); + // extract sign bit from source lanes. + vandpsd(elem_bt, xtmp, xtmp, src, vlen_enc); + // merge extracted sign with magnitude bits. + vpor(dst, dst, xtmp, vlen_enc); +} diff --git a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp index 6e49cdefa6c94..aab29b7182ce1 100644 --- a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp +++ b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp @@ -583,4 +583,13 @@ void select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc); + void vandpsd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc); + + void vpsrldq_imm(BasicType elem_bt, XMMRegister dst, XMMRegister src, int shift, int vlen_enc); + + void vpslldq_imm(BasicType elem_bt, XMMRegister dst, XMMRegister src, int shift, int vlen_enc); + + void vector_copy_sign_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp, int vlen_enc); + + void vector_copy_sign_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp, int vlen_enc); #endif // CPU_X86_C2_MACROASSEMBLER_X86_HPP diff --git a/src/hotspot/cpu/x86/x86.ad b/src/hotspot/cpu/x86/x86.ad index a949c89523e8e..a8a78537327a8 100644 --- a/src/hotspot/cpu/x86/x86.ad +++ b/src/hotspot/cpu/x86/x86.ad @@ -1613,6 +1613,7 @@ bool Matcher::match_rule_supported(int opcode) { if (UseAVX < 1 || !is_LP64) { return false; } + break; #ifndef _LP64 case Op_AddReductionVF: case Op_AddReductionVD: @@ -1767,6 +1768,12 @@ bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) { return false; } break; + case Op_CopySignVD: + case Op_CopySignVF: + if (UseAVX < 1 || !is_LP64) { + return false; + } + break; case Op_MaxV: case Op_MinV: if (UseSSE < 4 && is_integral_type(bt)) { @@ -6707,34 +6714,6 @@ instruct signumV_reg_evex(vec dst, vec src, vec zero, vec one, kReg ktmp1) %{ ins_pipe( pipe_slow ); %} -// ---------------------------------------------------------------------- -// We are using bitwise ternary logic insturction VPTERNLOG which can -// absorb complex binary expressions involving 3 boolean variables. -// -// For copySign we set the truth table value as 0xE4. -// First column of truth table represents magnitude, second column -// represents sign operand while the third column is a conditional -// operand with fixed value of 0x7FFFFFFF. -// -// Whenever condition bit is 1 corresponding magnitude bit gets selected -// else corresponding sign bit is picked. -// Our condition mask is such that apart for sign bit i.e. MSB bit all -// other bits are set to 1, this ensures that all the bits of result -// apart from MSB bit are copied from magnitude operand while sign bit -// is borrowed from sign operand. -// -// Magnitude Sign Condition Result -// 0 0 0 0 -// 0 0 1 0 -// 0 1 0 1 -// 0 1 1 0 -// 1 0 0 0 -// 1 0 1 1 -// 1 1 0 1 -// 1 1 1 1 -// -// ---------------------------------------------------------------------- - #ifdef _LP64 instruct copySignF_reg(regF dst, regF src, regF xtmp) %{ predicate(VM_Version::supports_avx512vl()); @@ -6742,9 +6721,7 @@ instruct copySignF_reg(regF dst, regF src, regF xtmp) %{ effect(TEMP xtmp); format %{ "CopySignF $dst, $src\t! using $xtmp as TEMP" %} ins_encode %{ - __ vpternlogd($xtmp$$XMMRegister, 0xFF, $xtmp$$XMMRegister, $xtmp$$XMMRegister, Assembler::AVX_128bit); - __ vpsrld($xtmp$$XMMRegister, $xtmp$$XMMRegister, 1, Assembler::AVX_128bit); - __ vpternlogd($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $xtmp$$XMMRegister, Assembler::AVX_128bit); + __ vector_copy_sign_evex(T_FLOAT, $dst$$XMMRegister, $src$$XMMRegister, $xtmp$$XMMRegister, Assembler::AVX_128bit); %} ins_pipe( pipe_slow ); %} @@ -6756,44 +6733,49 @@ instruct copySignD_imm(regD dst, regD src, regD xtmp, immD zero) %{ effect(TEMP xtmp); format %{ "CopySignD $dst, $src\t! using $xtmp as TEMP" %} ins_encode %{ - __ vpternlogq($xtmp$$XMMRegister, 0xFF, $xtmp$$XMMRegister, $xtmp$$XMMRegister, Assembler::AVX_128bit); - __ vpsrlq($xtmp$$XMMRegister, $xtmp$$XMMRegister, 1, Assembler::AVX_128bit); - __ vpternlogq($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $xtmp$$XMMRegister, Assembler::AVX_128bit); + __ vector_copy_sign_evex(T_DOUBLE, $dst$$XMMRegister, $src$$XMMRegister, $xtmp$$XMMRegister, Assembler::AVX_128bit); %} ins_pipe( pipe_slow ); %} #endif // _LP64 -instruct copySignF_reg_avx(regF dst, regF src, regF xtmp1, regF xtmp2) %{ +instruct copySignF_reg_avx(regF dst, regF src, regF xtmp) %{ predicate(!VM_Version::supports_avx512vl()); match(Set dst (CopySignF dst src)); - effect(TEMP_DEF dst,TEMP xtmp1, TEMP xtmp2); - format %{ "CopySignF $dst, $src\t! using $xtmp1 and $xtmp2 as TEMP" %} + effect(TEMP_DEF dst,TEMP xtmp); + format %{ "CopySignF $dst, $src\t! using $xtmp as TEMP" %} ins_encode %{ - __ vpcmpeqd($xtmp1$$XMMRegister, $xtmp1$$XMMRegister, $xtmp1$$XMMRegister, Assembler::AVX_128bit); - __ vpsrld($xtmp2$$XMMRegister, $xtmp1$$XMMRegister, 1, Assembler::AVX_128bit); - __ vpxor($xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp1$$XMMRegister, Assembler::AVX_128bit); - __ vandps($xtmp1$$XMMRegister, $xtmp1$$XMMRegister, $src$$XMMRegister, Assembler::AVX_128bit); - __ vandps($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit); - __ vpor($dst$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, Assembler::AVX_128bit); + __ vector_copy_sign_avx(T_FLOAT, $dst$$XMMRegister, $src$$XMMRegister, $xtmp$$XMMRegister, Assembler::AVX_128bit); %} ins_pipe( pipe_slow ); %} -instruct copySignD_imm_avx(regD dst, regD src, regD xtmp1, regD xtmp2, immD zero) %{ +instruct copySignD_imm_avx(regD dst, regD src, regD xtmp, immD zero) %{ predicate(!VM_Version::supports_avx512vl()); match(Set dst (CopySignD dst (Binary src zero))); ins_cost(100); - effect(TEMP_DEF dst,TEMP xtmp1, TEMP xtmp2); - format %{ "CopySignD $dst, $src\t! using $xtmp1 and $xtmp2 as TEMP" %} - ins_encode %{ - __ vpcmpeqq($xtmp1$$XMMRegister, $xtmp1$$XMMRegister, $xtmp1$$XMMRegister, Assembler::AVX_128bit); - __ vpsrlq($xtmp2$$XMMRegister, $xtmp1$$XMMRegister, 1, Assembler::AVX_128bit); - __ vpxor($xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp1$$XMMRegister, Assembler::AVX_128bit); - __ vandpd($xtmp1$$XMMRegister, $xtmp1$$XMMRegister, $src$$XMMRegister, Assembler::AVX_128bit); - __ vandpd($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit); - __ vpor($dst$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, Assembler::AVX_128bit); + effect(TEMP_DEF dst,TEMP xtmp); + format %{ "CopySignD $dst, $src\t! using $xtmp as TEMP" %} + ins_encode %{ + __ vector_copy_sign_avx(T_DOUBLE, $dst$$XMMRegister, $src$$XMMRegister, $xtmp$$XMMRegister, Assembler::AVX_128bit); + %} + ins_pipe( pipe_slow ); +%} + +instruct copySignV_reg(vec dst, vec src, vec xtmp) %{ + match(Set dst (CopySignVF dst src)); + match(Set dst (CopySignVD dst src)); + effect(TEMP xtmp); + format %{ "vector_copysign $dst, $src\t! using $xtmp as TEMP" %} + ins_encode %{ + int vlen_enc = vector_length_encoding(this); + BasicType bt = Matcher::vector_element_basic_type(this); + if (VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(this) == 64) { + __ vector_copy_sign_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp$$XMMRegister, vlen_enc); + } else { + __ vector_copy_sign_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp$$XMMRegister, vlen_enc); + } %} ins_pipe( pipe_slow ); %} diff --git a/src/hotspot/share/adlc/formssel.cpp b/src/hotspot/share/adlc/formssel.cpp index dfa414ef56484..634d978f729ac 100644 --- a/src/hotspot/share/adlc/formssel.cpp +++ b/src/hotspot/share/adlc/formssel.cpp @@ -4364,7 +4364,7 @@ bool MatchRule::is_vector() const { "FmaVD","FmaVF","PopCountVI","PopCountVL","PopulateIndex","VectorLongToMask", "CountLeadingZerosV", "CountTrailingZerosV", "SignumVF", "SignumVD", "SaturatingAddV", "SaturatingSubV", // Next are vector mask ops. - "MaskAll", "AndVMask", "OrVMask", "XorVMask", "VectorMaskCast", + "MaskAll", "AndVMask", "OrVMask", "XorVMask", "VectorMaskCast", "CopySignVF", "CopySignVD", "RoundVF", "RoundVD", // Next are not supported currently. "PackB","PackS","PackI","PackL","PackF","PackD","Pack2L","Pack2D", diff --git a/src/hotspot/share/opto/classes.hpp b/src/hotspot/share/opto/classes.hpp index 60ee3e01137b0..d1bf8f753738b 100644 --- a/src/hotspot/share/opto/classes.hpp +++ b/src/hotspot/share/opto/classes.hpp @@ -336,6 +336,8 @@ macro(SignumD) macro(SignumF) macro(SignumVF) macro(SignumVD) +macro(CopySignVF) +macro(CopySignVD) macro(SqrtD) macro(SqrtF) macro(RoundF) diff --git a/src/hotspot/share/opto/superwordVTransformBuilder.cpp b/src/hotspot/share/opto/superwordVTransformBuilder.cpp index aee6add2a98ef..46e496c6a074e 100644 --- a/src/hotspot/share/opto/superwordVTransformBuilder.cpp +++ b/src/hotspot/share/opto/superwordVTransformBuilder.cpp @@ -156,6 +156,9 @@ VTransformVectorNode* SuperWordVTransformBuilder::make_vector_vtnode_for_pack(co // v = MulAddS2I(a, b) = a0 * b0 + a1 + b1 assert(p0->req() == 5, "MulAddS2I should have 4 operands"); vtn = new (_vtransform.arena()) VTransformElementWiseVectorNode(_vtransform, 3, pack_size); + } else if (opc == Op_CopySignD) { + assert(p0->req() == 4, "CopySignD should have 3 operands"); + vtn = new (_vtransform.arena()) VTransformElementWiseVectorNode(_vtransform, 3, pack_size); } else { assert(p0->req() == 3 || p0->is_CMove() || @@ -164,6 +167,7 @@ VTransformVectorNode* SuperWordVTransformBuilder::make_vector_vtnode_for_pack(co VectorNode::is_scalar_unary_op_with_equal_input_and_output_types(opc) || opc == Op_FmaD || opc == Op_FmaF || + opc == Op_CopySignF || opc == Op_SignumF || opc == Op_SignumD, "pack type must be in this list"); diff --git a/src/hotspot/share/opto/vectornode.cpp b/src/hotspot/share/opto/vectornode.cpp index 4ef864fe9c445..8a15223d05b24 100644 --- a/src/hotspot/share/opto/vectornode.cpp +++ b/src/hotspot/share/opto/vectornode.cpp @@ -266,6 +266,10 @@ int VectorNode::opcode(int sopc, BasicType bt) { return Op_SignumVF; case Op_SignumD: return Op_SignumVD; + case Op_CopySignF: + return Op_CopySignVF; + case Op_CopySignD: + return Op_CopySignVD; default: assert(!VectorNode::is_convert_opcode(sopc), @@ -720,6 +724,9 @@ VectorNode* VectorNode::make(int vopc, Node* n1, Node* n2, const TypeVect* vt, b case Op_SqrtVF: return new SqrtVFNode(n1, vt); case Op_SqrtVD: return new SqrtVDNode(n1, vt); + case Op_CopySignVF: return new CopySignVFNode(n1, n2, vt); + case Op_CopySignVD: return new CopySignVDNode(n1, n2, vt); + case Op_RoundVF: return new RoundVFNode(n1, vt); case Op_RoundVD: return new RoundVDNode(n1, vt); diff --git a/src/hotspot/share/opto/vectornode.hpp b/src/hotspot/share/opto/vectornode.hpp index 0bd1c71d7e628..cfe618b6f74e8 100644 --- a/src/hotspot/share/opto/vectornode.hpp +++ b/src/hotspot/share/opto/vectornode.hpp @@ -2015,6 +2015,22 @@ class ReverseBytesVNode : public VectorNode { virtual int Opcode() const; }; +class CopySignVFNode : public VectorNode { +public: + CopySignVFNode(Node* in1, Node* in2, const TypeVect* vt) + : VectorNode(in1, in2, vt) {} + + virtual int Opcode() const; +}; + +class CopySignVDNode : public VectorNode { +public: + CopySignVDNode(Node* in1, Node* in2, const TypeVect* vt) + : VectorNode(in1, in2, vt) {} + + virtual int Opcode() const; +}; + class SignumVFNode : public VectorNode { public: SignumVFNode(Node* in1, Node* zero, Node* one, const TypeVect* vt) diff --git a/test/hotspot/jtreg/compiler/intrinsics/math/TestCopySignIntrinsic.java b/test/hotspot/jtreg/compiler/intrinsics/math/TestCopySignIntrinsic.java index 7318bee1f328c..c39656dcc4d30 100644 --- a/test/hotspot/jtreg/compiler/intrinsics/math/TestCopySignIntrinsic.java +++ b/test/hotspot/jtreg/compiler/intrinsics/math/TestCopySignIntrinsic.java @@ -103,7 +103,7 @@ public TestCopySignIntrinsic() { } @Test - @IR(counts = {IRNode.COPYSIGN_F, " >0 "}, applyIfCPUFeature = { "avx", "true"}) + @IR(counts = {IRNode.COPYSIGN_F, " >0 ", IRNode.COPYSIGN_VF, " >0 "}, applyIfCPUFeature = { "avx", "true"}) public void testCopySignF() { for (int i = 0; i < SIZE; i++) { afresult[i] = Math.copySign(fmagnitude[i], fsign[i]); @@ -123,7 +123,7 @@ public void checkCopySignF() { } @Test - @IR(counts = {IRNode.COPYSIGN_D, " >0 "}, applyIfCPUFeature = { "avx", "true"}) + @IR(counts = {IRNode.COPYSIGN_D, " >0 ", IRNode.COPYSIGN_VD, " >0 "}, applyIfCPUFeature = { "avx", "true"}) public void testCopySignD() { for (int i = 0; i < SIZE; i++) { adresult[i] = Math.copySign(dmagnitude[i], dsign[i]); diff --git a/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java b/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java index ec77ae0db2bd2..d9519da2059bf 100644 --- a/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java +++ b/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java @@ -1601,6 +1601,16 @@ public class IRNode { beforeMatchingNameRegex(COPYSIGN_D, "CopySignD"); } + public static final String COPYSIGN_VD = VECTOR_PREFIX + "COPYSIGN_VD" + POSTFIX; + static { + vectorNode(COPYSIGN_VD, "CopySignVD", TYPE_DOUBLE); + } + + public static final String COPYSIGN_VF = VECTOR_PREFIX + "COPYSIGN_VF" + POSTFIX; + static { + vectorNode(COPYSIGN_VF, "CopySignVF", TYPE_FLOAT); + } + public static final String SIGNUM_VD = VECTOR_PREFIX + "SIGNUM_VD" + POSTFIX; static { vectorNode(SIGNUM_VD, "SignumVD", TYPE_DOUBLE); From ecc36582e170bdea25e05d10030070511a6ae126 Mon Sep 17 00:00:00 2001 From: Jatin Bhateja Date: Tue, 6 May 2025 22:17:12 +0530 Subject: [PATCH 4/4] Review comments resolutions --- src/hotspot/cpu/x86/x86.ad | 4 +- .../math/TestCopySignIntrinsic.java | 47 +++++-------------- 2 files changed, 15 insertions(+), 36 deletions(-) diff --git a/src/hotspot/cpu/x86/x86.ad b/src/hotspot/cpu/x86/x86.ad index 7ba550bdddbfc..52eb9e96e542d 100644 --- a/src/hotspot/cpu/x86/x86.ad +++ b/src/hotspot/cpu/x86/x86.ad @@ -1538,7 +1538,7 @@ bool Matcher::match_rule_supported(int opcode) { break; case Op_CopySignD: case Op_CopySignF: - if (UseAVX < 1 || !is_LP64) { + if (UseAVX < 1) { return false; } break; @@ -1668,7 +1668,7 @@ bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) { break; case Op_CopySignVD: case Op_CopySignVF: - if (UseAVX < 1 || !is_LP64) { + if (UseAVX < 1) { return false; } break; diff --git a/test/hotspot/jtreg/compiler/intrinsics/math/TestCopySignIntrinsic.java b/test/hotspot/jtreg/compiler/intrinsics/math/TestCopySignIntrinsic.java index c39656dcc4d30..ba225cc50d200 100644 --- a/test/hotspot/jtreg/compiler/intrinsics/math/TestCopySignIntrinsic.java +++ b/test/hotspot/jtreg/compiler/intrinsics/math/TestCopySignIntrinsic.java @@ -38,9 +38,12 @@ import compiler.lib.ir_framework.Test; import compiler.lib.ir_framework.TestFramework; import compiler.lib.ir_framework.Setup; +import compiler.lib.verify.*; import java.util.stream.IntStream; import java.util.Random; import jdk.test.lib.Utils; +import compiler.lib.generators.Generator; +import static compiler.lib.generators.Generators.G; public class TestCopySignIntrinsic { private static final Random rd = Utils.getRandomInstance(); @@ -73,28 +76,14 @@ public TestCopySignIntrinsic() { adresult = new double[SIZE]; edresult = new double[SIZE]; - IntStream.range(0, SIZE - 8).forEach(i -> { fmagnitude[i] = rd.nextFloat(-Float.MAX_VALUE, Float.MAX_VALUE); }); - IntStream.range(0, SIZE - 8).forEach(i -> { dmagnitude[i] = rd.nextFloat(-Float.MAX_VALUE, Float.MAX_VALUE); }); - IntStream.range(0, SIZE).forEach(i -> { fsign[i] = rd.nextFloat(-Float.MAX_VALUE, Float.MAX_VALUE); }); - IntStream.range(0, SIZE).forEach(i -> { dsign[i] = rd.nextFloat(-Float.MAX_VALUE, Float.MAX_VALUE); }); - - fmagnitude[SIZE - 1] = Float.NaN; - fmagnitude[SIZE - 2] = Float.NaN; - fmagnitude[SIZE - 3] = 0.0f; - fmagnitude[SIZE - 4] = -0.0f; - fmagnitude[SIZE - 5] = Float.NEGATIVE_INFINITY; - fmagnitude[SIZE - 6] = Float.POSITIVE_INFINITY; - fmagnitude[SIZE - 7] = -Float.MIN_VALUE; - fmagnitude[SIZE - 8] = Float.MIN_VALUE; - - dmagnitude[SIZE - 1] = Double.NaN; - dmagnitude[SIZE - 2] = Double.NaN; - dmagnitude[SIZE - 3] = 0.0; - dmagnitude[SIZE - 4] = -0.0; - dmagnitude[SIZE - 5] = Double.NEGATIVE_INFINITY; - dmagnitude[SIZE - 6] = Double.POSITIVE_INFINITY; - dmagnitude[SIZE - 7] = -Double.MIN_VALUE; - dmagnitude[SIZE - 8] = Double.MIN_VALUE; + Generator genFloat = G.floats(); + Generator genDouble = G.doubles(); + for (int i = 0; i < SIZE; i++) { + fmagnitude[i] = genFloat.next(); + dmagnitude[i] = genFloat.next(); + fsign[i] = genFloat.next(); + dsign[i] = genFloat.next(); + } for (int i = 0; i < SIZE; i++) { efresult[i] = Math.copySign(fmagnitude[i], fsign[i]); @@ -113,12 +102,7 @@ public void testCopySignF() { @Check(test = "testCopySignF") public void checkCopySignF() { for (int i = 0; i < SIZE; i++) { - if (afresult[i] != efresult[i]) { - if (Float.isNaN(afresult[i]) ^ Float.isNaN(efresult[i])) { - throw new RuntimeException("Incorrect result, Math.copySign(" + fmagnitude[i] + " , " + fsign[i] + ") => " + - "expected : " + efresult[i] + " != " + " actual : " + afresult[i]); - } - } + Verify.checkEQ(afresult[i], efresult[i]); } } @@ -133,12 +117,7 @@ public void testCopySignD() { @Check(test = "testCopySignD") public void checkCopySignD() { for (int i = 0; i < SIZE; i++) { - if (Double.isNaN(adresult[i]) ^ Double.isNaN(edresult[i])) { - if (adresult[i] != edresult[i]) { - throw new RuntimeException("Incorrect result, Math.copySign(" + dmagnitude[i] + " , " + dsign[i] + ") => " + - "expected : " + edresult[i] + " != " + " actual : " + adresult[i]); - } - } + Verify.checkEQ(adresult[i], edresult[i]); } } }