From 20b6762ddc7a8bd1a7686beb6590b7d764f67d64 Mon Sep 17 00:00:00 2001 From: Hao Sun Date: Mon, 7 Feb 2022 03:01:09 +0000 Subject: [PATCH] 8265263: AArch64: Combine vneg with right shift count *** Implementation In AArch64 NEON, vector shift right is implemented by vector shift left instructions (SSHL[1] and USHL[2]) with negative shift count value. In C2 backend, we generate a `neg` to given shift value followed by `sshl` or `ushl` instruction. For vector shift right, the vector shift count has two origins: 1) it can be duplicated from scalar variable/immediate(case-1), 2) it can be loaded directly from one vector(case-2). This patch aims to optimize case-1. Specifically, we move the negate from RShiftV* rules to RShiftCntV rule. As a result, the negate can be hoisted outside of the loop if it's a loop invariant. In this patch, 1) we split vshiftcnt* rules into vslcnt* and vsrcnt* rules to handle shift left and shift right respectively. Compared to vslcnt* rules, the negate is conducted in vsrcnt*. 2) for each vsra* and vsrl* rules, we create one variant, i.e. vsra*_var and vsrl*_var. We use vsra* and vsrl* rules to handle case-1, and use vsra*_var and vsrl*_var rules to handle case-2. Note that ShiftVNode::is_var_shift() can be used to distinguish case-1 from case-2. 3) we add one assertion for the vs*_imm rules as we have done on ARM32[3]. 4) several style issues are resolved. *** Example Take function `rShiftInt()` in the newly added micro benchmark VectorShiftRight.java as an example. ``` public void rShiftInt() { for (int i = 0; i < SIZE; i++) { intsB[i] = intsA[i] >> count; } } ``` Arithmetic shift right is conducted inside a big loop. The following code snippet shows the disassembly code generated by auto-vectorization before we apply current patch. We can see that `neg` is conducted in the loop body. ``` 0x0000ffff89057a64: dup v16.16b, w13 <-- dup 0x0000ffff89057a68: mov w12, #0x7d00 // #32000 0x0000ffff89057a6c: sub w13, w2, w10 0x0000ffff89057a70: cmp w2, w10 0x0000ffff89057a74: csel w13, wzr, w13, lt 0x0000ffff89057a78: mov w8, #0x7d00 // #32000 0x0000ffff89057a7c: cmp w13, w8 0x0000ffff89057a80: csel w13, w12, w13, hi 0x0000ffff89057a84: add w14, w13, w10 0x0000ffff89057a88: nop 0x0000ffff89057a8c: nop 0x0000ffff89057a90: sbfiz x13, x10, #2, #32 <-- loop entry 0x0000ffff89057a94: add x15, x17, x13 0x0000ffff89057a98: ldr q17, [x15,#16] 0x0000ffff89057a9c: add x13, x0, x13 0x0000ffff89057aa0: neg v18.16b, v16.16b <-- neg 0x0000ffff89057aa4: sshl v17.4s, v17.4s, v18.4s <-- shift right 0x0000ffff89057aa8: str q17, [x13,#16] 0x0000ffff89057aac: ... 0x0000ffff89057b1c: add w10, w10, #0x20 0x0000ffff89057b20: cmp w10, w14 0x0000ffff89057b24: b.lt 0x0000ffff89057a90 <-- loop end ``` Here is the disassembly code after we apply current patch. We can see that the negate is no longer conducted inside the loop, and it is hoisted to the outside. ``` 0x0000ffff8d053a68: neg w14, w13 <---- neg 0x0000ffff8d053a6c: dup v16.16b, w14 <---- dup 0x0000ffff8d053a70: sub w14, w2, w10 0x0000ffff8d053a74: cmp w2, w10 0x0000ffff8d053a78: csel w14, wzr, w14, lt 0x0000ffff8d053a7c: mov w8, #0x7d00 // #32000 0x0000ffff8d053a80: cmp w14, w8 0x0000ffff8d053a84: csel w14, w12, w14, hi 0x0000ffff8d053a88: add w13, w14, w10 0x0000ffff8d053a8c: nop 0x0000ffff8d053a90: sbfiz x14, x10, #2, #32 <-- loop entry 0x0000ffff8d053a94: add x15, x17, x14 0x0000ffff8d053a98: ldr q17, [x15,#16] 0x0000ffff8d053a9c: sshl v17.4s, v17.4s, v16.4s <-- shift right 0x0000ffff8d053aa0: add x14, x0, x14 0x0000ffff8d053aa4: str q17, [x14,#16] 0x0000ffff8d053aa8: ... 0x0000ffff8d053afc: add w10, w10, #0x20 0x0000ffff8d053b00: cmp w10, w13 0x0000ffff8d053b04: b.lt 0x0000ffff8d053a90 <-- loop end ``` *** Testing Tier1~3 tests passed on Linux/AArch64 platform. *** Performance Evaluation - Auto-vectorization One micro benchmark, i.e. VectorShiftRight.java, is added by this patch in order to evaluate the optimization on vector shift right. The following table shows the result. Column `Score-1` shows the score before we apply current patch, and column `Score-2` shows the score when we apply current patch. We witness about 30% ~ 53% improvement on microbenchmarks. ``` Benchmark Units Score-1 Score-2 VectorShiftRight.rShiftByte ops/ms 10601.980 13816.353 VectorShiftRight.rShiftInt ops/ms 3592.831 5502.941 VectorShiftRight.rShiftLong ops/ms 1584.012 2425.247 VectorShiftRight.rShiftShort ops/ms 6643.414 9728.762 VectorShiftRight.urShiftByte ops/ms 2066.965 2048.336 (*) VectorShiftRight.urShiftChar ops/ms 6660.805 9728.478 VectorShiftRight.urShiftInt ops/ms 3592.909 5514.928 VectorShiftRight.urShiftLong ops/ms 1583.995 2422.991 *: Logical shift right for Byte type(urShiftByte) is not vectorized, as disscussed in [4]. ``` - VectorAPI Furthermore, we also evaluate the impact of this patch on VectorAPI benchmarks, e.g., [5]. Details can be found in the table below. Columns `Score-1` and `Score-2` show the scores before and after applying current patch. ``` Benchmark Units Score-1 Score-2 Byte128Vector.LSHL ops/ms 10867.666 10873.993 Byte128Vector.LSHLShift ops/ms 10945.729 10945.741 Byte128Vector.LSHR ops/ms 8629.305 8629.343 Byte128Vector.LSHRShift ops/ms 8245.864 10303.521 <-- Byte128Vector.ASHR ops/ms 8619.691 8629.438 Byte128Vector.ASHRShift ops/ms 8245.860 10305.027 <-- Int128Vector.LSHL ops/ms 3104.213 3103.702 Int128Vector.LSHLShift ops/ms 3114.354 3114.371 Int128Vector.LSHR ops/ms 2380.717 2380.693 Int128Vector.LSHRShift ops/ms 2312.871 2992.377 <-- Int128Vector.ASHR ops/ms 2380.668 2380.647 Int128Vector.ASHRShift ops/ms 2312.894 2992.332 <-- Long128Vector.LSHL ops/ms 1586.907 1587.591 Long128Vector.LSHLShift ops/ms 1589.469 1589.540 Long128Vector.LSHR ops/ms 1209.754 1209.687 Long128Vector.LSHRShift ops/ms 1174.718 1527.502 <-- Long128Vector.ASHR ops/ms 1209.713 1209.669 Long128Vector.ASHRShift ops/ms 1174.712 1527.174 <-- Short128Vector.LSHL ops/ms 5945.542 5943.770 Short128Vector.LSHLShift ops/ms 5984.743 5984.640 Short128Vector.LSHR ops/ms 4613.378 4613.577 Short128Vector.LSHRShift ops/ms 4486.023 5746.466 <-- Short128Vector.ASHR ops/ms 4613.389 4613.478 Short128Vector.ASHRShift ops/ms 4486.019 5746.368 <-- ``` 1) For logical shift left(LSHL and LSHLShift), and shift right with variable vector shift count(LSHR and ASHR) cases, we didn't find much changes, which is expected. 2) For shift right with scalar shift count(LSHRShift and ASHRShift) case, about 25% ~ 30% improvement can be observed, and this benefit is introduced by current patch. [1] https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/SSHL--Signed-Shift-Left--register-- [2] https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/USHL--Unsigned-Shift-Left--register-- [3] https://github.com/openjdk/jdk18/pull/41 [4] https://github.com/openjdk/jdk/pull/1087 [5] https://github.com/openjdk/panama-vector/blob/vectorIntrinsics/test/micro/org/openjdk/bench/jdk/incubator/vector/operation/Byte128Vector.java#L509 --- src/hotspot/cpu/aarch64/aarch64.ad | 9 + src/hotspot/cpu/aarch64/aarch64_neon.ad | 611 ++++++++++++------ src/hotspot/cpu/aarch64/aarch64_neon_ad.m4 | 434 +++++++------ .../bench/vm/compiler/VectorShiftRight.java | 129 ++++ 4 files changed, 809 insertions(+), 374 deletions(-) create mode 100644 test/micro/org/openjdk/bench/vm/compiler/VectorShiftRight.java diff --git a/src/hotspot/cpu/aarch64/aarch64.ad b/src/hotspot/cpu/aarch64/aarch64.ad index f21835f9de3ef..96075ecd9d5a6 100644 --- a/src/hotspot/cpu/aarch64/aarch64.ad +++ b/src/hotspot/cpu/aarch64/aarch64.ad @@ -1311,6 +1311,9 @@ public: // predicate controlling translation of CompareAndSwapX bool needs_acquiring_load_exclusive(const Node *load); + // Assert that the given node is not a variable shift. + bool assert_not_var_shift(const Node* n); + // predicate controlling addressing modes bool size_fits_all_mem_uses(AddPNode* addp, int shift); %} @@ -1725,6 +1728,12 @@ bool needs_acquiring_load_exclusive(const Node *n) return true; } +// Assert that the given node is not a variable shift. +bool assert_not_var_shift(const Node* n) { + assert(!n->as_ShiftV()->is_var_shift(), "illegal variable shift"); + return true; +} + #define __ _masm. // advance declarations for helper functions to convert register diff --git a/src/hotspot/cpu/aarch64/aarch64_neon.ad b/src/hotspot/cpu/aarch64/aarch64_neon.ad index 7c84a93583b10..feecd8ab90add 100644 --- a/src/hotspot/cpu/aarch64/aarch64_neon.ad +++ b/src/hotspot/cpu/aarch64/aarch64_neon.ad @@ -1,5 +1,5 @@ -// Copyright (c) 2020, 2021, Oracle and/or its affiliates. All rights reserved. -// Copyright (c) 2020, 2021, Arm Limited. All rights reserved. +// Copyright (c) 2020, 2022, Oracle and/or its affiliates. All rights reserved. +// Copyright (c) 2020, 2022, Arm Limited. All rights reserved. // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. // // This code is free software; you can redistribute it and/or modify it @@ -4400,11 +4400,17 @@ instruct vxor16B(vecX dst, vecX src1, vecX src2) // ------------------------------ Shift --------------------------------------- -instruct vshiftcnt8B(vecD dst, iRegIorL2I cnt) %{ +// Vector shift count +// Note-1: Low 8 bits of each element are used, so it doesn't matter if we +// treat it as ints or bytes here. +// Note-2: Shift value is negated for RShiftCntV additionally. See the comments +// on vsra8B rule for more details. + +instruct vslcnt8B(vecD dst, iRegIorL2I cnt) %{ predicate(UseSVE == 0 && (n->as_Vector()->length_in_bytes() == 4 || - n->as_Vector()->length_in_bytes() == 8)); + n->as_Vector()->length_in_bytes() == 8)); match(Set dst (LShiftCntV cnt)); - match(Set dst (RShiftCntV cnt)); + ins_cost(INSN_COST); format %{ "dup $dst, $cnt\t# shift count vector (8B)" %} ins_encode %{ __ dup(as_FloatRegister($dst$$reg), __ T8B, as_Register($cnt$$reg)); @@ -4412,10 +4418,10 @@ instruct vshiftcnt8B(vecD dst, iRegIorL2I cnt) %{ ins_pipe(vdup_reg_reg64); %} -instruct vshiftcnt16B(vecX dst, iRegIorL2I cnt) %{ - predicate(UseSVE == 0 && (n->as_Vector()->length_in_bytes() == 16)); +instruct vslcnt16B(vecX dst, iRegIorL2I cnt) %{ + predicate(UseSVE == 0 && n->as_Vector()->length_in_bytes() == 16); match(Set dst (LShiftCntV cnt)); - match(Set dst (RShiftCntV cnt)); + ins_cost(INSN_COST); format %{ "dup $dst, $cnt\t# shift count vector (16B)" %} ins_encode %{ __ dup(as_FloatRegister($dst$$reg), __ T16B, as_Register($cnt$$reg)); @@ -4423,9 +4429,35 @@ instruct vshiftcnt16B(vecX dst, iRegIorL2I cnt) %{ ins_pipe(vdup_reg_reg128); %} +instruct vsrcnt8B(vecD dst, iRegIorL2I cnt) %{ + predicate(UseSVE == 0 && (n->as_Vector()->length_in_bytes() == 4 || + n->as_Vector()->length_in_bytes() == 8)); + match(Set dst (RShiftCntV cnt)); + ins_cost(INSN_COST * 2); + format %{ "negw rscratch1, $cnt\t" + "dup $dst, rscratch1\t# shift count vector (8B)" %} + ins_encode %{ + __ negw(rscratch1, as_Register($cnt$$reg)); + __ dup(as_FloatRegister($dst$$reg), __ T8B, rscratch1); + %} + ins_pipe(vdup_reg_reg64); +%} + +instruct vsrcnt16B(vecX dst, iRegIorL2I cnt) %{ + predicate(UseSVE == 0 && n->as_Vector()->length_in_bytes() == 16); + match(Set dst (RShiftCntV cnt)); + ins_cost(INSN_COST * 2); + format %{ "negw rscratch1, $cnt\t" + "dup $dst, rscratch1\t# shift count vector (16B)" %} + ins_encode %{ + __ negw(rscratch1, as_Register($cnt$$reg)); + __ dup(as_FloatRegister($dst$$reg), __ T16B, rscratch1); + %} + ins_pipe(vdup_reg_reg128); +%} + instruct vsll8B(vecD dst, vecD src, vecD shift) %{ - predicate(n->as_Vector()->length() == 4 || - n->as_Vector()->length() == 8); + predicate(n->as_Vector()->length() == 4 || n->as_Vector()->length() == 8); match(Set dst (LShiftVB src shift)); ins_cost(INSN_COST); format %{ "sshl $dst,$src,$shift\t# vector (8B)" %} @@ -4459,8 +4491,6 @@ instruct vsll16B(vecX dst, vecX src, vecX shift) %{ // LoadVector RShiftCntV // | / // RShiftVI -// Note: In inner loop, multiple neg instructions are used, which can be -// moved to outer loop and merge into one neg instruction. // // Case 2: The vector shift count is from loading. // This case isn't supported by middle-end now. But it's supported by @@ -4470,83 +4500,145 @@ instruct vsll16B(vecX dst, vecX src, vecX shift) %{ // | / // RShiftVI // +// The negate is conducted in RShiftCntV rule for case 1, whereas it's done in +// RShiftV* rules for case 2. Because there exists an optimization opportunity +// for case 1, that is, multiple neg instructions in inner loop can be hoisted +// to outer loop and merged into one neg instruction. +// +// Note that ShiftVNode::is_var_shift() indicates whether the vector shift +// count is a variable vector(case 2) or not(a vector generated by RShiftCntV, +// i.e. case 1). -instruct vsra8B(vecD dst, vecD src, vecD shift, vecD tmp) %{ - predicate(n->as_Vector()->length() == 4 || - n->as_Vector()->length() == 8); +instruct vsra8B(vecD dst, vecD src, vecD shift) %{ + predicate((n->as_Vector()->length() == 4 || n->as_Vector()->length() == 8) && + !n->as_ShiftV()->is_var_shift()); match(Set dst (RShiftVB src shift)); ins_cost(INSN_COST); - effect(TEMP tmp); - format %{ "negr $tmp,$shift\t" - "sshl $dst,$src,$tmp\t# vector (8B)" %} + format %{ "sshl $dst,$src,$shift\t# vector (8B)" %} + ins_encode %{ + __ sshl(as_FloatRegister($dst$$reg), __ T8B, + as_FloatRegister($src$$reg), + as_FloatRegister($shift$$reg)); + %} + ins_pipe(vshift64); +%} + +instruct vsra8B_var(vecD dst, vecD src, vecD shift) %{ + predicate((n->as_Vector()->length() == 4 || n->as_Vector()->length() == 8) && + n->as_ShiftV()->is_var_shift()); + match(Set dst (RShiftVB src shift)); + ins_cost(INSN_COST * 2); + effect(TEMP_DEF dst); + format %{ "negr $dst,$shift\t" + "sshl $dst,$src,$dst\t# vector (8B)" %} ins_encode %{ - __ negr(as_FloatRegister($tmp$$reg), __ T8B, + __ negr(as_FloatRegister($dst$$reg), __ T8B, as_FloatRegister($shift$$reg)); __ sshl(as_FloatRegister($dst$$reg), __ T8B, as_FloatRegister($src$$reg), - as_FloatRegister($tmp$$reg)); + as_FloatRegister($dst$$reg)); %} ins_pipe(vshift64); %} -instruct vsra16B(vecX dst, vecX src, vecX shift, vecX tmp) %{ - predicate(n->as_Vector()->length() == 16); +instruct vsra16B(vecX dst, vecX src, vecX shift) %{ + predicate(n->as_Vector()->length() == 16 && !n->as_ShiftV()->is_var_shift()); match(Set dst (RShiftVB src shift)); ins_cost(INSN_COST); - effect(TEMP tmp); - format %{ "negr $tmp,$shift\t" - "sshl $dst,$src,$tmp\t# vector (16B)" %} + format %{ "sshl $dst,$src,$shift\t# vector (16B)" %} ins_encode %{ - __ negr(as_FloatRegister($tmp$$reg), __ T16B, + __ sshl(as_FloatRegister($dst$$reg), __ T16B, + as_FloatRegister($src$$reg), + as_FloatRegister($shift$$reg)); + %} + ins_pipe(vshift128); +%} + +instruct vsra16B_var(vecX dst, vecX src, vecX shift) %{ + predicate(n->as_Vector()->length() == 16 && n->as_ShiftV()->is_var_shift()); + match(Set dst (RShiftVB src shift)); + ins_cost(INSN_COST * 2); + effect(TEMP_DEF dst); + format %{ "negr $dst,$shift\t" + "sshl $dst,$src,$dst\t# vector (16B)" %} + ins_encode %{ + __ negr(as_FloatRegister($dst$$reg), __ T16B, as_FloatRegister($shift$$reg)); __ sshl(as_FloatRegister($dst$$reg), __ T16B, as_FloatRegister($src$$reg), - as_FloatRegister($tmp$$reg)); + as_FloatRegister($dst$$reg)); %} ins_pipe(vshift128); %} -instruct vsrl8B(vecD dst, vecD src, vecD shift, vecD tmp) %{ - predicate(n->as_Vector()->length() == 4 || - n->as_Vector()->length() == 8); +instruct vsrl8B(vecD dst, vecD src, vecD shift) %{ + predicate((n->as_Vector()->length() == 4 || n->as_Vector()->length() == 8) && + !n->as_ShiftV()->is_var_shift()); match(Set dst (URShiftVB src shift)); ins_cost(INSN_COST); - effect(TEMP tmp); - format %{ "negr $tmp,$shift\t" - "ushl $dst,$src,$tmp\t# vector (8B)" %} + format %{ "ushl $dst,$src,$shift\t# vector (8B)" %} ins_encode %{ - __ negr(as_FloatRegister($tmp$$reg), __ T8B, + __ ushl(as_FloatRegister($dst$$reg), __ T8B, + as_FloatRegister($src$$reg), + as_FloatRegister($shift$$reg)); + %} + ins_pipe(vshift64); +%} + +instruct vsrl8B_var(vecD dst, vecD src, vecD shift) %{ + predicate((n->as_Vector()->length() == 4 || n->as_Vector()->length() == 8) && + n->as_ShiftV()->is_var_shift()); + match(Set dst (URShiftVB src shift)); + ins_cost(INSN_COST * 2); + effect(TEMP_DEF dst); + format %{ "negr $dst,$shift\t" + "ushl $dst,$src,$dst\t# vector (8B)" %} + ins_encode %{ + __ negr(as_FloatRegister($dst$$reg), __ T8B, as_FloatRegister($shift$$reg)); __ ushl(as_FloatRegister($dst$$reg), __ T8B, as_FloatRegister($src$$reg), - as_FloatRegister($tmp$$reg)); + as_FloatRegister($dst$$reg)); %} ins_pipe(vshift64); %} -instruct vsrl16B(vecX dst, vecX src, vecX shift, vecX tmp) %{ - predicate(n->as_Vector()->length() == 16); +instruct vsrl16B(vecX dst, vecX src, vecX shift) %{ + predicate(n->as_Vector()->length() == 16 && !n->as_ShiftV()->is_var_shift()); match(Set dst (URShiftVB src shift)); ins_cost(INSN_COST); - effect(TEMP tmp); - format %{ "negr $tmp,$shift\t" - "ushl $dst,$src,$tmp\t# vector (16B)" %} + format %{ "ushl $dst,$src,$shift\t# vector (16B)" %} + ins_encode %{ + __ ushl(as_FloatRegister($dst$$reg), __ T16B, + as_FloatRegister($src$$reg), + as_FloatRegister($shift$$reg)); + %} + ins_pipe(vshift128); +%} + +instruct vsrl16B_var(vecX dst, vecX src, vecX shift) %{ + predicate(n->as_Vector()->length() == 16 && n->as_ShiftV()->is_var_shift()); + match(Set dst (URShiftVB src shift)); + ins_cost(INSN_COST * 2); + effect(TEMP_DEF dst); + format %{ "negr $dst,$shift\t" + "ushl $dst,$src,$dst\t# vector (16B)" %} ins_encode %{ - __ negr(as_FloatRegister($tmp$$reg), __ T16B, + __ negr(as_FloatRegister($dst$$reg), __ T16B, as_FloatRegister($shift$$reg)); __ ushl(as_FloatRegister($dst$$reg), __ T16B, as_FloatRegister($src$$reg), - as_FloatRegister($tmp$$reg)); + as_FloatRegister($dst$$reg)); %} ins_pipe(vshift128); %} instruct vsll8B_imm(vecD dst, vecD src, immI shift) %{ - predicate(n->as_Vector()->length() == 4 || - n->as_Vector()->length() == 8); + predicate((n->as_Vector()->length() == 4 || n->as_Vector()->length() == 8) && + assert_not_var_shift(n)); match(Set dst (LShiftVB src (LShiftCntV shift))); ins_cost(INSN_COST); - format %{ "shl $dst, $src, $shift\t# vector (8B)" %} + format %{ "shl $dst, $src, $shift\t# vector (8B)" %} ins_encode %{ int sh = (int)$shift$$constant; if (sh >= 8) { @@ -4562,10 +4654,10 @@ instruct vsll8B_imm(vecD dst, vecD src, immI shift) %{ %} instruct vsll16B_imm(vecX dst, vecX src, immI shift) %{ - predicate(n->as_Vector()->length() == 16); + predicate(n->as_Vector()->length() == 16 && assert_not_var_shift(n)); match(Set dst (LShiftVB src (LShiftCntV shift))); ins_cost(INSN_COST); - format %{ "shl $dst, $src, $shift\t# vector (16B)" %} + format %{ "shl $dst, $src, $shift\t# vector (16B)" %} ins_encode %{ int sh = (int)$shift$$constant; if (sh >= 8) { @@ -4581,40 +4673,40 @@ instruct vsll16B_imm(vecX dst, vecX src, immI shift) %{ %} instruct vsra8B_imm(vecD dst, vecD src, immI shift) %{ - predicate(n->as_Vector()->length() == 4 || - n->as_Vector()->length() == 8); + predicate((n->as_Vector()->length() == 4 || n->as_Vector()->length() == 8) && + assert_not_var_shift(n)); match(Set dst (RShiftVB src (RShiftCntV shift))); ins_cost(INSN_COST); - format %{ "sshr $dst, $src, $shift\t# vector (8B)" %} + format %{ "sshr $dst, $src, $shift\t# vector (8B)" %} ins_encode %{ int sh = (int)$shift$$constant; if (sh >= 8) sh = 7; __ sshr(as_FloatRegister($dst$$reg), __ T8B, - as_FloatRegister($src$$reg), sh); + as_FloatRegister($src$$reg), sh); %} ins_pipe(vshift64_imm); %} instruct vsra16B_imm(vecX dst, vecX src, immI shift) %{ - predicate(n->as_Vector()->length() == 16); + predicate(n->as_Vector()->length() == 16 && assert_not_var_shift(n)); match(Set dst (RShiftVB src (RShiftCntV shift))); ins_cost(INSN_COST); - format %{ "sshr $dst, $src, $shift\t# vector (16B)" %} + format %{ "sshr $dst, $src, $shift\t# vector (16B)" %} ins_encode %{ int sh = (int)$shift$$constant; if (sh >= 8) sh = 7; __ sshr(as_FloatRegister($dst$$reg), __ T16B, - as_FloatRegister($src$$reg), sh); + as_FloatRegister($src$$reg), sh); %} ins_pipe(vshift128_imm); %} instruct vsrl8B_imm(vecD dst, vecD src, immI shift) %{ - predicate(n->as_Vector()->length() == 4 || - n->as_Vector()->length() == 8); + predicate((n->as_Vector()->length() == 4 || n->as_Vector()->length() == 8) && + assert_not_var_shift(n)); match(Set dst (URShiftVB src (RShiftCntV shift))); ins_cost(INSN_COST); - format %{ "ushr $dst, $src, $shift\t# vector (8B)" %} + format %{ "ushr $dst, $src, $shift\t# vector (8B)" %} ins_encode %{ int sh = (int)$shift$$constant; if (sh >= 8) { @@ -4623,17 +4715,17 @@ instruct vsrl8B_imm(vecD dst, vecD src, immI shift) %{ as_FloatRegister($src$$reg)); } else { __ ushr(as_FloatRegister($dst$$reg), __ T8B, - as_FloatRegister($src$$reg), sh); + as_FloatRegister($src$$reg), sh); } %} ins_pipe(vshift64_imm); %} instruct vsrl16B_imm(vecX dst, vecX src, immI shift) %{ - predicate(n->as_Vector()->length() == 16); + predicate(n->as_Vector()->length() == 16 && assert_not_var_shift(n)); match(Set dst (URShiftVB src (RShiftCntV shift))); ins_cost(INSN_COST); - format %{ "ushr $dst, $src, $shift\t# vector (16B)" %} + format %{ "ushr $dst, $src, $shift\t# vector (16B)" %} ins_encode %{ int sh = (int)$shift$$constant; if (sh >= 8) { @@ -4642,15 +4734,14 @@ instruct vsrl16B_imm(vecX dst, vecX src, immI shift) %{ as_FloatRegister($src$$reg)); } else { __ ushr(as_FloatRegister($dst$$reg), __ T16B, - as_FloatRegister($src$$reg), sh); + as_FloatRegister($src$$reg), sh); } %} ins_pipe(vshift128_imm); %} instruct vsll4S(vecD dst, vecD src, vecD shift) %{ - predicate(n->as_Vector()->length() == 2 || - n->as_Vector()->length() == 4); + predicate(n->as_Vector()->length() == 2 || n->as_Vector()->length() == 4); match(Set dst (LShiftVS src shift)); ins_cost(INSN_COST); format %{ "sshl $dst,$src,$shift\t# vector (4H)" %} @@ -4675,82 +4766,136 @@ instruct vsll8S(vecX dst, vecX src, vecX shift) %{ ins_pipe(vshift128); %} -instruct vsra4S(vecD dst, vecD src, vecD shift, vecD tmp) %{ - predicate(n->as_Vector()->length() == 2 || - n->as_Vector()->length() == 4); +instruct vsra4S(vecD dst, vecD src, vecD shift) %{ + predicate((n->as_Vector()->length() == 2 || n->as_Vector()->length() == 4) && + !n->as_ShiftV()->is_var_shift()); match(Set dst (RShiftVS src shift)); ins_cost(INSN_COST); - effect(TEMP tmp); - format %{ "negr $tmp,$shift\t" - "sshl $dst,$src,$tmp\t# vector (4H)" %} + format %{ "sshl $dst,$src,$shift\t# vector (4H)" %} + ins_encode %{ + __ sshl(as_FloatRegister($dst$$reg), __ T4H, + as_FloatRegister($src$$reg), + as_FloatRegister($shift$$reg)); + %} + ins_pipe(vshift64); +%} + +instruct vsra4S_var(vecD dst, vecD src, vecD shift) %{ + predicate((n->as_Vector()->length() == 2 || n->as_Vector()->length() == 4) && + n->as_ShiftV()->is_var_shift()); + match(Set dst (RShiftVS src shift)); + ins_cost(INSN_COST * 2); + effect(TEMP_DEF dst); + format %{ "negr $dst,$shift\t" + "sshl $dst,$src,$dst\t# vector (4H)" %} ins_encode %{ - __ negr(as_FloatRegister($tmp$$reg), __ T8B, + __ negr(as_FloatRegister($dst$$reg), __ T8B, as_FloatRegister($shift$$reg)); __ sshl(as_FloatRegister($dst$$reg), __ T4H, as_FloatRegister($src$$reg), - as_FloatRegister($tmp$$reg)); + as_FloatRegister($dst$$reg)); %} ins_pipe(vshift64); %} -instruct vsra8S(vecX dst, vecX src, vecX shift, vecX tmp) %{ - predicate(n->as_Vector()->length() == 8); +instruct vsra8S(vecX dst, vecX src, vecX shift) %{ + predicate(n->as_Vector()->length() == 8 && !n->as_ShiftV()->is_var_shift()); match(Set dst (RShiftVS src shift)); ins_cost(INSN_COST); - effect(TEMP tmp); - format %{ "negr $tmp,$shift\t" - "sshl $dst,$src,$tmp\t# vector (8H)" %} + format %{ "sshl $dst,$src,$shift\t# vector (8H)" %} ins_encode %{ - __ negr(as_FloatRegister($tmp$$reg), __ T16B, + __ sshl(as_FloatRegister($dst$$reg), __ T8H, + as_FloatRegister($src$$reg), + as_FloatRegister($shift$$reg)); + %} + ins_pipe(vshift128); +%} + +instruct vsra8S_var(vecX dst, vecX src, vecX shift) %{ + predicate(n->as_Vector()->length() == 8 && n->as_ShiftV()->is_var_shift()); + match(Set dst (RShiftVS src shift)); + ins_cost(INSN_COST * 2); + effect(TEMP_DEF dst); + format %{ "negr $dst,$shift\t" + "sshl $dst,$src,$dst\t# vector (8H)" %} + ins_encode %{ + __ negr(as_FloatRegister($dst$$reg), __ T16B, as_FloatRegister($shift$$reg)); __ sshl(as_FloatRegister($dst$$reg), __ T8H, as_FloatRegister($src$$reg), - as_FloatRegister($tmp$$reg)); + as_FloatRegister($dst$$reg)); %} ins_pipe(vshift128); %} -instruct vsrl4S(vecD dst, vecD src, vecD shift, vecD tmp) %{ - predicate(n->as_Vector()->length() == 2 || - n->as_Vector()->length() == 4); +instruct vsrl4S(vecD dst, vecD src, vecD shift) %{ + predicate((n->as_Vector()->length() == 2 || n->as_Vector()->length() == 4) && + !n->as_ShiftV()->is_var_shift()); match(Set dst (URShiftVS src shift)); ins_cost(INSN_COST); - effect(TEMP tmp); - format %{ "negr $tmp,$shift\t" - "ushl $dst,$src,$tmp\t# vector (4H)" %} + format %{ "ushl $dst,$src,$shift\t# vector (4H)" %} + ins_encode %{ + __ ushl(as_FloatRegister($dst$$reg), __ T4H, + as_FloatRegister($src$$reg), + as_FloatRegister($shift$$reg)); + %} + ins_pipe(vshift64); +%} + +instruct vsrl4S_var(vecD dst, vecD src, vecD shift) %{ + predicate((n->as_Vector()->length() == 2 || n->as_Vector()->length() == 4) && + n->as_ShiftV()->is_var_shift()); + match(Set dst (URShiftVS src shift)); + ins_cost(INSN_COST * 2); + effect(TEMP_DEF dst); + format %{ "negr $dst,$shift\t" + "ushl $dst,$src,$dst\t# vector (4H)" %} ins_encode %{ - __ negr(as_FloatRegister($tmp$$reg), __ T8B, + __ negr(as_FloatRegister($dst$$reg), __ T8B, as_FloatRegister($shift$$reg)); __ ushl(as_FloatRegister($dst$$reg), __ T4H, as_FloatRegister($src$$reg), - as_FloatRegister($tmp$$reg)); + as_FloatRegister($dst$$reg)); %} ins_pipe(vshift64); %} -instruct vsrl8S(vecX dst, vecX src, vecX shift, vecX tmp) %{ - predicate(n->as_Vector()->length() == 8); +instruct vsrl8S(vecX dst, vecX src, vecX shift) %{ + predicate(n->as_Vector()->length() == 8 && !n->as_ShiftV()->is_var_shift()); match(Set dst (URShiftVS src shift)); ins_cost(INSN_COST); - effect(TEMP tmp); - format %{ "negr $tmp,$shift\t" - "ushl $dst,$src,$tmp\t# vector (8H)" %} + format %{ "ushl $dst,$src,$shift\t# vector (8H)" %} ins_encode %{ - __ negr(as_FloatRegister($tmp$$reg), __ T16B, + __ ushl(as_FloatRegister($dst$$reg), __ T8H, + as_FloatRegister($src$$reg), + as_FloatRegister($shift$$reg)); + %} + ins_pipe(vshift128); +%} + +instruct vsrl8S_var(vecX dst, vecX src, vecX shift) %{ + predicate(n->as_Vector()->length() == 8 && n->as_ShiftV()->is_var_shift()); + match(Set dst (URShiftVS src shift)); + ins_cost(INSN_COST * 2); + effect(TEMP_DEF dst); + format %{ "negr $dst,$shift\t" + "ushl $dst,$src,$dst\t# vector (8H)" %} + ins_encode %{ + __ negr(as_FloatRegister($dst$$reg), __ T16B, as_FloatRegister($shift$$reg)); __ ushl(as_FloatRegister($dst$$reg), __ T8H, as_FloatRegister($src$$reg), - as_FloatRegister($tmp$$reg)); + as_FloatRegister($dst$$reg)); %} ins_pipe(vshift128); %} instruct vsll4S_imm(vecD dst, vecD src, immI shift) %{ - predicate(n->as_Vector()->length() == 2 || - n->as_Vector()->length() == 4); + predicate((n->as_Vector()->length() == 2 || n->as_Vector()->length() == 4) && + assert_not_var_shift(n)); match(Set dst (LShiftVS src (LShiftCntV shift))); ins_cost(INSN_COST); - format %{ "shl $dst, $src, $shift\t# vector (4H)" %} + format %{ "shl $dst, $src, $shift\t# vector (4H)" %} ins_encode %{ int sh = (int)$shift$$constant; if (sh >= 16) { @@ -4766,10 +4911,10 @@ instruct vsll4S_imm(vecD dst, vecD src, immI shift) %{ %} instruct vsll8S_imm(vecX dst, vecX src, immI shift) %{ - predicate(n->as_Vector()->length() == 8); + predicate(n->as_Vector()->length() == 8 && assert_not_var_shift(n)); match(Set dst (LShiftVS src (LShiftCntV shift))); ins_cost(INSN_COST); - format %{ "shl $dst, $src, $shift\t# vector (8H)" %} + format %{ "shl $dst, $src, $shift\t# vector (8H)" %} ins_encode %{ int sh = (int)$shift$$constant; if (sh >= 16) { @@ -4785,40 +4930,40 @@ instruct vsll8S_imm(vecX dst, vecX src, immI shift) %{ %} instruct vsra4S_imm(vecD dst, vecD src, immI shift) %{ - predicate(n->as_Vector()->length() == 2 || - n->as_Vector()->length() == 4); + predicate((n->as_Vector()->length() == 2 || n->as_Vector()->length() == 4) && + assert_not_var_shift(n)); match(Set dst (RShiftVS src (RShiftCntV shift))); ins_cost(INSN_COST); - format %{ "sshr $dst, $src, $shift\t# vector (4H)" %} + format %{ "sshr $dst, $src, $shift\t# vector (4H)" %} ins_encode %{ int sh = (int)$shift$$constant; if (sh >= 16) sh = 15; __ sshr(as_FloatRegister($dst$$reg), __ T4H, - as_FloatRegister($src$$reg), sh); + as_FloatRegister($src$$reg), sh); %} ins_pipe(vshift64_imm); %} instruct vsra8S_imm(vecX dst, vecX src, immI shift) %{ - predicate(n->as_Vector()->length() == 8); + predicate(n->as_Vector()->length() == 8 && assert_not_var_shift(n)); match(Set dst (RShiftVS src (RShiftCntV shift))); ins_cost(INSN_COST); - format %{ "sshr $dst, $src, $shift\t# vector (8H)" %} + format %{ "sshr $dst, $src, $shift\t# vector (8H)" %} ins_encode %{ int sh = (int)$shift$$constant; if (sh >= 16) sh = 15; __ sshr(as_FloatRegister($dst$$reg), __ T8H, - as_FloatRegister($src$$reg), sh); + as_FloatRegister($src$$reg), sh); %} ins_pipe(vshift128_imm); %} instruct vsrl4S_imm(vecD dst, vecD src, immI shift) %{ - predicate(n->as_Vector()->length() == 2 || - n->as_Vector()->length() == 4); + predicate((n->as_Vector()->length() == 2 || n->as_Vector()->length() == 4) && + assert_not_var_shift(n)); match(Set dst (URShiftVS src (RShiftCntV shift))); ins_cost(INSN_COST); - format %{ "ushr $dst, $src, $shift\t# vector (4H)" %} + format %{ "ushr $dst, $src, $shift\t# vector (4H)" %} ins_encode %{ int sh = (int)$shift$$constant; if (sh >= 16) { @@ -4827,17 +4972,17 @@ instruct vsrl4S_imm(vecD dst, vecD src, immI shift) %{ as_FloatRegister($src$$reg)); } else { __ ushr(as_FloatRegister($dst$$reg), __ T4H, - as_FloatRegister($src$$reg), sh); + as_FloatRegister($src$$reg), sh); } %} ins_pipe(vshift64_imm); %} instruct vsrl8S_imm(vecX dst, vecX src, immI shift) %{ - predicate(n->as_Vector()->length() == 8); + predicate(n->as_Vector()->length() == 8 && assert_not_var_shift(n)); match(Set dst (URShiftVS src (RShiftCntV shift))); ins_cost(INSN_COST); - format %{ "ushr $dst, $src, $shift\t# vector (8H)" %} + format %{ "ushr $dst, $src, $shift\t# vector (8H)" %} ins_encode %{ int sh = (int)$shift$$constant; if (sh >= 16) { @@ -4846,7 +4991,7 @@ instruct vsrl8S_imm(vecX dst, vecX src, immI shift) %{ as_FloatRegister($src$$reg)); } else { __ ushr(as_FloatRegister($dst$$reg), __ T8H, - as_FloatRegister($src$$reg), sh); + as_FloatRegister($src$$reg), sh); } %} ins_pipe(vshift128_imm); @@ -4878,79 +5023,131 @@ instruct vsll4I(vecX dst, vecX src, vecX shift) %{ ins_pipe(vshift128); %} -instruct vsra2I(vecD dst, vecD src, vecD shift, vecD tmp) %{ - predicate(n->as_Vector()->length() == 2); +instruct vsra2I(vecD dst, vecD src, vecD shift) %{ + predicate(n->as_Vector()->length() == 2 && !n->as_ShiftV()->is_var_shift()); match(Set dst (RShiftVI src shift)); ins_cost(INSN_COST); - effect(TEMP tmp); - format %{ "negr $tmp,$shift\t" - "sshl $dst,$src,$tmp\t# vector (2S)" %} + format %{ "sshl $dst,$src,$shift\t# vector (2S)" %} + ins_encode %{ + __ sshl(as_FloatRegister($dst$$reg), __ T2S, + as_FloatRegister($src$$reg), + as_FloatRegister($shift$$reg)); + %} + ins_pipe(vshift64); +%} + +instruct vsra2I_var(vecD dst, vecD src, vecD shift) %{ + predicate(n->as_Vector()->length() == 2 && n->as_ShiftV()->is_var_shift()); + match(Set dst (RShiftVI src shift)); + ins_cost(INSN_COST * 2); + effect(TEMP_DEF dst); + format %{ "negr $dst,$shift\t" + "sshl $dst,$src,$dst\t# vector (2S)" %} ins_encode %{ - __ negr(as_FloatRegister($tmp$$reg), __ T8B, + __ negr(as_FloatRegister($dst$$reg), __ T8B, as_FloatRegister($shift$$reg)); __ sshl(as_FloatRegister($dst$$reg), __ T2S, as_FloatRegister($src$$reg), - as_FloatRegister($tmp$$reg)); + as_FloatRegister($dst$$reg)); %} ins_pipe(vshift64); %} -instruct vsra4I(vecX dst, vecX src, vecX shift, vecX tmp) %{ - predicate(n->as_Vector()->length() == 4); +instruct vsra4I(vecX dst, vecX src, vecX shift) %{ + predicate(n->as_Vector()->length() == 4 && !n->as_ShiftV()->is_var_shift()); match(Set dst (RShiftVI src shift)); ins_cost(INSN_COST); - effect(TEMP tmp); - format %{ "negr $tmp,$shift\t" - "sshl $dst,$src,$tmp\t# vector (4S)" %} + format %{ "sshl $dst,$src,$shift\t# vector (4S)" %} ins_encode %{ - __ negr(as_FloatRegister($tmp$$reg), __ T16B, + __ sshl(as_FloatRegister($dst$$reg), __ T4S, + as_FloatRegister($src$$reg), + as_FloatRegister($shift$$reg)); + %} + ins_pipe(vshift128); +%} + +instruct vsra4I_var(vecX dst, vecX src, vecX shift) %{ + predicate(n->as_Vector()->length() == 4 && n->as_ShiftV()->is_var_shift()); + match(Set dst (RShiftVI src shift)); + ins_cost(INSN_COST * 2); + effect(TEMP_DEF dst); + format %{ "negr $dst,$shift\t" + "sshl $dst,$src,$dst\t# vector (4S)" %} + ins_encode %{ + __ negr(as_FloatRegister($dst$$reg), __ T16B, as_FloatRegister($shift$$reg)); __ sshl(as_FloatRegister($dst$$reg), __ T4S, as_FloatRegister($src$$reg), - as_FloatRegister($tmp$$reg)); + as_FloatRegister($dst$$reg)); %} ins_pipe(vshift128); %} -instruct vsrl2I(vecD dst, vecD src, vecD shift, vecD tmp) %{ - predicate(n->as_Vector()->length() == 2); +instruct vsrl2I(vecD dst, vecD src, vecD shift) %{ + predicate(n->as_Vector()->length() == 2 && !n->as_ShiftV()->is_var_shift()); match(Set dst (URShiftVI src shift)); ins_cost(INSN_COST); - effect(TEMP tmp); - format %{ "negr $tmp,$shift\t" - "ushl $dst,$src,$tmp\t# vector (2S)" %} + format %{ "ushl $dst,$src,$shift\t# vector (2S)" %} + ins_encode %{ + __ ushl(as_FloatRegister($dst$$reg), __ T2S, + as_FloatRegister($src$$reg), + as_FloatRegister($shift$$reg)); + %} + ins_pipe(vshift64); +%} + +instruct vsrl2I_var(vecD dst, vecD src, vecD shift) %{ + predicate(n->as_Vector()->length() == 2 && n->as_ShiftV()->is_var_shift()); + match(Set dst (URShiftVI src shift)); + ins_cost(INSN_COST * 2); + effect(TEMP_DEF dst); + format %{ "negr $dst,$shift\t" + "ushl $dst,$src,$dst\t# vector (2S)" %} ins_encode %{ - __ negr(as_FloatRegister($tmp$$reg), __ T8B, + __ negr(as_FloatRegister($dst$$reg), __ T8B, as_FloatRegister($shift$$reg)); __ ushl(as_FloatRegister($dst$$reg), __ T2S, as_FloatRegister($src$$reg), - as_FloatRegister($tmp$$reg)); + as_FloatRegister($dst$$reg)); %} ins_pipe(vshift64); %} -instruct vsrl4I(vecX dst, vecX src, vecX shift, vecX tmp) %{ - predicate(n->as_Vector()->length() == 4); +instruct vsrl4I(vecX dst, vecX src, vecX shift) %{ + predicate(n->as_Vector()->length() == 4 && !n->as_ShiftV()->is_var_shift()); match(Set dst (URShiftVI src shift)); ins_cost(INSN_COST); - effect(TEMP tmp); - format %{ "negr $tmp,$shift\t" - "ushl $dst,$src,$tmp\t# vector (4S)" %} + format %{ "ushl $dst,$src,$shift\t# vector (4S)" %} ins_encode %{ - __ negr(as_FloatRegister($tmp$$reg), __ T16B, + __ ushl(as_FloatRegister($dst$$reg), __ T4S, + as_FloatRegister($src$$reg), + as_FloatRegister($shift$$reg)); + %} + ins_pipe(vshift128); +%} + +instruct vsrl4I_var(vecX dst, vecX src, vecX shift) %{ + predicate(n->as_Vector()->length() == 4 && n->as_ShiftV()->is_var_shift()); + match(Set dst (URShiftVI src shift)); + ins_cost(INSN_COST * 2); + effect(TEMP_DEF dst); + format %{ "negr $dst,$shift\t" + "ushl $dst,$src,$dst\t# vector (4S)" %} + ins_encode %{ + __ negr(as_FloatRegister($dst$$reg), __ T16B, as_FloatRegister($shift$$reg)); __ ushl(as_FloatRegister($dst$$reg), __ T4S, as_FloatRegister($src$$reg), - as_FloatRegister($tmp$$reg)); + as_FloatRegister($dst$$reg)); %} ins_pipe(vshift128); %} instruct vsll2I_imm(vecD dst, vecD src, immI shift) %{ - predicate(n->as_Vector()->length() == 2); + predicate(n->as_Vector()->length() == 2 && assert_not_var_shift(n)); match(Set dst (LShiftVI src (LShiftCntV shift))); ins_cost(INSN_COST); - format %{ "shl $dst, $src, $shift\t# vector (2S)" %} + format %{ "shl $dst, $src, $shift\t# vector (2S)" %} ins_encode %{ __ shl(as_FloatRegister($dst$$reg), __ T2S, as_FloatRegister($src$$reg), @@ -4960,10 +5157,10 @@ instruct vsll2I_imm(vecD dst, vecD src, immI shift) %{ %} instruct vsll4I_imm(vecX dst, vecX src, immI shift) %{ - predicate(n->as_Vector()->length() == 4); + predicate(n->as_Vector()->length() == 4 && assert_not_var_shift(n)); match(Set dst (LShiftVI src (LShiftCntV shift))); ins_cost(INSN_COST); - format %{ "shl $dst, $src, $shift\t# vector (4S)" %} + format %{ "shl $dst, $src, $shift\t# vector (4S)" %} ins_encode %{ __ shl(as_FloatRegister($dst$$reg), __ T4S, as_FloatRegister($src$$reg), @@ -4973,10 +5170,10 @@ instruct vsll4I_imm(vecX dst, vecX src, immI shift) %{ %} instruct vsra2I_imm(vecD dst, vecD src, immI shift) %{ - predicate(n->as_Vector()->length() == 2); + predicate(n->as_Vector()->length() == 2 && assert_not_var_shift(n)); match(Set dst (RShiftVI src (RShiftCntV shift))); ins_cost(INSN_COST); - format %{ "sshr $dst, $src, $shift\t# vector (2S)" %} + format %{ "sshr $dst, $src, $shift\t# vector (2S)" %} ins_encode %{ __ sshr(as_FloatRegister($dst$$reg), __ T2S, as_FloatRegister($src$$reg), @@ -4986,10 +5183,10 @@ instruct vsra2I_imm(vecD dst, vecD src, immI shift) %{ %} instruct vsra4I_imm(vecX dst, vecX src, immI shift) %{ - predicate(n->as_Vector()->length() == 4); + predicate(n->as_Vector()->length() == 4 && assert_not_var_shift(n)); match(Set dst (RShiftVI src (RShiftCntV shift))); ins_cost(INSN_COST); - format %{ "sshr $dst, $src, $shift\t# vector (4S)" %} + format %{ "sshr $dst, $src, $shift\t# vector (4S)" %} ins_encode %{ __ sshr(as_FloatRegister($dst$$reg), __ T4S, as_FloatRegister($src$$reg), @@ -4999,10 +5196,10 @@ instruct vsra4I_imm(vecX dst, vecX src, immI shift) %{ %} instruct vsrl2I_imm(vecD dst, vecD src, immI shift) %{ - predicate(n->as_Vector()->length() == 2); + predicate(n->as_Vector()->length() == 2 && assert_not_var_shift(n)); match(Set dst (URShiftVI src (RShiftCntV shift))); ins_cost(INSN_COST); - format %{ "ushr $dst, $src, $shift\t# vector (2S)" %} + format %{ "ushr $dst, $src, $shift\t# vector (2S)" %} ins_encode %{ __ ushr(as_FloatRegister($dst$$reg), __ T2S, as_FloatRegister($src$$reg), @@ -5012,10 +5209,10 @@ instruct vsrl2I_imm(vecD dst, vecD src, immI shift) %{ %} instruct vsrl4I_imm(vecX dst, vecX src, immI shift) %{ - predicate(n->as_Vector()->length() == 4); + predicate(n->as_Vector()->length() == 4 && assert_not_var_shift(n)); match(Set dst (URShiftVI src (RShiftCntV shift))); ins_cost(INSN_COST); - format %{ "ushr $dst, $src, $shift\t# vector (4S)" %} + format %{ "ushr $dst, $src, $shift\t# vector (4S)" %} ins_encode %{ __ ushr(as_FloatRegister($dst$$reg), __ T4S, as_FloatRegister($src$$reg), @@ -5037,45 +5234,71 @@ instruct vsll2L(vecX dst, vecX src, vecX shift) %{ ins_pipe(vshift128); %} -instruct vsra2L(vecX dst, vecX src, vecX shift, vecX tmp) %{ - predicate(n->as_Vector()->length() == 2); +instruct vsra2L(vecX dst, vecX src, vecX shift) %{ + predicate(n->as_Vector()->length() == 2 && !n->as_ShiftV()->is_var_shift()); match(Set dst (RShiftVL src shift)); ins_cost(INSN_COST); - effect(TEMP tmp); - format %{ "negr $tmp,$shift\t" - "sshl $dst,$src,$tmp\t# vector (2D)" %} + format %{ "sshl $dst,$src,$shift\t# vector (2D)" %} ins_encode %{ - __ negr(as_FloatRegister($tmp$$reg), __ T16B, + __ sshl(as_FloatRegister($dst$$reg), __ T2D, + as_FloatRegister($src$$reg), + as_FloatRegister($shift$$reg)); + %} + ins_pipe(vshift128); +%} + +instruct vsra2L_var(vecX dst, vecX src, vecX shift) %{ + predicate(n->as_Vector()->length() == 2 && n->as_ShiftV()->is_var_shift()); + match(Set dst (RShiftVL src shift)); + ins_cost(INSN_COST * 2); + effect(TEMP_DEF dst); + format %{ "negr $dst,$shift\t" + "sshl $dst,$src,$dst\t# vector (2D)" %} + ins_encode %{ + __ negr(as_FloatRegister($dst$$reg), __ T16B, as_FloatRegister($shift$$reg)); __ sshl(as_FloatRegister($dst$$reg), __ T2D, as_FloatRegister($src$$reg), - as_FloatRegister($tmp$$reg)); + as_FloatRegister($dst$$reg)); %} ins_pipe(vshift128); %} -instruct vsrl2L(vecX dst, vecX src, vecX shift, vecX tmp) %{ - predicate(n->as_Vector()->length() == 2); +instruct vsrl2L(vecX dst, vecX src, vecX shift) %{ + predicate(n->as_Vector()->length() == 2 && !n->as_ShiftV()->is_var_shift()); match(Set dst (URShiftVL src shift)); ins_cost(INSN_COST); - effect(TEMP tmp); - format %{ "negr $tmp,$shift\t" - "ushl $dst,$src,$tmp\t# vector (2D)" %} + format %{ "ushl $dst,$src,$shift\t# vector (2D)" %} + ins_encode %{ + __ ushl(as_FloatRegister($dst$$reg), __ T2D, + as_FloatRegister($src$$reg), + as_FloatRegister($shift$$reg)); + %} + ins_pipe(vshift128); +%} + +instruct vsrl2L_var(vecX dst, vecX src, vecX shift) %{ + predicate(n->as_Vector()->length() == 2 && n->as_ShiftV()->is_var_shift()); + match(Set dst (URShiftVL src shift)); + ins_cost(INSN_COST * 2); + effect(TEMP_DEF dst); + format %{ "negr $dst,$shift\t" + "ushl $dst,$src,$dst\t# vector (2D)" %} ins_encode %{ - __ negr(as_FloatRegister($tmp$$reg), __ T16B, + __ negr(as_FloatRegister($dst$$reg), __ T16B, as_FloatRegister($shift$$reg)); __ ushl(as_FloatRegister($dst$$reg), __ T2D, as_FloatRegister($src$$reg), - as_FloatRegister($tmp$$reg)); + as_FloatRegister($dst$$reg)); %} ins_pipe(vshift128); %} instruct vsll2L_imm(vecX dst, vecX src, immI shift) %{ - predicate(n->as_Vector()->length() == 2); + predicate(n->as_Vector()->length() == 2 && assert_not_var_shift(n)); match(Set dst (LShiftVL src (LShiftCntV shift))); ins_cost(INSN_COST); - format %{ "shl $dst, $src, $shift\t# vector (2D)" %} + format %{ "shl $dst, $src, $shift\t# vector (2D)" %} ins_encode %{ __ shl(as_FloatRegister($dst$$reg), __ T2D, as_FloatRegister($src$$reg), @@ -5085,10 +5308,10 @@ instruct vsll2L_imm(vecX dst, vecX src, immI shift) %{ %} instruct vsra2L_imm(vecX dst, vecX src, immI shift) %{ - predicate(n->as_Vector()->length() == 2); + predicate(n->as_Vector()->length() == 2 && assert_not_var_shift(n)); match(Set dst (RShiftVL src (RShiftCntV shift))); ins_cost(INSN_COST); - format %{ "sshr $dst, $src, $shift\t# vector (2D)" %} + format %{ "sshr $dst, $src, $shift\t# vector (2D)" %} ins_encode %{ __ sshr(as_FloatRegister($dst$$reg), __ T2D, as_FloatRegister($src$$reg), @@ -5098,10 +5321,10 @@ instruct vsra2L_imm(vecX dst, vecX src, immI shift) %{ %} instruct vsrl2L_imm(vecX dst, vecX src, immI shift) %{ - predicate(n->as_Vector()->length() == 2); + predicate(n->as_Vector()->length() == 2 && assert_not_var_shift(n)); match(Set dst (URShiftVL src (RShiftCntV shift))); ins_cost(INSN_COST); - format %{ "ushr $dst, $src, $shift\t# vector (2D)" %} + format %{ "ushr $dst, $src, $shift\t# vector (2D)" %} ins_encode %{ __ ushr(as_FloatRegister($dst$$reg), __ T2D, as_FloatRegister($src$$reg), @@ -5114,12 +5337,12 @@ instruct vsraa8B_imm(vecD dst, vecD src, immI shift) %{ predicate(n->as_Vector()->length() == 8); match(Set dst (AddVB dst (RShiftVB src (RShiftCntV shift)))); ins_cost(INSN_COST); - format %{ "ssra $dst, $src, $shift\t# vector (8B)" %} + format %{ "ssra $dst, $src, $shift\t# vector (8B)" %} ins_encode %{ int sh = (int)$shift$$constant; if (sh >= 8) sh = 7; __ ssra(as_FloatRegister($dst$$reg), __ T8B, - as_FloatRegister($src$$reg), sh); + as_FloatRegister($src$$reg), sh); %} ins_pipe(vshift64_imm); %} @@ -5128,12 +5351,12 @@ instruct vsraa16B_imm(vecX dst, vecX src, immI shift) %{ predicate(n->as_Vector()->length() == 16); match(Set dst (AddVB dst (RShiftVB src (RShiftCntV shift)))); ins_cost(INSN_COST); - format %{ "ssra $dst, $src, $shift\t# vector (16B)" %} + format %{ "ssra $dst, $src, $shift\t# vector (16B)" %} ins_encode %{ int sh = (int)$shift$$constant; if (sh >= 8) sh = 7; __ ssra(as_FloatRegister($dst$$reg), __ T16B, - as_FloatRegister($src$$reg), sh); + as_FloatRegister($src$$reg), sh); %} ins_pipe(vshift128_imm); %} @@ -5142,12 +5365,12 @@ instruct vsraa4S_imm(vecD dst, vecD src, immI shift) %{ predicate(n->as_Vector()->length() == 4); match(Set dst (AddVS dst (RShiftVS src (RShiftCntV shift)))); ins_cost(INSN_COST); - format %{ "ssra $dst, $src, $shift\t# vector (4H)" %} + format %{ "ssra $dst, $src, $shift\t# vector (4H)" %} ins_encode %{ int sh = (int)$shift$$constant; if (sh >= 16) sh = 15; __ ssra(as_FloatRegister($dst$$reg), __ T4H, - as_FloatRegister($src$$reg), sh); + as_FloatRegister($src$$reg), sh); %} ins_pipe(vshift64_imm); %} @@ -5156,12 +5379,12 @@ instruct vsraa8S_imm(vecX dst, vecX src, immI shift) %{ predicate(n->as_Vector()->length() == 8); match(Set dst (AddVS dst (RShiftVS src (RShiftCntV shift)))); ins_cost(INSN_COST); - format %{ "ssra $dst, $src, $shift\t# vector (8H)" %} + format %{ "ssra $dst, $src, $shift\t# vector (8H)" %} ins_encode %{ int sh = (int)$shift$$constant; if (sh >= 16) sh = 15; __ ssra(as_FloatRegister($dst$$reg), __ T8H, - as_FloatRegister($src$$reg), sh); + as_FloatRegister($src$$reg), sh); %} ins_pipe(vshift128_imm); %} @@ -5170,7 +5393,7 @@ instruct vsraa2I_imm(vecD dst, vecD src, immI shift) %{ predicate(n->as_Vector()->length() == 2); match(Set dst (AddVI dst (RShiftVI src (RShiftCntV shift)))); ins_cost(INSN_COST); - format %{ "ssra $dst, $src, $shift\t# vector (2S)" %} + format %{ "ssra $dst, $src, $shift\t# vector (2S)" %} ins_encode %{ __ ssra(as_FloatRegister($dst$$reg), __ T2S, as_FloatRegister($src$$reg), @@ -5183,7 +5406,7 @@ instruct vsraa4I_imm(vecX dst, vecX src, immI shift) %{ predicate(n->as_Vector()->length() == 4); match(Set dst (AddVI dst (RShiftVI src (RShiftCntV shift)))); ins_cost(INSN_COST); - format %{ "ssra $dst, $src, $shift\t# vector (4S)" %} + format %{ "ssra $dst, $src, $shift\t# vector (4S)" %} ins_encode %{ __ ssra(as_FloatRegister($dst$$reg), __ T4S, as_FloatRegister($src$$reg), @@ -5196,7 +5419,7 @@ instruct vsraa2L_imm(vecX dst, vecX src, immI shift) %{ predicate(n->as_Vector()->length() == 2); match(Set dst (AddVL dst (RShiftVL src (RShiftCntV shift)))); ins_cost(INSN_COST); - format %{ "ssra $dst, $src, $shift\t# vector (2D)" %} + format %{ "ssra $dst, $src, $shift\t# vector (2D)" %} ins_encode %{ __ ssra(as_FloatRegister($dst$$reg), __ T2D, as_FloatRegister($src$$reg), @@ -5209,12 +5432,12 @@ instruct vsrla8B_imm(vecD dst, vecD src, immI shift) %{ predicate(n->as_Vector()->length() == 8); match(Set dst (AddVB dst (URShiftVB src (RShiftCntV shift)))); ins_cost(INSN_COST); - format %{ "usra $dst, $src, $shift\t# vector (8B)" %} + format %{ "usra $dst, $src, $shift\t# vector (8B)" %} ins_encode %{ int sh = (int)$shift$$constant; if (sh < 8) { __ usra(as_FloatRegister($dst$$reg), __ T8B, - as_FloatRegister($src$$reg), sh); + as_FloatRegister($src$$reg), sh); } %} ins_pipe(vshift64_imm); @@ -5224,12 +5447,12 @@ instruct vsrla16B_imm(vecX dst, vecX src, immI shift) %{ predicate(n->as_Vector()->length() == 16); match(Set dst (AddVB dst (URShiftVB src (RShiftCntV shift)))); ins_cost(INSN_COST); - format %{ "usra $dst, $src, $shift\t# vector (16B)" %} + format %{ "usra $dst, $src, $shift\t# vector (16B)" %} ins_encode %{ int sh = (int)$shift$$constant; if (sh < 8) { __ usra(as_FloatRegister($dst$$reg), __ T16B, - as_FloatRegister($src$$reg), sh); + as_FloatRegister($src$$reg), sh); } %} ins_pipe(vshift128_imm); @@ -5239,12 +5462,12 @@ instruct vsrla4S_imm(vecD dst, vecD src, immI shift) %{ predicate(n->as_Vector()->length() == 4); match(Set dst (AddVS dst (URShiftVS src (RShiftCntV shift)))); ins_cost(INSN_COST); - format %{ "usra $dst, $src, $shift\t# vector (4H)" %} + format %{ "usra $dst, $src, $shift\t# vector (4H)" %} ins_encode %{ int sh = (int)$shift$$constant; if (sh < 16) { __ usra(as_FloatRegister($dst$$reg), __ T4H, - as_FloatRegister($src$$reg), sh); + as_FloatRegister($src$$reg), sh); } %} ins_pipe(vshift64_imm); @@ -5254,12 +5477,12 @@ instruct vsrla8S_imm(vecX dst, vecX src, immI shift) %{ predicate(n->as_Vector()->length() == 8); match(Set dst (AddVS dst (URShiftVS src (RShiftCntV shift)))); ins_cost(INSN_COST); - format %{ "usra $dst, $src, $shift\t# vector (8H)" %} + format %{ "usra $dst, $src, $shift\t# vector (8H)" %} ins_encode %{ int sh = (int)$shift$$constant; if (sh < 16) { __ usra(as_FloatRegister($dst$$reg), __ T8H, - as_FloatRegister($src$$reg), sh); + as_FloatRegister($src$$reg), sh); } %} ins_pipe(vshift128_imm); @@ -5269,7 +5492,7 @@ instruct vsrla2I_imm(vecD dst, vecD src, immI shift) %{ predicate(n->as_Vector()->length() == 2); match(Set dst (AddVI dst (URShiftVI src (RShiftCntV shift)))); ins_cost(INSN_COST); - format %{ "usra $dst, $src, $shift\t# vector (2S)" %} + format %{ "usra $dst, $src, $shift\t# vector (2S)" %} ins_encode %{ __ usra(as_FloatRegister($dst$$reg), __ T2S, as_FloatRegister($src$$reg), @@ -5282,7 +5505,7 @@ instruct vsrla4I_imm(vecX dst, vecX src, immI shift) %{ predicate(n->as_Vector()->length() == 4); match(Set dst (AddVI dst (URShiftVI src (RShiftCntV shift)))); ins_cost(INSN_COST); - format %{ "usra $dst, $src, $shift\t# vector (4S)" %} + format %{ "usra $dst, $src, $shift\t# vector (4S)" %} ins_encode %{ __ usra(as_FloatRegister($dst$$reg), __ T4S, as_FloatRegister($src$$reg), @@ -5295,7 +5518,7 @@ instruct vsrla2L_imm(vecX dst, vecX src, immI shift) %{ predicate(n->as_Vector()->length() == 2); match(Set dst (AddVL dst (URShiftVL src (RShiftCntV shift)))); ins_cost(INSN_COST); - format %{ "usra $dst, $src, $shift\t# vector (2D)" %} + format %{ "usra $dst, $src, $shift\t# vector (2D)" %} ins_encode %{ __ usra(as_FloatRegister($dst$$reg), __ T2D, as_FloatRegister($src$$reg), diff --git a/src/hotspot/cpu/aarch64/aarch64_neon_ad.m4 b/src/hotspot/cpu/aarch64/aarch64_neon_ad.m4 index ff94bb002fafc..f98ddf4ee3655 100644 --- a/src/hotspot/cpu/aarch64/aarch64_neon_ad.m4 +++ b/src/hotspot/cpu/aarch64/aarch64_neon_ad.m4 @@ -1,5 +1,5 @@ -// Copyright (c) 2020, 2021, Oracle and/or its affiliates. All rights reserved. -// Copyright (c) 2020, 2021, Arm Limited. All rights reserved. +// Copyright (c) 2020, 2022, Oracle and/or its affiliates. All rights reserved. +// Copyright (c) 2020, 2022, Arm Limited. All rights reserved. // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. // // This code is free software; you can redistribute it and/or modify it @@ -1972,223 +1972,277 @@ VLOGICAL(xor, eor, xor, Xor, 16, B, X) // ------------------------------ Shift --------------------------------------- dnl -define(`VSHIFTCNT', ` -instruct vshiftcnt$3$4`'(vec$5 dst, iRegIorL2I cnt) %{ - predicate(UseSVE == 0 && (ifelse($3, 8, n->as_Vector()->length_in_bytes() == 4 ||` - ')n->as_Vector()->length_in_bytes() == $3)); +define(`VSLCNT', ` +instruct vslcnt$1$2`'(vec$3 dst, iRegIorL2I cnt) %{ + predicate(UseSVE == 0 && ifelse($1, 8, + (n->as_Vector()->length_in_bytes() == 4 ||` + 'n->as_Vector()->length_in_bytes() == $1), + n->as_Vector()->length_in_bytes() == $1)); match(Set dst (LShiftCntV cnt)); - match(Set dst (RShiftCntV cnt)); - format %{ "$1 $dst, $cnt\t# shift count vector ($3$4)" %} + ins_cost(INSN_COST); + format %{ "dup $dst, $cnt\t# shift count vector ($1$2)" %} ins_encode %{ - __ $2(as_FloatRegister($dst$$reg), __ T$3$4, as_Register($cnt$$reg)); + __ dup(as_FloatRegister($dst$$reg), __ T$1$2, as_Register($cnt$$reg)); %} - ins_pipe(vdup_reg_reg`'ifelse($5, D, 64, 128)); + ins_pipe(vdup_reg_reg`'ifelse($3, D, 64, 128)); %}')dnl -dnl $1 $2 $3 $4 $5 -VSHIFTCNT(dup, dup, 8, B, D) -VSHIFTCNT(dup, dup, 16, B, X) +dnl +define(`VSRCNT', ` +instruct vsrcnt$1$2`'(vec$3 dst, iRegIorL2I cnt) %{ + predicate(UseSVE == 0 && ifelse($1, 8, + (n->as_Vector()->length_in_bytes() == 4 ||` + 'n->as_Vector()->length_in_bytes() == $1), + n->as_Vector()->length_in_bytes() == $1)); + match(Set dst (RShiftCntV cnt)); + ins_cost(INSN_COST * 2); + format %{ "negw rscratch1, $cnt\t" + "dup $dst, rscratch1\t# shift count vector ($1$2)" %} + ins_encode %{ + __ negw(rscratch1, as_Register($cnt$$reg)); + __ dup(as_FloatRegister($dst$$reg), __ T$1$2, rscratch1); + %} + ins_pipe(vdup_reg_reg`'ifelse($3, D, 64, 128)); +%}')dnl +dnl + +// Vector shift count +// Note-1: Low 8 bits of each element are used, so it doesn't matter if we +// treat it as ints or bytes here. +// Note-2: Shift value is negated for RShiftCntV additionally. See the comments +// on vsra8B rule for more details. +dnl $1 $2 $3 +VSLCNT(8, B, D) +VSLCNT(16, B, X) +VSRCNT(8, B, D) +VSRCNT(16, B, X) +dnl +define(`PREDICATE', +`ifelse($1, 8B, + ifelse($3, `', `predicate(n->as_Vector()->length() == 4 || n->as_Vector()->length() == 8);', + `predicate((n->as_Vector()->length() == 4 || n->as_Vector()->length() == 8) &&` + '$3);'), + $1, 4S, + ifelse($3, `', `predicate(n->as_Vector()->length() == 2 || n->as_Vector()->length() == 4);', + `predicate((n->as_Vector()->length() == 2 || n->as_Vector()->length() == 4) &&` + '$3);'), + ifelse($3, `', `predicate(n->as_Vector()->length() == $2);', + `predicate(n->as_Vector()->length() == $2 && $3);'))')dnl dnl define(`VSLL', ` -instruct vsll$3$4`'(vec$6 dst, vec$6 src, vec$6 shift) %{ - predicate(ifelse($3$4, 8B, n->as_Vector()->length() == 4 ||` - ', - $3$4, 4S, n->as_Vector()->length() == 2 ||` - ')n->as_Vector()->length() == $3); - match(Set dst (LShiftV$4 src shift)); +instruct vsll$1$2`'(vec$4 dst, vec$4 src, vec$4 shift) %{ + PREDICATE(`$1$2', $1, ) + match(Set dst (LShiftV$2 src shift)); ins_cost(INSN_COST); - format %{ "$1 $dst,$src,$shift\t# vector ($3$5)" %} + format %{ "sshl $dst,$src,$shift\t# vector ($1$3)" %} ins_encode %{ - __ $2(as_FloatRegister($dst$$reg), __ T$3$5, + __ sshl(as_FloatRegister($dst$$reg), __ T$1$3, as_FloatRegister($src$$reg), as_FloatRegister($shift$$reg)); %} - ins_pipe(vshift`'ifelse($6, D, 64, 128)); + ins_pipe(vshift`'ifelse($4, D, 64, 128)); %}')dnl dnl define(`VSRA', ` -instruct vsra$3$4`'(vec$6 dst, vec$6 src, vec$6 shift, vec$6 tmp) %{ - predicate(ifelse($3$4, 8B, n->as_Vector()->length() == 4 ||` - ', - $3$4, 4S, n->as_Vector()->length() == 2 ||` - ')n->as_Vector()->length() == $3); - match(Set dst (RShiftV$4 src shift)); +instruct vsra$1$2`'(vec$4 dst, vec$4 src, vec$4 shift) %{ + PREDICATE(`$1$2', $1, !n->as_ShiftV()->is_var_shift()) + match(Set dst (RShiftV$2 src shift)); ins_cost(INSN_COST); - effect(TEMP tmp); - format %{ "$1 $tmp,$shift\t" - "$2 $dst,$src,$tmp\t# vector ($3$5)" %} + format %{ "sshl $dst,$src,$shift\t# vector ($1$3)" %} ins_encode %{ - __ $1(as_FloatRegister($tmp$$reg), __ T`'ifelse($6, D, 8B, 16B), + __ sshl(as_FloatRegister($dst$$reg), __ T$1$3, + as_FloatRegister($src$$reg), as_FloatRegister($shift$$reg)); - __ $2(as_FloatRegister($dst$$reg), __ T$3$5, + %} + ins_pipe(vshift`'ifelse($4, D, 64, 128)); +%}')dnl +dnl +define(`VSRA_VAR', ` +instruct vsra$1$2_var`'(vec$4 dst, vec$4 src, vec$4 shift) %{ + PREDICATE(`$1$2', $1, n->as_ShiftV()->is_var_shift()) + match(Set dst (RShiftV$2 src shift)); + ins_cost(INSN_COST * 2); + effect(TEMP_DEF dst); + format %{ "negr $dst,$shift\t" + "sshl $dst,$src,$dst\t# vector ($1$3)" %} + ins_encode %{ + __ negr(as_FloatRegister($dst$$reg), __ T`'ifelse($4, D, 8B, 16B), + as_FloatRegister($shift$$reg)); + __ sshl(as_FloatRegister($dst$$reg), __ T$1$3, as_FloatRegister($src$$reg), - as_FloatRegister($tmp$$reg)); + as_FloatRegister($dst$$reg)); %} - ins_pipe(vshift`'ifelse($6, D, 64, 128)); + ins_pipe(vshift`'ifelse($4, D, 64, 128)); %}')dnl dnl define(`VSRL', ` -instruct vsrl$3$4`'(vec$6 dst, vec$6 src, vec$6 shift, vec$6 tmp) %{ - predicate(ifelse($3$4, 8B, n->as_Vector()->length() == 4 ||` - ', - $3$4, 4S, n->as_Vector()->length() == 2 ||` - ')n->as_Vector()->length() == $3); - match(Set dst (URShiftV$4 src shift)); +instruct vsrl$1$2`'(vec$4 dst, vec$4 src, vec$4 shift) %{ + PREDICATE(`$1$2', $1, !n->as_ShiftV()->is_var_shift()) + match(Set dst (URShiftV$2 src shift)); ins_cost(INSN_COST); - effect(TEMP tmp); - format %{ "$1 $tmp,$shift\t" - "$2 $dst,$src,$tmp\t# vector ($3$5)" %} + format %{ "ushl $dst,$src,$shift\t# vector ($1$3)" %} + ins_encode %{ + __ ushl(as_FloatRegister($dst$$reg), __ T$1$3, + as_FloatRegister($src$$reg), + as_FloatRegister($shift$$reg)); + %} + ins_pipe(vshift`'ifelse($4, D, 64, 128)); +%}')dnl +dnl +define(`VSRL_VAR', ` +instruct vsrl$1$2_var`'(vec$4 dst, vec$4 src, vec$4 shift) %{ + PREDICATE(`$1$2', $1, n->as_ShiftV()->is_var_shift()) + match(Set dst (URShiftV$2 src shift)); + ins_cost(INSN_COST * 2); + effect(TEMP_DEF dst); + format %{ "negr $dst,$shift\t" + "ushl $dst,$src,$dst\t# vector ($1$3)" %} ins_encode %{ - __ $1(as_FloatRegister($tmp$$reg), __ T`'ifelse($6, D, 8B, 16B), + __ negr(as_FloatRegister($dst$$reg), __ T`'ifelse($4, D, 8B, 16B), as_FloatRegister($shift$$reg)); - __ $2(as_FloatRegister($dst$$reg), __ T$3$5, + __ ushl(as_FloatRegister($dst$$reg), __ T$1$3, as_FloatRegister($src$$reg), - as_FloatRegister($tmp$$reg)); + as_FloatRegister($dst$$reg)); %} - ins_pipe(vshift`'ifelse($6, D, 64, 128)); + ins_pipe(vshift`'ifelse($4, D, 64, 128)); %}')dnl dnl define(`VSLL_IMM', ` -instruct vsll$3$4_imm`'(vec$6 dst, vec$6 src, immI shift) %{ - predicate(ifelse($3$4, 8B, n->as_Vector()->length() == 4 ||` - ', - $3$4, 4S, n->as_Vector()->length() == 2 ||` - ')n->as_Vector()->length() == $3); - match(Set dst (LShiftV$4 src (LShiftCntV shift))); - ins_cost(INSN_COST); - format %{ "$1 $dst, $src, $shift\t# vector ($3$5)" %} - ins_encode %{ifelse($4, B,` +instruct vsll$1$2_imm`'(vec$4 dst, vec$4 src, immI shift) %{ + PREDICATE(`$1$2', $1, assert_not_var_shift(n)) + match(Set dst (LShiftV$2 src (LShiftCntV shift))); + ins_cost(INSN_COST); + format %{ "shl $dst, $src, $shift\t# vector ($1$3)" %} + ins_encode %{ifelse($2, B,` int sh = (int)$shift$$constant; if (sh >= 8) { - __ eor(as_FloatRegister($dst$$reg), __ ifelse($6, D, T8B, T16B), + __ eor(as_FloatRegister($dst$$reg), __ ifelse($4, D, T8B, T16B), as_FloatRegister($src$$reg), as_FloatRegister($src$$reg)); } else { - __ $2(as_FloatRegister($dst$$reg), __ T$3$5, + __ shl(as_FloatRegister($dst$$reg), __ T$1$3, as_FloatRegister($src$$reg), sh); - }', $4, S,` + }', $2, S,` int sh = (int)$shift$$constant; if (sh >= 16) { - __ eor(as_FloatRegister($dst$$reg), __ ifelse($6, D, T8B, T16B), + __ eor(as_FloatRegister($dst$$reg), __ ifelse($4, D, T8B, T16B), as_FloatRegister($src$$reg), as_FloatRegister($src$$reg)); } else { - __ $2(as_FloatRegister($dst$$reg), __ T$3$5, + __ shl(as_FloatRegister($dst$$reg), __ T$1$3, as_FloatRegister($src$$reg), sh); }', ` - __ $2(as_FloatRegister($dst$$reg), __ T$3$5, + __ shl(as_FloatRegister($dst$$reg), __ T$1$3, as_FloatRegister($src$$reg), (int)$shift$$constant);') %} - ins_pipe(vshift`'ifelse($6, D, 64, 128)_imm); + ins_pipe(vshift`'ifelse($4, D, 64, 128)_imm); %}')dnl +dnl define(`VSRA_IMM', ` -instruct vsra$3$4_imm`'(vec$6 dst, vec$6 src, immI shift) %{ - predicate(ifelse($3$4, 8B, n->as_Vector()->length() == 4 ||` - ', - $3$4, 4S, n->as_Vector()->length() == 2 ||` - ')n->as_Vector()->length() == $3); - match(Set dst (RShiftV$4 src (RShiftCntV shift))); - ins_cost(INSN_COST); - format %{ "$1 $dst, $src, $shift\t# vector ($3$5)" %} - ins_encode %{ifelse($4, B,` +instruct vsra$1$2_imm`'(vec$4 dst, vec$4 src, immI shift) %{ + PREDICATE(`$1$2', $1, assert_not_var_shift(n)) + match(Set dst (RShiftV$2 src (RShiftCntV shift))); + ins_cost(INSN_COST); + format %{ "sshr $dst, $src, $shift\t# vector ($1$3)" %} + ins_encode %{ifelse($2, B,` int sh = (int)$shift$$constant; if (sh >= 8) sh = 7; - __ $2(as_FloatRegister($dst$$reg), __ T$3$5, - as_FloatRegister($src$$reg), sh);', $4, S,` + __ sshr(as_FloatRegister($dst$$reg), __ T$1$3, + as_FloatRegister($src$$reg), sh);', $2, S,` int sh = (int)$shift$$constant; if (sh >= 16) sh = 15; - __ $2(as_FloatRegister($dst$$reg), __ T$3$5, - as_FloatRegister($src$$reg), sh);', ` - __ $2(as_FloatRegister($dst$$reg), __ T$3$5, + __ sshr(as_FloatRegister($dst$$reg), __ T$1$3, + as_FloatRegister($src$$reg), sh);', ` + __ sshr(as_FloatRegister($dst$$reg), __ T$1$3, as_FloatRegister($src$$reg), (int)$shift$$constant);') %} - ins_pipe(vshift`'ifelse($6, D, 64, 128)_imm); + ins_pipe(vshift`'ifelse($4, D, 64, 128)_imm); %}')dnl dnl define(`VSRL_IMM', ` -instruct vsrl$3$4_imm`'(vec$6 dst, vec$6 src, immI shift) %{ - predicate(ifelse($3$4, 8B, n->as_Vector()->length() == 4 ||` - ', - $3$4, 4S, n->as_Vector()->length() == 2 ||` - ')n->as_Vector()->length() == $3); - match(Set dst (URShiftV$4 src (RShiftCntV shift))); - ins_cost(INSN_COST); - format %{ "$1 $dst, $src, $shift\t# vector ($3$5)" %} - ins_encode %{ifelse($4, B,` +instruct vsrl$1$2_imm`'(vec$4 dst, vec$4 src, immI shift) %{ + PREDICATE(`$1$2', $1, assert_not_var_shift(n)) + match(Set dst (URShiftV$2 src (RShiftCntV shift))); + ins_cost(INSN_COST); + format %{ "ushr $dst, $src, $shift\t# vector ($1$3)" %} + ins_encode %{ifelse($2, B,` int sh = (int)$shift$$constant; if (sh >= 8) { - __ eor(as_FloatRegister($dst$$reg), __ ifelse($6, D, T8B, T16B), + __ eor(as_FloatRegister($dst$$reg), __ ifelse($4, D, T8B, T16B), as_FloatRegister($src$$reg), as_FloatRegister($src$$reg)); } else { - __ $2(as_FloatRegister($dst$$reg), __ T$3$5, - as_FloatRegister($src$$reg), sh); - }', $4, S,` + __ ushr(as_FloatRegister($dst$$reg), __ T$1$3, + as_FloatRegister($src$$reg), sh); + }', $2, S,` int sh = (int)$shift$$constant; if (sh >= 16) { - __ eor(as_FloatRegister($dst$$reg), __ ifelse($6, D, T8B, T16B), + __ eor(as_FloatRegister($dst$$reg), __ ifelse($4, D, T8B, T16B), as_FloatRegister($src$$reg), as_FloatRegister($src$$reg)); } else { - __ $2(as_FloatRegister($dst$$reg), __ T$3$5, - as_FloatRegister($src$$reg), sh); + __ ushr(as_FloatRegister($dst$$reg), __ T$1$3, + as_FloatRegister($src$$reg), sh); }', ` - __ $2(as_FloatRegister($dst$$reg), __ T$3$5, + __ ushr(as_FloatRegister($dst$$reg), __ T$1$3, as_FloatRegister($src$$reg), (int)$shift$$constant);') %} - ins_pipe(vshift`'ifelse($6, D, 64, 128)_imm); + ins_pipe(vshift`'ifelse($4, D, 64, 128)_imm); %}')dnl dnl define(`VSRLA_IMM', ` -instruct vsrla$3$4_imm`'(vec$6 dst, vec$6 src, immI shift) %{ - predicate(n->as_Vector()->length() == $3); - match(Set dst (AddV$4 dst (URShiftV$4 src (RShiftCntV shift)))); +instruct vsrla$1$2_imm`'(vec$4 dst, vec$4 src, immI shift) %{ + predicate(n->as_Vector()->length() == $1); + match(Set dst (AddV$2 dst (URShiftV$2 src (RShiftCntV shift)))); ins_cost(INSN_COST); - format %{ "$1 $dst, $src, $shift\t# vector ($3$5)" %} - ins_encode %{ifelse($4, B,` + format %{ "usra $dst, $src, $shift\t# vector ($1$3)" %} + ins_encode %{ifelse($2, B,` int sh = (int)$shift$$constant; if (sh < 8) { - __ $2(as_FloatRegister($dst$$reg), __ T$3$5, - as_FloatRegister($src$$reg), sh); - }', $4, S,` + __ usra(as_FloatRegister($dst$$reg), __ T$1$3, + as_FloatRegister($src$$reg), sh); + }', $2, S,` int sh = (int)$shift$$constant; if (sh < 16) { - __ $2(as_FloatRegister($dst$$reg), __ T$3$5, - as_FloatRegister($src$$reg), sh); + __ usra(as_FloatRegister($dst$$reg), __ T$1$3, + as_FloatRegister($src$$reg), sh); }', ` - __ $2(as_FloatRegister($dst$$reg), __ T$3$5, + __ usra(as_FloatRegister($dst$$reg), __ T$1$3, as_FloatRegister($src$$reg), (int)$shift$$constant);') %} - ins_pipe(vshift`'ifelse($6, D, 64, 128)_imm); + ins_pipe(vshift`'ifelse($4, D, 64, 128)_imm); %}')dnl dnl define(`VSRAA_IMM', ` -instruct vsraa$3$4_imm`'(vec$6 dst, vec$6 src, immI shift) %{ - predicate(n->as_Vector()->length() == $3); - match(Set dst (AddV$4 dst (RShiftV$4 src (RShiftCntV shift)))); +instruct vsraa$1$2_imm`'(vec$4 dst, vec$4 src, immI shift) %{ + predicate(n->as_Vector()->length() == $1); + match(Set dst (AddV$2 dst (RShiftV$2 src (RShiftCntV shift)))); ins_cost(INSN_COST); - format %{ "$1 $dst, $src, $shift\t# vector ($3$5)" %} - ins_encode %{ifelse($4, B,` + format %{ "ssra $dst, $src, $shift\t# vector ($1$3)" %} + ins_encode %{ifelse($2, B,` int sh = (int)$shift$$constant; if (sh >= 8) sh = 7; - __ $2(as_FloatRegister($dst$$reg), __ T$3$5, - as_FloatRegister($src$$reg), sh);', $4, S,` + __ ssra(as_FloatRegister($dst$$reg), __ T$1$3, + as_FloatRegister($src$$reg), sh);', $2, S,` int sh = (int)$shift$$constant; if (sh >= 16) sh = 15; - __ $2(as_FloatRegister($dst$$reg), __ T$3$5, - as_FloatRegister($src$$reg), sh);', ` - __ $2(as_FloatRegister($dst$$reg), __ T$3$5, + __ ssra(as_FloatRegister($dst$$reg), __ T$1$3, + as_FloatRegister($src$$reg), sh);', ` + __ ssra(as_FloatRegister($dst$$reg), __ T$1$3, as_FloatRegister($src$$reg), (int)$shift$$constant);') %} - ins_pipe(vshift`'ifelse($6, D, 64, 128)_imm); + ins_pipe(vshift`'ifelse($4, D, 64, 128)_imm); %}')dnl -dnl $1 $2 $3 $4 $5 $6 -VSLL(sshl, sshl, 8, B, B, D) -VSLL(sshl, sshl, 16, B, B, X) +dnl +undefine(PREDICATE)dnl +dnl +dnl $1 $2 $3 $4 +VSLL(8, B, B, D) +VSLL(16, B, B, X) // Right shifts with vector shift count on aarch64 SIMD are implemented // as left shift by negative shift count. @@ -2199,8 +2253,6 @@ VSLL(sshl, sshl, 16, B, B, X) // LoadVector RShiftCntV // | / // RShiftVI -// Note: In inner loop, multiple neg instructions are used, which can be -// moved to outer loop and merge into one neg instruction. // // Case 2: The vector shift count is from loading. // This case isn't supported by middle-end now. But it's supported by @@ -2210,61 +2262,83 @@ VSLL(sshl, sshl, 16, B, B, X) // | / // RShiftVI // -dnl $1 $2 $3 $4 $5 $6 -VSRA(negr, sshl, 8, B, B, D) -VSRA(negr, sshl, 16, B, B, X) -VSRL(negr, ushl, 8, B, B, D) -VSRL(negr, ushl, 16, B, B, X) -VSLL_IMM(shl, shl, 8, B, B, D) -VSLL_IMM(shl, shl, 16, B, B, X) -VSRA_IMM(sshr, sshr, 8, B, B, D) -VSRA_IMM(sshr, sshr, 16, B, B, X) -VSRL_IMM(ushr, ushr, 8, B, B, D) -VSRL_IMM(ushr, ushr, 16, B, B, X) -VSLL(sshl, sshl, 4, S, H, D) -VSLL(sshl, sshl, 8, S, H, X) -VSRA(negr, sshl, 4, S, H, D) -VSRA(negr, sshl, 8, S, H, X) -VSRL(negr, ushl, 4, S, H, D) -VSRL(negr, ushl, 8, S, H, X) -VSLL_IMM(shl, shl, 4, S, H, D) -VSLL_IMM(shl, shl, 8, S, H, X) -VSRA_IMM(sshr, sshr, 4, S, H, D) -VSRA_IMM(sshr, sshr, 8, S, H, X) -VSRL_IMM(ushr, ushr, 4, S, H, D) -VSRL_IMM(ushr, ushr, 8, S, H, X) -VSLL(sshl, sshl, 2, I, S, D) -VSLL(sshl, sshl, 4, I, S, X) -VSRA(negr, sshl, 2, I, S, D) -VSRA(negr, sshl, 4, I, S, X) -VSRL(negr, ushl, 2, I, S, D) -VSRL(negr, ushl, 4, I, S, X) -VSLL_IMM(shl, shl, 2, I, S, D) -VSLL_IMM(shl, shl, 4, I, S, X) -VSRA_IMM(sshr, sshr, 2, I, S, D) -VSRA_IMM(sshr, sshr, 4, I, S, X) -VSRL_IMM(ushr, ushr, 2, I, S, D) -VSRL_IMM(ushr, ushr, 4, I, S, X) -VSLL(sshl, sshl, 2, L, D, X) -VSRA(negr, sshl, 2, L, D, X) -VSRL(negr, ushl, 2, L, D, X) -VSLL_IMM(shl, shl, 2, L, D, X) -VSRA_IMM(sshr, sshr, 2, L, D, X) -VSRL_IMM(ushr, ushr, 2, L, D, X) -VSRAA_IMM(ssra, ssra, 8, B, B, D) -VSRAA_IMM(ssra, ssra, 16, B, B, X) -VSRAA_IMM(ssra, ssra, 4, S, H, D) -VSRAA_IMM(ssra, ssra, 8, S, H, X) -VSRAA_IMM(ssra, ssra, 2, I, S, D) -VSRAA_IMM(ssra, ssra, 4, I, S, X) -VSRAA_IMM(ssra, ssra, 2, L, D, X) -VSRLA_IMM(usra, usra, 8, B, B, D) -VSRLA_IMM(usra, usra, 16, B, B, X) -VSRLA_IMM(usra, usra, 4, S, H, D) -VSRLA_IMM(usra, usra, 8, S, H, X) -VSRLA_IMM(usra, usra, 2, I, S, D) -VSRLA_IMM(usra, usra, 4, I, S, X) -VSRLA_IMM(usra, usra, 2, L, D, X) +// The negate is conducted in RShiftCntV rule for case 1, whereas it's done in +// RShiftV* rules for case 2. Because there exists an optimization opportunity +// for case 1, that is, multiple neg instructions in inner loop can be hoisted +// to outer loop and merged into one neg instruction. +// +// Note that ShiftVNode::is_var_shift() indicates whether the vector shift +// count is a variable vector(case 2) or not(a vector generated by RShiftCntV, +// i.e. case 1). +dnl $1 $2 $3 $4 +VSRA(8, B, B, D) +VSRA_VAR(8, B, B, D) +VSRA(16, B, B, X) +VSRA_VAR(16, B, B, X) +VSRL(8, B, B, D) +VSRL_VAR(8, B, B, D) +VSRL(16, B, B, X) +VSRL_VAR(16, B, B, X) +VSLL_IMM(8, B, B, D) +VSLL_IMM(16, B, B, X) +VSRA_IMM(8, B, B, D) +VSRA_IMM(16, B, B, X) +VSRL_IMM(8, B, B, D) +VSRL_IMM(16, B, B, X) +VSLL(4, S, H, D) +VSLL(8, S, H, X) +VSRA(4, S, H, D) +VSRA_VAR(4, S, H, D) +VSRA(8, S, H, X) +VSRA_VAR(8, S, H, X) +VSRL(4, S, H, D) +VSRL_VAR(4, S, H, D) +VSRL(8, S, H, X) +VSRL_VAR(8, S, H, X) +VSLL_IMM(4, S, H, D) +VSLL_IMM(8, S, H, X) +VSRA_IMM(4, S, H, D) +VSRA_IMM(8, S, H, X) +VSRL_IMM(4, S, H, D) +VSRL_IMM(8, S, H, X) +VSLL(2, I, S, D) +VSLL(4, I, S, X) +VSRA(2, I, S, D) +VSRA_VAR(2, I, S, D) +VSRA(4, I, S, X) +VSRA_VAR(4, I, S, X) +VSRL(2, I, S, D) +VSRL_VAR(2, I, S, D) +VSRL(4, I, S, X) +VSRL_VAR(4, I, S, X) +VSLL_IMM(2, I, S, D) +VSLL_IMM(4, I, S, X) +VSRA_IMM(2, I, S, D) +VSRA_IMM(4, I, S, X) +VSRL_IMM(2, I, S, D) +VSRL_IMM(4, I, S, X) +VSLL(2, L, D, X) +VSRA(2, L, D, X) +VSRA_VAR(2, L, D, X) +VSRL(2, L, D, X) +VSRL_VAR(2, L, D, X) +VSLL_IMM(2, L, D, X) +VSRA_IMM(2, L, D, X) +VSRL_IMM(2, L, D, X) +VSRAA_IMM(8, B, B, D) +VSRAA_IMM(16, B, B, X) +VSRAA_IMM(4, S, H, D) +VSRAA_IMM(8, S, H, X) +VSRAA_IMM(2, I, S, D) +VSRAA_IMM(4, I, S, X) +VSRAA_IMM(2, L, D, X) +VSRLA_IMM(8, B, B, D) +VSRLA_IMM(16, B, B, X) +VSRLA_IMM(4, S, H, D) +VSRLA_IMM(8, S, H, X) +VSRLA_IMM(2, I, S, D) +VSRLA_IMM(4, I, S, X) +VSRLA_IMM(2, L, D, X) dnl define(`VMINMAX', ` instruct v$1$3`'ifelse($5, S, F, D)`'(vec$6 dst, vec$6 src1, vec$6 src2) diff --git a/test/micro/org/openjdk/bench/vm/compiler/VectorShiftRight.java b/test/micro/org/openjdk/bench/vm/compiler/VectorShiftRight.java new file mode 100644 index 0000000000000..068d1bd704dba --- /dev/null +++ b/test/micro/org/openjdk/bench/vm/compiler/VectorShiftRight.java @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2022, Arm Limited. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +package org.openjdk.bench.vm.compiler; + +import org.openjdk.jmh.annotations.*; +import org.openjdk.jmh.infra.*; + +import java.util.concurrent.TimeUnit; +import java.util.Random; + +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.NANOSECONDS) +@State(Scope.Thread) +public class VectorShiftRight { + @Param({"1024"}) + public int SIZE; + + private byte[] bytesA, bytesB; + private short[] shortsA, shortsB; + private char[] charsA, charsB; + private int[] intsA, intsB; + private long[] longsA, longsB; + + @Param("0") + private int seed; + private Random r = new Random(seed); + + @Param("3") + private int shiftCount; + + @Setup + public void init() { + bytesA = new byte[SIZE]; + shortsA = new short[SIZE]; + charsA = new char[SIZE]; + intsA = new int[SIZE]; + longsA = new long[SIZE]; + + bytesB = new byte[SIZE]; + shortsB = new short[SIZE]; + charsB = new char[SIZE]; + intsB = new int[SIZE]; + longsB = new long[SIZE]; + + for (int i = 0; i < SIZE; i++) { + bytesA[i] = (byte) r.nextInt(); + shortsA[i] = (short) r.nextInt(); + charsA[i] = (char) r.nextInt(); + intsA[i] = r.nextInt(); + longsA[i] = r.nextLong(); + } + } + + @Benchmark + public void rShiftByte() { + for (int i = 0; i < SIZE; i++) { + bytesB[i] = (byte) (bytesA[i] >> shiftCount); + } + } + + @Benchmark + public void urShiftByte() { + for (int i = 0; i < SIZE; i++) { + bytesB[i] = (byte) (bytesA[i] >>> shiftCount); + } + } + + @Benchmark + public void rShiftShort() { + for (int i = 0; i < SIZE; i++) { + shortsB[i] = (short) (shortsA[i] >> shiftCount); + } + } + + @Benchmark + public void urShiftChar() { + for (int i = 0; i < SIZE; i++) { + charsB[i] = (char) (charsA[i] >>> shiftCount); + } + } + + @Benchmark + public void rShiftInt() { + for (int i = 0; i < SIZE; i++) { + intsB[i] = intsA[i] >> shiftCount; + } + } + + @Benchmark + public void urShiftInt() { + for (int i = 0; i < SIZE; i++) { + intsB[i] = intsA[i] >>> shiftCount; + } + } + + @Benchmark + public void rShiftLong() { + for (int i = 0; i < SIZE; i++) { + longsB[i] = longsA[i] >> shiftCount; + } + } + + @Benchmark + public void urShiftLong() { + for (int i = 0; i < SIZE; i++) { + longsB[i] = longsA[i] >>> shiftCount; + } + } +}