Skip to content

Commit 33bec20

Browse files
Scott Gibbonscl4es
authored andcommitted
8300808: Accelerate Base64 on x86 for AVX2
Reviewed-by: jbhateja, redestad, sviswanathan
1 parent 46bcc49 commit 33bec20

File tree

7 files changed

+229
-26
lines changed

7 files changed

+229
-26
lines changed

src/hotspot/cpu/x86/stubGenerator_x86_64.cpp

Lines changed: 179 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1642,7 +1642,6 @@ address StubGenerator::generate_base64_encodeBlock()
16421642
// calculate length from offsets
16431643
__ movl(length, end_offset);
16441644
__ subl(length, start_offset);
1645-
__ cmpl(length, 0);
16461645
__ jcc(Assembler::lessEqual, L_exit);
16471646

16481647
// Code for 512-bit VBMI encoding. Encodes 48 input bytes into 64
@@ -1685,8 +1684,7 @@ address StubGenerator::generate_base64_encodeBlock()
16851684
}
16861685

16871686
__ BIND(L_not512);
1688-
if (VM_Version::supports_avx2()
1689-
&& VM_Version::supports_avx512vlbw()) {
1687+
if (VM_Version::supports_avx2()) {
16901688
/*
16911689
** This AVX2 encoder is based off the paper at:
16921690
** https://dl.acm.org/doi/10.1145/3132709
@@ -1703,15 +1701,17 @@ address StubGenerator::generate_base64_encodeBlock()
17031701
__ vmovdqu(xmm9, ExternalAddress(StubRoutines::x86::base64_avx2_shuffle_addr()), rax);
17041702
// 6-bit mask for 2nd and 4th (and multiples) 6-bit values
17051703
__ movl(rax, 0x0fc0fc00);
1704+
__ movdl(xmm8, rax);
17061705
__ vmovdqu(xmm1, ExternalAddress(StubRoutines::x86::base64_avx2_input_mask_addr()), rax);
1707-
__ evpbroadcastd(xmm8, rax, Assembler::AVX_256bit);
1706+
__ vpbroadcastd(xmm8, xmm8, Assembler::AVX_256bit);
17081707

17091708
// Multiplication constant for "shifting" right by 6 and 10
17101709
// bits
17111710
__ movl(rax, 0x04000040);
17121711

17131712
__ subl(length, 24);
1714-
__ evpbroadcastd(xmm7, rax, Assembler::AVX_256bit);
1713+
__ movdl(xmm7, rax);
1714+
__ vpbroadcastd(xmm7, xmm7, Assembler::AVX_256bit);
17151715

17161716
// For the first load, we mask off reading of the first 4
17171717
// bytes into the register. This is so we can get 4 3-byte
@@ -1813,19 +1813,23 @@ address StubGenerator::generate_base64_encodeBlock()
18131813
// Load masking register for first and third (and multiples)
18141814
// 6-bit values.
18151815
__ movl(rax, 0x003f03f0);
1816-
__ evpbroadcastd(xmm6, rax, Assembler::AVX_256bit);
1816+
__ movdl(xmm6, rax);
1817+
__ vpbroadcastd(xmm6, xmm6, Assembler::AVX_256bit);
18171818
// Multiplication constant for "shifting" left by 4 and 8 bits
18181819
__ movl(rax, 0x01000010);
1819-
__ evpbroadcastd(xmm5, rax, Assembler::AVX_256bit);
1820+
__ movdl(xmm5, rax);
1821+
__ vpbroadcastd(xmm5, xmm5, Assembler::AVX_256bit);
18201822

18211823
// Isolate 6-bit chunks of interest
18221824
__ vpand(xmm0, xmm8, xmm1, Assembler::AVX_256bit);
18231825

18241826
// Load constants for encoding
18251827
__ movl(rax, 0x19191919);
1826-
__ evpbroadcastd(xmm3, rax, Assembler::AVX_256bit);
1828+
__ movdl(xmm3, rax);
1829+
__ vpbroadcastd(xmm3, xmm3, Assembler::AVX_256bit);
18271830
__ movl(rax, 0x33333333);
1828-
__ evpbroadcastd(xmm4, rax, Assembler::AVX_256bit);
1831+
__ movdl(xmm4, rax);
1832+
__ vpbroadcastd(xmm4, xmm4, Assembler::AVX_256bit);
18291833

18301834
// Shift output bytes 0 and 2 into proper lanes
18311835
__ vpmulhuw(xmm2, xmm0, xmm7, Assembler::AVX_256bit);
@@ -2133,6 +2137,80 @@ address StubGenerator::base64_vbmi_join_2_3_addr() {
21332137
return start;
21342138
}
21352139

2140+
address StubGenerator::base64_AVX2_decode_tables_addr() {
2141+
__ align64();
2142+
StubCodeMark mark(this, "StubRoutines", "AVX2_tables_base64");
2143+
address start = __ pc();
2144+
2145+
assert(((unsigned long long)start & 0x3f) == 0,
2146+
"Alignment problem (0x%08llx)", (unsigned long long)start);
2147+
__ emit_data(0x2f2f2f2f, relocInfo::none, 0);
2148+
__ emit_data(0x5f5f5f5f, relocInfo::none, 0); // for URL
2149+
2150+
__ emit_data(0xffffffff, relocInfo::none, 0);
2151+
__ emit_data(0xfcfcfcfc, relocInfo::none, 0); // for URL
2152+
2153+
// Permute table
2154+
__ emit_data64(0x0000000100000000, relocInfo::none);
2155+
__ emit_data64(0x0000000400000002, relocInfo::none);
2156+
__ emit_data64(0x0000000600000005, relocInfo::none);
2157+
__ emit_data64(0xffffffffffffffff, relocInfo::none);
2158+
2159+
// Shuffle table
2160+
__ emit_data64(0x090a040506000102, relocInfo::none);
2161+
__ emit_data64(0xffffffff0c0d0e08, relocInfo::none);
2162+
__ emit_data64(0x090a040506000102, relocInfo::none);
2163+
__ emit_data64(0xffffffff0c0d0e08, relocInfo::none);
2164+
2165+
// merge table
2166+
__ emit_data(0x01400140, relocInfo::none, 0);
2167+
2168+
// merge multiplier
2169+
__ emit_data(0x00011000, relocInfo::none, 0);
2170+
2171+
return start;
2172+
}
2173+
2174+
address StubGenerator::base64_AVX2_decode_LUT_tables_addr() {
2175+
__ align64();
2176+
StubCodeMark mark(this, "StubRoutines", "AVX2_tables_URL_base64");
2177+
address start = __ pc();
2178+
2179+
assert(((unsigned long long)start & 0x3f) == 0,
2180+
"Alignment problem (0x%08llx)", (unsigned long long)start);
2181+
// lut_lo
2182+
__ emit_data64(0x1111111111111115, relocInfo::none);
2183+
__ emit_data64(0x1a1b1b1b1a131111, relocInfo::none);
2184+
__ emit_data64(0x1111111111111115, relocInfo::none);
2185+
__ emit_data64(0x1a1b1b1b1a131111, relocInfo::none);
2186+
2187+
// lut_roll
2188+
__ emit_data64(0xb9b9bfbf04131000, relocInfo::none);
2189+
__ emit_data64(0x0000000000000000, relocInfo::none);
2190+
__ emit_data64(0xb9b9bfbf04131000, relocInfo::none);
2191+
__ emit_data64(0x0000000000000000, relocInfo::none);
2192+
2193+
// lut_lo URL
2194+
__ emit_data64(0x1111111111111115, relocInfo::none);
2195+
__ emit_data64(0x1b1b1a1b1b131111, relocInfo::none);
2196+
__ emit_data64(0x1111111111111115, relocInfo::none);
2197+
__ emit_data64(0x1b1b1a1b1b131111, relocInfo::none);
2198+
2199+
// lut_roll URL
2200+
__ emit_data64(0xb9b9bfbf0411e000, relocInfo::none);
2201+
__ emit_data64(0x0000000000000000, relocInfo::none);
2202+
__ emit_data64(0xb9b9bfbf0411e000, relocInfo::none);
2203+
__ emit_data64(0x0000000000000000, relocInfo::none);
2204+
2205+
// lut_hi
2206+
__ emit_data64(0x0804080402011010, relocInfo::none);
2207+
__ emit_data64(0x1010101010101010, relocInfo::none);
2208+
__ emit_data64(0x0804080402011010, relocInfo::none);
2209+
__ emit_data64(0x1010101010101010, relocInfo::none);
2210+
2211+
return start;
2212+
}
2213+
21362214
address StubGenerator::base64_decoding_table_addr() {
21372215
StubCodeMark mark(this, "StubRoutines", "decoding_table_base64");
21382216
address start = __ pc();
@@ -2289,7 +2367,7 @@ address StubGenerator::generate_base64_decodeBlock() {
22892367

22902368
Label L_process256, L_process64, L_process64Loop, L_exit, L_processdata, L_loadURL;
22912369
Label L_continue, L_finalBit, L_padding, L_donePadding, L_bruteForce;
2292-
Label L_forceLoop, L_bottomLoop, L_checkMIME, L_exit_no_vzero;
2370+
Label L_forceLoop, L_bottomLoop, L_checkMIME, L_exit_no_vzero, L_lastChunk;
22932371

22942372
// calculate length from offsets
22952373
__ movl(length, end_offset);
@@ -2299,11 +2377,11 @@ address StubGenerator::generate_base64_decodeBlock() {
22992377
// If AVX512 VBMI not supported, just compile non-AVX code
23002378
if(VM_Version::supports_avx512_vbmi() &&
23012379
VM_Version::supports_avx512bw()) {
2302-
__ cmpl(length, 128); // 128-bytes is break-even for AVX-512
2303-
__ jcc(Assembler::lessEqual, L_bruteForce);
2380+
__ cmpl(length, 31); // 32-bytes is break-even for AVX-512
2381+
__ jcc(Assembler::lessEqual, L_lastChunk);
23042382

23052383
__ cmpl(isMIME, 0);
2306-
__ jcc(Assembler::notEqual, L_bruteForce);
2384+
__ jcc(Assembler::notEqual, L_lastChunk);
23072385

23082386
// Load lookup tables based on isURL
23092387
__ cmpl(isURL, 0);
@@ -2554,6 +2632,89 @@ address StubGenerator::generate_base64_decodeBlock() {
25542632
__ BIND(L_bruteForce);
25552633
} // End of if(avx512_vbmi)
25562634

2635+
if (VM_Version::supports_avx2()) {
2636+
Label L_tailProc, L_topLoop, L_enterLoop;
2637+
2638+
__ cmpl(isMIME, 0);
2639+
__ jcc(Assembler::notEqual, L_lastChunk);
2640+
2641+
// Check for buffer too small (for algorithm)
2642+
__ subl(length, 0x2c);
2643+
__ jcc(Assembler::less, L_tailProc);
2644+
2645+
__ shll(isURL, 2);
2646+
2647+
// Algorithm adapted from https://arxiv.org/abs/1704.00605, "Faster Base64
2648+
// Encoding and Decoding using AVX2 Instructions". URL modifications added.
2649+
2650+
// Set up constants
2651+
__ lea(r13, ExternalAddress(StubRoutines::x86::base64_AVX2_decode_tables_addr()));
2652+
__ vpbroadcastd(xmm4, Address(r13, isURL, Address::times_1), Assembler::AVX_256bit); // 2F or 5F
2653+
__ vpbroadcastd(xmm10, Address(r13, isURL, Address::times_1, 0x08), Assembler::AVX_256bit); // -1 or -4
2654+
__ vmovdqu(xmm12, Address(r13, 0x10)); // permute
2655+
__ vmovdqu(xmm13, Address(r13, 0x30)); // shuffle
2656+
__ vpbroadcastd(xmm7, Address(r13, 0x50), Assembler::AVX_256bit); // merge
2657+
__ vpbroadcastd(xmm6, Address(r13, 0x54), Assembler::AVX_256bit); // merge mult
2658+
2659+
__ lea(r13, ExternalAddress(StubRoutines::x86::base64_AVX2_decode_LUT_tables_addr()));
2660+
__ shll(isURL, 4);
2661+
__ vmovdqu(xmm11, Address(r13, isURL, Address::times_1, 0x00)); // lut_lo
2662+
__ vmovdqu(xmm8, Address(r13, isURL, Address::times_1, 0x20)); // lut_roll
2663+
__ shrl(isURL, 6); // restore isURL
2664+
__ vmovdqu(xmm9, Address(r13, 0x80)); // lut_hi
2665+
__ jmp(L_enterLoop);
2666+
2667+
__ align32();
2668+
__ bind(L_topLoop);
2669+
// Add in the offset value (roll) to get 6-bit out values
2670+
__ vpaddb(xmm0, xmm0, xmm2, Assembler::AVX_256bit);
2671+
// Merge and permute the output bits into appropriate output byte lanes
2672+
__ vpmaddubsw(xmm0, xmm0, xmm7, Assembler::AVX_256bit);
2673+
__ vpmaddwd(xmm0, xmm0, xmm6, Assembler::AVX_256bit);
2674+
__ vpshufb(xmm0, xmm0, xmm13, Assembler::AVX_256bit);
2675+
__ vpermd(xmm0, xmm12, xmm0, Assembler::AVX_256bit);
2676+
// Store the output bytes
2677+
__ vmovdqu(Address(dest, dp, Address::times_1, 0), xmm0);
2678+
__ addptr(source, 0x20);
2679+
__ addptr(dest, 0x18);
2680+
__ subl(length, 0x20);
2681+
__ jcc(Assembler::less, L_tailProc);
2682+
2683+
__ bind(L_enterLoop);
2684+
2685+
// Load in encoded string (32 bytes)
2686+
__ vmovdqu(xmm2, Address(source, start_offset, Address::times_1, 0x0));
2687+
// Extract the high nibble for indexing into the lut tables. High 4 bits are don't care.
2688+
__ vpsrld(xmm1, xmm2, 0x4, Assembler::AVX_256bit);
2689+
__ vpand(xmm1, xmm4, xmm1, Assembler::AVX_256bit);
2690+
// Extract the low nibble. 5F/2F will isolate the low-order 4 bits. High 4 bits are don't care.
2691+
__ vpand(xmm3, xmm2, xmm4, Assembler::AVX_256bit);
2692+
// Check for special-case (0x2F or 0x5F (URL))
2693+
__ vpcmpeqb(xmm0, xmm4, xmm2, Assembler::AVX_256bit);
2694+
// Get the bitset based on the low nibble. vpshufb uses low-order 4 bits only.
2695+
__ vpshufb(xmm3, xmm11, xmm3, Assembler::AVX_256bit);
2696+
// Get the bit value of the high nibble
2697+
__ vpshufb(xmm5, xmm9, xmm1, Assembler::AVX_256bit);
2698+
// Make sure 2F / 5F shows as valid
2699+
__ vpandn(xmm3, xmm0, xmm3, Assembler::AVX_256bit);
2700+
// Make adjustment for roll index. For non-URL, this is a no-op,
2701+
// for URL, this adjusts by -4. This is to properly index the
2702+
// roll value for 2F / 5F.
2703+
__ vpand(xmm0, xmm0, xmm10, Assembler::AVX_256bit);
2704+
// If the and of the two is non-zero, we have an invalid input character
2705+
__ vptest(xmm3, xmm5);
2706+
// Extract the "roll" value - value to add to the input to get 6-bit out value
2707+
__ vpaddb(xmm0, xmm0, xmm1, Assembler::AVX_256bit); // Handle 2F / 5F
2708+
__ vpshufb(xmm0, xmm8, xmm0, Assembler::AVX_256bit);
2709+
__ jcc(Assembler::equal, L_topLoop); // Fall through on error
2710+
2711+
__ bind(L_tailProc);
2712+
2713+
__ addl(length, 0x2c);
2714+
2715+
__ vzeroupper();
2716+
}
2717+
25572718
// Use non-AVX code to decode 4-byte chunks into 3 bytes of output
25582719

25592720
// Register state (Linux):
@@ -2584,6 +2745,8 @@ address StubGenerator::generate_base64_decodeBlock() {
25842745
const Register byte3 = WIN64_ONLY(r8) NOT_WIN64(rdx);
25852746
const Register byte4 = WIN64_ONLY(r10) NOT_WIN64(r9);
25862747

2748+
__ bind(L_lastChunk);
2749+
25872750
__ shrl(length, 2); // Multiple of 4 bytes only - length is # 4-byte chunks
25882751
__ cmpl(length, 0);
25892752
__ jcc(Assembler::lessEqual, L_exit_no_vzero);
@@ -3829,12 +3992,12 @@ void StubGenerator::generate_all() {
38293992
}
38303993

38313994
if (UseBASE64Intrinsics) {
3832-
if(VM_Version::supports_avx2() &&
3833-
VM_Version::supports_avx512bw() &&
3834-
VM_Version::supports_avx512vl()) {
3995+
if(VM_Version::supports_avx2()) {
38353996
StubRoutines::x86::_avx2_shuffle_base64 = base64_avx2_shuffle_addr();
38363997
StubRoutines::x86::_avx2_input_mask_base64 = base64_avx2_input_mask_addr();
38373998
StubRoutines::x86::_avx2_lut_base64 = base64_avx2_lut_addr();
3999+
StubRoutines::x86::_avx2_decode_tables_base64 = base64_AVX2_decode_tables_addr();
4000+
StubRoutines::x86::_avx2_decode_lut_tables_base64 = base64_AVX2_decode_LUT_tables_addr();
38384001
}
38394002
StubRoutines::x86::_encoding_table_base64 = base64_encoding_table_addr();
38404003
if (VM_Version::supports_avx512_vbmi()) {

src/hotspot/cpu/x86/stubGenerator_x86_64.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -441,6 +441,8 @@ class StubGenerator: public StubCodeGenerator {
441441
address base64_vbmi_join_1_2_addr();
442442
address base64_vbmi_join_2_3_addr();
443443
address base64_decoding_table_addr();
444+
address base64_AVX2_decode_tables_addr();
445+
address base64_AVX2_decode_LUT_tables_addr();
444446

445447
// Code for generating Base64 decoding.
446448
//

src/hotspot/cpu/x86/stubRoutines_x86.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,8 @@ address StubRoutines::x86::_shuffle_base64 = NULL;
7171
address StubRoutines::x86::_avx2_shuffle_base64 = NULL;
7272
address StubRoutines::x86::_avx2_input_mask_base64 = NULL;
7373
address StubRoutines::x86::_avx2_lut_base64 = NULL;
74+
address StubRoutines::x86::_avx2_decode_tables_base64 = NULL;
75+
address StubRoutines::x86::_avx2_decode_lut_tables_base64 = NULL;
7476
address StubRoutines::x86::_lookup_lo_base64 = NULL;
7577
address StubRoutines::x86::_lookup_hi_base64 = NULL;
7678
address StubRoutines::x86::_lookup_lo_base64url = NULL;

src/hotspot/cpu/x86/stubRoutines_x86.hpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,8 @@ class x86 {
185185
static address _avx2_shuffle_base64;
186186
static address _avx2_input_mask_base64;
187187
static address _avx2_lut_base64;
188+
static address _avx2_decode_tables_base64;
189+
static address _avx2_decode_lut_tables_base64;
188190
static address _lookup_lo_base64;
189191
static address _lookup_hi_base64;
190192
static address _lookup_lo_base64url;
@@ -325,6 +327,8 @@ class x86 {
325327
static address base64_vbmi_join_1_2_addr() { return _join_1_2_base64; }
326328
static address base64_vbmi_join_2_3_addr() { return _join_2_3_base64; }
327329
static address base64_decoding_table_addr() { return _decoding_table_base64; }
330+
static address base64_AVX2_decode_tables_addr() { return _avx2_decode_tables_base64; }
331+
static address base64_AVX2_decode_LUT_tables_addr() { return _avx2_decode_lut_tables_base64; }
328332
#endif
329333
static address pshuffle_byte_flip_mask_addr() { return _pshuffle_byte_flip_mask_addr; }
330334
static address arrays_hashcode_powers_of_31() { return (address)_arrays_hashcode_powers_of_31; }

src/hotspot/cpu/x86/vm_version_x86.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1140,7 +1140,7 @@ void VM_Version::get_processor_features() {
11401140
}
11411141

11421142
// Base64 Intrinsics (Check the condition for which the intrinsic will be active)
1143-
if ((UseAVX > 2) && supports_avx512vl() && supports_avx512bw()) {
1143+
if (UseAVX >= 2) {
11441144
if (FLAG_IS_DEFAULT(UseBASE64Intrinsics)) {
11451145
UseBASE64Intrinsics = true;
11461146
}

0 commit comments

Comments
 (0)