@@ -1642,7 +1642,6 @@ address StubGenerator::generate_base64_encodeBlock()
16421642 // calculate length from offsets
16431643 __ movl (length, end_offset);
16441644 __ subl (length, start_offset);
1645- __ cmpl (length, 0 );
16461645 __ jcc (Assembler::lessEqual, L_exit);
16471646
16481647 // Code for 512-bit VBMI encoding. Encodes 48 input bytes into 64
@@ -1685,8 +1684,7 @@ address StubGenerator::generate_base64_encodeBlock()
16851684 }
16861685
16871686 __ BIND (L_not512);
1688- if (VM_Version::supports_avx2 ()
1689- && VM_Version::supports_avx512vlbw ()) {
1687+ if (VM_Version::supports_avx2 ()) {
16901688 /*
16911689 ** This AVX2 encoder is based off the paper at:
16921690 ** https://dl.acm.org/doi/10.1145/3132709
@@ -1703,15 +1701,17 @@ address StubGenerator::generate_base64_encodeBlock()
17031701 __ vmovdqu (xmm9, ExternalAddress (StubRoutines::x86::base64_avx2_shuffle_addr ()), rax);
17041702 // 6-bit mask for 2nd and 4th (and multiples) 6-bit values
17051703 __ movl (rax, 0x0fc0fc00 );
1704+ __ movdl (xmm8, rax);
17061705 __ vmovdqu (xmm1, ExternalAddress (StubRoutines::x86::base64_avx2_input_mask_addr ()), rax);
1707- __ evpbroadcastd (xmm8, rax , Assembler::AVX_256bit);
1706+ __ vpbroadcastd (xmm8, xmm8 , Assembler::AVX_256bit);
17081707
17091708 // Multiplication constant for "shifting" right by 6 and 10
17101709 // bits
17111710 __ movl (rax, 0x04000040 );
17121711
17131712 __ subl (length, 24 );
1714- __ evpbroadcastd (xmm7, rax, Assembler::AVX_256bit);
1713+ __ movdl (xmm7, rax);
1714+ __ vpbroadcastd (xmm7, xmm7, Assembler::AVX_256bit);
17151715
17161716 // For the first load, we mask off reading of the first 4
17171717 // bytes into the register. This is so we can get 4 3-byte
@@ -1813,19 +1813,23 @@ address StubGenerator::generate_base64_encodeBlock()
18131813 // Load masking register for first and third (and multiples)
18141814 // 6-bit values.
18151815 __ movl (rax, 0x003f03f0 );
1816- __ evpbroadcastd (xmm6, rax, Assembler::AVX_256bit);
1816+ __ movdl (xmm6, rax);
1817+ __ vpbroadcastd (xmm6, xmm6, Assembler::AVX_256bit);
18171818 // Multiplication constant for "shifting" left by 4 and 8 bits
18181819 __ movl (rax, 0x01000010 );
1819- __ evpbroadcastd (xmm5, rax, Assembler::AVX_256bit);
1820+ __ movdl (xmm5, rax);
1821+ __ vpbroadcastd (xmm5, xmm5, Assembler::AVX_256bit);
18201822
18211823 // Isolate 6-bit chunks of interest
18221824 __ vpand (xmm0, xmm8, xmm1, Assembler::AVX_256bit);
18231825
18241826 // Load constants for encoding
18251827 __ movl (rax, 0x19191919 );
1826- __ evpbroadcastd (xmm3, rax, Assembler::AVX_256bit);
1828+ __ movdl (xmm3, rax);
1829+ __ vpbroadcastd (xmm3, xmm3, Assembler::AVX_256bit);
18271830 __ movl (rax, 0x33333333 );
1828- __ evpbroadcastd (xmm4, rax, Assembler::AVX_256bit);
1831+ __ movdl (xmm4, rax);
1832+ __ vpbroadcastd (xmm4, xmm4, Assembler::AVX_256bit);
18291833
18301834 // Shift output bytes 0 and 2 into proper lanes
18311835 __ vpmulhuw (xmm2, xmm0, xmm7, Assembler::AVX_256bit);
@@ -2133,6 +2137,80 @@ address StubGenerator::base64_vbmi_join_2_3_addr() {
21332137 return start;
21342138}
21352139
2140+ address StubGenerator::base64_AVX2_decode_tables_addr () {
2141+ __ align64 ();
2142+ StubCodeMark mark (this , " StubRoutines" , " AVX2_tables_base64" );
2143+ address start = __ pc ();
2144+
2145+ assert (((unsigned long long )start & 0x3f ) == 0 ,
2146+ " Alignment problem (0x%08llx)" , (unsigned long long )start);
2147+ __ emit_data (0x2f2f2f2f , relocInfo::none, 0 );
2148+ __ emit_data (0x5f5f5f5f , relocInfo::none, 0 ); // for URL
2149+
2150+ __ emit_data (0xffffffff , relocInfo::none, 0 );
2151+ __ emit_data (0xfcfcfcfc , relocInfo::none, 0 ); // for URL
2152+
2153+ // Permute table
2154+ __ emit_data64 (0x0000000100000000 , relocInfo::none);
2155+ __ emit_data64 (0x0000000400000002 , relocInfo::none);
2156+ __ emit_data64 (0x0000000600000005 , relocInfo::none);
2157+ __ emit_data64 (0xffffffffffffffff , relocInfo::none);
2158+
2159+ // Shuffle table
2160+ __ emit_data64 (0x090a040506000102 , relocInfo::none);
2161+ __ emit_data64 (0xffffffff0c0d0e08 , relocInfo::none);
2162+ __ emit_data64 (0x090a040506000102 , relocInfo::none);
2163+ __ emit_data64 (0xffffffff0c0d0e08 , relocInfo::none);
2164+
2165+ // merge table
2166+ __ emit_data (0x01400140 , relocInfo::none, 0 );
2167+
2168+ // merge multiplier
2169+ __ emit_data (0x00011000 , relocInfo::none, 0 );
2170+
2171+ return start;
2172+ }
2173+
2174+ address StubGenerator::base64_AVX2_decode_LUT_tables_addr () {
2175+ __ align64 ();
2176+ StubCodeMark mark (this , " StubRoutines" , " AVX2_tables_URL_base64" );
2177+ address start = __ pc ();
2178+
2179+ assert (((unsigned long long )start & 0x3f ) == 0 ,
2180+ " Alignment problem (0x%08llx)" , (unsigned long long )start);
2181+ // lut_lo
2182+ __ emit_data64 (0x1111111111111115 , relocInfo::none);
2183+ __ emit_data64 (0x1a1b1b1b1a131111 , relocInfo::none);
2184+ __ emit_data64 (0x1111111111111115 , relocInfo::none);
2185+ __ emit_data64 (0x1a1b1b1b1a131111 , relocInfo::none);
2186+
2187+ // lut_roll
2188+ __ emit_data64 (0xb9b9bfbf04131000 , relocInfo::none);
2189+ __ emit_data64 (0x0000000000000000 , relocInfo::none);
2190+ __ emit_data64 (0xb9b9bfbf04131000 , relocInfo::none);
2191+ __ emit_data64 (0x0000000000000000 , relocInfo::none);
2192+
2193+ // lut_lo URL
2194+ __ emit_data64 (0x1111111111111115 , relocInfo::none);
2195+ __ emit_data64 (0x1b1b1a1b1b131111 , relocInfo::none);
2196+ __ emit_data64 (0x1111111111111115 , relocInfo::none);
2197+ __ emit_data64 (0x1b1b1a1b1b131111 , relocInfo::none);
2198+
2199+ // lut_roll URL
2200+ __ emit_data64 (0xb9b9bfbf0411e000 , relocInfo::none);
2201+ __ emit_data64 (0x0000000000000000 , relocInfo::none);
2202+ __ emit_data64 (0xb9b9bfbf0411e000 , relocInfo::none);
2203+ __ emit_data64 (0x0000000000000000 , relocInfo::none);
2204+
2205+ // lut_hi
2206+ __ emit_data64 (0x0804080402011010 , relocInfo::none);
2207+ __ emit_data64 (0x1010101010101010 , relocInfo::none);
2208+ __ emit_data64 (0x0804080402011010 , relocInfo::none);
2209+ __ emit_data64 (0x1010101010101010 , relocInfo::none);
2210+
2211+ return start;
2212+ }
2213+
21362214address StubGenerator::base64_decoding_table_addr () {
21372215 StubCodeMark mark (this , " StubRoutines" , " decoding_table_base64" );
21382216 address start = __ pc ();
@@ -2289,7 +2367,7 @@ address StubGenerator::generate_base64_decodeBlock() {
22892367
22902368 Label L_process256, L_process64, L_process64Loop, L_exit, L_processdata, L_loadURL;
22912369 Label L_continue, L_finalBit, L_padding, L_donePadding, L_bruteForce;
2292- Label L_forceLoop, L_bottomLoop, L_checkMIME, L_exit_no_vzero;
2370+ Label L_forceLoop, L_bottomLoop, L_checkMIME, L_exit_no_vzero, L_lastChunk ;
22932371
22942372 // calculate length from offsets
22952373 __ movl (length, end_offset);
@@ -2299,11 +2377,11 @@ address StubGenerator::generate_base64_decodeBlock() {
22992377 // If AVX512 VBMI not supported, just compile non-AVX code
23002378 if (VM_Version::supports_avx512_vbmi () &&
23012379 VM_Version::supports_avx512bw ()) {
2302- __ cmpl (length, 128 ); // 128 -bytes is break-even for AVX-512
2303- __ jcc (Assembler::lessEqual, L_bruteForce );
2380+ __ cmpl (length, 31 ); // 32 -bytes is break-even for AVX-512
2381+ __ jcc (Assembler::lessEqual, L_lastChunk );
23042382
23052383 __ cmpl (isMIME, 0 );
2306- __ jcc (Assembler::notEqual, L_bruteForce );
2384+ __ jcc (Assembler::notEqual, L_lastChunk );
23072385
23082386 // Load lookup tables based on isURL
23092387 __ cmpl (isURL, 0 );
@@ -2554,6 +2632,89 @@ address StubGenerator::generate_base64_decodeBlock() {
25542632 __ BIND (L_bruteForce);
25552633 } // End of if(avx512_vbmi)
25562634
2635+ if (VM_Version::supports_avx2 ()) {
2636+ Label L_tailProc, L_topLoop, L_enterLoop;
2637+
2638+ __ cmpl (isMIME, 0 );
2639+ __ jcc (Assembler::notEqual, L_lastChunk);
2640+
2641+ // Check for buffer too small (for algorithm)
2642+ __ subl (length, 0x2c );
2643+ __ jcc (Assembler::less, L_tailProc);
2644+
2645+ __ shll (isURL, 2 );
2646+
2647+ // Algorithm adapted from https://arxiv.org/abs/1704.00605, "Faster Base64
2648+ // Encoding and Decoding using AVX2 Instructions". URL modifications added.
2649+
2650+ // Set up constants
2651+ __ lea (r13, ExternalAddress (StubRoutines::x86::base64_AVX2_decode_tables_addr ()));
2652+ __ vpbroadcastd (xmm4, Address (r13, isURL, Address::times_1), Assembler::AVX_256bit); // 2F or 5F
2653+ __ vpbroadcastd (xmm10, Address (r13, isURL, Address::times_1, 0x08 ), Assembler::AVX_256bit); // -1 or -4
2654+ __ vmovdqu (xmm12, Address (r13, 0x10 )); // permute
2655+ __ vmovdqu (xmm13, Address (r13, 0x30 )); // shuffle
2656+ __ vpbroadcastd (xmm7, Address (r13, 0x50 ), Assembler::AVX_256bit); // merge
2657+ __ vpbroadcastd (xmm6, Address (r13, 0x54 ), Assembler::AVX_256bit); // merge mult
2658+
2659+ __ lea (r13, ExternalAddress (StubRoutines::x86::base64_AVX2_decode_LUT_tables_addr ()));
2660+ __ shll (isURL, 4 );
2661+ __ vmovdqu (xmm11, Address (r13, isURL, Address::times_1, 0x00 )); // lut_lo
2662+ __ vmovdqu (xmm8, Address (r13, isURL, Address::times_1, 0x20 )); // lut_roll
2663+ __ shrl (isURL, 6 ); // restore isURL
2664+ __ vmovdqu (xmm9, Address (r13, 0x80 )); // lut_hi
2665+ __ jmp (L_enterLoop);
2666+
2667+ __ align32 ();
2668+ __ bind (L_topLoop);
2669+ // Add in the offset value (roll) to get 6-bit out values
2670+ __ vpaddb (xmm0, xmm0, xmm2, Assembler::AVX_256bit);
2671+ // Merge and permute the output bits into appropriate output byte lanes
2672+ __ vpmaddubsw (xmm0, xmm0, xmm7, Assembler::AVX_256bit);
2673+ __ vpmaddwd (xmm0, xmm0, xmm6, Assembler::AVX_256bit);
2674+ __ vpshufb (xmm0, xmm0, xmm13, Assembler::AVX_256bit);
2675+ __ vpermd (xmm0, xmm12, xmm0, Assembler::AVX_256bit);
2676+ // Store the output bytes
2677+ __ vmovdqu (Address (dest, dp, Address::times_1, 0 ), xmm0);
2678+ __ addptr (source, 0x20 );
2679+ __ addptr (dest, 0x18 );
2680+ __ subl (length, 0x20 );
2681+ __ jcc (Assembler::less, L_tailProc);
2682+
2683+ __ bind (L_enterLoop);
2684+
2685+ // Load in encoded string (32 bytes)
2686+ __ vmovdqu (xmm2, Address (source, start_offset, Address::times_1, 0x0 ));
2687+ // Extract the high nibble for indexing into the lut tables. High 4 bits are don't care.
2688+ __ vpsrld (xmm1, xmm2, 0x4 , Assembler::AVX_256bit);
2689+ __ vpand (xmm1, xmm4, xmm1, Assembler::AVX_256bit);
2690+ // Extract the low nibble. 5F/2F will isolate the low-order 4 bits. High 4 bits are don't care.
2691+ __ vpand (xmm3, xmm2, xmm4, Assembler::AVX_256bit);
2692+ // Check for special-case (0x2F or 0x5F (URL))
2693+ __ vpcmpeqb (xmm0, xmm4, xmm2, Assembler::AVX_256bit);
2694+ // Get the bitset based on the low nibble. vpshufb uses low-order 4 bits only.
2695+ __ vpshufb (xmm3, xmm11, xmm3, Assembler::AVX_256bit);
2696+ // Get the bit value of the high nibble
2697+ __ vpshufb (xmm5, xmm9, xmm1, Assembler::AVX_256bit);
2698+ // Make sure 2F / 5F shows as valid
2699+ __ vpandn (xmm3, xmm0, xmm3, Assembler::AVX_256bit);
2700+ // Make adjustment for roll index. For non-URL, this is a no-op,
2701+ // for URL, this adjusts by -4. This is to properly index the
2702+ // roll value for 2F / 5F.
2703+ __ vpand (xmm0, xmm0, xmm10, Assembler::AVX_256bit);
2704+ // If the and of the two is non-zero, we have an invalid input character
2705+ __ vptest (xmm3, xmm5);
2706+ // Extract the "roll" value - value to add to the input to get 6-bit out value
2707+ __ vpaddb (xmm0, xmm0, xmm1, Assembler::AVX_256bit); // Handle 2F / 5F
2708+ __ vpshufb (xmm0, xmm8, xmm0, Assembler::AVX_256bit);
2709+ __ jcc (Assembler::equal, L_topLoop); // Fall through on error
2710+
2711+ __ bind (L_tailProc);
2712+
2713+ __ addl (length, 0x2c );
2714+
2715+ __ vzeroupper ();
2716+ }
2717+
25572718 // Use non-AVX code to decode 4-byte chunks into 3 bytes of output
25582719
25592720 // Register state (Linux):
@@ -2584,6 +2745,8 @@ address StubGenerator::generate_base64_decodeBlock() {
25842745 const Register byte3 = WIN64_ONLY (r8) NOT_WIN64 (rdx);
25852746 const Register byte4 = WIN64_ONLY (r10) NOT_WIN64 (r9);
25862747
2748+ __ bind (L_lastChunk);
2749+
25872750 __ shrl (length, 2 ); // Multiple of 4 bytes only - length is # 4-byte chunks
25882751 __ cmpl (length, 0 );
25892752 __ jcc (Assembler::lessEqual, L_exit_no_vzero);
@@ -3829,12 +3992,12 @@ void StubGenerator::generate_all() {
38293992 }
38303993
38313994 if (UseBASE64Intrinsics) {
3832- if (VM_Version::supports_avx2 () &&
3833- VM_Version::supports_avx512bw () &&
3834- VM_Version::supports_avx512vl ()) {
3995+ if (VM_Version::supports_avx2 ()) {
38353996 StubRoutines::x86::_avx2_shuffle_base64 = base64_avx2_shuffle_addr ();
38363997 StubRoutines::x86::_avx2_input_mask_base64 = base64_avx2_input_mask_addr ();
38373998 StubRoutines::x86::_avx2_lut_base64 = base64_avx2_lut_addr ();
3999+ StubRoutines::x86::_avx2_decode_tables_base64 = base64_AVX2_decode_tables_addr ();
4000+ StubRoutines::x86::_avx2_decode_lut_tables_base64 = base64_AVX2_decode_LUT_tables_addr ();
38384001 }
38394002 StubRoutines::x86::_encoding_table_base64 = base64_encoding_table_addr ();
38404003 if (VM_Version::supports_avx512_vbmi ()) {
0 commit comments