Skip to content

Commit cafd84e

Browse files
committed
[GR-49260] Remove usages of fcmpZero in AArch64 intrinsics.
PullRequest: graal/16316
2 parents 8eb16fa + e56fb11 commit cafd84e

11 files changed

+228
-171
lines changed

compiler/src/jdk.graal.compiler/src/jdk/graal/compiler/asm/aarch64/AArch64ASIMDAssembler.java

Lines changed: 55 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,15 +24,15 @@
2424
*/
2525
package jdk.graal.compiler.asm.aarch64;
2626

27-
import static jdk.vm.ci.aarch64.AArch64.CPU;
28-
import static jdk.vm.ci.aarch64.AArch64.SIMD;
29-
import static jdk.vm.ci.aarch64.AArch64.zr;
3027
import static jdk.graal.compiler.asm.aarch64.AArch64Assembler.LoadFlag;
3128
import static jdk.graal.compiler.asm.aarch64.AArch64Assembler.rd;
3229
import static jdk.graal.compiler.asm.aarch64.AArch64Assembler.rn;
3330
import static jdk.graal.compiler.asm.aarch64.AArch64Assembler.rs1;
3431
import static jdk.graal.compiler.asm.aarch64.AArch64Assembler.rs2;
3532
import static jdk.graal.compiler.asm.aarch64.AArch64Assembler.rs3;
33+
import static jdk.vm.ci.aarch64.AArch64.CPU;
34+
import static jdk.vm.ci.aarch64.AArch64.SIMD;
35+
import static jdk.vm.ci.aarch64.AArch64.zr;
3636

3737
import java.util.Arrays;
3838
import java.util.HashMap;
@@ -41,7 +41,6 @@
4141
import jdk.graal.compiler.core.common.NumUtil;
4242
import jdk.graal.compiler.core.common.Stride;
4343
import jdk.graal.compiler.debug.GraalError;
44-
4544
import jdk.vm.ci.aarch64.AArch64;
4645
import jdk.vm.ci.aarch64.AArch64Kind;
4746
import jdk.vm.ci.code.Register;
@@ -704,7 +703,9 @@ public enum ASIMDInstruction {
704703
CMHS(UBit | 0b00111 << 11),
705704
USHL(UBit | 0b01000 << 11),
706705
UMAX(UBit | 0b01100 << 11),
706+
UMAXP(UBit | 0b10100 << 11),
707707
UMIN(UBit | 0b01101 << 11),
708+
UMINP(UBit | 0b10101 << 11),
708709
SUB(UBit | 0b10000 << 11),
709710
CMEQ(UBit | 0b10001 << 11),
710711
MLS(UBit | 0b10010 << 11),
@@ -3557,6 +3558,31 @@ public void umaxVVV(ASIMDSize size, ElementSize eSize, Register dst, Register sr
35573558
threeSameEncoding(ASIMDInstruction.UMAX, size, elemSizeXX(eSize), dst, src1, src2);
35583559
}
35593560

3561+
/**
3562+
* C7.2.361 Unsigned maximum pairwise.<br>
3563+
*
3564+
* <code>
3565+
* concat = src2:src1
3566+
* for i in 0..n-1 do dst[i] = uint_max(concat[2 * i], concat[2 * i + 1])
3567+
* </code>
3568+
*
3569+
* @param size register size.
3570+
* @param eSize element size.
3571+
* @param dst SIMD register.
3572+
* @param src1 SIMD register.
3573+
* @param src2 SIMD register.
3574+
*/
3575+
public void umaxpVVV(ASIMDSize size, ElementSize eSize, Register dst, Register src1, Register src2) {
3576+
assert usesMultipleLanes(size, eSize) : "Must use multiple lanes " + size + " " + eSize;
3577+
3578+
assert dst.getRegisterCategory().equals(SIMD) : dst;
3579+
assert src1.getRegisterCategory().equals(SIMD) : src1;
3580+
assert src2.getRegisterCategory().equals(SIMD) : src2;
3581+
assert eSize != ElementSize.DoubleWord : "Invalid lane width for umaxp";
3582+
3583+
threeSameEncoding(ASIMDInstruction.UMAXP, size, elemSizeXX(eSize), dst, src1, src2);
3584+
}
3585+
35603586
/**
35613587
* C7.2.362 Unsigned maximum across vector.<br>
35623588
*
@@ -3598,6 +3624,31 @@ public void uminVVV(ASIMDSize size, ElementSize eSize, Register dst, Register sr
35983624
threeSameEncoding(ASIMDInstruction.UMIN, size, elemSizeXX(eSize), dst, src1, src2);
35993625
}
36003626

3627+
/**
3628+
* C7.2.364 Unsigned minimum pairwise.<br>
3629+
*
3630+
* <code>
3631+
* concat = src2:src1
3632+
* for i in 0..n-1 do dst[i] = uint_min(concat[2 * i], concat[2 * i + 1])
3633+
* </code>
3634+
*
3635+
* @param size register size.
3636+
* @param eSize element size.
3637+
* @param dst SIMD register.
3638+
* @param src1 SIMD register.
3639+
* @param src2 SIMD register.
3640+
*/
3641+
public void uminpVVV(ASIMDSize size, ElementSize eSize, Register dst, Register src1, Register src2) {
3642+
assert usesMultipleLanes(size, eSize) : "Must use multiple lanes " + size + " " + eSize;
3643+
3644+
assert dst.getRegisterCategory().equals(SIMD) : dst;
3645+
assert src1.getRegisterCategory().equals(SIMD) : src1;
3646+
assert src2.getRegisterCategory().equals(SIMD) : src2;
3647+
assert eSize != ElementSize.DoubleWord : "Invalid lane width for uminp";
3648+
3649+
threeSameEncoding(ASIMDInstruction.UMINP, size, elemSizeXX(eSize), dst, src1, src2);
3650+
}
3651+
36013652
/**
36023653
* C7.2.365 Unsigned minimum across vector.<br>
36033654
*

compiler/src/jdk.graal.compiler/src/jdk/graal/compiler/lir/aarch64/AArch64ArrayCompareToOp.java

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,8 @@
2424
*/
2525
package jdk.graal.compiler.lir.aarch64;
2626

27-
import static jdk.vm.ci.code.ValueUtil.asRegister;
2827
import static jdk.graal.compiler.lir.LIRInstruction.OperandFlag.REG;
28+
import static jdk.vm.ci.code.ValueUtil.asRegister;
2929

3030
import jdk.graal.compiler.asm.Label;
3131
import jdk.graal.compiler.asm.aarch64.AArch64ASIMDAssembler;
@@ -35,11 +35,10 @@
3535
import jdk.graal.compiler.asm.aarch64.AArch64MacroAssembler;
3636
import jdk.graal.compiler.core.common.Stride;
3737
import jdk.graal.compiler.debug.GraalError;
38-
import jdk.graal.compiler.lir.asm.CompilationResultBuilder;
3938
import jdk.graal.compiler.lir.LIRInstructionClass;
4039
import jdk.graal.compiler.lir.Opcode;
40+
import jdk.graal.compiler.lir.asm.CompilationResultBuilder;
4141
import jdk.graal.compiler.lir.gen.LIRGeneratorTool;
42-
4342
import jdk.vm.ci.aarch64.AArch64Kind;
4443
import jdk.vm.ci.code.Register;
4544
import jdk.vm.ci.meta.AllocatableValue;
@@ -333,8 +332,11 @@ private void emitSIMDCode(AArch64MacroAssembler masm, Label stringsEqualUptoLeng
333332
masm.neon.cmeqVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, eSize, array1HighV, array1HighV, array2HighV);
334333
masm.neon.andVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, tmpRegV1, array1LowV, array1HighV);
335334
masm.neon.uminvSV(AArch64ASIMDAssembler.ASIMDSize.FullReg, eSize, tmpRegV1, tmpRegV1);
336-
masm.fcmpZero(64, tmpRegV1);
337-
masm.branchConditionally(ConditionFlag.EQ, mismatchInChunk);
335+
try (AArch64MacroAssembler.ScratchRegister scratchReg = masm.getScratchRegister()) {
336+
Register tmp = scratchReg.getRegister();
337+
masm.neon.umovGX(AArch64ASIMDAssembler.ElementSize.DoubleWord, tmp, tmpRegV1, 0);
338+
masm.cbz(64, tmp, mismatchInChunk);
339+
}
338340
masm.cmp(64, array1, lastChunkAddress1);
339341
masm.branchConditionally(ConditionFlag.LO, simdLoop);
340342

compiler/src/jdk.graal.compiler/src/jdk/graal/compiler/lir/aarch64/AArch64ArrayEqualsOp.java

Lines changed: 16 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,6 @@
2424
*/
2525
package jdk.graal.compiler.lir.aarch64;
2626

27-
import static jdk.vm.ci.code.ValueUtil.asRegister;
28-
import static jdk.vm.ci.code.ValueUtil.isIllegal;
2927
import static jdk.graal.compiler.asm.aarch64.AArch64ASIMDAssembler.ASIMDInstruction.LD2_MULTIPLE_2R;
3028
import static jdk.graal.compiler.asm.aarch64.AArch64ASIMDAssembler.ASIMDInstruction.LD4_MULTIPLE_4R;
3129
import static jdk.graal.compiler.asm.aarch64.AArch64ASIMDAssembler.ASIMDSize.FullReg;
@@ -39,6 +37,8 @@
3937
import static jdk.graal.compiler.asm.aarch64.AArch64MacroAssembler.PREFERRED_LOOP_ALIGNMENT;
4038
import static jdk.graal.compiler.lir.LIRInstruction.OperandFlag.ILLEGAL;
4139
import static jdk.graal.compiler.lir.LIRInstruction.OperandFlag.REG;
40+
import static jdk.vm.ci.code.ValueUtil.asRegister;
41+
import static jdk.vm.ci.code.ValueUtil.isIllegal;
4242

4343
import java.util.Arrays;
4444

@@ -53,11 +53,10 @@
5353
import jdk.graal.compiler.core.common.StrideUtil;
5454
import jdk.graal.compiler.debug.Assertions;
5555
import jdk.graal.compiler.debug.GraalError;
56-
import jdk.graal.compiler.lir.asm.CompilationResultBuilder;
5756
import jdk.graal.compiler.lir.LIRInstructionClass;
5857
import jdk.graal.compiler.lir.Opcode;
58+
import jdk.graal.compiler.lir.asm.CompilationResultBuilder;
5959
import jdk.graal.compiler.lir.gen.LIRGeneratorTool;
60-
6160
import jdk.vm.ci.aarch64.AArch64Kind;
6261
import jdk.vm.ci.code.Register;
6362
import jdk.vm.ci.meta.Value;
@@ -232,7 +231,7 @@ private void emitArrayEquals(AArch64MacroAssembler asm,
232231
Register refAddress = len;
233232
asm.add(64, refAddress, arrayMax, len, ShiftType.LSL, strideMax.log2);
234233

235-
simdCompare64(asm, strideMax, strideMin, strideA, strideM, arrayMax, arrayMin, arrayM);
234+
simdCompare64(asm, strideMax, strideMin, strideA, strideM, arrayMax, arrayMin, arrayM, tmp);
236235
asm.branchConditionally(ConditionFlag.NE, end);
237236

238237
asm.cmp(64, refAddress, arrayMax);
@@ -249,7 +248,7 @@ private void emitArrayEquals(AArch64MacroAssembler asm,
249248
// 64 byte loop
250249
asm.align(PREFERRED_LOOP_ALIGNMENT);
251250
asm.bind(vectorLoop);
252-
simdCompare64(asm, strideMax, strideMin, strideA, strideM, arrayMax, arrayMin, arrayM);
251+
simdCompare64(asm, strideMax, strideMin, strideA, strideM, arrayMax, arrayMin, arrayM, tmp);
253252
asm.branchConditionally(ConditionFlag.NE, end);
254253
asm.cmp(64, arrayMax, refAddress);
255254
asm.branchConditionally(ConditionFlag.LO, vectorLoop);
@@ -263,13 +262,13 @@ private void emitArrayEquals(AArch64MacroAssembler asm,
263262
asm.sub(64, arrayM, arrayM, tmp, ShiftType.LSR, strideMax.log2 - strideM.log2);
264263
}
265264

266-
simdCompare64(asm, strideMax, strideMin, strideA, strideM, arrayMax, arrayMin, arrayM);
265+
simdCompare64(asm, strideMax, strideMin, strideA, strideM, arrayMax, arrayMin, arrayM, tmp);
267266
asm.jmp(end);
268267

269268
// tail for 32 - 63 bytes
270-
tail32(asm, strideMax, strideMin, strideA, strideM, arrayMax, arrayMin, arrayM, len, tailLessThan64, tailLessThan32, end);
269+
tail32(asm, strideMax, strideMin, strideA, strideM, arrayMax, arrayMin, arrayM, len, tmp, tailLessThan64, tailLessThan32, end);
271270
// tail for 16 - 31 bytes
272-
tail16(asm, strideA, strideB, strideM, strideMax, strideMin, arrayA, arrayB, arrayM, len, tailLessThan32, tailLessThan16, end);
271+
tail16(asm, strideA, strideB, strideM, strideMax, strideMin, arrayA, arrayB, arrayM, len, tmp, tailLessThan32, tailLessThan16, end);
273272
// tail for 8 - 15 bytes
274273
tailLessThan16(asm, strideA, strideB, strideM, strideMax, arrayA, arrayB, arrayM, len, tmp, ret, tailLessThan16, tailLessThan8, end, 8);
275274
// tail for 4 - 7 bytes
@@ -291,7 +290,8 @@ private void simdCompare64(AArch64MacroAssembler asm,
291290
Stride strideMask,
292291
Register arrayMax,
293292
Register arrayMin,
294-
Register arrayMask) {
293+
Register arrayMask,
294+
Register tmp) {
295295
ElementSize minESize = fromStride(strideMin);
296296
switch (strideMax.log2 - strideMin.log2) {
297297
case 0:
@@ -420,7 +420,7 @@ private void simdCompare64(AArch64MacroAssembler asm,
420420
default:
421421
throw GraalError.unimplemented("comparison of " + strideMin + " to " + strideMax + " not implemented"); // ExcludeFromJacocoGeneratedReport
422422
}
423-
vectorCheckZero(asm, v(0), v(0));
423+
cmpZeroVector(asm, v(0), v(0), tmp);
424424
}
425425

426426
private void tail32(AArch64MacroAssembler asm,
@@ -432,6 +432,7 @@ private void tail32(AArch64MacroAssembler asm,
432432
Register arrayMin,
433433
Register arrayMask,
434434
Register len,
435+
Register tmp,
435436
Label entry,
436437
Label nextTail,
437438
Label end) {
@@ -595,7 +596,7 @@ private void tail32(AArch64MacroAssembler asm,
595596
default:
596597
throw GraalError.unimplemented("comparison of " + strideMin + " to " + strideMax + " not implemented"); // ExcludeFromJacocoGeneratedReport
597598
}
598-
vectorCheckZero(asm, v(0), v(0));
599+
cmpZeroVector(asm, v(0), v(0), tmp);
599600
asm.jmp(end);
600601
}
601602

@@ -609,6 +610,7 @@ private void tail16(AArch64MacroAssembler asm,
609610
Register arrayB,
610611
Register arrayM,
611612
Register len,
613+
Register tmp,
612614
Label entry,
613615
Label nextTail,
614616
Label end) {
@@ -663,7 +665,7 @@ private void tail16(AArch64MacroAssembler asm,
663665
asm.neon.eorVVV(FullReg, vecArrayA2, vecArrayA2, vecArrayB2);
664666
asm.neon.orrVVV(FullReg, vecArrayA1, vecArrayA1, vecArrayA2);
665667

666-
vectorCheckZero(asm, vecArrayA1, vecArrayA1);
668+
cmpZeroVector(asm, vecArrayA1, vecArrayA1, tmp);
667669
asm.jmp(end);
668670
}
669671

@@ -753,7 +755,7 @@ private void tailLessThan16(AArch64MacroAssembler asm,
753755
asm.neon.orrVVV(FullReg, vecArrayA1, vecArrayA1, vecArrayM1);
754756
}
755757
asm.neon.eorVVV(FullReg, vecArrayA1, vecArrayA1, vecArrayB1);
756-
vectorCheckZero(asm, vecArrayA1, vecArrayA1);
758+
cmpZeroVector(asm, vecArrayA1, vecArrayA1, tmp);
757759
} else if (strideMax.value == nBytes) {
758760
asm.bind(entry);
759761
// tail for length == 1

compiler/src/jdk.graal.compiler/src/jdk/graal/compiler/lir/aarch64/AArch64ArrayIndexOfOp.java

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -25,14 +25,14 @@
2525
*/
2626
package jdk.graal.compiler.lir.aarch64;
2727

28-
import static jdk.vm.ci.aarch64.AArch64.zr;
29-
import static jdk.vm.ci.code.ValueUtil.asRegister;
3028
import static jdk.graal.compiler.asm.aarch64.AArch64ASIMDAssembler.ASIMDInstruction.LD2_MULTIPLE_2R;
3129
import static jdk.graal.compiler.asm.aarch64.AArch64ASIMDAssembler.ASIMDInstruction.LD4_MULTIPLE_4R;
3230
import static jdk.graal.compiler.asm.aarch64.AArch64ASIMDAssembler.ASIMDSize.FullReg;
3331
import static jdk.graal.compiler.asm.aarch64.AArch64Address.createStructureImmediatePostIndexAddress;
3432
import static jdk.graal.compiler.asm.aarch64.AArch64Assembler.ConditionFlag;
3533
import static jdk.graal.compiler.lir.LIRInstruction.OperandFlag.REG;
34+
import static jdk.vm.ci.aarch64.AArch64.zr;
35+
import static jdk.vm.ci.code.ValueUtil.asRegister;
3636

3737
import java.util.Arrays;
3838

@@ -46,12 +46,11 @@
4646
import jdk.graal.compiler.asm.aarch64.AArch64MacroAssembler.ScratchRegister;
4747
import jdk.graal.compiler.core.common.Stride;
4848
import jdk.graal.compiler.debug.GraalError;
49-
import jdk.graal.compiler.lir.asm.CompilationResultBuilder;
5049
import jdk.graal.compiler.lir.LIRInstructionClass;
5150
import jdk.graal.compiler.lir.Opcode;
51+
import jdk.graal.compiler.lir.asm.CompilationResultBuilder;
5252
import jdk.graal.compiler.lir.gen.LIRGeneratorTool;
5353
import jdk.graal.compiler.lir.gen.LIRGeneratorTool.ArrayIndexOfVariant;
54-
5554
import jdk.vm.ci.aarch64.AArch64Kind;
5655
import jdk.vm.ci.code.Register;
5756
import jdk.vm.ci.meta.AllocatableValue;
@@ -631,9 +630,11 @@ private void emitSIMDMatch(AArch64MacroAssembler masm,
631630
break;
632631
}
633632
masm.neon.orrVVV(FullReg, vecTmp[0], vecArray1, vecArray2);
634-
/* If value != 0, then there was a match somewhere. */
635-
vectorCheckZero(masm, ElementSize.fromStride(getMatchResultStride()), vecTmp[0], vecTmp[0], variant != ArrayIndexOfVariant.Table);
636-
masm.branchConditionally(ConditionFlag.NE, matchInChunk);
633+
try (ScratchRegister sc = masm.getScratchRegister()) {
634+
Register tmp = sc.getRegister();
635+
/* If value != 0, then there was a match somewhere. */
636+
cbnzVector(masm, ElementSize.fromStride(getMatchResultStride()), vecTmp[0], vecTmp[0], tmp, variant != ArrayIndexOfVariant.Table, matchInChunk);
637+
}
637638
}
638639

639640
private Stride getMatchResultStride() {

compiler/src/jdk.graal.compiler/src/jdk/graal/compiler/lir/aarch64/AArch64ArrayRegionCompareToOp.java

Lines changed: 9 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -24,17 +24,17 @@
2424
*/
2525
package jdk.graal.compiler.lir.aarch64;
2626

27-
import static jdk.vm.ci.aarch64.AArch64.CPU;
28-
import static jdk.vm.ci.aarch64.AArch64.SIMD;
29-
import static jdk.vm.ci.aarch64.AArch64.zr;
30-
import static jdk.vm.ci.code.ValueUtil.asRegister;
31-
import static jdk.vm.ci.code.ValueUtil.isIllegal;
3227
import static jdk.graal.compiler.asm.aarch64.AArch64ASIMDAssembler.ASIMDSize.FullReg;
3328
import static jdk.graal.compiler.asm.aarch64.AArch64ASIMDAssembler.ElementSize.fromStride;
3429
import static jdk.graal.compiler.asm.aarch64.AArch64MacroAssembler.PREFERRED_BRANCH_TARGET_ALIGNMENT;
3530
import static jdk.graal.compiler.asm.aarch64.AArch64MacroAssembler.PREFERRED_LOOP_ALIGNMENT;
3631
import static jdk.graal.compiler.lir.LIRInstruction.OperandFlag.ILLEGAL;
3732
import static jdk.graal.compiler.lir.LIRInstruction.OperandFlag.REG;
33+
import static jdk.vm.ci.aarch64.AArch64.CPU;
34+
import static jdk.vm.ci.aarch64.AArch64.SIMD;
35+
import static jdk.vm.ci.aarch64.AArch64.zr;
36+
import static jdk.vm.ci.code.ValueUtil.asRegister;
37+
import static jdk.vm.ci.code.ValueUtil.isIllegal;
3838

3939
import java.util.Arrays;
4040

@@ -49,11 +49,10 @@
4949
import jdk.graal.compiler.core.common.Stride;
5050
import jdk.graal.compiler.core.common.StrideUtil;
5151
import jdk.graal.compiler.debug.GraalError;
52-
import jdk.graal.compiler.lir.asm.CompilationResultBuilder;
5352
import jdk.graal.compiler.lir.LIRInstructionClass;
5453
import jdk.graal.compiler.lir.Opcode;
54+
import jdk.graal.compiler.lir.asm.CompilationResultBuilder;
5555
import jdk.graal.compiler.lir.gen.LIRGeneratorTool;
56-
5756
import jdk.vm.ci.aarch64.AArch64Kind;
5857
import jdk.vm.ci.code.Register;
5958
import jdk.vm.ci.meta.AllocatableValue;
@@ -202,8 +201,7 @@ private void emitArrayCompare(CompilationResultBuilder crb, AArch64MacroAssemble
202201
asm.neon.eorVVV(FullReg, vecTmp1, vecArrayA1, vecArrayB1);
203202
asm.neon.eorVVV(FullReg, vecTmp2, vecArrayA2, vecArrayB2);
204203
asm.neon.orrVVV(FullReg, vecTmp2, vecTmp2, vecTmp1);
205-
vectorCheckZero(asm, vecTmp2, vecTmp2);
206-
asm.branchConditionally(ConditionFlag.NE, diffFound);
204+
cbnzVector(asm, ElementSize.Byte, vecTmp2, vecTmp2, tmp, false, diffFound);
207205
// if so, continue
208206
asm.cmp(64, maxStrideArray, refAddress);
209207
asm.branchConditionally(ConditionFlag.LO, vectorLoop);
@@ -218,8 +216,7 @@ private void emitArrayCompare(CompilationResultBuilder crb, AArch64MacroAssemble
218216
asm.neon.eorVVV(FullReg, vecTmp1, vecArrayA1, vecArrayB1);
219217
asm.neon.eorVVV(FullReg, vecTmp2, vecArrayA2, vecArrayB2);
220218
asm.neon.orrVVV(FullReg, vecTmp2, vecTmp2, vecTmp1);
221-
vectorCheckZero(asm, vecTmp2, vecTmp2);
222-
asm.branchConditionally(ConditionFlag.NE, diffFound);
219+
cbnzVector(asm, ElementSize.Byte, vecTmp2, vecTmp2, tmp, false, diffFound);
223220
asm.mov(64, ret, zr);
224221
asm.jmp(end);
225222

@@ -239,8 +236,7 @@ private void emitArrayCompare(CompilationResultBuilder crb, AArch64MacroAssemble
239236
asm.align(PREFERRED_BRANCH_TARGET_ALIGNMENT);
240237
asm.bind(diffFound);
241238
// check if vecArrayA1 and vecArrayB1 are equal
242-
vectorCheckZero(asm, vecTmp1, vecTmp1);
243-
asm.branchConditionally(ConditionFlag.NE, returnV1);
239+
cbnzVector(asm, ElementSize.Byte, vecTmp1, vecTmp1, tmp, false, returnV1);
244240
calcReturnValue(asm, ret, vecArrayA2, vecArrayB2, vecArrayA1, vecArrayB1, vecMask, strideMax);
245241
asm.jmp(end);
246242

0 commit comments

Comments
 (0)