|
3 | 3 | ; RUN: opt -passes=loop-vectorize -mattr=+v,+experimental-zvqdotq -prefer-predicate-over-epilogue=scalar-epilogue -S < %s | FileCheck %s --check-prefixes=CHECK,ZVQDOTQ |
4 | 4 | ; RUN: opt -passes=loop-vectorize -mattr=+v -scalable-vectorization=off -prefer-predicate-over-epilogue=scalar-epilogue -S < %s | FileCheck %s --check-prefixes=FIXED,FIXED-V |
5 | 5 | ; RUN: opt -passes=loop-vectorize -mattr=+v,+experimental-zvqdotq -scalable-vectorization=off -prefer-predicate-over-epilogue=scalar-epilogue -S < %s | FileCheck %s --check-prefixes=FIXED,FIXED-ZVQDOTQ |
| 6 | +; RUN: opt -passes=loop-vectorize -mattr=+v,+experimental-zvqdotq -S < %s | FileCheck %s --check-prefixes=CHECK,TAILFOLD |
6 | 7 |
|
7 | 8 | ; TODO: Remove -prefer-predicate-over-epilogue=scalar-epilogue when partial reductions with EVL tail folding is supported. |
8 | 9 |
|
@@ -147,6 +148,37 @@ define i32 @vqdot(ptr %a, ptr %b) #0 { |
147 | 148 | ; FIXED-ZVQDOTQ: for.exit: |
148 | 149 | ; FIXED-ZVQDOTQ-NEXT: ret i32 [[TMP13]] |
149 | 150 | ; |
| 151 | +; TAILFOLD-LABEL: define i32 @vqdot( |
| 152 | +; TAILFOLD-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { |
| 153 | +; TAILFOLD-NEXT: entry: |
| 154 | +; TAILFOLD-NEXT: br label [[VECTOR_PH:%.*]] |
| 155 | +; TAILFOLD: vector.ph: |
| 156 | +; TAILFOLD-NEXT: br label [[VECTOR_BODY:%.*]] |
| 157 | +; TAILFOLD: vector.body: |
| 158 | +; TAILFOLD-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] |
| 159 | +; TAILFOLD-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] |
| 160 | +; TAILFOLD-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] |
| 161 | +; TAILFOLD-NEXT: [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) |
| 162 | +; TAILFOLD-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[EVL_BASED_IV]] |
| 163 | +; TAILFOLD-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.vp.load.nxv4i8.p0(ptr align 1 [[TMP1]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP0]]) |
| 164 | +; TAILFOLD-NEXT: [[TMP2:%.*]] = sext <vscale x 4 x i8> [[VP_OP_LOAD]] to <vscale x 4 x i32> |
| 165 | +; TAILFOLD-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[B]], i64 [[EVL_BASED_IV]] |
| 166 | +; TAILFOLD-NEXT: [[VP_OP_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.vp.load.nxv4i8.p0(ptr align 1 [[TMP3]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP0]]) |
| 167 | +; TAILFOLD-NEXT: [[TMP4:%.*]] = sext <vscale x 4 x i8> [[VP_OP_LOAD1]] to <vscale x 4 x i32> |
| 168 | +; TAILFOLD-NEXT: [[TMP5:%.*]] = mul <vscale x 4 x i32> [[TMP4]], [[TMP2]] |
| 169 | +; TAILFOLD-NEXT: [[TMP6:%.*]] = add <vscale x 4 x i32> [[TMP5]], [[VEC_PHI]] |
| 170 | +; TAILFOLD-NEXT: [[TMP7]] = call <vscale x 4 x i32> @llvm.vp.merge.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[TMP6]], <vscale x 4 x i32> [[VEC_PHI]], i32 [[TMP0]]) |
| 171 | +; TAILFOLD-NEXT: [[TMP8:%.*]] = zext i32 [[TMP0]] to i64 |
| 172 | +; TAILFOLD-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP8]], [[EVL_BASED_IV]] |
| 173 | +; TAILFOLD-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]] |
| 174 | +; TAILFOLD-NEXT: [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 |
| 175 | +; TAILFOLD-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] |
| 176 | +; TAILFOLD: middle.block: |
| 177 | +; TAILFOLD-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP7]]) |
| 178 | +; TAILFOLD-NEXT: br label [[FOR_EXIT:%.*]] |
| 179 | +; TAILFOLD: for.exit: |
| 180 | +; TAILFOLD-NEXT: ret i32 [[TMP10]] |
| 181 | +; |
150 | 182 | entry: |
151 | 183 | br label %for.body |
152 | 184 |
|
@@ -309,6 +341,37 @@ define i32 @vqdotu(ptr %a, ptr %b) #0 { |
309 | 341 | ; FIXED-ZVQDOTQ: for.exit: |
310 | 342 | ; FIXED-ZVQDOTQ-NEXT: ret i32 [[TMP13]] |
311 | 343 | ; |
| 344 | +; TAILFOLD-LABEL: define i32 @vqdotu( |
| 345 | +; TAILFOLD-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { |
| 346 | +; TAILFOLD-NEXT: entry: |
| 347 | +; TAILFOLD-NEXT: br label [[VECTOR_PH:%.*]] |
| 348 | +; TAILFOLD: vector.ph: |
| 349 | +; TAILFOLD-NEXT: br label [[VECTOR_BODY:%.*]] |
| 350 | +; TAILFOLD: vector.body: |
| 351 | +; TAILFOLD-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] |
| 352 | +; TAILFOLD-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] |
| 353 | +; TAILFOLD-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] |
| 354 | +; TAILFOLD-NEXT: [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) |
| 355 | +; TAILFOLD-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[EVL_BASED_IV]] |
| 356 | +; TAILFOLD-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.vp.load.nxv4i8.p0(ptr align 1 [[TMP1]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP0]]) |
| 357 | +; TAILFOLD-NEXT: [[TMP2:%.*]] = zext <vscale x 4 x i8> [[VP_OP_LOAD]] to <vscale x 4 x i32> |
| 358 | +; TAILFOLD-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[B]], i64 [[EVL_BASED_IV]] |
| 359 | +; TAILFOLD-NEXT: [[VP_OP_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.vp.load.nxv4i8.p0(ptr align 1 [[TMP3]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP0]]) |
| 360 | +; TAILFOLD-NEXT: [[TMP4:%.*]] = zext <vscale x 4 x i8> [[VP_OP_LOAD1]] to <vscale x 4 x i32> |
| 361 | +; TAILFOLD-NEXT: [[TMP5:%.*]] = mul <vscale x 4 x i32> [[TMP4]], [[TMP2]] |
| 362 | +; TAILFOLD-NEXT: [[TMP6:%.*]] = add <vscale x 4 x i32> [[TMP5]], [[VEC_PHI]] |
| 363 | +; TAILFOLD-NEXT: [[TMP7]] = call <vscale x 4 x i32> @llvm.vp.merge.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[TMP6]], <vscale x 4 x i32> [[VEC_PHI]], i32 [[TMP0]]) |
| 364 | +; TAILFOLD-NEXT: [[TMP8:%.*]] = zext i32 [[TMP0]] to i64 |
| 365 | +; TAILFOLD-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP8]], [[EVL_BASED_IV]] |
| 366 | +; TAILFOLD-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]] |
| 367 | +; TAILFOLD-NEXT: [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 |
| 368 | +; TAILFOLD-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] |
| 369 | +; TAILFOLD: middle.block: |
| 370 | +; TAILFOLD-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP7]]) |
| 371 | +; TAILFOLD-NEXT: br label [[FOR_EXIT:%.*]] |
| 372 | +; TAILFOLD: for.exit: |
| 373 | +; TAILFOLD-NEXT: ret i32 [[TMP10]] |
| 374 | +; |
312 | 375 | entry: |
313 | 376 | br label %for.body |
314 | 377 |
|
@@ -471,6 +534,37 @@ define i32 @vqdotsu(ptr %a, ptr %b) #0 { |
471 | 534 | ; FIXED-ZVQDOTQ: for.exit: |
472 | 535 | ; FIXED-ZVQDOTQ-NEXT: ret i32 [[TMP13]] |
473 | 536 | ; |
| 537 | +; TAILFOLD-LABEL: define i32 @vqdotsu( |
| 538 | +; TAILFOLD-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { |
| 539 | +; TAILFOLD-NEXT: entry: |
| 540 | +; TAILFOLD-NEXT: br label [[VECTOR_PH:%.*]] |
| 541 | +; TAILFOLD: vector.ph: |
| 542 | +; TAILFOLD-NEXT: br label [[VECTOR_BODY:%.*]] |
| 543 | +; TAILFOLD: vector.body: |
| 544 | +; TAILFOLD-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] |
| 545 | +; TAILFOLD-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] |
| 546 | +; TAILFOLD-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] |
| 547 | +; TAILFOLD-NEXT: [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) |
| 548 | +; TAILFOLD-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[EVL_BASED_IV]] |
| 549 | +; TAILFOLD-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.vp.load.nxv4i8.p0(ptr align 1 [[TMP1]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP0]]) |
| 550 | +; TAILFOLD-NEXT: [[TMP2:%.*]] = zext <vscale x 4 x i8> [[VP_OP_LOAD]] to <vscale x 4 x i32> |
| 551 | +; TAILFOLD-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[B]], i64 [[EVL_BASED_IV]] |
| 552 | +; TAILFOLD-NEXT: [[VP_OP_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.vp.load.nxv4i8.p0(ptr align 1 [[TMP3]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP0]]) |
| 553 | +; TAILFOLD-NEXT: [[TMP4:%.*]] = sext <vscale x 4 x i8> [[VP_OP_LOAD1]] to <vscale x 4 x i32> |
| 554 | +; TAILFOLD-NEXT: [[TMP5:%.*]] = mul <vscale x 4 x i32> [[TMP4]], [[TMP2]] |
| 555 | +; TAILFOLD-NEXT: [[TMP6:%.*]] = add <vscale x 4 x i32> [[TMP5]], [[VEC_PHI]] |
| 556 | +; TAILFOLD-NEXT: [[TMP7]] = call <vscale x 4 x i32> @llvm.vp.merge.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[TMP6]], <vscale x 4 x i32> [[VEC_PHI]], i32 [[TMP0]]) |
| 557 | +; TAILFOLD-NEXT: [[TMP8:%.*]] = zext i32 [[TMP0]] to i64 |
| 558 | +; TAILFOLD-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP8]], [[EVL_BASED_IV]] |
| 559 | +; TAILFOLD-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]] |
| 560 | +; TAILFOLD-NEXT: [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 |
| 561 | +; TAILFOLD-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] |
| 562 | +; TAILFOLD: middle.block: |
| 563 | +; TAILFOLD-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP7]]) |
| 564 | +; TAILFOLD-NEXT: br label [[FOR_EXIT:%.*]] |
| 565 | +; TAILFOLD: for.exit: |
| 566 | +; TAILFOLD-NEXT: ret i32 [[TMP10]] |
| 567 | +; |
474 | 568 | entry: |
475 | 569 | br label %for.body |
476 | 570 |
|
@@ -632,6 +726,37 @@ define i32 @vqdotsu2(ptr %a, ptr %b) #0 { |
632 | 726 | ; FIXED-ZVQDOTQ: for.exit: |
633 | 727 | ; FIXED-ZVQDOTQ-NEXT: ret i32 [[TMP13]] |
634 | 728 | ; |
| 729 | +; TAILFOLD-LABEL: define i32 @vqdotsu2( |
| 730 | +; TAILFOLD-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { |
| 731 | +; TAILFOLD-NEXT: entry: |
| 732 | +; TAILFOLD-NEXT: br label [[VECTOR_PH:%.*]] |
| 733 | +; TAILFOLD: vector.ph: |
| 734 | +; TAILFOLD-NEXT: br label [[VECTOR_BODY:%.*]] |
| 735 | +; TAILFOLD: vector.body: |
| 736 | +; TAILFOLD-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] |
| 737 | +; TAILFOLD-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] |
| 738 | +; TAILFOLD-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] |
| 739 | +; TAILFOLD-NEXT: [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) |
| 740 | +; TAILFOLD-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[EVL_BASED_IV]] |
| 741 | +; TAILFOLD-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.vp.load.nxv4i8.p0(ptr align 1 [[TMP1]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP0]]) |
| 742 | +; TAILFOLD-NEXT: [[TMP2:%.*]] = sext <vscale x 4 x i8> [[VP_OP_LOAD]] to <vscale x 4 x i32> |
| 743 | +; TAILFOLD-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[B]], i64 [[EVL_BASED_IV]] |
| 744 | +; TAILFOLD-NEXT: [[VP_OP_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.vp.load.nxv4i8.p0(ptr align 1 [[TMP3]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP0]]) |
| 745 | +; TAILFOLD-NEXT: [[TMP4:%.*]] = zext <vscale x 4 x i8> [[VP_OP_LOAD1]] to <vscale x 4 x i32> |
| 746 | +; TAILFOLD-NEXT: [[TMP5:%.*]] = mul <vscale x 4 x i32> [[TMP4]], [[TMP2]] |
| 747 | +; TAILFOLD-NEXT: [[TMP6:%.*]] = add <vscale x 4 x i32> [[TMP5]], [[VEC_PHI]] |
| 748 | +; TAILFOLD-NEXT: [[TMP7]] = call <vscale x 4 x i32> @llvm.vp.merge.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[TMP6]], <vscale x 4 x i32> [[VEC_PHI]], i32 [[TMP0]]) |
| 749 | +; TAILFOLD-NEXT: [[TMP8:%.*]] = zext i32 [[TMP0]] to i64 |
| 750 | +; TAILFOLD-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP8]], [[EVL_BASED_IV]] |
| 751 | +; TAILFOLD-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]] |
| 752 | +; TAILFOLD-NEXT: [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 |
| 753 | +; TAILFOLD-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] |
| 754 | +; TAILFOLD: middle.block: |
| 755 | +; TAILFOLD-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP7]]) |
| 756 | +; TAILFOLD-NEXT: br label [[FOR_EXIT:%.*]] |
| 757 | +; TAILFOLD: for.exit: |
| 758 | +; TAILFOLD-NEXT: ret i32 [[TMP10]] |
| 759 | +; |
635 | 760 | entry: |
636 | 761 | br label %for.body |
637 | 762 |
|
|
0 commit comments