-
Notifications
You must be signed in to change notification settings - Fork 14.5k
[AArch64] Change IssueWidth to 6 in AArch64SchedNeoverseV2.td #142565
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-aarch64 Author: David Sherwood (david-arm) ChangesI think that the issue width for neoverse-v2 CPUs is set too 548.exchange2: -1.7% If this patch causes any major regressions post-commit it can Patch is 311.03 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/142565.diff 8 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td
index 39f7077ae4514..2fea569296427 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td
@@ -14,7 +14,7 @@
//===----------------------------------------------------------------------===//
def NeoverseV2Model : SchedMachineModel {
- let IssueWidth = 16; // Micro-ops dispatched at a time.
+ let IssueWidth = 6; // Micro-ops dispatched at a time.
let MicroOpBufferSize = 320; // Entries in micro-op re-order buffer.
let LoadLatency = 4; // Optimistic load latency.
let MispredictPenalty = 10; // Extra cycles for mispredicted branch. NOTE: Copied from N2.
diff --git a/llvm/test/CodeGen/AArch64/aarch64-reassociate-accumulators-sve.ll b/llvm/test/CodeGen/AArch64/aarch64-reassociate-accumulators-sve.ll
index 0d4c053551011..ecc972ef237b5 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-reassociate-accumulators-sve.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-reassociate-accumulators-sve.ll
@@ -1,4 +1,5 @@
-; RUN: opt -passes=loop-unroll %s -o - | llc -O3 - -mtriple=aarch64-unknown-unknown -mcpu=neoverse-v2 -o - | FileCheck %s
+; RUN: opt -passes=loop-unroll %s -o - | llc -O3 - -mtriple=aarch64-unknown-unknown \
+; RUN: -mcpu=neoverse-v1 -mattr=+sve2 -o - | FileCheck %s
define i64 @sabalb_i32_to_i64_accumulation(ptr %ptr1, ptr %ptr2) {
; CHECK-LABEL: sabalb_i32_to_i64_accumulation
@@ -423,4 +424,4 @@ exit:
ret i16 %reduce
}
-declare <vscale x 8 x i16> @llvm.aarch64.sve.add.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)
\ No newline at end of file
+declare <vscale x 8 x i16> @llvm.aarch64.sve.add.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-basic-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-basic-instructions.s
index 581dad6b68dcf..54b5f1644be48 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-basic-instructions.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-basic-instructions.s
@@ -2536,14 +2536,14 @@ drps
# CHECK-NEXT: 1 2 0.50 bics x3, xzr, x3, lsl #1
# CHECK-NEXT: 1 2 0.50 tst w3, w7, lsl #31
# CHECK-NEXT: 1 2 0.50 tst x2, x20, asr #2
-# CHECK-NEXT: 1 0 0.06 mov x3, x6
-# CHECK-NEXT: 1 0 0.06 mov x3, xzr
-# CHECK-NEXT: 1 0 0.06 mov wzr, w2
-# CHECK-NEXT: 1 0 0.06 mov w3, w5
+# CHECK-NEXT: 1 0 0.17 mov x3, x6
+# CHECK-NEXT: 1 0 0.17 mov x3, xzr
+# CHECK-NEXT: 1 0 0.17 mov wzr, w2
+# CHECK-NEXT: 1 0 0.17 mov w3, w5
# CHECK-NEXT: 1 1 0.17 movz w2, #0, lsl #16
# CHECK-NEXT: 1 1 0.17 mov w2, #-1235
# CHECK-NEXT: 1 1 0.17 mov x2, #5299989643264
-# CHECK-NEXT: 1 0 0.06 mov x2, #0
+# CHECK-NEXT: 1 0 0.17 mov x2, #0
# CHECK-NEXT: 1 1 0.17 movk w3, #0
# CHECK-NEXT: 1 1 0.17 movz x4, #0, lsl #16
# CHECK-NEXT: 1 1 0.17 movk w5, #0, lsl #16
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s
index fbf65e26e99a5..3398331a67f5b 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s
@@ -58,7 +58,7 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: Total Cycles: 44
# CHECK-NEXT: Total uOps: 200
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 4.55
# CHECK-NEXT: IPC: 4.55
# CHECK-NEXT: Block RThroughput: 0.3
@@ -116,8 +116,8 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [1,1] D======eeER. add z0.d, z0.d, z0.d
# CHECK-NEXT: [2,0] DeeeeeeE--R. ldr b0, [sp]
# CHECK-NEXT: [2,1] D======eeER. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0] D=eeeeeeE-R. ldr b0, [sp]
-# CHECK-NEXT: [3,1] D=======eeER add z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0] .DeeeeeeE-R. ldr b0, [sp]
+# CHECK-NEXT: [3,1] .D======eeER add z0.d, z0.d, z0.d
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -126,9 +126,9 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ldr b0, [sp]
-# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add z0.d, z0.d, z0.d
-# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
+# CHECK-NEXT: 0. 4 1.0 1.0 1.3 ldr b0, [sp]
+# CHECK-NEXT: 1. 4 7.0 0.0 0.0 add z0.d, z0.d, z0.d
+# CHECK-NEXT: 4 4.0 0.5 0.6 <total>
# CHECK: [1] Code Region - FPR16-bit
@@ -137,7 +137,7 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: Total Cycles: 44
# CHECK-NEXT: Total uOps: 200
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 4.55
# CHECK-NEXT: IPC: 4.55
# CHECK-NEXT: Block RThroughput: 0.3
@@ -195,8 +195,8 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [1,1] D======eeER. add z0.d, z0.d, z0.d
# CHECK-NEXT: [2,0] DeeeeeeE--R. ldr h0, [sp]
# CHECK-NEXT: [2,1] D======eeER. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0] D=eeeeeeE-R. ldr h0, [sp]
-# CHECK-NEXT: [3,1] D=======eeER add z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0] .DeeeeeeE-R. ldr h0, [sp]
+# CHECK-NEXT: [3,1] .D======eeER add z0.d, z0.d, z0.d
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -205,9 +205,9 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ldr h0, [sp]
-# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add z0.d, z0.d, z0.d
-# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
+# CHECK-NEXT: 0. 4 1.0 1.0 1.3 ldr h0, [sp]
+# CHECK-NEXT: 1. 4 7.0 0.0 0.0 add z0.d, z0.d, z0.d
+# CHECK-NEXT: 4 4.0 0.5 0.6 <total>
# CHECK: [2] Code Region - FPR32-bit
@@ -216,7 +216,7 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: Total Cycles: 44
# CHECK-NEXT: Total uOps: 200
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 4.55
# CHECK-NEXT: IPC: 4.55
# CHECK-NEXT: Block RThroughput: 0.3
@@ -274,8 +274,8 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [1,1] D======eeER. add z0.d, z0.d, z0.d
# CHECK-NEXT: [2,0] DeeeeeeE--R. ldr s0, [sp]
# CHECK-NEXT: [2,1] D======eeER. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0] D=eeeeeeE-R. ldr s0, [sp]
-# CHECK-NEXT: [3,1] D=======eeER add z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0] .DeeeeeeE-R. ldr s0, [sp]
+# CHECK-NEXT: [3,1] .D======eeER add z0.d, z0.d, z0.d
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -284,9 +284,9 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ldr s0, [sp]
-# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add z0.d, z0.d, z0.d
-# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
+# CHECK-NEXT: 0. 4 1.0 1.0 1.3 ldr s0, [sp]
+# CHECK-NEXT: 1. 4 7.0 0.0 0.0 add z0.d, z0.d, z0.d
+# CHECK-NEXT: 4 4.0 0.5 0.6 <total>
# CHECK: [3] Code Region - FPR64-bit
@@ -295,7 +295,7 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: Total Cycles: 44
# CHECK-NEXT: Total uOps: 200
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 4.55
# CHECK-NEXT: IPC: 4.55
# CHECK-NEXT: Block RThroughput: 0.3
@@ -353,8 +353,8 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [1,1] D======eeER. add z0.d, z0.d, z0.d
# CHECK-NEXT: [2,0] DeeeeeeE--R. ldr d0, [sp]
# CHECK-NEXT: [2,1] D======eeER. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0] D=eeeeeeE-R. ldr d0, [sp]
-# CHECK-NEXT: [3,1] D=======eeER add z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0] .DeeeeeeE-R. ldr d0, [sp]
+# CHECK-NEXT: [3,1] .D======eeER add z0.d, z0.d, z0.d
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -363,9 +363,9 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ldr d0, [sp]
-# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add z0.d, z0.d, z0.d
-# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
+# CHECK-NEXT: 0. 4 1.0 1.0 1.3 ldr d0, [sp]
+# CHECK-NEXT: 1. 4 7.0 0.0 0.0 add z0.d, z0.d, z0.d
+# CHECK-NEXT: 4 4.0 0.5 0.6 <total>
# CHECK: [4] Code Region - FPR128-bit
@@ -374,7 +374,7 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: Total Cycles: 44
# CHECK-NEXT: Total uOps: 200
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 4.55
# CHECK-NEXT: IPC: 4.55
# CHECK-NEXT: Block RThroughput: 0.3
@@ -432,8 +432,8 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [1,1] D======eeER. add z0.d, z0.d, z0.d
# CHECK-NEXT: [2,0] DeeeeeeE--R. ldr q0, [sp]
# CHECK-NEXT: [2,1] D======eeER. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0] D=eeeeeeE-R. ldr q0, [sp]
-# CHECK-NEXT: [3,1] D=======eeER add z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0] .DeeeeeeE-R. ldr q0, [sp]
+# CHECK-NEXT: [3,1] .D======eeER add z0.d, z0.d, z0.d
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -442,9 +442,9 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ldr q0, [sp]
-# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add z0.d, z0.d, z0.d
-# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
+# CHECK-NEXT: 0. 4 1.0 1.0 1.3 ldr q0, [sp]
+# CHECK-NEXT: 1. 4 7.0 0.0 0.0 add z0.d, z0.d, z0.d
+# CHECK-NEXT: 4 4.0 0.5 0.6 <total>
# CHECK: [5] Code Region - SIMD64-bit-b
@@ -453,7 +453,7 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: Total Cycles: 44
# CHECK-NEXT: Total uOps: 200
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 4.55
# CHECK-NEXT: IPC: 4.55
# CHECK-NEXT: Block RThroughput: 0.3
@@ -511,8 +511,8 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [1,1] D======eeER. add z0.d, z0.d, z0.d
# CHECK-NEXT: [2,0] DeeeeeeE--R. ld1 { v0.8b }, [sp]
# CHECK-NEXT: [2,1] D======eeER. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0] D=eeeeeeE-R. ld1 { v0.8b }, [sp]
-# CHECK-NEXT: [3,1] D=======eeER add z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0] .DeeeeeeE-R. ld1 { v0.8b }, [sp]
+# CHECK-NEXT: [3,1] .D======eeER add z0.d, z0.d, z0.d
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -521,9 +521,9 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ld1 { v0.8b }, [sp]
-# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add z0.d, z0.d, z0.d
-# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
+# CHECK-NEXT: 0. 4 1.0 1.0 1.3 ld1 { v0.8b }, [sp]
+# CHECK-NEXT: 1. 4 7.0 0.0 0.0 add z0.d, z0.d, z0.d
+# CHECK-NEXT: 4 4.0 0.5 0.6 <total>
# CHECK: [6] Code Region - SIMD64-bit-h
@@ -532,7 +532,7 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: Total Cycles: 44
# CHECK-NEXT: Total uOps: 200
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 4.55
# CHECK-NEXT: IPC: 4.55
# CHECK-NEXT: Block RThroughput: 0.3
@@ -590,8 +590,8 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [1,1] D======eeER. add z0.d, z0.d, z0.d
# CHECK-NEXT: [2,0] DeeeeeeE--R. ld1 { v0.4h }, [sp]
# CHECK-NEXT: [2,1] D======eeER. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0] D=eeeeeeE-R. ld1 { v0.4h }, [sp]
-# CHECK-NEXT: [3,1] D=======eeER add z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0] .DeeeeeeE-R. ld1 { v0.4h }, [sp]
+# CHECK-NEXT: [3,1] .D======eeER add z0.d, z0.d, z0.d
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -600,9 +600,9 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ld1 { v0.4h }, [sp]
-# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add z0.d, z0.d, z0.d
-# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
+# CHECK-NEXT: 0. 4 1.0 1.0 1.3 ld1 { v0.4h }, [sp]
+# CHECK-NEXT: 1. 4 7.0 0.0 0.0 add z0.d, z0.d, z0.d
+# CHECK-NEXT: 4 4.0 0.5 0.6 <total>
# CHECK: [7] Code Region - SIMD64-bit-s
@@ -611,7 +611,7 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: Total Cycles: 44
# CHECK-NEXT: Total uOps: 200
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 4.55
# CHECK-NEXT: IPC: 4.55
# CHECK-NEXT: Block RThroughput: 0.3
@@ -669,8 +669,8 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [1,1] D======eeER. add z0.d, z0.d, z0.d
# CHECK-NEXT: [2,0] DeeeeeeE--R. ld1 { v0.2s }, [sp]
# CHECK-NEXT: [2,1] D======eeER. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0] D=eeeeeeE-R. ld1 { v0.2s }, [sp]
-# CHECK-NEXT: [3,1] D=======eeER add z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0] .DeeeeeeE-R. ld1 { v0.2s }, [sp]
+# CHECK-NEXT: [3,1] .D======eeER add z0.d, z0.d, z0.d
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -679,9 +679,9 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ld1 { v0.2s }, [sp]
-# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add z0.d, z0.d, z0.d
-# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
+# CHECK-NEXT: 0. 4 1.0 1.0 1.3 ld1 { v0.2s }, [sp]
+# CHECK-NEXT: 1. 4 7.0 0.0 0.0 add z0.d, z0.d, z0.d
+# CHECK-NEXT: 4 4.0 0.5 0.6 <total>
# CHECK: [8] Code Region - SIMD64-bit-d
@@ -690,7 +690,7 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: Total Cycles: 44
# CHECK-NEXT: Total uOps: 200
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 4.55
# CHECK-NEXT: IPC: 4.55
# CHECK-NEXT: Block RThroughput: 0.3
@@ -748,8 +748,8 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [1,1] D======eeER. add z0.d, z0.d, z0.d
# CHECK-NEXT: [2,0] DeeeeeeE--R. ld1 { v0.1d }, [sp]
# CHECK-NEXT: [2,1] D======eeER. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0] D=eeeeeeE-R. ld1 { v0.1d }, [sp]
-# CHECK-NEXT: [3,1] D=======eeER add z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0] .DeeeeeeE-R. ld1 { v0.1d }, [sp]
+# CHECK-NEXT: [3,1] .D======eeER add z0.d, z0.d, z0.d
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -758,9 +758,9 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ld1 { v0.1d }, [sp]
-# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add z0.d, z0.d, z0.d
-# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
+# CHECK-NEXT: 0. 4 1.0 1.0 1.3 ld1 { v0.1d }, [sp]
+# CHECK-NEXT: 1. 4 7.0 0.0 0.0 add z0.d, z0.d, z0.d
+# CHECK-NEXT: 4 4.0 0.5 0.6 <total>
# CHECK: [9] Code Region - insr
@@ -769,7 +769,7 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: Total Cycles: 803
# CHECK-NEXT: Total uOps: 300
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 0.37
# CHECK-NEXT: IPC: 0.25
# CHECK-NEXT: Block RThroughput: 1.0
@@ -825,10 +825,10 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [0,1] D======eeER . . . . . add z0.s, z0.s, z0.s
# CHECK-NEXT: [1,0] D========eeeeeeER . . . . insr z0.s, w0
# CHECK-NEXT: [1,1] D==============eeER . . . . add z0.s, z0.s, z0.s
-# CHECK-NEXT: [2,0] D================eeeeeeER. . . insr z0.s, w0
-# CHECK-NEXT: [2,1] D======================eeER . . add z0.s, z0.s, z0.s
-# CHECK-NEXT: [3,0] D========================eeeeeeER . insr z0.s, w0
-# CHECK-NEXT: [3,1] D==============================eeER add z0.s, z0.s, z0.s
+# CHECK-NEXT: [2,0] .D===============eeeeeeER. . . insr z0.s, w0
+# CHECK-NEXT: [2,1] .D=====================eeER . . add z0.s, z0.s, z0.s
+# CHECK-NEXT: [3,0] .D=======================eeeeeeER . insr z0.s, w0
+# CHECK-NEXT: [3,1] .D=============================eeER add z0.s, z0.s, z0.s
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -837,6 +837,6 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 4 13.0 0.3 0.0 insr z0.s, w0
-# CHECK-NEXT: 1. 4 19.0 0.0 0.0 add z0.s, z0.s, z0.s
-# CHECK-NEXT: 4 16.0 0.1 0.0 <total>
+# CHECK-NEXT: 0. 4 12.5 0.3 0.0 insr z0.s, w0
+# CHECK-NEXT: 1. 4 18.5 0.0 0.0 add z0.s, z0.s, z0.s
+# CHECK-NEXT: 4 15.5 0.1 0.0 <total>
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-forwarding.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-forwarding.s
index 0f5ab183f5358..39a779b27fe7f 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-forwarding.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-forwarding.s
@@ -315,7 +315,7 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: Total Cycles: 703
# CHECK-NEXT: Total uOps: 400
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 0.57
# CHECK-NEXT: IPC: 0.57
# CHECK-NEXT: Block RThroughput: 3.0
@@ -330,8 +330,8 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [0,3] D=====eeER. .. madd x0, x0, x0, x0
# CHECK-NEXT: [1,0] D=======eeER .. mul x0, x0, x0
# CHECK-NEXT: [1,1] D=========eeER .. madd x0, x1, x2, x0
-# CHECK-NEXT: [1,2] D==========eeER.. madd x0, x1, x2, x0
-# CHECK-NEXT: [1,3] D============eeER madd x0, x0, x0, x0
+# CHECK-NEXT: [1,2] .D=========eeER.. madd x0, x1, x2, x0
+# CHECK-NEXT: [1,3] .D===========eeER madd x0, x0, x0, x0
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -342,9 +342,9 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 2 4.5 0.5 0.0 mul x0, x0, x0
# CHECK-NEXT: 1. 2 6.5 0.0 0.0 madd x0, x1, x2, x0
-# CHECK-NEXT: 2. 2 7.5 0.0 0.0 madd x0, x1, x...
[truncated]
|
@@ -1,4 +1,5 @@ | |||
; RUN: opt -passes=loop-unroll %s -o - | llc -O3 - -mtriple=aarch64-unknown-unknown -mcpu=neoverse-v2 -o - | FileCheck %s | |||
; RUN: opt -passes=loop-unroll %s -o - | llc -O3 - -mtriple=aarch64-unknown-unknown \ | |||
; RUN: -mcpu=neoverse-v1 -mattr=+sve2 -o - | FileCheck %s |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I changed the RUN line here to match the spirit of the test, which is to reassociate accumulators. It was previously relying upon the scheduling of a neoverse-v2 CPU, but I can get the same effect by tuning for neoverse-v1 and enabling SVE2.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What happens when you don't change the RUN line? I ask because the new RUN line is artificial whereas I think if the transformation is ever broken for a genuine target then we should know about it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If I leave the RUN line alone, then the test changes in such a way that it's no longer testing "reassociate chains of accumulation instructions into a tree" (#132728)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I plan to rewrite the SVE test in a separate patch since it's not representative of actual vectorised code. When I do that then this PR has no effect on the test.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We might want to decrease the value for the other scheduling models too.
Hi David, I think this makes sense. I would like to run some more benchmarks, and will report back here in a day or two. |
related #136374 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If @sjoerdmeijer's data reflects our own measurements then this PR looks go to me.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I suspect we are looking at very similar things. I have tested the interaction between this and #140897 that I plan to commit end of this week. That all looks fine for my motivating examples and app, so that is a good start.
I would like to do some more extensive testing now, and need a bit more time, maybe a day or so.
In the meantime, I've added one nit.
@@ -14,7 +14,7 @@ | |||
//===----------------------------------------------------------------------===// | |||
|
|||
def NeoverseV2Model : SchedMachineModel { | |||
let IssueWidth = 16; // Micro-ops dispatched at a time. | |||
let IssueWidth = 6; // Micro-ops dispatched at a time. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit: can we update the comment, and try to be more specific what we mean here, because things are all a bit vague in the SWOG and here too. So a bit more rationale that we are now more modelling the dispatch constraints rather than the issue width, and that 6 is better than 8 because that's what benchmarks shows.
If we get it right here, we might get it right elsewhere too, because these things tend to be copied from one scheduling file to the next one. ;-)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've tried to update the comment, hope it makes sense!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Perhaps just call it the decode bandwidth as that is where the value comes from, and mention that from empirical measurements lower values worked better than higher values?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM too, thanks.
I think that the issue width for neoverse-v2 CPUs is set too high and does not properly reflect the dispatch constraints. I tested various values of IssueWidth (16, 8 and 6) with runs of SPEC2017 on a neoverse-v2 machine and I got the highest overall geomean score with an issue width of 6, although it's only a marginal 0.14% improvement. I also observed a 1-2% improvement when testing the Gromacs application with some workloads. Here are some notable changes in SPEC2017 ref runtimes, i.e. has a ~0.5% change or greater ('-' means faster): 548.exchange2: -1.7% 510.parest: -0.78% 538.imagick: -0.73% 500.perlbench: -0.57% 525.x264: -0.55% 507.cactuBSSN: -0.5% 520.omnetpp: -0.48% 511.povray: +0.57% 544.nab: +0.65% 503.bwaves: +0.68% 526.blender: +0.75% If this patch causes any major regressions post-commit it can be easily reverted, but I think it should be an overall improvement.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Cheers
…42565) I think that the issue width for neoverse-v2 CPUs is set too high and does not properly reflect the dispatch constraints. I tested various values of IssueWidth (16, 8 and 6) with runs of SPEC2017 on a neoverse-v2 machine and I got the highest overall geomean score with an issue width of 6, although it's only a marginal 0.14% improvement. I also observed a 1-2% improvement when testing the Gromacs application with some workloads. Here are some notable changes in SPEC2017 ref runtimes, i.e. has a ~0.5% change or greater ('-' means faster): 548.exchange2: -1.7% 510.parest: -0.78% 538.imagick: -0.73% 500.perlbench: -0.57% 525.x264: -0.55% 507.cactuBSSN: -0.5% 520.omnetpp: -0.48% 511.povray: +0.57% 544.nab: +0.65% 503.bwaves: +0.68% 526.blender: +0.75% If this patch causes any major regressions post-commit it can be easily reverted, but I think it should be an overall improvement.
…42565) I think that the issue width for neoverse-v2 CPUs is set too high and does not properly reflect the dispatch constraints. I tested various values of IssueWidth (16, 8 and 6) with runs of SPEC2017 on a neoverse-v2 machine and I got the highest overall geomean score with an issue width of 6, although it's only a marginal 0.14% improvement. I also observed a 1-2% improvement when testing the Gromacs application with some workloads. Here are some notable changes in SPEC2017 ref runtimes, i.e. has a ~0.5% change or greater ('-' means faster): 548.exchange2: -1.7% 510.parest: -0.78% 538.imagick: -0.73% 500.perlbench: -0.57% 525.x264: -0.55% 507.cactuBSSN: -0.5% 520.omnetpp: -0.48% 511.povray: +0.57% 544.nab: +0.65% 503.bwaves: +0.68% 526.blender: +0.75% If this patch causes any major regressions post-commit it can be easily reverted, but I think it should be an overall improvement.
…42565) I think that the issue width for neoverse-v2 CPUs is set too high and does not properly reflect the dispatch constraints. I tested various values of IssueWidth (16, 8 and 6) with runs of SPEC2017 on a neoverse-v2 machine and I got the highest overall geomean score with an issue width of 6, although it's only a marginal 0.14% improvement. I also observed a 1-2% improvement when testing the Gromacs application with some workloads. Here are some notable changes in SPEC2017 ref runtimes, i.e. has a ~0.5% change or greater ('-' means faster): 548.exchange2: -1.7% 510.parest: -0.78% 538.imagick: -0.73% 500.perlbench: -0.57% 525.x264: -0.55% 507.cactuBSSN: -0.5% 520.omnetpp: -0.48% 511.povray: +0.57% 544.nab: +0.65% 503.bwaves: +0.68% 526.blender: +0.75% If this patch causes any major regressions post-commit it can be easily reverted, but I think it should be an overall improvement.
It has been observed that the issue width for neoverse-n2 CPUs is set too high, and does not properly reflect the dispatch constraints. I tested various values of IssueWidth (10, 8, 6, 5, 4) with runs of various workloads on a neoverse-n2 machine and I got the highest overall geomean score with an issue width of 5. If this patch were to cause any major regression post-commit, it could be easily reverted, but it is likely to show an overall improvement. Related Neoverse-V2 PR: #142565
….td (#145717) It has been observed that the issue width for neoverse-n2 CPUs is set too high, and does not properly reflect the dispatch constraints. I tested various values of IssueWidth (10, 8, 6, 5, 4) with runs of various workloads on a neoverse-n2 machine and I got the highest overall geomean score with an issue width of 5. If this patch were to cause any major regression post-commit, it could be easily reverted, but it is likely to show an overall improvement. Related Neoverse-V2 PR: llvm/llvm-project#142565
…45717) It has been observed that the issue width for neoverse-n2 CPUs is set too high, and does not properly reflect the dispatch constraints. I tested various values of IssueWidth (10, 8, 6, 5, 4) with runs of various workloads on a neoverse-n2 machine and I got the highest overall geomean score with an issue width of 5. If this patch were to cause any major regression post-commit, it could be easily reverted, but it is likely to show an overall improvement. Related Neoverse-V2 PR: llvm#142565
…45717) It has been observed that the issue width for neoverse-n2 CPUs is set too high, and does not properly reflect the dispatch constraints. I tested various values of IssueWidth (10, 8, 6, 5, 4) with runs of various workloads on a neoverse-n2 machine and I got the highest overall geomean score with an issue width of 5. If this patch were to cause any major regression post-commit, it could be easily reverted, but it is likely to show an overall improvement. Related Neoverse-V2 PR: llvm#142565
I think that the issue width for neoverse-v2 CPUs is set too
high and does not properly reflect the dispatch constraints.
I tested various values of IssueWidth (16, 8 and 6) with runs
of SPEC2017 on a neoverse-v2 machine and I got the highest
overall geomean score with an issue width of 6, although it's
only a marginal 0.14% improvement. I also observed a 1-2%
improvement when testing the Gromacs application with some
workloads. Here are some notable changes in SPEC2017 ref
runtimes, i.e. has a ~0.5% change or greater ('-' means
faster):
548.exchange2: -1.7%
510.parest: -0.78%
538.imagick: -0.73%
500.perlbench: -0.57%
525.x264: -0.55%
507.cactuBSSN: -0.5%
520.omnetpp: -0.48%
511.povray: +0.57%
544.nab: +0.65%
503.bwaves: +0.68%
526.blender: +0.75%
If this patch causes any major regressions post-commit it can
be easily reverted, but I think it should be an overall
improvement.