diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td index 39f7077ae4514..8d3a4553d4b73 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td +++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td @@ -14,7 +14,9 @@ //===----------------------------------------------------------------------===// def NeoverseV2Model : SchedMachineModel { - let IssueWidth = 16; // Micro-ops dispatched at a time. + let IssueWidth = 6; // This value comes from the decode bandwidth + // and empirical measurements showed that a + // lower value is better. let MicroOpBufferSize = 320; // Entries in micro-op re-order buffer. let LoadLatency = 4; // Optimistic load latency. let MispredictPenalty = 10; // Extra cycles for mispredicted branch. NOTE: Copied from N2. diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-basic-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-basic-instructions.s index 581dad6b68dcf..54b5f1644be48 100644 --- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-basic-instructions.s +++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-basic-instructions.s @@ -2536,14 +2536,14 @@ drps # CHECK-NEXT: 1 2 0.50 bics x3, xzr, x3, lsl #1 # CHECK-NEXT: 1 2 0.50 tst w3, w7, lsl #31 # CHECK-NEXT: 1 2 0.50 tst x2, x20, asr #2 -# CHECK-NEXT: 1 0 0.06 mov x3, x6 -# CHECK-NEXT: 1 0 0.06 mov x3, xzr -# CHECK-NEXT: 1 0 0.06 mov wzr, w2 -# CHECK-NEXT: 1 0 0.06 mov w3, w5 +# CHECK-NEXT: 1 0 0.17 mov x3, x6 +# CHECK-NEXT: 1 0 0.17 mov x3, xzr +# CHECK-NEXT: 1 0 0.17 mov wzr, w2 +# CHECK-NEXT: 1 0 0.17 mov w3, w5 # CHECK-NEXT: 1 1 0.17 movz w2, #0, lsl #16 # CHECK-NEXT: 1 1 0.17 mov w2, #-1235 # CHECK-NEXT: 1 1 0.17 mov x2, #5299989643264 -# CHECK-NEXT: 1 0 0.06 mov x2, #0 +# CHECK-NEXT: 1 0 0.17 mov x2, #0 # CHECK-NEXT: 1 1 0.17 movk w3, #0 # CHECK-NEXT: 1 1 0.17 movz x4, #0, lsl #16 # CHECK-NEXT: 1 1 0.17 movk w5, #0, lsl #16 diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s index fbf65e26e99a5..3398331a67f5b 100644 --- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s +++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s @@ -58,7 +58,7 @@ add z0.s, z0.s, z0.s # CHECK-NEXT: Total Cycles: 44 # CHECK-NEXT: Total uOps: 200 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 4.55 # CHECK-NEXT: IPC: 4.55 # CHECK-NEXT: Block RThroughput: 0.3 @@ -116,8 +116,8 @@ add z0.s, z0.s, z0.s # CHECK-NEXT: [1,1] D======eeER. add z0.d, z0.d, z0.d # CHECK-NEXT: [2,0] DeeeeeeE--R. ldr b0, [sp] # CHECK-NEXT: [2,1] D======eeER. add z0.d, z0.d, z0.d -# CHECK-NEXT: [3,0] D=eeeeeeE-R. ldr b0, [sp] -# CHECK-NEXT: [3,1] D=======eeER add z0.d, z0.d, z0.d +# CHECK-NEXT: [3,0] .DeeeeeeE-R. ldr b0, [sp] +# CHECK-NEXT: [3,1] .D======eeER add z0.d, z0.d, z0.d # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -126,9 +126,9 @@ add z0.s, z0.s, z0.s # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ldr b0, [sp] -# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add z0.d, z0.d, z0.d -# CHECK-NEXT: 4 4.3 0.6 0.6 +# CHECK-NEXT: 0. 4 1.0 1.0 1.3 ldr b0, [sp] +# CHECK-NEXT: 1. 4 7.0 0.0 0.0 add z0.d, z0.d, z0.d +# CHECK-NEXT: 4 4.0 0.5 0.6 # CHECK: [1] Code Region - FPR16-bit @@ -137,7 +137,7 @@ add z0.s, z0.s, z0.s # CHECK-NEXT: Total Cycles: 44 # CHECK-NEXT: Total uOps: 200 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 4.55 # CHECK-NEXT: IPC: 4.55 # CHECK-NEXT: Block RThroughput: 0.3 @@ -195,8 +195,8 @@ add z0.s, z0.s, z0.s # CHECK-NEXT: [1,1] D======eeER. add z0.d, z0.d, z0.d # CHECK-NEXT: [2,0] DeeeeeeE--R. ldr h0, [sp] # CHECK-NEXT: [2,1] D======eeER. add z0.d, z0.d, z0.d -# CHECK-NEXT: [3,0] D=eeeeeeE-R. ldr h0, [sp] -# CHECK-NEXT: [3,1] D=======eeER add z0.d, z0.d, z0.d +# CHECK-NEXT: [3,0] .DeeeeeeE-R. ldr h0, [sp] +# CHECK-NEXT: [3,1] .D======eeER add z0.d, z0.d, z0.d # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -205,9 +205,9 @@ add z0.s, z0.s, z0.s # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ldr h0, [sp] -# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add z0.d, z0.d, z0.d -# CHECK-NEXT: 4 4.3 0.6 0.6 +# CHECK-NEXT: 0. 4 1.0 1.0 1.3 ldr h0, [sp] +# CHECK-NEXT: 1. 4 7.0 0.0 0.0 add z0.d, z0.d, z0.d +# CHECK-NEXT: 4 4.0 0.5 0.6 # CHECK: [2] Code Region - FPR32-bit @@ -216,7 +216,7 @@ add z0.s, z0.s, z0.s # CHECK-NEXT: Total Cycles: 44 # CHECK-NEXT: Total uOps: 200 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 4.55 # CHECK-NEXT: IPC: 4.55 # CHECK-NEXT: Block RThroughput: 0.3 @@ -274,8 +274,8 @@ add z0.s, z0.s, z0.s # CHECK-NEXT: [1,1] D======eeER. add z0.d, z0.d, z0.d # CHECK-NEXT: [2,0] DeeeeeeE--R. ldr s0, [sp] # CHECK-NEXT: [2,1] D======eeER. add z0.d, z0.d, z0.d -# CHECK-NEXT: [3,0] D=eeeeeeE-R. ldr s0, [sp] -# CHECK-NEXT: [3,1] D=======eeER add z0.d, z0.d, z0.d +# CHECK-NEXT: [3,0] .DeeeeeeE-R. ldr s0, [sp] +# CHECK-NEXT: [3,1] .D======eeER add z0.d, z0.d, z0.d # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -284,9 +284,9 @@ add z0.s, z0.s, z0.s # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ldr s0, [sp] -# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add z0.d, z0.d, z0.d -# CHECK-NEXT: 4 4.3 0.6 0.6 +# CHECK-NEXT: 0. 4 1.0 1.0 1.3 ldr s0, [sp] +# CHECK-NEXT: 1. 4 7.0 0.0 0.0 add z0.d, z0.d, z0.d +# CHECK-NEXT: 4 4.0 0.5 0.6 # CHECK: [3] Code Region - FPR64-bit @@ -295,7 +295,7 @@ add z0.s, z0.s, z0.s # CHECK-NEXT: Total Cycles: 44 # CHECK-NEXT: Total uOps: 200 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 4.55 # CHECK-NEXT: IPC: 4.55 # CHECK-NEXT: Block RThroughput: 0.3 @@ -353,8 +353,8 @@ add z0.s, z0.s, z0.s # CHECK-NEXT: [1,1] D======eeER. add z0.d, z0.d, z0.d # CHECK-NEXT: [2,0] DeeeeeeE--R. ldr d0, [sp] # CHECK-NEXT: [2,1] D======eeER. add z0.d, z0.d, z0.d -# CHECK-NEXT: [3,0] D=eeeeeeE-R. ldr d0, [sp] -# CHECK-NEXT: [3,1] D=======eeER add z0.d, z0.d, z0.d +# CHECK-NEXT: [3,0] .DeeeeeeE-R. ldr d0, [sp] +# CHECK-NEXT: [3,1] .D======eeER add z0.d, z0.d, z0.d # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -363,9 +363,9 @@ add z0.s, z0.s, z0.s # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ldr d0, [sp] -# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add z0.d, z0.d, z0.d -# CHECK-NEXT: 4 4.3 0.6 0.6 +# CHECK-NEXT: 0. 4 1.0 1.0 1.3 ldr d0, [sp] +# CHECK-NEXT: 1. 4 7.0 0.0 0.0 add z0.d, z0.d, z0.d +# CHECK-NEXT: 4 4.0 0.5 0.6 # CHECK: [4] Code Region - FPR128-bit @@ -374,7 +374,7 @@ add z0.s, z0.s, z0.s # CHECK-NEXT: Total Cycles: 44 # CHECK-NEXT: Total uOps: 200 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 4.55 # CHECK-NEXT: IPC: 4.55 # CHECK-NEXT: Block RThroughput: 0.3 @@ -432,8 +432,8 @@ add z0.s, z0.s, z0.s # CHECK-NEXT: [1,1] D======eeER. add z0.d, z0.d, z0.d # CHECK-NEXT: [2,0] DeeeeeeE--R. ldr q0, [sp] # CHECK-NEXT: [2,1] D======eeER. add z0.d, z0.d, z0.d -# CHECK-NEXT: [3,0] D=eeeeeeE-R. ldr q0, [sp] -# CHECK-NEXT: [3,1] D=======eeER add z0.d, z0.d, z0.d +# CHECK-NEXT: [3,0] .DeeeeeeE-R. ldr q0, [sp] +# CHECK-NEXT: [3,1] .D======eeER add z0.d, z0.d, z0.d # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -442,9 +442,9 @@ add z0.s, z0.s, z0.s # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ldr q0, [sp] -# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add z0.d, z0.d, z0.d -# CHECK-NEXT: 4 4.3 0.6 0.6 +# CHECK-NEXT: 0. 4 1.0 1.0 1.3 ldr q0, [sp] +# CHECK-NEXT: 1. 4 7.0 0.0 0.0 add z0.d, z0.d, z0.d +# CHECK-NEXT: 4 4.0 0.5 0.6 # CHECK: [5] Code Region - SIMD64-bit-b @@ -453,7 +453,7 @@ add z0.s, z0.s, z0.s # CHECK-NEXT: Total Cycles: 44 # CHECK-NEXT: Total uOps: 200 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 4.55 # CHECK-NEXT: IPC: 4.55 # CHECK-NEXT: Block RThroughput: 0.3 @@ -511,8 +511,8 @@ add z0.s, z0.s, z0.s # CHECK-NEXT: [1,1] D======eeER. add z0.d, z0.d, z0.d # CHECK-NEXT: [2,0] DeeeeeeE--R. ld1 { v0.8b }, [sp] # CHECK-NEXT: [2,1] D======eeER. add z0.d, z0.d, z0.d -# CHECK-NEXT: [3,0] D=eeeeeeE-R. ld1 { v0.8b }, [sp] -# CHECK-NEXT: [3,1] D=======eeER add z0.d, z0.d, z0.d +# CHECK-NEXT: [3,0] .DeeeeeeE-R. ld1 { v0.8b }, [sp] +# CHECK-NEXT: [3,1] .D======eeER add z0.d, z0.d, z0.d # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -521,9 +521,9 @@ add z0.s, z0.s, z0.s # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ld1 { v0.8b }, [sp] -# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add z0.d, z0.d, z0.d -# CHECK-NEXT: 4 4.3 0.6 0.6 +# CHECK-NEXT: 0. 4 1.0 1.0 1.3 ld1 { v0.8b }, [sp] +# CHECK-NEXT: 1. 4 7.0 0.0 0.0 add z0.d, z0.d, z0.d +# CHECK-NEXT: 4 4.0 0.5 0.6 # CHECK: [6] Code Region - SIMD64-bit-h @@ -532,7 +532,7 @@ add z0.s, z0.s, z0.s # CHECK-NEXT: Total Cycles: 44 # CHECK-NEXT: Total uOps: 200 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 4.55 # CHECK-NEXT: IPC: 4.55 # CHECK-NEXT: Block RThroughput: 0.3 @@ -590,8 +590,8 @@ add z0.s, z0.s, z0.s # CHECK-NEXT: [1,1] D======eeER. add z0.d, z0.d, z0.d # CHECK-NEXT: [2,0] DeeeeeeE--R. ld1 { v0.4h }, [sp] # CHECK-NEXT: [2,1] D======eeER. add z0.d, z0.d, z0.d -# CHECK-NEXT: [3,0] D=eeeeeeE-R. ld1 { v0.4h }, [sp] -# CHECK-NEXT: [3,1] D=======eeER add z0.d, z0.d, z0.d +# CHECK-NEXT: [3,0] .DeeeeeeE-R. ld1 { v0.4h }, [sp] +# CHECK-NEXT: [3,1] .D======eeER add z0.d, z0.d, z0.d # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -600,9 +600,9 @@ add z0.s, z0.s, z0.s # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ld1 { v0.4h }, [sp] -# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add z0.d, z0.d, z0.d -# CHECK-NEXT: 4 4.3 0.6 0.6 +# CHECK-NEXT: 0. 4 1.0 1.0 1.3 ld1 { v0.4h }, [sp] +# CHECK-NEXT: 1. 4 7.0 0.0 0.0 add z0.d, z0.d, z0.d +# CHECK-NEXT: 4 4.0 0.5 0.6 # CHECK: [7] Code Region - SIMD64-bit-s @@ -611,7 +611,7 @@ add z0.s, z0.s, z0.s # CHECK-NEXT: Total Cycles: 44 # CHECK-NEXT: Total uOps: 200 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 4.55 # CHECK-NEXT: IPC: 4.55 # CHECK-NEXT: Block RThroughput: 0.3 @@ -669,8 +669,8 @@ add z0.s, z0.s, z0.s # CHECK-NEXT: [1,1] D======eeER. add z0.d, z0.d, z0.d # CHECK-NEXT: [2,0] DeeeeeeE--R. ld1 { v0.2s }, [sp] # CHECK-NEXT: [2,1] D======eeER. add z0.d, z0.d, z0.d -# CHECK-NEXT: [3,0] D=eeeeeeE-R. ld1 { v0.2s }, [sp] -# CHECK-NEXT: [3,1] D=======eeER add z0.d, z0.d, z0.d +# CHECK-NEXT: [3,0] .DeeeeeeE-R. ld1 { v0.2s }, [sp] +# CHECK-NEXT: [3,1] .D======eeER add z0.d, z0.d, z0.d # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -679,9 +679,9 @@ add z0.s, z0.s, z0.s # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ld1 { v0.2s }, [sp] -# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add z0.d, z0.d, z0.d -# CHECK-NEXT: 4 4.3 0.6 0.6 +# CHECK-NEXT: 0. 4 1.0 1.0 1.3 ld1 { v0.2s }, [sp] +# CHECK-NEXT: 1. 4 7.0 0.0 0.0 add z0.d, z0.d, z0.d +# CHECK-NEXT: 4 4.0 0.5 0.6 # CHECK: [8] Code Region - SIMD64-bit-d @@ -690,7 +690,7 @@ add z0.s, z0.s, z0.s # CHECK-NEXT: Total Cycles: 44 # CHECK-NEXT: Total uOps: 200 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 4.55 # CHECK-NEXT: IPC: 4.55 # CHECK-NEXT: Block RThroughput: 0.3 @@ -748,8 +748,8 @@ add z0.s, z0.s, z0.s # CHECK-NEXT: [1,1] D======eeER. add z0.d, z0.d, z0.d # CHECK-NEXT: [2,0] DeeeeeeE--R. ld1 { v0.1d }, [sp] # CHECK-NEXT: [2,1] D======eeER. add z0.d, z0.d, z0.d -# CHECK-NEXT: [3,0] D=eeeeeeE-R. ld1 { v0.1d }, [sp] -# CHECK-NEXT: [3,1] D=======eeER add z0.d, z0.d, z0.d +# CHECK-NEXT: [3,0] .DeeeeeeE-R. ld1 { v0.1d }, [sp] +# CHECK-NEXT: [3,1] .D======eeER add z0.d, z0.d, z0.d # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -758,9 +758,9 @@ add z0.s, z0.s, z0.s # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ld1 { v0.1d }, [sp] -# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add z0.d, z0.d, z0.d -# CHECK-NEXT: 4 4.3 0.6 0.6 +# CHECK-NEXT: 0. 4 1.0 1.0 1.3 ld1 { v0.1d }, [sp] +# CHECK-NEXT: 1. 4 7.0 0.0 0.0 add z0.d, z0.d, z0.d +# CHECK-NEXT: 4 4.0 0.5 0.6 # CHECK: [9] Code Region - insr @@ -769,7 +769,7 @@ add z0.s, z0.s, z0.s # CHECK-NEXT: Total Cycles: 803 # CHECK-NEXT: Total uOps: 300 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 0.37 # CHECK-NEXT: IPC: 0.25 # CHECK-NEXT: Block RThroughput: 1.0 @@ -825,10 +825,10 @@ add z0.s, z0.s, z0.s # CHECK-NEXT: [0,1] D======eeER . . . . . add z0.s, z0.s, z0.s # CHECK-NEXT: [1,0] D========eeeeeeER . . . . insr z0.s, w0 # CHECK-NEXT: [1,1] D==============eeER . . . . add z0.s, z0.s, z0.s -# CHECK-NEXT: [2,0] D================eeeeeeER. . . insr z0.s, w0 -# CHECK-NEXT: [2,1] D======================eeER . . add z0.s, z0.s, z0.s -# CHECK-NEXT: [3,0] D========================eeeeeeER . insr z0.s, w0 -# CHECK-NEXT: [3,1] D==============================eeER add z0.s, z0.s, z0.s +# CHECK-NEXT: [2,0] .D===============eeeeeeER. . . insr z0.s, w0 +# CHECK-NEXT: [2,1] .D=====================eeER . . add z0.s, z0.s, z0.s +# CHECK-NEXT: [3,0] .D=======================eeeeeeER . insr z0.s, w0 +# CHECK-NEXT: [3,1] .D=============================eeER add z0.s, z0.s, z0.s # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -837,6 +837,6 @@ add z0.s, z0.s, z0.s # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 4 13.0 0.3 0.0 insr z0.s, w0 -# CHECK-NEXT: 1. 4 19.0 0.0 0.0 add z0.s, z0.s, z0.s -# CHECK-NEXT: 4 16.0 0.1 0.0 +# CHECK-NEXT: 0. 4 12.5 0.3 0.0 insr z0.s, w0 +# CHECK-NEXT: 1. 4 18.5 0.0 0.0 add z0.s, z0.s, z0.s +# CHECK-NEXT: 4 15.5 0.1 0.0 diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-forwarding.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-forwarding.s index 0f5ab183f5358..39a779b27fe7f 100644 --- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-forwarding.s +++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-forwarding.s @@ -315,7 +315,7 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: Total Cycles: 703 # CHECK-NEXT: Total uOps: 400 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 0.57 # CHECK-NEXT: IPC: 0.57 # CHECK-NEXT: Block RThroughput: 3.0 @@ -330,8 +330,8 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: [0,3] D=====eeER. .. madd x0, x0, x0, x0 # CHECK-NEXT: [1,0] D=======eeER .. mul x0, x0, x0 # CHECK-NEXT: [1,1] D=========eeER .. madd x0, x1, x2, x0 -# CHECK-NEXT: [1,2] D==========eeER.. madd x0, x1, x2, x0 -# CHECK-NEXT: [1,3] D============eeER madd x0, x0, x0, x0 +# CHECK-NEXT: [1,2] .D=========eeER.. madd x0, x1, x2, x0 +# CHECK-NEXT: [1,3] .D===========eeER madd x0, x0, x0, x0 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -342,9 +342,9 @@ bfmlalb z0.s, z0.h, z1.h # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 2 4.5 0.5 0.0 mul x0, x0, x0 # CHECK-NEXT: 1. 2 6.5 0.0 0.0 madd x0, x1, x2, x0 -# CHECK-NEXT: 2. 2 7.5 0.0 0.0 madd x0, x1, x2, x0 -# CHECK-NEXT: 3. 2 9.5 0.0 0.0 madd x0, x0, x0, x0 -# CHECK-NEXT: 2 7.0 0.1 0.0 +# CHECK-NEXT: 2. 2 7.0 0.0 0.0 madd x0, x1, x2, x0 +# CHECK-NEXT: 3. 2 9.0 0.0 0.0 madd x0, x0, x0, x0 +# CHECK-NEXT: 2 6.8 0.1 0.0 # CHECK: [1] Code Region - smaddl @@ -353,7 +353,7 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: Total Cycles: 703 # CHECK-NEXT: Total uOps: 400 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 0.57 # CHECK-NEXT: IPC: 0.57 # CHECK-NEXT: Block RThroughput: 3.0 @@ -368,8 +368,8 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: [0,3] D=====eeER. .. smaddl x0, w0, w0, x0 # CHECK-NEXT: [1,0] D=======eeER .. mul x0, x0, x0 # CHECK-NEXT: [1,1] D=========eeER .. smaddl x0, w1, w2, x0 -# CHECK-NEXT: [1,2] D==========eeER.. smaddl x0, w1, w2, x0 -# CHECK-NEXT: [1,3] D============eeER smaddl x0, w0, w0, x0 +# CHECK-NEXT: [1,2] .D=========eeER.. smaddl x0, w1, w2, x0 +# CHECK-NEXT: [1,3] .D===========eeER smaddl x0, w0, w0, x0 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -380,9 +380,9 @@ bfmlalb z0.s, z0.h, z1.h # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 2 4.5 0.5 0.0 mul x0, x0, x0 # CHECK-NEXT: 1. 2 6.5 0.0 0.0 smaddl x0, w1, w2, x0 -# CHECK-NEXT: 2. 2 7.5 0.0 0.0 smaddl x0, w1, w2, x0 -# CHECK-NEXT: 3. 2 9.5 0.0 0.0 smaddl x0, w0, w0, x0 -# CHECK-NEXT: 2 7.0 0.1 0.0 +# CHECK-NEXT: 2. 2 7.0 0.0 0.0 smaddl x0, w1, w2, x0 +# CHECK-NEXT: 3. 2 9.0 0.0 0.0 smaddl x0, w0, w0, x0 +# CHECK-NEXT: 2 6.8 0.1 0.0 # CHECK: [2] Code Region - fmadd @@ -391,7 +391,7 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: Total Cycles: 1703 # CHECK-NEXT: Total uOps: 600 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 0.35 # CHECK-NEXT: IPC: 0.35 # CHECK-NEXT: Block RThroughput: 1.5 @@ -406,12 +406,12 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: [0,3] D=======eeeeER . . . . .. fmadd d0, d1, d2, d0 # CHECK-NEXT: [0,4] D=========eeeeER . . . .. fmadd d0, d1, d2, d0 # CHECK-NEXT: [0,5] D=============eeeeER. . . .. fmadd d0, d0, d1, d2 -# CHECK-NEXT: [1,0] D=================eeER . . .. fadd d0, d0, d0 -# CHECK-NEXT: [1,1] D===================eeeeER . .. fmadd d0, d1, d2, d0 -# CHECK-NEXT: [1,2] D=======================eeeER . .. fmul d0, d0, d0 -# CHECK-NEXT: [1,3] D========================eeeeER .. fmadd d0, d1, d2, d0 -# CHECK-NEXT: [1,4] D==========================eeeeER .. fmadd d0, d1, d2, d0 -# CHECK-NEXT: [1,5] D==============================eeeeER fmadd d0, d0, d1, d2 +# CHECK-NEXT: [1,0] .D================eeER . . .. fadd d0, d0, d0 +# CHECK-NEXT: [1,1] .D==================eeeeER . .. fmadd d0, d1, d2, d0 +# CHECK-NEXT: [1,2] .D======================eeeER . .. fmul d0, d0, d0 +# CHECK-NEXT: [1,3] .D=======================eeeeER .. fmadd d0, d1, d2, d0 +# CHECK-NEXT: [1,4] .D=========================eeeeER .. fmadd d0, d1, d2, d0 +# CHECK-NEXT: [1,5] .D=============================eeeeER fmadd d0, d0, d1, d2 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -420,13 +420,13 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 2 9.5 0.5 0.0 fadd d0, d0, d0 -# CHECK-NEXT: 1. 2 11.5 0.0 0.0 fmadd d0, d1, d2, d0 -# CHECK-NEXT: 2. 2 15.5 0.0 0.0 fmul d0, d0, d0 -# CHECK-NEXT: 3. 2 16.5 0.0 0.0 fmadd d0, d1, d2, d0 -# CHECK-NEXT: 4. 2 18.5 0.0 0.0 fmadd d0, d1, d2, d0 -# CHECK-NEXT: 5. 2 22.5 0.0 0.0 fmadd d0, d0, d1, d2 -# CHECK-NEXT: 2 15.7 0.1 0.0 +# CHECK-NEXT: 0. 2 9.0 0.5 0.0 fadd d0, d0, d0 +# CHECK-NEXT: 1. 2 11.0 0.0 0.0 fmadd d0, d1, d2, d0 +# CHECK-NEXT: 2. 2 15.0 0.0 0.0 fmul d0, d0, d0 +# CHECK-NEXT: 3. 2 16.0 0.0 0.0 fmadd d0, d1, d2, d0 +# CHECK-NEXT: 4. 2 18.0 0.0 0.0 fmadd d0, d1, d2, d0 +# CHECK-NEXT: 5. 2 22.0 0.0 0.0 fmadd d0, d0, d1, d2 +# CHECK-NEXT: 2 15.2 0.1 0.0 # CHECK: [3] Code Region - saba @@ -435,7 +435,7 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: Total Cycles: 1303 # CHECK-NEXT: Total uOps: 400 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 0.31 # CHECK-NEXT: IPC: 0.31 # CHECK-NEXT: Block RThroughput: 1.5 @@ -450,8 +450,8 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: [0,3] D=========eeeeER . . . saba v0.4s, v0.4s, v1.4s # CHECK-NEXT: [1,0] D=============eeeeER. . . mul v0.4s, v0.4s, v0.4s # CHECK-NEXT: [1,1] D=================eeeeER . . saba v0.4s, v1.4s, v2.4s -# CHECK-NEXT: [1,2] D==================eeeeER. . saba v0.4s, v1.4s, v2.4s -# CHECK-NEXT: [1,3] D======================eeeeER saba v0.4s, v0.4s, v1.4s +# CHECK-NEXT: [1,2] .D=================eeeeER. . saba v0.4s, v1.4s, v2.4s +# CHECK-NEXT: [1,3] .D=====================eeeeER saba v0.4s, v0.4s, v1.4s # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -462,9 +462,9 @@ bfmlalb z0.s, z0.h, z1.h # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul v0.4s, v0.4s, v0.4s # CHECK-NEXT: 1. 2 11.5 0.0 0.0 saba v0.4s, v1.4s, v2.4s -# CHECK-NEXT: 2. 2 12.5 0.0 0.0 saba v0.4s, v1.4s, v2.4s -# CHECK-NEXT: 3. 2 16.5 0.0 0.0 saba v0.4s, v0.4s, v1.4s -# CHECK-NEXT: 2 12.0 0.1 0.0 +# CHECK-NEXT: 2. 2 12.0 0.0 0.0 saba v0.4s, v1.4s, v2.4s +# CHECK-NEXT: 3. 2 16.0 0.0 0.0 saba v0.4s, v0.4s, v1.4s +# CHECK-NEXT: 2 11.8 0.1 0.0 # CHECK: [4] Code Region - sdot @@ -473,7 +473,7 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: Total Cycles: 1103 # CHECK-NEXT: Total uOps: 400 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 0.36 # CHECK-NEXT: IPC: 0.36 # CHECK-NEXT: Block RThroughput: 0.8 @@ -488,8 +488,8 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: [0,3] D========eeeER . . . sdot v0.4s, v0.16b, v1.16b # CHECK-NEXT: [1,0] D===========eeeeER . . mul v0.4s, v0.4s, v0.4s # CHECK-NEXT: [1,1] D===============eeeER . sdot v0.4s, v1.16b, v2.16b -# CHECK-NEXT: [1,2] D================eeeER . sdot v0.4s, v1.16b, v2.16b -# CHECK-NEXT: [1,3] D===================eeeER sdot v0.4s, v0.16b, v1.16b +# CHECK-NEXT: [1,2] .D===============eeeER . sdot v0.4s, v1.16b, v2.16b +# CHECK-NEXT: [1,3] .D==================eeeER sdot v0.4s, v0.16b, v1.16b # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -500,9 +500,9 @@ bfmlalb z0.s, z0.h, z1.h # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 2 6.5 0.5 0.0 mul v0.4s, v0.4s, v0.4s # CHECK-NEXT: 1. 2 10.5 0.0 0.0 sdot v0.4s, v1.16b, v2.16b -# CHECK-NEXT: 2. 2 11.5 0.0 0.0 sdot v0.4s, v1.16b, v2.16b -# CHECK-NEXT: 3. 2 14.5 0.0 0.0 sdot v0.4s, v0.16b, v1.16b -# CHECK-NEXT: 2 10.8 0.1 0.0 +# CHECK-NEXT: 2. 2 11.0 0.0 0.0 sdot v0.4s, v1.16b, v2.16b +# CHECK-NEXT: 3. 2 14.0 0.0 0.0 sdot v0.4s, v0.16b, v1.16b +# CHECK-NEXT: 2 10.5 0.1 0.0 # CHECK: [5] Code Region - smmla @@ -511,7 +511,7 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: Total Cycles: 1103 # CHECK-NEXT: Total uOps: 400 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 0.36 # CHECK-NEXT: IPC: 0.36 # CHECK-NEXT: Block RThroughput: 0.8 @@ -526,8 +526,8 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: [0,3] D========eeeER . . . smmla v0.4s, v0.16b, v1.16b # CHECK-NEXT: [1,0] D===========eeeeER . . mul v0.4s, v0.4s, v0.4s # CHECK-NEXT: [1,1] D===============eeeER . smmla v0.4s, v1.16b, v2.16b -# CHECK-NEXT: [1,2] D================eeeER . smmla v0.4s, v1.16b, v2.16b -# CHECK-NEXT: [1,3] D===================eeeER smmla v0.4s, v0.16b, v1.16b +# CHECK-NEXT: [1,2] .D===============eeeER . smmla v0.4s, v1.16b, v2.16b +# CHECK-NEXT: [1,3] .D==================eeeER smmla v0.4s, v0.16b, v1.16b # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -538,9 +538,9 @@ bfmlalb z0.s, z0.h, z1.h # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 2 6.5 0.5 0.0 mul v0.4s, v0.4s, v0.4s # CHECK-NEXT: 1. 2 10.5 0.0 0.0 smmla v0.4s, v1.16b, v2.16b -# CHECK-NEXT: 2. 2 11.5 0.0 0.0 smmla v0.4s, v1.16b, v2.16b -# CHECK-NEXT: 3. 2 14.5 0.0 0.0 smmla v0.4s, v0.16b, v1.16b -# CHECK-NEXT: 2 10.8 0.1 0.0 +# CHECK-NEXT: 2. 2 11.0 0.0 0.0 smmla v0.4s, v1.16b, v2.16b +# CHECK-NEXT: 3. 2 14.0 0.0 0.0 smmla v0.4s, v0.16b, v1.16b +# CHECK-NEXT: 2 10.5 0.1 0.0 # CHECK: [6] Code Region - mla @@ -549,7 +549,7 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: Total Cycles: 1303 # CHECK-NEXT: Total uOps: 400 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 0.31 # CHECK-NEXT: IPC: 0.31 # CHECK-NEXT: Block RThroughput: 2.0 @@ -564,8 +564,8 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: [0,3] D=========eeeeER . . . mla v0.4s, v0.4s, v1.4s # CHECK-NEXT: [1,0] D=============eeeeER. . . mul v0.4s, v0.4s, v0.4s # CHECK-NEXT: [1,1] D=================eeeeER . . mla v0.4s, v1.4s, v2.4s -# CHECK-NEXT: [1,2] D==================eeeeER. . mla v0.4s, v1.4s, v2.4s -# CHECK-NEXT: [1,3] D======================eeeeER mla v0.4s, v0.4s, v1.4s +# CHECK-NEXT: [1,2] .D=================eeeeER. . mla v0.4s, v1.4s, v2.4s +# CHECK-NEXT: [1,3] .D=====================eeeeER mla v0.4s, v0.4s, v1.4s # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -576,9 +576,9 @@ bfmlalb z0.s, z0.h, z1.h # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul v0.4s, v0.4s, v0.4s # CHECK-NEXT: 1. 2 11.5 0.0 0.0 mla v0.4s, v1.4s, v2.4s -# CHECK-NEXT: 2. 2 12.5 0.0 0.0 mla v0.4s, v1.4s, v2.4s -# CHECK-NEXT: 3. 2 16.5 0.0 0.0 mla v0.4s, v0.4s, v1.4s -# CHECK-NEXT: 2 12.0 0.1 0.0 +# CHECK-NEXT: 2. 2 12.0 0.0 0.0 mla v0.4s, v1.4s, v2.4s +# CHECK-NEXT: 3. 2 16.0 0.0 0.0 mla v0.4s, v0.4s, v1.4s +# CHECK-NEXT: 2 11.8 0.1 0.0 # CHECK: [7] Code Region - sqrdmlah @@ -587,7 +587,7 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: Total Cycles: 1403 # CHECK-NEXT: Total uOps: 400 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 0.29 # CHECK-NEXT: IPC: 0.29 # CHECK-NEXT: Block RThroughput: 3.5 @@ -602,8 +602,8 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: [0,3] D==========eeeeER . . . sqrdmlah v0.4s, v0.4s, v1.4s # CHECK-NEXT: [1,0] D==============eeeeER . . mul v0.4s, v0.4s, v0.4s # CHECK-NEXT: [1,1] D==================eeeeER. . sqrdmlah v0.4s, v1.4s, v2.4s -# CHECK-NEXT: [1,2] D====================eeeeER . sqrdmlah v0.4s, v1.4s, v2.4s -# CHECK-NEXT: [1,3] D========================eeeeER sqrdmlah v0.4s, v0.4s, v1.4s +# CHECK-NEXT: [1,2] .D===================eeeeER . sqrdmlah v0.4s, v1.4s, v2.4s +# CHECK-NEXT: [1,3] .D=======================eeeeER sqrdmlah v0.4s, v0.4s, v1.4s # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -614,9 +614,9 @@ bfmlalb z0.s, z0.h, z1.h # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 2 8.0 0.5 0.0 mul v0.4s, v0.4s, v0.4s # CHECK-NEXT: 1. 2 12.0 0.0 0.0 sqrdmlah v0.4s, v1.4s, v2.4s -# CHECK-NEXT: 2. 2 14.0 0.0 0.0 sqrdmlah v0.4s, v1.4s, v2.4s -# CHECK-NEXT: 3. 2 18.0 0.0 0.0 sqrdmlah v0.4s, v0.4s, v1.4s -# CHECK-NEXT: 2 13.0 0.1 0.0 +# CHECK-NEXT: 2. 2 13.5 0.0 0.0 sqrdmlah v0.4s, v1.4s, v2.4s +# CHECK-NEXT: 3. 2 17.5 0.0 0.0 sqrdmlah v0.4s, v0.4s, v1.4s +# CHECK-NEXT: 2 12.8 0.1 0.0 # CHECK: [8] Code Region - smlal2 @@ -625,7 +625,7 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: Total Cycles: 1303 # CHECK-NEXT: Total uOps: 400 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 0.31 # CHECK-NEXT: IPC: 0.31 # CHECK-NEXT: Block RThroughput: 2.0 @@ -640,8 +640,8 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: [0,3] D=========eeeeER . . . smlal2 v0.4s, v0.8h, v1.8h # CHECK-NEXT: [1,0] D=============eeeeER. . . mul v0.4s, v0.4s, v0.4s # CHECK-NEXT: [1,1] D=================eeeeER . . smlal2 v0.4s, v1.8h, v2.8h -# CHECK-NEXT: [1,2] D==================eeeeER. . smlal2 v0.4s, v1.8h, v2.8h -# CHECK-NEXT: [1,3] D======================eeeeER smlal2 v0.4s, v0.8h, v1.8h +# CHECK-NEXT: [1,2] .D=================eeeeER. . smlal2 v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [1,3] .D=====================eeeeER smlal2 v0.4s, v0.8h, v1.8h # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -652,9 +652,9 @@ bfmlalb z0.s, z0.h, z1.h # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul v0.4s, v0.4s, v0.4s # CHECK-NEXT: 1. 2 11.5 0.0 0.0 smlal2 v0.4s, v1.8h, v2.8h -# CHECK-NEXT: 2. 2 12.5 0.0 0.0 smlal2 v0.4s, v1.8h, v2.8h -# CHECK-NEXT: 3. 2 16.5 0.0 0.0 smlal2 v0.4s, v0.8h, v1.8h -# CHECK-NEXT: 2 12.0 0.1 0.0 +# CHECK-NEXT: 2. 2 12.0 0.0 0.0 smlal2 v0.4s, v1.8h, v2.8h +# CHECK-NEXT: 3. 2 16.0 0.0 0.0 smlal2 v0.4s, v0.8h, v1.8h +# CHECK-NEXT: 2 11.8 0.1 0.0 # CHECK: [9] Code Region - sadalp @@ -663,7 +663,7 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: Total Cycles: 1303 # CHECK-NEXT: Total uOps: 400 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 0.31 # CHECK-NEXT: IPC: 0.31 # CHECK-NEXT: Block RThroughput: 1.5 @@ -678,8 +678,8 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: [0,3] D=========eeeeER . . . sadalp v0.2d, v0.4s # CHECK-NEXT: [1,0] D=============eeeeER. . . mul v0.4s, v0.4s, v0.4s # CHECK-NEXT: [1,1] D=================eeeeER . . sadalp v0.2d, v1.4s -# CHECK-NEXT: [1,2] D==================eeeeER. . sadalp v0.2d, v1.4s -# CHECK-NEXT: [1,3] D======================eeeeER sadalp v0.2d, v0.4s +# CHECK-NEXT: [1,2] .D=================eeeeER. . sadalp v0.2d, v1.4s +# CHECK-NEXT: [1,3] .D=====================eeeeER sadalp v0.2d, v0.4s # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -690,9 +690,9 @@ bfmlalb z0.s, z0.h, z1.h # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul v0.4s, v0.4s, v0.4s # CHECK-NEXT: 1. 2 11.5 0.0 0.0 sadalp v0.2d, v1.4s -# CHECK-NEXT: 2. 2 12.5 0.0 0.0 sadalp v0.2d, v1.4s -# CHECK-NEXT: 3. 2 16.5 0.0 0.0 sadalp v0.2d, v0.4s -# CHECK-NEXT: 2 12.0 0.1 0.0 +# CHECK-NEXT: 2. 2 12.0 0.0 0.0 sadalp v0.2d, v1.4s +# CHECK-NEXT: 3. 2 16.0 0.0 0.0 sadalp v0.2d, v0.4s +# CHECK-NEXT: 2 11.8 0.1 0.0 # CHECK: [10] Code Region - ssra @@ -701,7 +701,7 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: Total Cycles: 1303 # CHECK-NEXT: Total uOps: 400 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 0.31 # CHECK-NEXT: IPC: 0.31 # CHECK-NEXT: Block RThroughput: 1.5 @@ -716,8 +716,8 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: [0,3] D=========eeeeER . . . ssra v0.2d, v0.2d, #1 # CHECK-NEXT: [1,0] D=============eeeeER. . . mul v0.4s, v0.4s, v0.4s # CHECK-NEXT: [1,1] D=================eeeeER . . ssra v0.2d, v1.2d, #1 -# CHECK-NEXT: [1,2] D==================eeeeER. . ssra v0.2d, v1.2d, #1 -# CHECK-NEXT: [1,3] D======================eeeeER ssra v0.2d, v0.2d, #1 +# CHECK-NEXT: [1,2] .D=================eeeeER. . ssra v0.2d, v1.2d, #1 +# CHECK-NEXT: [1,3] .D=====================eeeeER ssra v0.2d, v0.2d, #1 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -728,9 +728,9 @@ bfmlalb z0.s, z0.h, z1.h # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul v0.4s, v0.4s, v0.4s # CHECK-NEXT: 1. 2 11.5 0.0 0.0 ssra v0.2d, v1.2d, #1 -# CHECK-NEXT: 2. 2 12.5 0.0 0.0 ssra v0.2d, v1.2d, #1 -# CHECK-NEXT: 3. 2 16.5 0.0 0.0 ssra v0.2d, v0.2d, #1 -# CHECK-NEXT: 2 12.0 0.1 0.0 +# CHECK-NEXT: 2. 2 12.0 0.0 0.0 ssra v0.2d, v1.2d, #1 +# CHECK-NEXT: 3. 2 16.0 0.0 0.0 ssra v0.2d, v0.2d, #1 +# CHECK-NEXT: 2 11.8 0.1 0.0 # CHECK: [11] Code Region - fcmla @@ -739,7 +739,7 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: Total Cycles: 1303 # CHECK-NEXT: Total uOps: 400 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 0.31 # CHECK-NEXT: IPC: 0.31 # CHECK-NEXT: Block RThroughput: 1.0 @@ -754,8 +754,8 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: [0,3] D=========eeeeER . . . fcmla v0.2d, v0.2d, v1.2d, #90 # CHECK-NEXT: [1,0] D=============eeeER . . . fmul v0.4s, v0.4s, v0.4s # CHECK-NEXT: [1,1] D================eeeeER . . fcmla v0.2d, v1.2d, v2.2d, #90 -# CHECK-NEXT: [1,2] D==================eeeeER. . fcmla v0.2d, v1.2d, v2.2d, #90 -# CHECK-NEXT: [1,3] D======================eeeeER fcmla v0.2d, v0.2d, v1.2d, #90 +# CHECK-NEXT: [1,2] .D=================eeeeER. . fcmla v0.2d, v1.2d, v2.2d, #90 +# CHECK-NEXT: [1,3] .D=====================eeeeER fcmla v0.2d, v0.2d, v1.2d, #90 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -766,9 +766,9 @@ bfmlalb z0.s, z0.h, z1.h # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 2 7.5 0.5 0.0 fmul v0.4s, v0.4s, v0.4s # CHECK-NEXT: 1. 2 10.5 0.0 0.0 fcmla v0.2d, v1.2d, v2.2d, #90 -# CHECK-NEXT: 2. 2 12.5 0.0 0.0 fcmla v0.2d, v1.2d, v2.2d, #90 -# CHECK-NEXT: 3. 2 16.5 0.0 0.0 fcmla v0.2d, v0.2d, v1.2d, #90 -# CHECK-NEXT: 2 11.8 0.1 0.0 +# CHECK-NEXT: 2. 2 12.0 0.0 0.0 fcmla v0.2d, v1.2d, v2.2d, #90 +# CHECK-NEXT: 3. 2 16.0 0.0 0.0 fcmla v0.2d, v0.2d, v1.2d, #90 +# CHECK-NEXT: 2 11.5 0.1 0.0 # CHECK: [12] Code Region - fmla @@ -777,7 +777,7 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: Total Cycles: 1703 # CHECK-NEXT: Total uOps: 600 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 0.35 # CHECK-NEXT: IPC: 0.35 # CHECK-NEXT: Block RThroughput: 1.5 @@ -792,12 +792,12 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: [0,3] D=======eeeeER . . . . .. fmla v0.2d, v1.2d, v2.2d # CHECK-NEXT: [0,4] D=========eeeeER . . . .. fmla v0.2d, v1.2d, v2.2d # CHECK-NEXT: [0,5] D=============eeeeER. . . .. fmla v0.2d, v0.2d, v1.2d -# CHECK-NEXT: [1,0] D=================eeeER . . .. fmul v0.2d, v0.2d, v0.2d -# CHECK-NEXT: [1,1] D==================eeeeER. . .. fmla v0.2d, v1.2d, v2.2d -# CHECK-NEXT: [1,2] D======================eeER . .. fadd v0.2d, v0.2d, v0.2d -# CHECK-NEXT: [1,3] D========================eeeeER .. fmla v0.2d, v1.2d, v2.2d -# CHECK-NEXT: [1,4] D==========================eeeeER .. fmla v0.2d, v1.2d, v2.2d -# CHECK-NEXT: [1,5] D==============================eeeeER fmla v0.2d, v0.2d, v1.2d +# CHECK-NEXT: [1,0] .D================eeeER . . .. fmul v0.2d, v0.2d, v0.2d +# CHECK-NEXT: [1,1] .D=================eeeeER. . .. fmla v0.2d, v1.2d, v2.2d +# CHECK-NEXT: [1,2] .D=====================eeER . .. fadd v0.2d, v0.2d, v0.2d +# CHECK-NEXT: [1,3] .D=======================eeeeER .. fmla v0.2d, v1.2d, v2.2d +# CHECK-NEXT: [1,4] .D=========================eeeeER .. fmla v0.2d, v1.2d, v2.2d +# CHECK-NEXT: [1,5] .D=============================eeeeER fmla v0.2d, v0.2d, v1.2d # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -806,13 +806,13 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 2 9.5 0.5 0.0 fmul v0.2d, v0.2d, v0.2d -# CHECK-NEXT: 1. 2 10.5 0.0 0.0 fmla v0.2d, v1.2d, v2.2d -# CHECK-NEXT: 2. 2 14.5 0.0 0.0 fadd v0.2d, v0.2d, v0.2d -# CHECK-NEXT: 3. 2 16.5 0.0 0.0 fmla v0.2d, v1.2d, v2.2d -# CHECK-NEXT: 4. 2 18.5 0.0 0.0 fmla v0.2d, v1.2d, v2.2d -# CHECK-NEXT: 5. 2 22.5 0.0 0.0 fmla v0.2d, v0.2d, v1.2d -# CHECK-NEXT: 2 15.3 0.1 0.0 +# CHECK-NEXT: 0. 2 9.0 0.5 0.0 fmul v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1. 2 10.0 0.0 0.0 fmla v0.2d, v1.2d, v2.2d +# CHECK-NEXT: 2. 2 14.0 0.0 0.0 fadd v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 3. 2 16.0 0.0 0.0 fmla v0.2d, v1.2d, v2.2d +# CHECK-NEXT: 4. 2 18.0 0.0 0.0 fmla v0.2d, v1.2d, v2.2d +# CHECK-NEXT: 5. 2 22.0 0.0 0.0 fmla v0.2d, v0.2d, v1.2d +# CHECK-NEXT: 2 14.8 0.1 0.0 # CHECK: [13] Code Region - fmlal @@ -821,7 +821,7 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: Total Cycles: 1903 # CHECK-NEXT: Total uOps: 600 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 0.32 # CHECK-NEXT: IPC: 0.32 # CHECK-NEXT: Block RThroughput: 1.5 @@ -836,12 +836,12 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: [0,3] D=========eeeeER . . . . . fmlal v0.4s, v1.4h, v2.4h # CHECK-NEXT: [0,4] D===========eeeeER . . . . . fmlal v0.4s, v1.4h, v2.4h # CHECK-NEXT: [0,5] D===============eeeeER . . . . fmlal v0.4s, v0.4h, v1.4h -# CHECK-NEXT: [1,0] D===================eeeER. . . . fmul v0.2d, v0.2d, v0.2d -# CHECK-NEXT: [1,1] D======================eeeeER . . . fmlal v0.4s, v1.4h, v2.4h -# CHECK-NEXT: [1,2] D==========================eeER . . fadd v0.2d, v0.2d, v0.2d -# CHECK-NEXT: [1,3] D============================eeeeER. . fmlal v0.4s, v1.4h, v2.4h -# CHECK-NEXT: [1,4] D==============================eeeeER . fmlal v0.4s, v1.4h, v2.4h -# CHECK-NEXT: [1,5] D==================================eeeeER fmlal v0.4s, v0.4h, v1.4h +# CHECK-NEXT: [1,0] .D==================eeeER. . . . fmul v0.2d, v0.2d, v0.2d +# CHECK-NEXT: [1,1] .D=====================eeeeER . . . fmlal v0.4s, v1.4h, v2.4h +# CHECK-NEXT: [1,2] .D=========================eeER . . fadd v0.2d, v0.2d, v0.2d +# CHECK-NEXT: [1,3] .D===========================eeeeER. . fmlal v0.4s, v1.4h, v2.4h +# CHECK-NEXT: [1,4] .D=============================eeeeER . fmlal v0.4s, v1.4h, v2.4h +# CHECK-NEXT: [1,5] .D=================================eeeeER fmlal v0.4s, v0.4h, v1.4h # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -850,13 +850,13 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 2 10.5 0.5 0.0 fmul v0.2d, v0.2d, v0.2d -# CHECK-NEXT: 1. 2 13.5 0.0 0.0 fmlal v0.4s, v1.4h, v2.4h -# CHECK-NEXT: 2. 2 17.5 0.0 0.0 fadd v0.2d, v0.2d, v0.2d -# CHECK-NEXT: 3. 2 19.5 0.0 0.0 fmlal v0.4s, v1.4h, v2.4h -# CHECK-NEXT: 4. 2 21.5 0.0 0.0 fmlal v0.4s, v1.4h, v2.4h -# CHECK-NEXT: 5. 2 25.5 0.0 0.0 fmlal v0.4s, v0.4h, v1.4h -# CHECK-NEXT: 2 18.0 0.1 0.0 +# CHECK-NEXT: 0. 2 10.0 0.5 0.0 fmul v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1. 2 13.0 0.0 0.0 fmlal v0.4s, v1.4h, v2.4h +# CHECK-NEXT: 2. 2 17.0 0.0 0.0 fadd v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 3. 2 19.0 0.0 0.0 fmlal v0.4s, v1.4h, v2.4h +# CHECK-NEXT: 4. 2 21.0 0.0 0.0 fmlal v0.4s, v1.4h, v2.4h +# CHECK-NEXT: 5. 2 25.0 0.0 0.0 fmlal v0.4s, v0.4h, v1.4h +# CHECK-NEXT: 2 17.5 0.1 0.0 # CHECK: [14] Code Region - bfdot @@ -865,7 +865,7 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: Total Cycles: 1603 # CHECK-NEXT: Total uOps: 400 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 0.25 # CHECK-NEXT: IPC: 0.25 # CHECK-NEXT: Block RThroughput: 1.0 @@ -880,8 +880,8 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: [0,3] D===========eeeeeER . . . . bfdot v0.4s, v0.8h, v1.8h # CHECK-NEXT: [1,0] D================eeeER . . . fmul v0.2d, v0.2d, v0.2d # CHECK-NEXT: [1,1] D===================eeeeeER . . bfdot v0.4s, v1.8h, v2.8h -# CHECK-NEXT: [1,2] D======================eeeeeER. . bfdot v0.4s, v1.8h, v2.8h -# CHECK-NEXT: [1,3] D===========================eeeeeER bfdot v0.4s, v0.8h, v1.8h +# CHECK-NEXT: [1,2] .D=====================eeeeeER. . bfdot v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [1,3] .D==========================eeeeeER bfdot v0.4s, v0.8h, v1.8h # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -892,9 +892,9 @@ bfmlalb z0.s, z0.h, z1.h # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 2 9.0 0.5 0.0 fmul v0.2d, v0.2d, v0.2d # CHECK-NEXT: 1. 2 12.0 0.0 0.0 bfdot v0.4s, v1.8h, v2.8h -# CHECK-NEXT: 2. 2 15.0 0.0 0.0 bfdot v0.4s, v1.8h, v2.8h -# CHECK-NEXT: 3. 2 20.0 0.0 0.0 bfdot v0.4s, v0.8h, v1.8h -# CHECK-NEXT: 2 14.0 0.1 0.0 +# CHECK-NEXT: 2. 2 14.5 0.0 0.0 bfdot v0.4s, v1.8h, v2.8h +# CHECK-NEXT: 3. 2 19.5 0.0 0.0 bfdot v0.4s, v0.8h, v1.8h +# CHECK-NEXT: 2 13.8 0.1 0.0 # CHECK: [15] Code Region - bfmmla @@ -903,7 +903,7 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: Total Cycles: 1903 # CHECK-NEXT: Total uOps: 400 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 0.21 # CHECK-NEXT: IPC: 0.21 # CHECK-NEXT: Block RThroughput: 1.0 @@ -918,8 +918,8 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: [0,3] D=============eeeeeeER . . . . bfmmla v0.4s, v0.8h, v1.8h # CHECK-NEXT: [1,0] D===================eeeER. . . . fmul v0.2d, v0.2d, v0.2d # CHECK-NEXT: [1,1] D======================eeeeeeER . . bfmmla v0.4s, v1.8h, v2.8h -# CHECK-NEXT: [1,2] D==========================eeeeeeER. . bfmmla v0.4s, v1.8h, v2.8h -# CHECK-NEXT: [1,3] D================================eeeeeeER bfmmla v0.4s, v0.8h, v1.8h +# CHECK-NEXT: [1,2] .D=========================eeeeeeER. . bfmmla v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [1,3] .D===============================eeeeeeER bfmmla v0.4s, v0.8h, v1.8h # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -930,9 +930,9 @@ bfmlalb z0.s, z0.h, z1.h # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 2 10.5 0.5 0.0 fmul v0.2d, v0.2d, v0.2d # CHECK-NEXT: 1. 2 13.5 0.0 0.0 bfmmla v0.4s, v1.8h, v2.8h -# CHECK-NEXT: 2. 2 17.5 0.0 0.0 bfmmla v0.4s, v1.8h, v2.8h -# CHECK-NEXT: 3. 2 23.5 0.0 0.0 bfmmla v0.4s, v0.8h, v1.8h -# CHECK-NEXT: 2 16.3 0.1 0.0 +# CHECK-NEXT: 2. 2 17.0 0.0 0.0 bfmmla v0.4s, v1.8h, v2.8h +# CHECK-NEXT: 3. 2 23.0 0.0 0.0 bfmmla v0.4s, v0.8h, v1.8h +# CHECK-NEXT: 2 16.0 0.1 0.0 # CHECK: [16] Code Region - bfmlalb @@ -941,7 +941,7 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: Total Cycles: 1503 # CHECK-NEXT: Total uOps: 400 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 0.27 # CHECK-NEXT: IPC: 0.27 # CHECK-NEXT: Block RThroughput: 1.0 @@ -956,8 +956,8 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: [0,3] D==========eeeeeER . . . . bfmlalb v0.4s, v0.8h, v1.8h # CHECK-NEXT: [1,0] D===============eeeER . . . fmul v0.2d, v0.2d, v0.2d # CHECK-NEXT: [1,1] D==================eeeeeER . . bfmlalb v0.4s, v1.8h, v2.8h -# CHECK-NEXT: [1,2] D====================eeeeeER . . bfmlalb v0.4s, v1.8h, v2.8h -# CHECK-NEXT: [1,3] D=========================eeeeeER bfmlalb v0.4s, v0.8h, v1.8h +# CHECK-NEXT: [1,2] .D===================eeeeeER . . bfmlalb v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [1,3] .D========================eeeeeER bfmlalb v0.4s, v0.8h, v1.8h # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -968,9 +968,9 @@ bfmlalb z0.s, z0.h, z1.h # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 2 8.5 0.5 0.0 fmul v0.2d, v0.2d, v0.2d # CHECK-NEXT: 1. 2 11.5 0.0 0.0 bfmlalb v0.4s, v1.8h, v2.8h -# CHECK-NEXT: 2. 2 13.5 0.0 0.0 bfmlalb v0.4s, v1.8h, v2.8h -# CHECK-NEXT: 3. 2 18.5 0.0 0.0 bfmlalb v0.4s, v0.8h, v1.8h -# CHECK-NEXT: 2 13.0 0.1 0.0 +# CHECK-NEXT: 2. 2 13.0 0.0 0.0 bfmlalb v0.4s, v1.8h, v2.8h +# CHECK-NEXT: 3. 2 18.0 0.0 0.0 bfmlalb v0.4s, v0.8h, v1.8h +# CHECK-NEXT: 2 12.8 0.1 0.0 # CHECK: [17] Code Region - crc32b @@ -979,7 +979,7 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: Total Cycles: 703 # CHECK-NEXT: Total uOps: 400 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 0.57 # CHECK-NEXT: IPC: 0.57 # CHECK-NEXT: Block RThroughput: 3.0 @@ -994,8 +994,8 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: [0,3] D=====eeER. .. crc32b w0, w0, w0 # CHECK-NEXT: [1,0] D=======eeER .. mul w0, w0, w0 # CHECK-NEXT: [1,1] D=========eeER .. crc32b w0, w0, w1 -# CHECK-NEXT: [1,2] D==========eeER.. crc32b w0, w0, w1 -# CHECK-NEXT: [1,3] D============eeER crc32b w0, w0, w0 +# CHECK-NEXT: [1,2] .D=========eeER.. crc32b w0, w0, w1 +# CHECK-NEXT: [1,3] .D===========eeER crc32b w0, w0, w0 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -1006,9 +1006,9 @@ bfmlalb z0.s, z0.h, z1.h # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 2 4.5 0.5 0.0 mul w0, w0, w0 # CHECK-NEXT: 1. 2 6.5 0.0 0.0 crc32b w0, w0, w1 -# CHECK-NEXT: 2. 2 7.5 0.0 0.0 crc32b w0, w0, w1 -# CHECK-NEXT: 3. 2 9.5 0.0 0.0 crc32b w0, w0, w0 -# CHECK-NEXT: 2 7.0 0.1 0.0 +# CHECK-NEXT: 2. 2 7.0 0.0 0.0 crc32b w0, w0, w1 +# CHECK-NEXT: 3. 2 9.0 0.0 0.0 crc32b w0, w0, w0 +# CHECK-NEXT: 2 6.8 0.1 0.0 # CHECK: [18] Code Region - Z saba @@ -1017,7 +1017,7 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: Total Cycles: 1403 # CHECK-NEXT: Total uOps: 500 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 0.36 # CHECK-NEXT: IPC: 0.29 # CHECK-NEXT: Block RThroughput: 1.5 @@ -1030,10 +1030,10 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: [0,1] D=====eeeeER . . . . saba z0.d, z1.d, z2.d # CHECK-NEXT: [0,2] D======eeeeER . . . . saba z0.d, z1.d, z2.d # CHECK-NEXT: [0,3] D==========eeeeER . . . saba z0.d, z0.d, z1.d -# CHECK-NEXT: [1,0] D==============eeeeeER . . mul z0.d, z0.d, z0.d -# CHECK-NEXT: [1,1] D===================eeeeER . saba z0.d, z1.d, z2.d -# CHECK-NEXT: [1,2] D====================eeeeER . saba z0.d, z1.d, z2.d -# CHECK-NEXT: [1,3] D========================eeeeER saba z0.d, z0.d, z1.d +# CHECK-NEXT: [1,0] .D=============eeeeeER . . mul z0.d, z0.d, z0.d +# CHECK-NEXT: [1,1] .D==================eeeeER . saba z0.d, z1.d, z2.d +# CHECK-NEXT: [1,2] .D===================eeeeER . saba z0.d, z1.d, z2.d +# CHECK-NEXT: [1,3] .D=======================eeeeER saba z0.d, z0.d, z1.d # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -1042,11 +1042,11 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 2 8.0 0.5 0.0 mul z0.d, z0.d, z0.d -# CHECK-NEXT: 1. 2 13.0 0.0 0.0 saba z0.d, z1.d, z2.d -# CHECK-NEXT: 2. 2 14.0 0.0 0.0 saba z0.d, z1.d, z2.d -# CHECK-NEXT: 3. 2 18.0 0.0 0.0 saba z0.d, z0.d, z1.d -# CHECK-NEXT: 2 13.3 0.1 0.0 +# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul z0.d, z0.d, z0.d +# CHECK-NEXT: 1. 2 12.5 0.0 0.0 saba z0.d, z1.d, z2.d +# CHECK-NEXT: 2. 2 13.5 0.0 0.0 saba z0.d, z1.d, z2.d +# CHECK-NEXT: 3. 2 17.5 0.0 0.0 saba z0.d, z0.d, z1.d +# CHECK-NEXT: 2 12.8 0.1 0.0 # CHECK: [19] Code Region - Z sadalp @@ -1055,7 +1055,7 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: Total Cycles: 1403 # CHECK-NEXT: Total uOps: 500 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 0.36 # CHECK-NEXT: IPC: 0.29 # CHECK-NEXT: Block RThroughput: 1.5 @@ -1068,10 +1068,10 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: [0,1] D=====eeeeER . . . . sadalp z0.d, p0/m, z1.s # CHECK-NEXT: [0,2] D======eeeeER . . . . sadalp z0.d, p0/m, z1.s # CHECK-NEXT: [0,3] D==========eeeeER . . . sadalp z0.d, p0/m, z0.s -# CHECK-NEXT: [1,0] D==============eeeeeER . . mul z0.d, z0.d, z0.d -# CHECK-NEXT: [1,1] D===================eeeeER . sadalp z0.d, p0/m, z1.s -# CHECK-NEXT: [1,2] D====================eeeeER . sadalp z0.d, p0/m, z1.s -# CHECK-NEXT: [1,3] D========================eeeeER sadalp z0.d, p0/m, z0.s +# CHECK-NEXT: [1,0] .D=============eeeeeER . . mul z0.d, z0.d, z0.d +# CHECK-NEXT: [1,1] .D==================eeeeER . sadalp z0.d, p0/m, z1.s +# CHECK-NEXT: [1,2] .D===================eeeeER . sadalp z0.d, p0/m, z1.s +# CHECK-NEXT: [1,3] .D=======================eeeeER sadalp z0.d, p0/m, z0.s # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -1080,11 +1080,11 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 2 8.0 0.5 0.0 mul z0.d, z0.d, z0.d -# CHECK-NEXT: 1. 2 13.0 0.0 0.0 sadalp z0.d, p0/m, z1.s -# CHECK-NEXT: 2. 2 14.0 0.0 0.0 sadalp z0.d, p0/m, z1.s -# CHECK-NEXT: 3. 2 18.0 0.0 0.0 sadalp z0.d, p0/m, z0.s -# CHECK-NEXT: 2 13.3 0.1 0.0 +# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul z0.d, z0.d, z0.d +# CHECK-NEXT: 1. 2 12.5 0.0 0.0 sadalp z0.d, p0/m, z1.s +# CHECK-NEXT: 2. 2 13.5 0.0 0.0 sadalp z0.d, p0/m, z1.s +# CHECK-NEXT: 3. 2 17.5 0.0 0.0 sadalp z0.d, p0/m, z0.s +# CHECK-NEXT: 2 12.8 0.1 0.0 # CHECK: [20] Code Region - Z ssra @@ -1093,7 +1093,7 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: Total Cycles: 1403 # CHECK-NEXT: Total uOps: 500 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 0.36 # CHECK-NEXT: IPC: 0.29 # CHECK-NEXT: Block RThroughput: 1.5 @@ -1106,10 +1106,10 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: [0,1] D=====eeeeER . . . . ssra z0.d, z1.d, #1 # CHECK-NEXT: [0,2] D======eeeeER . . . . ssra z0.d, z1.d, #1 # CHECK-NEXT: [0,3] D==========eeeeER . . . ssra z0.d, z0.d, #1 -# CHECK-NEXT: [1,0] D==============eeeeeER . . mul z0.d, z0.d, z0.d -# CHECK-NEXT: [1,1] D===================eeeeER . ssra z0.d, z1.d, #1 -# CHECK-NEXT: [1,2] D====================eeeeER . ssra z0.d, z1.d, #1 -# CHECK-NEXT: [1,3] D========================eeeeER ssra z0.d, z0.d, #1 +# CHECK-NEXT: [1,0] .D=============eeeeeER . . mul z0.d, z0.d, z0.d +# CHECK-NEXT: [1,1] .D==================eeeeER . ssra z0.d, z1.d, #1 +# CHECK-NEXT: [1,2] .D===================eeeeER . ssra z0.d, z1.d, #1 +# CHECK-NEXT: [1,3] .D=======================eeeeER ssra z0.d, z0.d, #1 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -1118,11 +1118,11 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 2 8.0 0.5 0.0 mul z0.d, z0.d, z0.d -# CHECK-NEXT: 1. 2 13.0 0.0 0.0 ssra z0.d, z1.d, #1 -# CHECK-NEXT: 2. 2 14.0 0.0 0.0 ssra z0.d, z1.d, #1 -# CHECK-NEXT: 3. 2 18.0 0.0 0.0 ssra z0.d, z0.d, #1 -# CHECK-NEXT: 2 13.3 0.1 0.0 +# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul z0.d, z0.d, z0.d +# CHECK-NEXT: 1. 2 12.5 0.0 0.0 ssra z0.d, z1.d, #1 +# CHECK-NEXT: 2. 2 13.5 0.0 0.0 ssra z0.d, z1.d, #1 +# CHECK-NEXT: 3. 2 17.5 0.0 0.0 ssra z0.d, z0.d, #1 +# CHECK-NEXT: 2 12.8 0.1 0.0 # CHECK: [21] Code Region - Z cdot.s @@ -1131,7 +1131,7 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: Total Cycles: 1203 # CHECK-NEXT: Total uOps: 500 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 0.42 # CHECK-NEXT: IPC: 0.33 # CHECK-NEXT: Block RThroughput: 1.0 @@ -1144,10 +1144,10 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: [0,1] D=====eeeER . . .. cdot z0.s, z1.b, z2.b, #90 # CHECK-NEXT: [0,2] D======eeeER . . .. cdot z0.s, z1.b, z2.b, #90 # CHECK-NEXT: [0,3] D=========eeeER. . .. cdot z0.s, z0.b, z1.b, #90 -# CHECK-NEXT: [1,0] D============eeeeeER. .. mul z0.d, z0.d, z0.d -# CHECK-NEXT: [1,1] D=================eeeER .. cdot z0.s, z1.b, z2.b, #90 -# CHECK-NEXT: [1,2] D==================eeeER .. cdot z0.s, z1.b, z2.b, #90 -# CHECK-NEXT: [1,3] D=====================eeeER cdot z0.s, z0.b, z1.b, #90 +# CHECK-NEXT: [1,0] .D===========eeeeeER. .. mul z0.d, z0.d, z0.d +# CHECK-NEXT: [1,1] .D================eeeER .. cdot z0.s, z1.b, z2.b, #90 +# CHECK-NEXT: [1,2] .D=================eeeER .. cdot z0.s, z1.b, z2.b, #90 +# CHECK-NEXT: [1,3] .D====================eeeER cdot z0.s, z0.b, z1.b, #90 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -1156,11 +1156,11 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 2 7.0 0.5 0.0 mul z0.d, z0.d, z0.d -# CHECK-NEXT: 1. 2 12.0 0.0 0.0 cdot z0.s, z1.b, z2.b, #90 -# CHECK-NEXT: 2. 2 13.0 0.0 0.0 cdot z0.s, z1.b, z2.b, #90 -# CHECK-NEXT: 3. 2 16.0 0.0 0.0 cdot z0.s, z0.b, z1.b, #90 -# CHECK-NEXT: 2 12.0 0.1 0.0 +# CHECK-NEXT: 0. 2 6.5 0.5 0.0 mul z0.d, z0.d, z0.d +# CHECK-NEXT: 1. 2 11.5 0.0 0.0 cdot z0.s, z1.b, z2.b, #90 +# CHECK-NEXT: 2. 2 12.5 0.0 0.0 cdot z0.s, z1.b, z2.b, #90 +# CHECK-NEXT: 3. 2 15.5 0.0 0.0 cdot z0.s, z0.b, z1.b, #90 +# CHECK-NEXT: 2 11.5 0.1 0.0 # CHECK: [22] Code Region - Z cdot.d @@ -1169,7 +1169,7 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: Total Cycles: 1403 # CHECK-NEXT: Total uOps: 500 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 0.36 # CHECK-NEXT: IPC: 0.29 # CHECK-NEXT: Block RThroughput: 2.5 @@ -1182,10 +1182,10 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: [0,1] D=====eeeeER . . . . cdot z0.d, z1.h, z2.h, #90 # CHECK-NEXT: [0,2] D======eeeeER . . . . cdot z0.d, z1.h, z2.h, #90 # CHECK-NEXT: [0,3] D==========eeeeER . . . cdot z0.d, z0.h, z1.h, #90 -# CHECK-NEXT: [1,0] D==============eeeeeER . . mul z0.d, z0.d, z0.d -# CHECK-NEXT: [1,1] D===================eeeeER . cdot z0.d, z1.h, z2.h, #90 -# CHECK-NEXT: [1,2] D====================eeeeER . cdot z0.d, z1.h, z2.h, #90 -# CHECK-NEXT: [1,3] D========================eeeeER cdot z0.d, z0.h, z1.h, #90 +# CHECK-NEXT: [1,0] .D=============eeeeeER . . mul z0.d, z0.d, z0.d +# CHECK-NEXT: [1,1] .D==================eeeeER . cdot z0.d, z1.h, z2.h, #90 +# CHECK-NEXT: [1,2] .D===================eeeeER . cdot z0.d, z1.h, z2.h, #90 +# CHECK-NEXT: [1,3] .D=======================eeeeER cdot z0.d, z0.h, z1.h, #90 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -1194,11 +1194,11 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 2 8.0 0.5 0.0 mul z0.d, z0.d, z0.d -# CHECK-NEXT: 1. 2 13.0 0.0 0.0 cdot z0.d, z1.h, z2.h, #90 -# CHECK-NEXT: 2. 2 14.0 0.0 0.0 cdot z0.d, z1.h, z2.h, #90 -# CHECK-NEXT: 3. 2 18.0 0.0 0.0 cdot z0.d, z0.h, z1.h, #90 -# CHECK-NEXT: 2 13.3 0.1 0.0 +# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul z0.d, z0.d, z0.d +# CHECK-NEXT: 1. 2 12.5 0.0 0.0 cdot z0.d, z1.h, z2.h, #90 +# CHECK-NEXT: 2. 2 13.5 0.0 0.0 cdot z0.d, z1.h, z2.h, #90 +# CHECK-NEXT: 3. 2 17.5 0.0 0.0 cdot z0.d, z0.h, z1.h, #90 +# CHECK-NEXT: 2 12.8 0.1 0.0 # CHECK: [23] Code Region - Z cmla.b @@ -1207,7 +1207,7 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: Total Cycles: 1403 # CHECK-NEXT: Total uOps: 500 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 0.36 # CHECK-NEXT: IPC: 0.29 # CHECK-NEXT: Block RThroughput: 2.5 @@ -1220,10 +1220,10 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: [0,1] D=====eeeeER . . . . cmla z0.b, z1.b, z2.b, #90 # CHECK-NEXT: [0,2] D======eeeeER . . . . cmla z0.b, z1.b, z2.b, #90 # CHECK-NEXT: [0,3] D==========eeeeER . . . cmla z0.b, z0.b, z1.b, #90 -# CHECK-NEXT: [1,0] D==============eeeeeER . . mul z0.d, z0.d, z0.d -# CHECK-NEXT: [1,1] D===================eeeeER . cmla z0.b, z1.b, z2.b, #90 -# CHECK-NEXT: [1,2] D====================eeeeER . cmla z0.b, z1.b, z2.b, #90 -# CHECK-NEXT: [1,3] D========================eeeeER cmla z0.b, z0.b, z1.b, #90 +# CHECK-NEXT: [1,0] .D=============eeeeeER . . mul z0.d, z0.d, z0.d +# CHECK-NEXT: [1,1] .D==================eeeeER . cmla z0.b, z1.b, z2.b, #90 +# CHECK-NEXT: [1,2] .D===================eeeeER . cmla z0.b, z1.b, z2.b, #90 +# CHECK-NEXT: [1,3] .D=======================eeeeER cmla z0.b, z0.b, z1.b, #90 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -1232,11 +1232,11 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 2 8.0 0.5 0.0 mul z0.d, z0.d, z0.d -# CHECK-NEXT: 1. 2 13.0 0.0 0.0 cmla z0.b, z1.b, z2.b, #90 -# CHECK-NEXT: 2. 2 14.0 0.0 0.0 cmla z0.b, z1.b, z2.b, #90 -# CHECK-NEXT: 3. 2 18.0 0.0 0.0 cmla z0.b, z0.b, z1.b, #90 -# CHECK-NEXT: 2 13.3 0.1 0.0 +# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul z0.d, z0.d, z0.d +# CHECK-NEXT: 1. 2 12.5 0.0 0.0 cmla z0.b, z1.b, z2.b, #90 +# CHECK-NEXT: 2. 2 13.5 0.0 0.0 cmla z0.b, z1.b, z2.b, #90 +# CHECK-NEXT: 3. 2 17.5 0.0 0.0 cmla z0.b, z0.b, z1.b, #90 +# CHECK-NEXT: 2 12.8 0.1 0.0 # CHECK: [24] Code Region - Z cmla.d @@ -1245,7 +1245,7 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: Total Cycles: 1803 # CHECK-NEXT: Total uOps: 500 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 0.28 # CHECK-NEXT: IPC: 0.22 # CHECK-NEXT: Block RThroughput: 4.0 @@ -1258,10 +1258,10 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: [0,1] D=====eeeeeER . . . . . . cmla z0.d, z1.d, z2.d, #90 # CHECK-NEXT: [0,2] D========eeeeeER . . . . . cmla z0.d, z1.d, z2.d, #90 # CHECK-NEXT: [0,3] D=============eeeeeER . . . . cmla z0.d, z0.d, z1.d, #90 -# CHECK-NEXT: [1,0] D==================eeeeeER . . . mul z0.d, z0.d, z0.d -# CHECK-NEXT: [1,1] D=======================eeeeeER . . cmla z0.d, z1.d, z2.d, #90 -# CHECK-NEXT: [1,2] D==========================eeeeeER . . cmla z0.d, z1.d, z2.d, #90 -# CHECK-NEXT: [1,3] D===============================eeeeeER cmla z0.d, z0.d, z1.d, #90 +# CHECK-NEXT: [1,0] .D=================eeeeeER . . . mul z0.d, z0.d, z0.d +# CHECK-NEXT: [1,1] .D======================eeeeeER . . cmla z0.d, z1.d, z2.d, #90 +# CHECK-NEXT: [1,2] .D=========================eeeeeER . . cmla z0.d, z1.d, z2.d, #90 +# CHECK-NEXT: [1,3] .D==============================eeeeeER cmla z0.d, z0.d, z1.d, #90 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -1270,11 +1270,11 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 2 10.0 0.5 0.0 mul z0.d, z0.d, z0.d -# CHECK-NEXT: 1. 2 15.0 0.0 0.0 cmla z0.d, z1.d, z2.d, #90 -# CHECK-NEXT: 2. 2 18.0 0.0 0.0 cmla z0.d, z1.d, z2.d, #90 -# CHECK-NEXT: 3. 2 23.0 0.0 0.0 cmla z0.d, z0.d, z1.d, #90 -# CHECK-NEXT: 2 16.5 0.1 0.0 +# CHECK-NEXT: 0. 2 9.5 0.5 0.0 mul z0.d, z0.d, z0.d +# CHECK-NEXT: 1. 2 14.5 0.0 0.0 cmla z0.d, z1.d, z2.d, #90 +# CHECK-NEXT: 2. 2 17.5 0.0 0.0 cmla z0.d, z1.d, z2.d, #90 +# CHECK-NEXT: 3. 2 22.5 0.0 0.0 cmla z0.d, z0.d, z1.d, #90 +# CHECK-NEXT: 2 16.0 0.1 0.0 # CHECK: [25] Code Region - Z sdot.s @@ -1283,7 +1283,7 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: Total Cycles: 1203 # CHECK-NEXT: Total uOps: 500 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 0.42 # CHECK-NEXT: IPC: 0.33 # CHECK-NEXT: Block RThroughput: 1.0 @@ -1296,10 +1296,10 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: [0,1] D=====eeeER . . .. sdot z0.s, z1.b, z2.b # CHECK-NEXT: [0,2] D======eeeER . . .. sdot z0.s, z1.b, z2.b # CHECK-NEXT: [0,3] D=========eeeER. . .. sdot z0.s, z0.b, z1.b -# CHECK-NEXT: [1,0] D============eeeeeER. .. mul z0.d, z0.d, z0.d -# CHECK-NEXT: [1,1] D=================eeeER .. sdot z0.s, z1.b, z2.b -# CHECK-NEXT: [1,2] D==================eeeER .. sdot z0.s, z1.b, z2.b -# CHECK-NEXT: [1,3] D=====================eeeER sdot z0.s, z0.b, z1.b +# CHECK-NEXT: [1,0] .D===========eeeeeER. .. mul z0.d, z0.d, z0.d +# CHECK-NEXT: [1,1] .D================eeeER .. sdot z0.s, z1.b, z2.b +# CHECK-NEXT: [1,2] .D=================eeeER .. sdot z0.s, z1.b, z2.b +# CHECK-NEXT: [1,3] .D====================eeeER sdot z0.s, z0.b, z1.b # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -1308,11 +1308,11 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 2 7.0 0.5 0.0 mul z0.d, z0.d, z0.d -# CHECK-NEXT: 1. 2 12.0 0.0 0.0 sdot z0.s, z1.b, z2.b -# CHECK-NEXT: 2. 2 13.0 0.0 0.0 sdot z0.s, z1.b, z2.b -# CHECK-NEXT: 3. 2 16.0 0.0 0.0 sdot z0.s, z0.b, z1.b -# CHECK-NEXT: 2 12.0 0.1 0.0 +# CHECK-NEXT: 0. 2 6.5 0.5 0.0 mul z0.d, z0.d, z0.d +# CHECK-NEXT: 1. 2 11.5 0.0 0.0 sdot z0.s, z1.b, z2.b +# CHECK-NEXT: 2. 2 12.5 0.0 0.0 sdot z0.s, z1.b, z2.b +# CHECK-NEXT: 3. 2 15.5 0.0 0.0 sdot z0.s, z0.b, z1.b +# CHECK-NEXT: 2 11.5 0.1 0.0 # CHECK: [26] Code Region - Z sudot @@ -1321,7 +1321,7 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: Total Cycles: 1203 # CHECK-NEXT: Total uOps: 500 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 0.42 # CHECK-NEXT: IPC: 0.33 # CHECK-NEXT: Block RThroughput: 1.0 @@ -1334,10 +1334,10 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: [0,1] D=====eeeER . . .. sdot z0.s, z1.b, z2.b[1] # CHECK-NEXT: [0,2] D======eeeER . . .. sdot z0.s, z1.b, z2.b[1] # CHECK-NEXT: [0,3] D=========eeeER. . .. sdot z0.s, z0.b, z1.b[1] -# CHECK-NEXT: [1,0] D============eeeeeER. .. mul z0.d, z0.d, z0.d -# CHECK-NEXT: [1,1] D=================eeeER .. sdot z0.s, z1.b, z2.b[1] -# CHECK-NEXT: [1,2] D==================eeeER .. sdot z0.s, z1.b, z2.b[1] -# CHECK-NEXT: [1,3] D=====================eeeER sdot z0.s, z0.b, z1.b[1] +# CHECK-NEXT: [1,0] .D===========eeeeeER. .. mul z0.d, z0.d, z0.d +# CHECK-NEXT: [1,1] .D================eeeER .. sdot z0.s, z1.b, z2.b[1] +# CHECK-NEXT: [1,2] .D=================eeeER .. sdot z0.s, z1.b, z2.b[1] +# CHECK-NEXT: [1,3] .D====================eeeER sdot z0.s, z0.b, z1.b[1] # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -1346,11 +1346,11 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 2 7.0 0.5 0.0 mul z0.d, z0.d, z0.d -# CHECK-NEXT: 1. 2 12.0 0.0 0.0 sdot z0.s, z1.b, z2.b[1] -# CHECK-NEXT: 2. 2 13.0 0.0 0.0 sdot z0.s, z1.b, z2.b[1] -# CHECK-NEXT: 3. 2 16.0 0.0 0.0 sdot z0.s, z0.b, z1.b[1] -# CHECK-NEXT: 2 12.0 0.1 0.0 +# CHECK-NEXT: 0. 2 6.5 0.5 0.0 mul z0.d, z0.d, z0.d +# CHECK-NEXT: 1. 2 11.5 0.0 0.0 sdot z0.s, z1.b, z2.b[1] +# CHECK-NEXT: 2. 2 12.5 0.0 0.0 sdot z0.s, z1.b, z2.b[1] +# CHECK-NEXT: 3. 2 15.5 0.0 0.0 sdot z0.s, z0.b, z1.b[1] +# CHECK-NEXT: 2 11.5 0.1 0.0 # CHECK: [27] Code Region - Z sdot.d @@ -1359,7 +1359,7 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: Total Cycles: 1403 # CHECK-NEXT: Total uOps: 500 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 0.36 # CHECK-NEXT: IPC: 0.29 # CHECK-NEXT: Block RThroughput: 2.5 @@ -1372,10 +1372,10 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: [0,1] D=====eeeeER . . . . sdot z0.d, z1.h, z2.h # CHECK-NEXT: [0,2] D======eeeeER . . . . sdot z0.d, z1.h, z2.h # CHECK-NEXT: [0,3] D==========eeeeER . . . sdot z0.d, z0.h, z1.h -# CHECK-NEXT: [1,0] D==============eeeeeER . . mul z0.d, z0.d, z0.d -# CHECK-NEXT: [1,1] D===================eeeeER . sdot z0.d, z1.h, z2.h -# CHECK-NEXT: [1,2] D====================eeeeER . sdot z0.d, z1.h, z2.h -# CHECK-NEXT: [1,3] D========================eeeeER sdot z0.d, z0.h, z1.h +# CHECK-NEXT: [1,0] .D=============eeeeeER . . mul z0.d, z0.d, z0.d +# CHECK-NEXT: [1,1] .D==================eeeeER . sdot z0.d, z1.h, z2.h +# CHECK-NEXT: [1,2] .D===================eeeeER . sdot z0.d, z1.h, z2.h +# CHECK-NEXT: [1,3] .D=======================eeeeER sdot z0.d, z0.h, z1.h # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -1384,11 +1384,11 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 2 8.0 0.5 0.0 mul z0.d, z0.d, z0.d -# CHECK-NEXT: 1. 2 13.0 0.0 0.0 sdot z0.d, z1.h, z2.h -# CHECK-NEXT: 2. 2 14.0 0.0 0.0 sdot z0.d, z1.h, z2.h -# CHECK-NEXT: 3. 2 18.0 0.0 0.0 sdot z0.d, z0.h, z1.h -# CHECK-NEXT: 2 13.3 0.1 0.0 +# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul z0.d, z0.d, z0.d +# CHECK-NEXT: 1. 2 12.5 0.0 0.0 sdot z0.d, z1.h, z2.h +# CHECK-NEXT: 2. 2 13.5 0.0 0.0 sdot z0.d, z1.h, z2.h +# CHECK-NEXT: 3. 2 17.5 0.0 0.0 sdot z0.d, z0.h, z1.h +# CHECK-NEXT: 2 12.8 0.1 0.0 # CHECK: [28] Code Region - Z smmla @@ -1397,7 +1397,7 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: Total Cycles: 1103 # CHECK-NEXT: Total uOps: 400 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 0.36 # CHECK-NEXT: IPC: 0.36 # CHECK-NEXT: Block RThroughput: 0.8 @@ -1412,8 +1412,8 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: [0,3] D========eeeER . . . smmla z0.s, z0.b, z1.b # CHECK-NEXT: [1,0] D===========eeeeER . . mul z0.s, z0.s, z0.s # CHECK-NEXT: [1,1] D===============eeeER . smmla z0.s, z1.b, z2.b -# CHECK-NEXT: [1,2] D================eeeER . smmla z0.s, z1.b, z2.b -# CHECK-NEXT: [1,3] D===================eeeER smmla z0.s, z0.b, z1.b +# CHECK-NEXT: [1,2] .D===============eeeER . smmla z0.s, z1.b, z2.b +# CHECK-NEXT: [1,3] .D==================eeeER smmla z0.s, z0.b, z1.b # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -1424,9 +1424,9 @@ bfmlalb z0.s, z0.h, z1.h # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 2 6.5 0.5 0.0 mul z0.s, z0.s, z0.s # CHECK-NEXT: 1. 2 10.5 0.0 0.0 smmla z0.s, z1.b, z2.b -# CHECK-NEXT: 2. 2 11.5 0.0 0.0 smmla z0.s, z1.b, z2.b -# CHECK-NEXT: 3. 2 14.5 0.0 0.0 smmla z0.s, z0.b, z1.b -# CHECK-NEXT: 2 10.8 0.1 0.0 +# CHECK-NEXT: 2. 2 11.0 0.0 0.0 smmla z0.s, z1.b, z2.b +# CHECK-NEXT: 3. 2 14.0 0.0 0.0 smmla z0.s, z0.b, z1.b +# CHECK-NEXT: 2 10.5 0.1 0.0 # CHECK: [29] Code Region - Z mla.b @@ -1435,7 +1435,7 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: Total Cycles: 1403 # CHECK-NEXT: Total uOps: 500 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 0.36 # CHECK-NEXT: IPC: 0.29 # CHECK-NEXT: Block RThroughput: 4.0 @@ -1448,10 +1448,10 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: [0,1] D=====eeeeER . . . . mla z0.b, p0/m, z1.b, z2.b # CHECK-NEXT: [0,2] D======eeeeER . . . . mla z0.b, p0/m, z1.b, z2.b # CHECK-NEXT: [0,3] D==========eeeeER . . . mla z0.b, p0/m, z0.b, z1.b -# CHECK-NEXT: [1,0] D==============eeeeeER . . mul z0.d, z0.d, z0.d -# CHECK-NEXT: [1,1] D===================eeeeER . mla z0.b, p0/m, z1.b, z2.b -# CHECK-NEXT: [1,2] D====================eeeeER . mla z0.b, p0/m, z1.b, z2.b -# CHECK-NEXT: [1,3] D========================eeeeER mla z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: [1,0] .D=============eeeeeER . . mul z0.d, z0.d, z0.d +# CHECK-NEXT: [1,1] .D==================eeeeER . mla z0.b, p0/m, z1.b, z2.b +# CHECK-NEXT: [1,2] .D===================eeeeER . mla z0.b, p0/m, z1.b, z2.b +# CHECK-NEXT: [1,3] .D=======================eeeeER mla z0.b, p0/m, z0.b, z1.b # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -1460,11 +1460,11 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 2 8.0 0.5 0.0 mul z0.d, z0.d, z0.d -# CHECK-NEXT: 1. 2 13.0 0.0 0.0 mla z0.b, p0/m, z1.b, z2.b -# CHECK-NEXT: 2. 2 14.0 0.0 0.0 mla z0.b, p0/m, z1.b, z2.b -# CHECK-NEXT: 3. 2 18.0 0.0 0.0 mla z0.b, p0/m, z0.b, z1.b -# CHECK-NEXT: 2 13.3 0.1 0.0 +# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul z0.d, z0.d, z0.d +# CHECK-NEXT: 1. 2 12.5 0.0 0.0 mla z0.b, p0/m, z1.b, z2.b +# CHECK-NEXT: 2. 2 13.5 0.0 0.0 mla z0.b, p0/m, z1.b, z2.b +# CHECK-NEXT: 3. 2 17.5 0.0 0.0 mla z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: 2 12.8 0.1 0.0 # CHECK: [30] Code Region - Z mla.d @@ -1473,7 +1473,7 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: Total Cycles: 1803 # CHECK-NEXT: Total uOps: 500 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 0.28 # CHECK-NEXT: IPC: 0.22 # CHECK-NEXT: Block RThroughput: 4.0 @@ -1486,10 +1486,10 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: [0,1] D=====eeeeeER . . . . . . mla z0.d, p0/m, z1.d, z2.d # CHECK-NEXT: [0,2] D========eeeeeER . . . . . mla z0.d, p0/m, z1.d, z2.d # CHECK-NEXT: [0,3] D=============eeeeeER . . . . mla z0.d, p0/m, z0.d, z1.d -# CHECK-NEXT: [1,0] D==================eeeeeER . . . mul z0.d, z0.d, z0.d -# CHECK-NEXT: [1,1] D=======================eeeeeER . . mla z0.d, p0/m, z1.d, z2.d -# CHECK-NEXT: [1,2] D==========================eeeeeER . . mla z0.d, p0/m, z1.d, z2.d -# CHECK-NEXT: [1,3] D===============================eeeeeER mla z0.d, p0/m, z0.d, z1.d +# CHECK-NEXT: [1,0] .D=================eeeeeER . . . mul z0.d, z0.d, z0.d +# CHECK-NEXT: [1,1] .D======================eeeeeER . . mla z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: [1,2] .D=========================eeeeeER . . mla z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: [1,3] .D==============================eeeeeER mla z0.d, p0/m, z0.d, z1.d # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -1498,11 +1498,11 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 2 10.0 0.5 0.0 mul z0.d, z0.d, z0.d -# CHECK-NEXT: 1. 2 15.0 0.0 0.0 mla z0.d, p0/m, z1.d, z2.d -# CHECK-NEXT: 2. 2 18.0 0.0 0.0 mla z0.d, p0/m, z1.d, z2.d -# CHECK-NEXT: 3. 2 23.0 0.0 0.0 mla z0.d, p0/m, z0.d, z1.d -# CHECK-NEXT: 2 16.5 0.1 0.0 +# CHECK-NEXT: 0. 2 9.5 0.5 0.0 mul z0.d, z0.d, z0.d +# CHECK-NEXT: 1. 2 14.5 0.0 0.0 mla z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: 2. 2 17.5 0.0 0.0 mla z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: 3. 2 22.5 0.0 0.0 mla z0.d, p0/m, z0.d, z1.d +# CHECK-NEXT: 2 16.0 0.1 0.0 # CHECK: [31] Code Region - Z smlalb @@ -1511,7 +1511,7 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: Total Cycles: 1403 # CHECK-NEXT: Total uOps: 500 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 0.36 # CHECK-NEXT: IPC: 0.29 # CHECK-NEXT: Block RThroughput: 2.5 @@ -1524,10 +1524,10 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: [0,1] D=====eeeeER . . . . smlalb z0.d, z1.s, z2.s # CHECK-NEXT: [0,2] D======eeeeER . . . . smlalb z0.d, z1.s, z2.s # CHECK-NEXT: [0,3] D==========eeeeER . . . smlalb z0.d, z0.s, z1.s -# CHECK-NEXT: [1,0] D==============eeeeeER . . mul z0.d, z0.d, z0.d -# CHECK-NEXT: [1,1] D===================eeeeER . smlalb z0.d, z1.s, z2.s -# CHECK-NEXT: [1,2] D====================eeeeER . smlalb z0.d, z1.s, z2.s -# CHECK-NEXT: [1,3] D========================eeeeER smlalb z0.d, z0.s, z1.s +# CHECK-NEXT: [1,0] .D=============eeeeeER . . mul z0.d, z0.d, z0.d +# CHECK-NEXT: [1,1] .D==================eeeeER . smlalb z0.d, z1.s, z2.s +# CHECK-NEXT: [1,2] .D===================eeeeER . smlalb z0.d, z1.s, z2.s +# CHECK-NEXT: [1,3] .D=======================eeeeER smlalb z0.d, z0.s, z1.s # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -1536,11 +1536,11 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 2 8.0 0.5 0.0 mul z0.d, z0.d, z0.d -# CHECK-NEXT: 1. 2 13.0 0.0 0.0 smlalb z0.d, z1.s, z2.s -# CHECK-NEXT: 2. 2 14.0 0.0 0.0 smlalb z0.d, z1.s, z2.s -# CHECK-NEXT: 3. 2 18.0 0.0 0.0 smlalb z0.d, z0.s, z1.s -# CHECK-NEXT: 2 13.3 0.1 0.0 +# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul z0.d, z0.d, z0.d +# CHECK-NEXT: 1. 2 12.5 0.0 0.0 smlalb z0.d, z1.s, z2.s +# CHECK-NEXT: 2. 2 13.5 0.0 0.0 smlalb z0.d, z1.s, z2.s +# CHECK-NEXT: 3. 2 17.5 0.0 0.0 smlalb z0.d, z0.s, z1.s +# CHECK-NEXT: 2 12.8 0.1 0.0 # CHECK: [32] Code Region - Z sqdmlalb @@ -1549,7 +1549,7 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: Total Cycles: 1503 # CHECK-NEXT: Total uOps: 500 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 0.33 # CHECK-NEXT: IPC: 0.27 # CHECK-NEXT: Block RThroughput: 2.5 @@ -1562,10 +1562,10 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: [0,1] D=====eeeeER . . . . . sqdmlalb z0.d, z1.s, z2.s # CHECK-NEXT: [0,2] D=======eeeeER . . . . . sqdmlalb z0.d, z1.s, z2.s # CHECK-NEXT: [0,3] D===========eeeeER . . . . sqdmlalb z0.d, z0.s, z1.s -# CHECK-NEXT: [1,0] D===============eeeeeER . . . mul z0.d, z0.d, z0.d -# CHECK-NEXT: [1,1] D====================eeeeER . . sqdmlalb z0.d, z1.s, z2.s -# CHECK-NEXT: [1,2] D======================eeeeER . . sqdmlalb z0.d, z1.s, z2.s -# CHECK-NEXT: [1,3] D==========================eeeeER sqdmlalb z0.d, z0.s, z1.s +# CHECK-NEXT: [1,0] .D==============eeeeeER . . . mul z0.d, z0.d, z0.d +# CHECK-NEXT: [1,1] .D===================eeeeER . . sqdmlalb z0.d, z1.s, z2.s +# CHECK-NEXT: [1,2] .D=====================eeeeER . . sqdmlalb z0.d, z1.s, z2.s +# CHECK-NEXT: [1,3] .D=========================eeeeER sqdmlalb z0.d, z0.s, z1.s # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -1574,11 +1574,11 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 2 8.5 0.5 0.0 mul z0.d, z0.d, z0.d -# CHECK-NEXT: 1. 2 13.5 0.0 0.0 sqdmlalb z0.d, z1.s, z2.s -# CHECK-NEXT: 2. 2 15.5 0.0 0.0 sqdmlalb z0.d, z1.s, z2.s -# CHECK-NEXT: 3. 2 19.5 0.0 0.0 sqdmlalb z0.d, z0.s, z1.s -# CHECK-NEXT: 2 14.3 0.1 0.0 +# CHECK-NEXT: 0. 2 8.0 0.5 0.0 mul z0.d, z0.d, z0.d +# CHECK-NEXT: 1. 2 13.0 0.0 0.0 sqdmlalb z0.d, z1.s, z2.s +# CHECK-NEXT: 2. 2 15.0 0.0 0.0 sqdmlalb z0.d, z1.s, z2.s +# CHECK-NEXT: 3. 2 19.0 0.0 0.0 sqdmlalb z0.d, z0.s, z1.s +# CHECK-NEXT: 2 13.8 0.1 0.0 # CHECK: [33] Code Region - Z sqrdmlah.b @@ -1587,7 +1587,7 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: Total Cycles: 1503 # CHECK-NEXT: Total uOps: 500 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 0.33 # CHECK-NEXT: IPC: 0.27 # CHECK-NEXT: Block RThroughput: 2.5 @@ -1600,10 +1600,10 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: [0,1] D=====eeeeER . . . . . sqrdmlah z0.b, z1.b, z2.b # CHECK-NEXT: [0,2] D=======eeeeER . . . . . sqrdmlah z0.b, z1.b, z2.b # CHECK-NEXT: [0,3] D===========eeeeER . . . . sqrdmlah z0.b, z0.b, z1.b -# CHECK-NEXT: [1,0] D===============eeeeeER . . . mul z0.d, z0.d, z0.d -# CHECK-NEXT: [1,1] D====================eeeeER . . sqrdmlah z0.b, z1.b, z2.b -# CHECK-NEXT: [1,2] D======================eeeeER . . sqrdmlah z0.b, z1.b, z2.b -# CHECK-NEXT: [1,3] D==========================eeeeER sqrdmlah z0.b, z0.b, z1.b +# CHECK-NEXT: [1,0] .D==============eeeeeER . . . mul z0.d, z0.d, z0.d +# CHECK-NEXT: [1,1] .D===================eeeeER . . sqrdmlah z0.b, z1.b, z2.b +# CHECK-NEXT: [1,2] .D=====================eeeeER . . sqrdmlah z0.b, z1.b, z2.b +# CHECK-NEXT: [1,3] .D=========================eeeeER sqrdmlah z0.b, z0.b, z1.b # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -1612,11 +1612,11 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 2 8.5 0.5 0.0 mul z0.d, z0.d, z0.d -# CHECK-NEXT: 1. 2 13.5 0.0 0.0 sqrdmlah z0.b, z1.b, z2.b -# CHECK-NEXT: 2. 2 15.5 0.0 0.0 sqrdmlah z0.b, z1.b, z2.b -# CHECK-NEXT: 3. 2 19.5 0.0 0.0 sqrdmlah z0.b, z0.b, z1.b -# CHECK-NEXT: 2 14.3 0.1 0.0 +# CHECK-NEXT: 0. 2 8.0 0.5 0.0 mul z0.d, z0.d, z0.d +# CHECK-NEXT: 1. 2 13.0 0.0 0.0 sqrdmlah z0.b, z1.b, z2.b +# CHECK-NEXT: 2. 2 15.0 0.0 0.0 sqrdmlah z0.b, z1.b, z2.b +# CHECK-NEXT: 3. 2 19.0 0.0 0.0 sqrdmlah z0.b, z0.b, z1.b +# CHECK-NEXT: 2 13.8 0.1 0.0 # CHECK: [34] Code Region - Z sqrdmlah.d @@ -1625,7 +1625,7 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: Total Cycles: 1803 # CHECK-NEXT: Total uOps: 500 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 0.28 # CHECK-NEXT: IPC: 0.22 # CHECK-NEXT: Block RThroughput: 4.0 @@ -1638,10 +1638,10 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: [0,1] D=====eeeeeER . . . . . . sqrdmlah z0.d, z1.d, z2.d # CHECK-NEXT: [0,2] D========eeeeeER . . . . . sqrdmlah z0.d, z1.d, z2.d # CHECK-NEXT: [0,3] D=============eeeeeER . . . . sqrdmlah z0.d, z0.d, z1.d -# CHECK-NEXT: [1,0] D==================eeeeeER . . . mul z0.d, z0.d, z0.d -# CHECK-NEXT: [1,1] D=======================eeeeeER . . sqrdmlah z0.d, z1.d, z2.d -# CHECK-NEXT: [1,2] D==========================eeeeeER . . sqrdmlah z0.d, z1.d, z2.d -# CHECK-NEXT: [1,3] D===============================eeeeeER sqrdmlah z0.d, z0.d, z1.d +# CHECK-NEXT: [1,0] .D=================eeeeeER . . . mul z0.d, z0.d, z0.d +# CHECK-NEXT: [1,1] .D======================eeeeeER . . sqrdmlah z0.d, z1.d, z2.d +# CHECK-NEXT: [1,2] .D=========================eeeeeER . . sqrdmlah z0.d, z1.d, z2.d +# CHECK-NEXT: [1,3] .D==============================eeeeeER sqrdmlah z0.d, z0.d, z1.d # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -1650,11 +1650,11 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 2 10.0 0.5 0.0 mul z0.d, z0.d, z0.d -# CHECK-NEXT: 1. 2 15.0 0.0 0.0 sqrdmlah z0.d, z1.d, z2.d -# CHECK-NEXT: 2. 2 18.0 0.0 0.0 sqrdmlah z0.d, z1.d, z2.d -# CHECK-NEXT: 3. 2 23.0 0.0 0.0 sqrdmlah z0.d, z0.d, z1.d -# CHECK-NEXT: 2 16.5 0.1 0.0 +# CHECK-NEXT: 0. 2 9.5 0.5 0.0 mul z0.d, z0.d, z0.d +# CHECK-NEXT: 1. 2 14.5 0.0 0.0 sqrdmlah z0.d, z1.d, z2.d +# CHECK-NEXT: 2. 2 17.5 0.0 0.0 sqrdmlah z0.d, z1.d, z2.d +# CHECK-NEXT: 3. 2 22.5 0.0 0.0 sqrdmlah z0.d, z0.d, z1.d +# CHECK-NEXT: 2 16.0 0.1 0.0 # CHECK: [35] Code Region - Z fcmla ZPmZZ @@ -1663,7 +1663,7 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: Total Cycles: 1503 # CHECK-NEXT: Total uOps: 400 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 0.27 # CHECK-NEXT: IPC: 0.27 # CHECK-NEXT: Block RThroughput: 1.0 @@ -1678,8 +1678,8 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: [0,3] D==========eeeeeER . . . . fcmla z0.d, p0/m, z0.d, z1.d, #90 # CHECK-NEXT: [1,0] D===============eeeER . . . fmul z0.d, z0.d, z0.d # CHECK-NEXT: [1,1] D==================eeeeeER . . fcmla z0.d, p0/m, z1.d, z2.d, #90 -# CHECK-NEXT: [1,2] D====================eeeeeER . . fcmla z0.d, p0/m, z1.d, z2.d, #90 -# CHECK-NEXT: [1,3] D=========================eeeeeER fcmla z0.d, p0/m, z0.d, z1.d, #90 +# CHECK-NEXT: [1,2] .D===================eeeeeER . . fcmla z0.d, p0/m, z1.d, z2.d, #90 +# CHECK-NEXT: [1,3] .D========================eeeeeER fcmla z0.d, p0/m, z0.d, z1.d, #90 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -1690,9 +1690,9 @@ bfmlalb z0.s, z0.h, z1.h # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 2 8.5 0.5 0.0 fmul z0.d, z0.d, z0.d # CHECK-NEXT: 1. 2 11.5 0.0 0.0 fcmla z0.d, p0/m, z1.d, z2.d, #90 -# CHECK-NEXT: 2. 2 13.5 0.0 0.0 fcmla z0.d, p0/m, z1.d, z2.d, #90 -# CHECK-NEXT: 3. 2 18.5 0.0 0.0 fcmla z0.d, p0/m, z0.d, z1.d, #90 -# CHECK-NEXT: 2 13.0 0.1 0.0 +# CHECK-NEXT: 2. 2 13.0 0.0 0.0 fcmla z0.d, p0/m, z1.d, z2.d, #90 +# CHECK-NEXT: 3. 2 18.0 0.0 0.0 fcmla z0.d, p0/m, z0.d, z1.d, #90 +# CHECK-NEXT: 2 12.8 0.1 0.0 # CHECK: [36] Code Region - Z fcmla ZZZI @@ -1701,7 +1701,7 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: Total Cycles: 1503 # CHECK-NEXT: Total uOps: 400 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 0.27 # CHECK-NEXT: IPC: 0.27 # CHECK-NEXT: Block RThroughput: 1.0 @@ -1716,8 +1716,8 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: [0,3] D==========eeeeeER . . . . fcmla z0.s, z0.s, z1.s[1], #90 # CHECK-NEXT: [1,0] D===============eeeER . . . fmul z0.d, z0.d, z0.d # CHECK-NEXT: [1,1] D==================eeeeeER . . fcmla z0.s, z1.s, z2.s[1], #90 -# CHECK-NEXT: [1,2] D====================eeeeeER . . fcmla z0.s, z1.s, z2.s[1], #90 -# CHECK-NEXT: [1,3] D=========================eeeeeER fcmla z0.s, z0.s, z1.s[1], #90 +# CHECK-NEXT: [1,2] .D===================eeeeeER . . fcmla z0.s, z1.s, z2.s[1], #90 +# CHECK-NEXT: [1,3] .D========================eeeeeER fcmla z0.s, z0.s, z1.s[1], #90 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -1728,9 +1728,9 @@ bfmlalb z0.s, z0.h, z1.h # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 2 8.5 0.5 0.0 fmul z0.d, z0.d, z0.d # CHECK-NEXT: 1. 2 11.5 0.0 0.0 fcmla z0.s, z1.s, z2.s[1], #90 -# CHECK-NEXT: 2. 2 13.5 0.0 0.0 fcmla z0.s, z1.s, z2.s[1], #90 -# CHECK-NEXT: 3. 2 18.5 0.0 0.0 fcmla z0.s, z0.s, z1.s[1], #90 -# CHECK-NEXT: 2 13.0 0.1 0.0 +# CHECK-NEXT: 2. 2 13.0 0.0 0.0 fcmla z0.s, z1.s, z2.s[1], #90 +# CHECK-NEXT: 3. 2 18.0 0.0 0.0 fcmla z0.s, z0.s, z1.s[1], #90 +# CHECK-NEXT: 2 12.8 0.1 0.0 # CHECK: [37] Code Region - Z fmla ZPmZZ @@ -1739,7 +1739,7 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: Total Cycles: 1303 # CHECK-NEXT: Total uOps: 400 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 0.31 # CHECK-NEXT: IPC: 0.31 # CHECK-NEXT: Block RThroughput: 1.0 @@ -1754,8 +1754,8 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: [0,3] D=========eeeeER . . . fmla z0.d, p0/m, z0.d, z1.d # CHECK-NEXT: [1,0] D=============eeeER . . . fmul z0.d, z0.d, z0.d # CHECK-NEXT: [1,1] D================eeeeER . . fmla z0.d, p0/m, z1.d, z2.d -# CHECK-NEXT: [1,2] D==================eeeeER. . fmla z0.d, p0/m, z1.d, z2.d -# CHECK-NEXT: [1,3] D======================eeeeER fmla z0.d, p0/m, z0.d, z1.d +# CHECK-NEXT: [1,2] .D=================eeeeER. . fmla z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: [1,3] .D=====================eeeeER fmla z0.d, p0/m, z0.d, z1.d # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -1766,9 +1766,9 @@ bfmlalb z0.s, z0.h, z1.h # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 2 7.5 0.5 0.0 fmul z0.d, z0.d, z0.d # CHECK-NEXT: 1. 2 10.5 0.0 0.0 fmla z0.d, p0/m, z1.d, z2.d -# CHECK-NEXT: 2. 2 12.5 0.0 0.0 fmla z0.d, p0/m, z1.d, z2.d -# CHECK-NEXT: 3. 2 16.5 0.0 0.0 fmla z0.d, p0/m, z0.d, z1.d -# CHECK-NEXT: 2 11.8 0.1 0.0 +# CHECK-NEXT: 2. 2 12.0 0.0 0.0 fmla z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: 3. 2 16.0 0.0 0.0 fmla z0.d, p0/m, z0.d, z1.d +# CHECK-NEXT: 2 11.5 0.1 0.0 # CHECK: [38] Code Region - Z fmla ZZZI @@ -1777,7 +1777,7 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: Total Cycles: 1303 # CHECK-NEXT: Total uOps: 400 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 0.31 # CHECK-NEXT: IPC: 0.31 # CHECK-NEXT: Block RThroughput: 1.0 @@ -1792,8 +1792,8 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: [0,3] D=========eeeeER . . . fmla z0.d, z0.d, z1.d[1] # CHECK-NEXT: [1,0] D=============eeeER . . . fmul z0.d, z0.d, z0.d # CHECK-NEXT: [1,1] D================eeeeER . . fmla z0.d, z1.d, z2.d[1] -# CHECK-NEXT: [1,2] D==================eeeeER. . fmla z0.d, z1.d, z2.d[1] -# CHECK-NEXT: [1,3] D======================eeeeER fmla z0.d, z0.d, z1.d[1] +# CHECK-NEXT: [1,2] .D=================eeeeER. . fmla z0.d, z1.d, z2.d[1] +# CHECK-NEXT: [1,3] .D=====================eeeeER fmla z0.d, z0.d, z1.d[1] # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -1804,9 +1804,9 @@ bfmlalb z0.s, z0.h, z1.h # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 2 7.5 0.5 0.0 fmul z0.d, z0.d, z0.d # CHECK-NEXT: 1. 2 10.5 0.0 0.0 fmla z0.d, z1.d, z2.d[1] -# CHECK-NEXT: 2. 2 12.5 0.0 0.0 fmla z0.d, z1.d, z2.d[1] -# CHECK-NEXT: 3. 2 16.5 0.0 0.0 fmla z0.d, z0.d, z1.d[1] -# CHECK-NEXT: 2 11.8 0.1 0.0 +# CHECK-NEXT: 2. 2 12.0 0.0 0.0 fmla z0.d, z1.d, z2.d[1] +# CHECK-NEXT: 3. 2 16.0 0.0 0.0 fmla z0.d, z0.d, z1.d[1] +# CHECK-NEXT: 2 11.5 0.1 0.0 # CHECK: [39] Code Region - Z fmlalb ZZZ @@ -1815,7 +1815,7 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: Total Cycles: 1303 # CHECK-NEXT: Total uOps: 400 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 0.31 # CHECK-NEXT: IPC: 0.31 # CHECK-NEXT: Block RThroughput: 1.0 @@ -1830,8 +1830,8 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: [0,3] D=========eeeeER . . . fmlalb z0.s, z0.h, z1.h # CHECK-NEXT: [1,0] D=============eeeER . . . fmul z0.d, z0.d, z0.d # CHECK-NEXT: [1,1] D================eeeeER . . fmlalb z0.s, z1.h, z2.h -# CHECK-NEXT: [1,2] D==================eeeeER. . fmlalb z0.s, z1.h, z2.h -# CHECK-NEXT: [1,3] D======================eeeeER fmlalb z0.s, z0.h, z1.h +# CHECK-NEXT: [1,2] .D=================eeeeER. . fmlalb z0.s, z1.h, z2.h +# CHECK-NEXT: [1,3] .D=====================eeeeER fmlalb z0.s, z0.h, z1.h # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -1842,9 +1842,9 @@ bfmlalb z0.s, z0.h, z1.h # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 2 7.5 0.5 0.0 fmul z0.d, z0.d, z0.d # CHECK-NEXT: 1. 2 10.5 0.0 0.0 fmlalb z0.s, z1.h, z2.h -# CHECK-NEXT: 2. 2 12.5 0.0 0.0 fmlalb z0.s, z1.h, z2.h -# CHECK-NEXT: 3. 2 16.5 0.0 0.0 fmlalb z0.s, z0.h, z1.h -# CHECK-NEXT: 2 11.8 0.1 0.0 +# CHECK-NEXT: 2. 2 12.0 0.0 0.0 fmlalb z0.s, z1.h, z2.h +# CHECK-NEXT: 3. 2 16.0 0.0 0.0 fmlalb z0.s, z0.h, z1.h +# CHECK-NEXT: 2 11.5 0.1 0.0 # CHECK: [40] Code Region - Z bfdot @@ -1853,7 +1853,7 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: Total Cycles: 1603 # CHECK-NEXT: Total uOps: 400 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 0.25 # CHECK-NEXT: IPC: 0.25 # CHECK-NEXT: Block RThroughput: 1.0 @@ -1868,8 +1868,8 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: [0,3] D===========eeeeeER . . . . bfdot z0.s, z0.h, z1.h # CHECK-NEXT: [1,0] D================eeeER . . . fmul z0.d, z0.d, z0.d # CHECK-NEXT: [1,1] D===================eeeeeER . . bfdot z0.s, z1.h, z2.h -# CHECK-NEXT: [1,2] D======================eeeeeER. . bfdot z0.s, z1.h, z2.h -# CHECK-NEXT: [1,3] D===========================eeeeeER bfdot z0.s, z0.h, z1.h +# CHECK-NEXT: [1,2] .D=====================eeeeeER. . bfdot z0.s, z1.h, z2.h +# CHECK-NEXT: [1,3] .D==========================eeeeeER bfdot z0.s, z0.h, z1.h # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -1880,9 +1880,9 @@ bfmlalb z0.s, z0.h, z1.h # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 2 9.0 0.5 0.0 fmul z0.d, z0.d, z0.d # CHECK-NEXT: 1. 2 12.0 0.0 0.0 bfdot z0.s, z1.h, z2.h -# CHECK-NEXT: 2. 2 15.0 0.0 0.0 bfdot z0.s, z1.h, z2.h -# CHECK-NEXT: 3. 2 20.0 0.0 0.0 bfdot z0.s, z0.h, z1.h -# CHECK-NEXT: 2 14.0 0.1 0.0 +# CHECK-NEXT: 2. 2 14.5 0.0 0.0 bfdot z0.s, z1.h, z2.h +# CHECK-NEXT: 3. 2 19.5 0.0 0.0 bfdot z0.s, z0.h, z1.h +# CHECK-NEXT: 2 13.8 0.1 0.0 # CHECK: [41] Code Region - Z bfmmla @@ -1891,7 +1891,7 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: Total Cycles: 1903 # CHECK-NEXT: Total uOps: 400 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 0.21 # CHECK-NEXT: IPC: 0.21 # CHECK-NEXT: Block RThroughput: 1.0 @@ -1906,8 +1906,8 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: [0,3] D=============eeeeeeER . . . . bfmmla z0.s, z0.h, z1.h # CHECK-NEXT: [1,0] D===================eeeER. . . . fmul z0.d, z0.d, z0.d # CHECK-NEXT: [1,1] D======================eeeeeeER . . bfmmla z0.s, z1.h, z2.h -# CHECK-NEXT: [1,2] D==========================eeeeeeER. . bfmmla z0.s, z1.h, z2.h -# CHECK-NEXT: [1,3] D================================eeeeeeER bfmmla z0.s, z0.h, z1.h +# CHECK-NEXT: [1,2] .D=========================eeeeeeER. . bfmmla z0.s, z1.h, z2.h +# CHECK-NEXT: [1,3] .D===============================eeeeeeER bfmmla z0.s, z0.h, z1.h # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -1918,9 +1918,9 @@ bfmlalb z0.s, z0.h, z1.h # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 2 10.5 0.5 0.0 fmul z0.d, z0.d, z0.d # CHECK-NEXT: 1. 2 13.5 0.0 0.0 bfmmla z0.s, z1.h, z2.h -# CHECK-NEXT: 2. 2 17.5 0.0 0.0 bfmmla z0.s, z1.h, z2.h -# CHECK-NEXT: 3. 2 23.5 0.0 0.0 bfmmla z0.s, z0.h, z1.h -# CHECK-NEXT: 2 16.3 0.1 0.0 +# CHECK-NEXT: 2. 2 17.0 0.0 0.0 bfmmla z0.s, z1.h, z2.h +# CHECK-NEXT: 3. 2 23.0 0.0 0.0 bfmmla z0.s, z0.h, z1.h +# CHECK-NEXT: 2 16.0 0.1 0.0 # CHECK: [42] Code Region - bfmlalb @@ -1929,7 +1929,7 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: Total Cycles: 1503 # CHECK-NEXT: Total uOps: 400 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 0.27 # CHECK-NEXT: IPC: 0.27 # CHECK-NEXT: Block RThroughput: 1.0 @@ -1944,8 +1944,8 @@ bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: [0,3] D==========eeeeeER . . . . bfmlalb z0.s, z0.h, z1.h # CHECK-NEXT: [1,0] D===============eeeER . . . fmul z0.d, z0.d, z0.d # CHECK-NEXT: [1,1] D==================eeeeeER . . bfmlalb z0.s, z1.h, z2.h -# CHECK-NEXT: [1,2] D====================eeeeeER . . bfmlalb z0.s, z1.h, z2.h -# CHECK-NEXT: [1,3] D=========================eeeeeER bfmlalb z0.s, z0.h, z1.h +# CHECK-NEXT: [1,2] .D===================eeeeeER . . bfmlalb z0.s, z1.h, z2.h +# CHECK-NEXT: [1,3] .D========================eeeeeER bfmlalb z0.s, z0.h, z1.h # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -1956,6 +1956,6 @@ bfmlalb z0.s, z0.h, z1.h # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 2 8.5 0.5 0.0 fmul z0.d, z0.d, z0.d # CHECK-NEXT: 1. 2 11.5 0.0 0.0 bfmlalb z0.s, z1.h, z2.h -# CHECK-NEXT: 2. 2 13.5 0.0 0.0 bfmlalb z0.s, z1.h, z2.h -# CHECK-NEXT: 3. 2 18.5 0.0 0.0 bfmlalb z0.s, z0.h, z1.h -# CHECK-NEXT: 2 13.0 0.1 0.0 +# CHECK-NEXT: 2. 2 13.0 0.0 0.0 bfmlalb z0.s, z1.h, z2.h +# CHECK-NEXT: 3. 2 18.0 0.0 0.0 bfmlalb z0.s, z0.h, z1.h +# CHECK-NEXT: 2 12.8 0.1 0.0 diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-sve-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-sve-instructions.s index 6cba45cdd42be..49af4df3d8ff5 100644 --- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-sve-instructions.s +++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-sve-instructions.s @@ -5071,19 +5071,19 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 2 2 1.00 movs p0.b, p0/z, p0.b # CHECK-NEXT: 2 2 1.00 movs p15.b, p15.b # CHECK-NEXT: 2 2 1.00 movs p15.b, p15/z, p15.b -# CHECK-NEXT: 1 1 0.06 U mrs x3, ID_AA64ZFR0_EL1 -# CHECK-NEXT: 1 1 0.06 U mrs x3, ZCR_EL1 -# CHECK-NEXT: 1 1 0.06 U mrs x3, ZCR_EL12 -# CHECK-NEXT: 1 1 0.06 U mrs x3, ZCR_EL2 -# CHECK-NEXT: 1 1 0.06 U mrs x3, ZCR_EL3 +# CHECK-NEXT: 1 1 0.17 U mrs x3, ID_AA64ZFR0_EL1 +# CHECK-NEXT: 1 1 0.17 U mrs x3, ZCR_EL1 +# CHECK-NEXT: 1 1 0.17 U mrs x3, ZCR_EL12 +# CHECK-NEXT: 1 1 0.17 U mrs x3, ZCR_EL2 +# CHECK-NEXT: 1 1 0.17 U mrs x3, ZCR_EL3 # CHECK-NEXT: 1 4 1.00 msb z0.b, p7/m, z1.b, z31.b # CHECK-NEXT: 1 5 1.00 msb z0.d, p7/m, z1.d, z31.d # CHECK-NEXT: 1 4 1.00 msb z0.h, p7/m, z1.h, z31.h # CHECK-NEXT: 1 4 1.00 msb z0.s, p7/m, z1.s, z31.s -# CHECK-NEXT: 1 1 0.06 U msr ZCR_EL1, x3 -# CHECK-NEXT: 1 1 0.06 U msr ZCR_EL12, x3 -# CHECK-NEXT: 1 1 0.06 U msr ZCR_EL2, x3 -# CHECK-NEXT: 1 1 0.06 U msr ZCR_EL3, x3 +# CHECK-NEXT: 1 1 0.17 U msr ZCR_EL1, x3 +# CHECK-NEXT: 1 1 0.17 U msr ZCR_EL12, x3 +# CHECK-NEXT: 1 1 0.17 U msr ZCR_EL2, x3 +# CHECK-NEXT: 1 1 0.17 U msr ZCR_EL3, x3 # CHECK-NEXT: 1 4 0.50 mul z0.b, p7/m, z0.b, z31.b # CHECK-NEXT: 1 4 0.50 mul z0.b, z1.b, z2.b # CHECK-NEXT: 2 5 1.00 mul z0.d, p7/m, z0.d, z31.d diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-writeback.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-writeback.s index 1ef746813966d..c7a93d1b4ed35 100644 --- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-writeback.s +++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-writeback.s @@ -733,7 +733,7 @@ ldr x2, [x1], #254 # CHECK-NEXT: Total Cycles: 508 # CHECK-NEXT: Total uOps: 1000 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 1.97 # CHECK-NEXT: IPC: 0.98 # CHECK-NEXT: Block RThroughput: 1.7 @@ -745,8 +745,8 @@ ldr x2, [x1], #254 # CHECK: [0,0] DeeeeeeER . . ld1 { v1.1d }, [x27], #8 # CHECK-NEXT: [0,1] D=eeeeeeER. . ld1 { v1.2d }, [x27], #16 # CHECK-NEXT: [0,2] D==eeeeeeER . ld1 { v1.2s }, [x27], #8 -# CHECK-NEXT: [0,3] D===eeeeeeER. ld1 { v1.4h }, [x27], #8 -# CHECK-NEXT: [0,4] D====eeeeeeER ld1 { v1.4s }, [x27], #16 +# CHECK-NEXT: [0,3] .D==eeeeeeER. ld1 { v1.4h }, [x27], #8 +# CHECK-NEXT: [0,4] .D===eeeeeeER ld1 { v1.4s }, [x27], #16 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -758,9 +758,9 @@ ldr x2, [x1], #254 # CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.1d }, [x27], #8 # CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld1 { v1.2d }, [x27], #16 # CHECK-NEXT: 2. 1 3.0 0.0 0.0 ld1 { v1.2s }, [x27], #8 -# CHECK-NEXT: 3. 1 4.0 0.0 0.0 ld1 { v1.4h }, [x27], #8 -# CHECK-NEXT: 4. 1 5.0 0.0 0.0 ld1 { v1.4s }, [x27], #16 -# CHECK-NEXT: 1 3.0 0.2 0.0 +# CHECK-NEXT: 3. 1 3.0 0.0 0.0 ld1 { v1.4h }, [x27], #8 +# CHECK-NEXT: 4. 1 4.0 0.0 0.0 ld1 { v1.4s }, [x27], #16 +# CHECK-NEXT: 1 2.6 0.2 0.0 # CHECK: [1] Code Region - G02 @@ -769,7 +769,7 @@ ldr x2, [x1], #254 # CHECK-NEXT: Total Cycles: 508 # CHECK-NEXT: Total uOps: 1000 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 1.97 # CHECK-NEXT: IPC: 0.98 # CHECK-NEXT: Block RThroughput: 1.7 @@ -781,8 +781,8 @@ ldr x2, [x1], #254 # CHECK: [0,0] DeeeeeeER . . ld1 { v1.8b }, [x27], #8 # CHECK-NEXT: [0,1] D=eeeeeeER. . ld1 { v1.8h }, [x27], #16 # CHECK-NEXT: [0,2] D==eeeeeeER . ld1 { v1.16b }, [x27], #16 -# CHECK-NEXT: [0,3] D===eeeeeeER. ld1 { v1.1d }, [x27], x28 -# CHECK-NEXT: [0,4] D====eeeeeeER ld1 { v1.2d }, [x27], x28 +# CHECK-NEXT: [0,3] .D==eeeeeeER. ld1 { v1.1d }, [x27], x28 +# CHECK-NEXT: [0,4] .D===eeeeeeER ld1 { v1.2d }, [x27], x28 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -794,9 +794,9 @@ ldr x2, [x1], #254 # CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.8b }, [x27], #8 # CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld1 { v1.8h }, [x27], #16 # CHECK-NEXT: 2. 1 3.0 0.0 0.0 ld1 { v1.16b }, [x27], #16 -# CHECK-NEXT: 3. 1 4.0 0.0 0.0 ld1 { v1.1d }, [x27], x28 -# CHECK-NEXT: 4. 1 5.0 0.0 0.0 ld1 { v1.2d }, [x27], x28 -# CHECK-NEXT: 1 3.0 0.2 0.0 +# CHECK-NEXT: 3. 1 3.0 0.0 0.0 ld1 { v1.1d }, [x27], x28 +# CHECK-NEXT: 4. 1 4.0 0.0 0.0 ld1 { v1.2d }, [x27], x28 +# CHECK-NEXT: 1 2.6 0.2 0.0 # CHECK: [2] Code Region - G03 @@ -805,7 +805,7 @@ ldr x2, [x1], #254 # CHECK-NEXT: Total Cycles: 508 # CHECK-NEXT: Total uOps: 1000 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 1.97 # CHECK-NEXT: IPC: 0.98 # CHECK-NEXT: Block RThroughput: 1.7 @@ -817,8 +817,8 @@ ldr x2, [x1], #254 # CHECK: [0,0] DeeeeeeER . . ld1 { v1.2s }, [x27], x28 # CHECK-NEXT: [0,1] D=eeeeeeER. . ld1 { v1.4h }, [x27], x28 # CHECK-NEXT: [0,2] D==eeeeeeER . ld1 { v1.4s }, [x27], x28 -# CHECK-NEXT: [0,3] D===eeeeeeER. ld1 { v1.8b }, [x27], x28 -# CHECK-NEXT: [0,4] D====eeeeeeER ld1 { v1.8h }, [x27], x28 +# CHECK-NEXT: [0,3] .D==eeeeeeER. ld1 { v1.8b }, [x27], x28 +# CHECK-NEXT: [0,4] .D===eeeeeeER ld1 { v1.8h }, [x27], x28 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -830,9 +830,9 @@ ldr x2, [x1], #254 # CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.2s }, [x27], x28 # CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld1 { v1.4h }, [x27], x28 # CHECK-NEXT: 2. 1 3.0 0.0 0.0 ld1 { v1.4s }, [x27], x28 -# CHECK-NEXT: 3. 1 4.0 0.0 0.0 ld1 { v1.8b }, [x27], x28 -# CHECK-NEXT: 4. 1 5.0 0.0 0.0 ld1 { v1.8h }, [x27], x28 -# CHECK-NEXT: 1 3.0 0.2 0.0 +# CHECK-NEXT: 3. 1 3.0 0.0 0.0 ld1 { v1.8b }, [x27], x28 +# CHECK-NEXT: 4. 1 4.0 0.0 0.0 ld1 { v1.8h }, [x27], x28 +# CHECK-NEXT: 1 2.6 0.2 0.0 # CHECK: [3] Code Region - G04 @@ -841,7 +841,7 @@ ldr x2, [x1], #254 # CHECK-NEXT: Total Cycles: 508 # CHECK-NEXT: Total uOps: 1400 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 2.76 # CHECK-NEXT: IPC: 0.98 # CHECK-NEXT: Block RThroughput: 3.0 @@ -852,9 +852,9 @@ ldr x2, [x1], #254 # CHECK: [0,0] DeeeeeeER . . ld1 { v1.16b }, [x27], x28 # CHECK-NEXT: [0,1] D=eeeeeeER. . ld1 { v1.1d, v2.1d }, [x27], #16 -# CHECK-NEXT: [0,2] D==eeeeeeER . ld1 { v1.2d, v2.2d }, [x27], #32 -# CHECK-NEXT: [0,3] D===eeeeeeER. ld1 { v1.2s, v2.2s }, [x27], #16 -# CHECK-NEXT: [0,4] D====eeeeeeER ld1 { v1.4h, v2.4h }, [x27], #16 +# CHECK-NEXT: [0,2] .D=eeeeeeER . ld1 { v1.2d, v2.2d }, [x27], #32 +# CHECK-NEXT: [0,3] .D==eeeeeeER. ld1 { v1.2s, v2.2s }, [x27], #16 +# CHECK-NEXT: [0,4] . D==eeeeeeER ld1 { v1.4h, v2.4h }, [x27], #16 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -865,10 +865,10 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.16b }, [x27], x28 # CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld1 { v1.1d, v2.1d }, [x27], #16 -# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ld1 { v1.2d, v2.2d }, [x27], #32 -# CHECK-NEXT: 3. 1 4.0 0.0 0.0 ld1 { v1.2s, v2.2s }, [x27], #16 -# CHECK-NEXT: 4. 1 5.0 0.0 0.0 ld1 { v1.4h, v2.4h }, [x27], #16 -# CHECK-NEXT: 1 3.0 0.2 0.0 +# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld1 { v1.2d, v2.2d }, [x27], #32 +# CHECK-NEXT: 3. 1 3.0 0.0 0.0 ld1 { v1.2s, v2.2s }, [x27], #16 +# CHECK-NEXT: 4. 1 3.0 0.0 0.0 ld1 { v1.4h, v2.4h }, [x27], #16 +# CHECK-NEXT: 1 2.2 0.2 0.0 # CHECK: [4] Code Region - G05 @@ -877,7 +877,7 @@ ldr x2, [x1], #254 # CHECK-NEXT: Total Cycles: 508 # CHECK-NEXT: Total uOps: 1500 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 2.95 # CHECK-NEXT: IPC: 0.98 # CHECK-NEXT: Block RThroughput: 3.3 @@ -888,9 +888,9 @@ ldr x2, [x1], #254 # CHECK: [0,0] DeeeeeeER . . ld1 { v1.4s, v2.4s }, [x27], #32 # CHECK-NEXT: [0,1] D=eeeeeeER. . ld1 { v1.8b, v2.8b }, [x27], #16 -# CHECK-NEXT: [0,2] D==eeeeeeER . ld1 { v1.8h, v2.8h }, [x27], #32 -# CHECK-NEXT: [0,3] D===eeeeeeER. ld1 { v1.16b, v2.16b }, [x27], #32 -# CHECK-NEXT: [0,4] D====eeeeeeER ld1 { v1.1d, v2.1d }, [x27], x28 +# CHECK-NEXT: [0,2] .D=eeeeeeER . ld1 { v1.8h, v2.8h }, [x27], #32 +# CHECK-NEXT: [0,3] .D==eeeeeeER. ld1 { v1.16b, v2.16b }, [x27], #32 +# CHECK-NEXT: [0,4] . D==eeeeeeER ld1 { v1.1d, v2.1d }, [x27], x28 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -901,10 +901,10 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.4s, v2.4s }, [x27], #32 # CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld1 { v1.8b, v2.8b }, [x27], #16 -# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ld1 { v1.8h, v2.8h }, [x27], #32 -# CHECK-NEXT: 3. 1 4.0 0.0 0.0 ld1 { v1.16b, v2.16b }, [x27], #32 -# CHECK-NEXT: 4. 1 5.0 0.0 0.0 ld1 { v1.1d, v2.1d }, [x27], x28 -# CHECK-NEXT: 1 3.0 0.2 0.0 +# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld1 { v1.8h, v2.8h }, [x27], #32 +# CHECK-NEXT: 3. 1 3.0 0.0 0.0 ld1 { v1.16b, v2.16b }, [x27], #32 +# CHECK-NEXT: 4. 1 3.0 0.0 0.0 ld1 { v1.1d, v2.1d }, [x27], x28 +# CHECK-NEXT: 1 2.2 0.2 0.0 # CHECK: [5] Code Region - G06 @@ -913,7 +913,7 @@ ldr x2, [x1], #254 # CHECK-NEXT: Total Cycles: 508 # CHECK-NEXT: Total uOps: 1500 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 2.95 # CHECK-NEXT: IPC: 0.98 # CHECK-NEXT: Block RThroughput: 3.3 @@ -924,9 +924,9 @@ ldr x2, [x1], #254 # CHECK: [0,0] DeeeeeeER . . ld1 { v1.2d, v2.2d }, [x27], x28 # CHECK-NEXT: [0,1] D=eeeeeeER. . ld1 { v1.2s, v2.2s }, [x27], x28 -# CHECK-NEXT: [0,2] D==eeeeeeER . ld1 { v1.4h, v2.4h }, [x27], x28 -# CHECK-NEXT: [0,3] D===eeeeeeER. ld1 { v1.4s, v2.4s }, [x27], x28 -# CHECK-NEXT: [0,4] D====eeeeeeER ld1 { v1.8b, v2.8b }, [x27], x28 +# CHECK-NEXT: [0,2] .D=eeeeeeER . ld1 { v1.4h, v2.4h }, [x27], x28 +# CHECK-NEXT: [0,3] .D==eeeeeeER. ld1 { v1.4s, v2.4s }, [x27], x28 +# CHECK-NEXT: [0,4] . D==eeeeeeER ld1 { v1.8b, v2.8b }, [x27], x28 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -937,10 +937,10 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.2d, v2.2d }, [x27], x28 # CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld1 { v1.2s, v2.2s }, [x27], x28 -# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ld1 { v1.4h, v2.4h }, [x27], x28 -# CHECK-NEXT: 3. 1 4.0 0.0 0.0 ld1 { v1.4s, v2.4s }, [x27], x28 -# CHECK-NEXT: 4. 1 5.0 0.0 0.0 ld1 { v1.8b, v2.8b }, [x27], x28 -# CHECK-NEXT: 1 3.0 0.2 0.0 +# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld1 { v1.4h, v2.4h }, [x27], x28 +# CHECK-NEXT: 3. 1 3.0 0.0 0.0 ld1 { v1.4s, v2.4s }, [x27], x28 +# CHECK-NEXT: 4. 1 3.0 0.0 0.0 ld1 { v1.8b, v2.8b }, [x27], x28 +# CHECK-NEXT: 1 2.2 0.2 0.0 # CHECK: [6] Code Region - G07 @@ -949,7 +949,7 @@ ldr x2, [x1], #254 # CHECK-NEXT: Total Cycles: 508 # CHECK-NEXT: Total uOps: 1800 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 3.54 # CHECK-NEXT: IPC: 0.98 # CHECK-NEXT: Block RThroughput: 4.3 @@ -960,9 +960,9 @@ ldr x2, [x1], #254 # CHECK: [0,0] DeeeeeeER . . ld1 { v1.8h, v2.8h }, [x27], x28 # CHECK-NEXT: [0,1] D=eeeeeeER. . ld1 { v1.16b, v2.16b }, [x27], x28 -# CHECK-NEXT: [0,2] D==eeeeeeER . ld1 { v1.1d, v2.1d, v3.1d }, [x27], #24 -# CHECK-NEXT: [0,3] D===eeeeeeER. ld1 { v1.2d, v2.2d, v3.2d }, [x27], #48 -# CHECK-NEXT: [0,4] .D===eeeeeeER ld1 { v1.2s, v2.2s, v3.2s }, [x27], #24 +# CHECK-NEXT: [0,2] .D=eeeeeeER . ld1 { v1.1d, v2.1d, v3.1d }, [x27], #24 +# CHECK-NEXT: [0,3] . D=eeeeeeER. ld1 { v1.2d, v2.2d, v3.2d }, [x27], #48 +# CHECK-NEXT: [0,4] . D=eeeeeeER ld1 { v1.2s, v2.2s, v3.2s }, [x27], #24 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -973,10 +973,10 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.8h, v2.8h }, [x27], x28 # CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld1 { v1.16b, v2.16b }, [x27], x28 -# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ld1 { v1.1d, v2.1d, v3.1d }, [x27], #24 -# CHECK-NEXT: 3. 1 4.0 0.0 0.0 ld1 { v1.2d, v2.2d, v3.2d }, [x27], #48 -# CHECK-NEXT: 4. 1 4.0 0.0 0.0 ld1 { v1.2s, v2.2s, v3.2s }, [x27], #24 -# CHECK-NEXT: 1 2.8 0.2 0.0 +# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld1 { v1.1d, v2.1d, v3.1d }, [x27], #24 +# CHECK-NEXT: 3. 1 2.0 0.0 0.0 ld1 { v1.2d, v2.2d, v3.2d }, [x27], #48 +# CHECK-NEXT: 4. 1 2.0 0.0 0.0 ld1 { v1.2s, v2.2s, v3.2s }, [x27], #24 +# CHECK-NEXT: 1 1.8 0.2 0.0 # CHECK: [7] Code Region - G08 @@ -985,7 +985,7 @@ ldr x2, [x1], #254 # CHECK-NEXT: Total Cycles: 508 # CHECK-NEXT: Total uOps: 2000 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 3.94 # CHECK-NEXT: IPC: 0.98 # CHECK-NEXT: Block RThroughput: 5.0 @@ -995,10 +995,10 @@ ldr x2, [x1], #254 # CHECK-NEXT: Index 0123456789 # CHECK: [0,0] DeeeeeeER . . ld1 { v1.4h, v2.4h, v3.4h }, [x27], #24 -# CHECK-NEXT: [0,1] D=eeeeeeER. . ld1 { v1.4s, v2.4s, v3.4s }, [x27], #48 -# CHECK-NEXT: [0,2] D==eeeeeeER . ld1 { v1.8b, v2.8b, v3.8b }, [x27], #24 -# CHECK-NEXT: [0,3] D===eeeeeeER. ld1 { v1.8h, v2.8h, v3.8h }, [x27], #48 -# CHECK-NEXT: [0,4] .D===eeeeeeER ld1 { v1.16b, v2.16b, v3.16b }, [x27], #48 +# CHECK-NEXT: [0,1] .DeeeeeeER. . ld1 { v1.4s, v2.4s, v3.4s }, [x27], #48 +# CHECK-NEXT: [0,2] . DeeeeeeER . ld1 { v1.8b, v2.8b, v3.8b }, [x27], #24 +# CHECK-NEXT: [0,3] . DeeeeeeER. ld1 { v1.8h, v2.8h, v3.8h }, [x27], #48 +# CHECK-NEXT: [0,4] . DeeeeeeER ld1 { v1.16b, v2.16b, v3.16b }, [x27], #48 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -1008,11 +1008,11 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.4h, v2.4h, v3.4h }, [x27], #24 -# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld1 { v1.4s, v2.4s, v3.4s }, [x27], #48 -# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ld1 { v1.8b, v2.8b, v3.8b }, [x27], #24 -# CHECK-NEXT: 3. 1 4.0 0.0 0.0 ld1 { v1.8h, v2.8h, v3.8h }, [x27], #48 -# CHECK-NEXT: 4. 1 4.0 0.0 0.0 ld1 { v1.16b, v2.16b, v3.16b }, [x27], #48 -# CHECK-NEXT: 1 2.8 0.2 0.0 +# CHECK-NEXT: 1. 1 1.0 0.0 0.0 ld1 { v1.4s, v2.4s, v3.4s }, [x27], #48 +# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld1 { v1.8b, v2.8b, v3.8b }, [x27], #24 +# CHECK-NEXT: 3. 1 1.0 0.0 0.0 ld1 { v1.8h, v2.8h, v3.8h }, [x27], #48 +# CHECK-NEXT: 4. 1 1.0 0.0 0.0 ld1 { v1.16b, v2.16b, v3.16b }, [x27], #48 +# CHECK-NEXT: 1 1.0 0.2 0.0 # CHECK: [8] Code Region - G09 @@ -1021,7 +1021,7 @@ ldr x2, [x1], #254 # CHECK-NEXT: Total Cycles: 508 # CHECK-NEXT: Total uOps: 2000 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 3.94 # CHECK-NEXT: IPC: 0.98 # CHECK-NEXT: Block RThroughput: 5.0 @@ -1031,10 +1031,10 @@ ldr x2, [x1], #254 # CHECK-NEXT: Index 0123456789 # CHECK: [0,0] DeeeeeeER . . ld1 { v1.1d, v2.1d, v3.1d }, [x27], x28 -# CHECK-NEXT: [0,1] D=eeeeeeER. . ld1 { v1.2d, v2.2d, v3.2d }, [x27], x28 -# CHECK-NEXT: [0,2] D==eeeeeeER . ld1 { v1.2s, v2.2s, v3.2s }, [x27], x28 -# CHECK-NEXT: [0,3] D===eeeeeeER. ld1 { v1.4h, v2.4h, v3.4h }, [x27], x28 -# CHECK-NEXT: [0,4] .D===eeeeeeER ld1 { v1.4s, v2.4s, v3.4s }, [x27], x28 +# CHECK-NEXT: [0,1] .DeeeeeeER. . ld1 { v1.2d, v2.2d, v3.2d }, [x27], x28 +# CHECK-NEXT: [0,2] . DeeeeeeER . ld1 { v1.2s, v2.2s, v3.2s }, [x27], x28 +# CHECK-NEXT: [0,3] . DeeeeeeER. ld1 { v1.4h, v2.4h, v3.4h }, [x27], x28 +# CHECK-NEXT: [0,4] . DeeeeeeER ld1 { v1.4s, v2.4s, v3.4s }, [x27], x28 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -1044,11 +1044,11 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.1d, v2.1d, v3.1d }, [x27], x28 -# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld1 { v1.2d, v2.2d, v3.2d }, [x27], x28 -# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ld1 { v1.2s, v2.2s, v3.2s }, [x27], x28 -# CHECK-NEXT: 3. 1 4.0 0.0 0.0 ld1 { v1.4h, v2.4h, v3.4h }, [x27], x28 -# CHECK-NEXT: 4. 1 4.0 0.0 0.0 ld1 { v1.4s, v2.4s, v3.4s }, [x27], x28 -# CHECK-NEXT: 1 2.8 0.2 0.0 +# CHECK-NEXT: 1. 1 1.0 0.0 0.0 ld1 { v1.2d, v2.2d, v3.2d }, [x27], x28 +# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld1 { v1.2s, v2.2s, v3.2s }, [x27], x28 +# CHECK-NEXT: 3. 1 1.0 0.0 0.0 ld1 { v1.4h, v2.4h, v3.4h }, [x27], x28 +# CHECK-NEXT: 4. 1 1.0 0.0 0.0 ld1 { v1.4s, v2.4s, v3.4s }, [x27], x28 +# CHECK-NEXT: 1 1.0 0.2 0.0 # CHECK: [9] Code Region - G10 @@ -1057,7 +1057,7 @@ ldr x2, [x1], #254 # CHECK-NEXT: Total Cycles: 608 # CHECK-NEXT: Total uOps: 2200 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 3.62 # CHECK-NEXT: IPC: 0.82 # CHECK-NEXT: Block RThroughput: 5.7 @@ -1067,10 +1067,10 @@ ldr x2, [x1], #254 # CHECK-NEXT: Index 0123456789 # CHECK: [0,0] DeeeeeeER . . ld1 { v1.8b, v2.8b, v3.8b }, [x27], x28 -# CHECK-NEXT: [0,1] D=eeeeeeER. . ld1 { v1.8h, v2.8h, v3.8h }, [x27], x28 -# CHECK-NEXT: [0,2] D==eeeeeeER . ld1 { v1.16b, v2.16b, v3.16b }, [x27], x28 -# CHECK-NEXT: [0,3] .D==eeeeeeeER. ld1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32 -# CHECK-NEXT: [0,4] .D===eeeeeeeER ld1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64 +# CHECK-NEXT: [0,1] .DeeeeeeER. . ld1 { v1.8h, v2.8h, v3.8h }, [x27], x28 +# CHECK-NEXT: [0,2] . DeeeeeeER . ld1 { v1.16b, v2.16b, v3.16b }, [x27], x28 +# CHECK-NEXT: [0,3] . DeeeeeeeER. ld1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32 +# CHECK-NEXT: [0,4] . DeeeeeeeER ld1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -1080,11 +1080,11 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.8b, v2.8b, v3.8b }, [x27], x28 -# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld1 { v1.8h, v2.8h, v3.8h }, [x27], x28 -# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ld1 { v1.16b, v2.16b, v3.16b }, [x27], x28 -# CHECK-NEXT: 3. 1 3.0 0.0 0.0 ld1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32 -# CHECK-NEXT: 4. 1 4.0 0.0 0.0 ld1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64 -# CHECK-NEXT: 1 2.6 0.2 0.0 +# CHECK-NEXT: 1. 1 1.0 0.0 0.0 ld1 { v1.8h, v2.8h, v3.8h }, [x27], x28 +# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld1 { v1.16b, v2.16b, v3.16b }, [x27], x28 +# CHECK-NEXT: 3. 1 1.0 0.0 0.0 ld1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32 +# CHECK-NEXT: 4. 1 1.0 0.0 0.0 ld1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64 +# CHECK-NEXT: 1 1.0 0.2 0.0 # CHECK: [10] Code Region - G11 @@ -1093,7 +1093,7 @@ ldr x2, [x1], #254 # CHECK-NEXT: Total Cycles: 675 # CHECK-NEXT: Total uOps: 2500 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 3.70 # CHECK-NEXT: IPC: 0.74 # CHECK-NEXT: Block RThroughput: 6.7 @@ -1103,10 +1103,10 @@ ldr x2, [x1], #254 # CHECK-NEXT: Index 0123456789 # CHECK: [0,0] DeeeeeeeER. . ld1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32 -# CHECK-NEXT: [0,1] D=eeeeeeeER . ld1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32 -# CHECK-NEXT: [0,2] D==eeeeeeeER . ld1 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64 -# CHECK-NEXT: [0,3] .D===eeeeeeeER. ld1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32 -# CHECK-NEXT: [0,4] .D====eeeeeeeER ld1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64 +# CHECK-NEXT: [0,1] .DeeeeeeeER . ld1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32 +# CHECK-NEXT: [0,2] . DeeeeeeeER . ld1 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64 +# CHECK-NEXT: [0,3] . D=eeeeeeeER. ld1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32 +# CHECK-NEXT: [0,4] . D=eeeeeeeER ld1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -1116,11 +1116,11 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32 -# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32 -# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ld1 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64 -# CHECK-NEXT: 3. 1 4.0 1.0 0.0 ld1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32 -# CHECK-NEXT: 4. 1 5.0 0.0 0.0 ld1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64 -# CHECK-NEXT: 1 3.0 0.4 0.0 +# CHECK-NEXT: 1. 1 1.0 0.0 0.0 ld1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32 +# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld1 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64 +# CHECK-NEXT: 3. 1 2.0 1.0 0.0 ld1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32 +# CHECK-NEXT: 4. 1 2.0 0.0 0.0 ld1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64 +# CHECK-NEXT: 1 1.4 0.4 0.0 # CHECK: [11] Code Region - G12 @@ -1129,7 +1129,7 @@ ldr x2, [x1], #254 # CHECK-NEXT: Total Cycles: 675 # CHECK-NEXT: Total uOps: 2500 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 3.70 # CHECK-NEXT: IPC: 0.74 # CHECK-NEXT: Block RThroughput: 6.7 @@ -1139,10 +1139,10 @@ ldr x2, [x1], #254 # CHECK-NEXT: Index 0123456789 # CHECK: [0,0] DeeeeeeeER. . ld1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64 -# CHECK-NEXT: [0,1] D=eeeeeeeER . ld1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28 -# CHECK-NEXT: [0,2] D==eeeeeeeER . ld1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28 -# CHECK-NEXT: [0,3] .D===eeeeeeeER. ld1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28 -# CHECK-NEXT: [0,4] .D====eeeeeeeER ld1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28 +# CHECK-NEXT: [0,1] .DeeeeeeeER . ld1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28 +# CHECK-NEXT: [0,2] . DeeeeeeeER . ld1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28 +# CHECK-NEXT: [0,3] . D=eeeeeeeER. ld1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28 +# CHECK-NEXT: [0,4] . D=eeeeeeeER ld1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -1152,11 +1152,11 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64 -# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28 -# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ld1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28 -# CHECK-NEXT: 3. 1 4.0 1.0 0.0 ld1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28 -# CHECK-NEXT: 4. 1 5.0 0.0 0.0 ld1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28 -# CHECK-NEXT: 1 3.0 0.4 0.0 +# CHECK-NEXT: 1. 1 1.0 0.0 0.0 ld1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28 +# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28 +# CHECK-NEXT: 3. 1 2.0 1.0 0.0 ld1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28 +# CHECK-NEXT: 4. 1 2.0 0.0 0.0 ld1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28 +# CHECK-NEXT: 1 1.4 0.4 0.0 # CHECK: [12] Code Region - G13 @@ -1165,7 +1165,7 @@ ldr x2, [x1], #254 # CHECK-NEXT: Total Cycles: 1210 # CHECK-NEXT: Total uOps: 2300 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 1.90 # CHECK-NEXT: IPC: 0.41 # CHECK-NEXT: Block RThroughput: 5.7 @@ -1175,10 +1175,10 @@ ldr x2, [x1], #254 # CHECK-NEXT: Index 0123456789 01 # CHECK: [0,0] DeeeeeeeER. . .. ld1 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28 -# CHECK-NEXT: [0,1] D=eeeeeeeER . .. ld1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28 -# CHECK-NEXT: [0,2] D==eeeeeeeER . .. ld1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28 -# CHECK-NEXT: [0,3] .D===eeeeeeeER . .. ld1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28 -# CHECK-NEXT: [0,4] .D==========eeeeeeeeER ld1 { v1.b }[0], [x27], #1 +# CHECK-NEXT: [0,1] .DeeeeeeeER . .. ld1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28 +# CHECK-NEXT: [0,2] . DeeeeeeeER . .. ld1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28 +# CHECK-NEXT: [0,3] . D=eeeeeeeER . .. ld1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28 +# CHECK-NEXT: [0,4] . D=======eeeeeeeeER ld1 { v1.b }[0], [x27], #1 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -1188,11 +1188,11 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28 -# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28 -# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ld1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28 -# CHECK-NEXT: 3. 1 4.0 1.0 0.0 ld1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28 -# CHECK-NEXT: 4. 1 11.0 0.0 0.0 ld1 { v1.b }[0], [x27], #1 -# CHECK-NEXT: 1 4.2 0.4 0.0 +# CHECK-NEXT: 1. 1 1.0 0.0 0.0 ld1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28 +# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28 +# CHECK-NEXT: 3. 1 2.0 1.0 0.0 ld1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28 +# CHECK-NEXT: 4. 1 8.0 0.0 0.0 ld1 { v1.b }[0], [x27], #1 +# CHECK-NEXT: 1 2.6 0.4 0.0 # CHECK: [13] Code Region - G14 @@ -1201,10 +1201,10 @@ ldr x2, [x1], #254 # CHECK-NEXT: Total Cycles: 4003 # CHECK-NEXT: Total uOps: 1500 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 0.37 # CHECK-NEXT: IPC: 0.12 -# CHECK-NEXT: Block RThroughput: 1.7 +# CHECK-NEXT: Block RThroughput: 2.5 # CHECK: Timeline view: # CHECK-NEXT: 0123456789 0123456789 @@ -1212,9 +1212,9 @@ ldr x2, [x1], #254 # CHECK: [0,0] DeeeeeeeeER . . . . . . . ld1 { v1.b }[8], [x27], #1 # CHECK-NEXT: [0,1] D========eeeeeeeeER . . . . . . ld1 { v1.b }[0], [x27], x28 -# CHECK-NEXT: [0,2] D================eeeeeeeeER . . . . ld1 { v1.b }[8], [x27], x28 -# CHECK-NEXT: [0,3] D========================eeeeeeeeER. . . ld1 { v1.h }[0], [x27], #2 -# CHECK-NEXT: [0,4] D================================eeeeeeeeER ld1 { v1.h }[4], [x27], #2 +# CHECK-NEXT: [0,2] .D===============eeeeeeeeER . . . . ld1 { v1.b }[8], [x27], x28 +# CHECK-NEXT: [0,3] .D=======================eeeeeeeeER. . . ld1 { v1.h }[0], [x27], #2 +# CHECK-NEXT: [0,4] . D==============================eeeeeeeeER ld1 { v1.h }[4], [x27], #2 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -1225,10 +1225,10 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.b }[8], [x27], #1 # CHECK-NEXT: 1. 1 9.0 0.0 0.0 ld1 { v1.b }[0], [x27], x28 -# CHECK-NEXT: 2. 1 17.0 0.0 0.0 ld1 { v1.b }[8], [x27], x28 -# CHECK-NEXT: 3. 1 25.0 0.0 0.0 ld1 { v1.h }[0], [x27], #2 -# CHECK-NEXT: 4. 1 33.0 0.0 0.0 ld1 { v1.h }[4], [x27], #2 -# CHECK-NEXT: 1 17.0 0.2 0.0 +# CHECK-NEXT: 2. 1 16.0 0.0 0.0 ld1 { v1.b }[8], [x27], x28 +# CHECK-NEXT: 3. 1 24.0 0.0 0.0 ld1 { v1.h }[0], [x27], #2 +# CHECK-NEXT: 4. 1 31.0 0.0 0.0 ld1 { v1.h }[4], [x27], #2 +# CHECK-NEXT: 1 16.2 0.2 0.0 # CHECK: [14] Code Region - G15 @@ -1237,10 +1237,10 @@ ldr x2, [x1], #254 # CHECK-NEXT: Total Cycles: 4003 # CHECK-NEXT: Total uOps: 1500 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 0.37 # CHECK-NEXT: IPC: 0.12 -# CHECK-NEXT: Block RThroughput: 1.7 +# CHECK-NEXT: Block RThroughput: 2.5 # CHECK: Timeline view: # CHECK-NEXT: 0123456789 0123456789 @@ -1248,9 +1248,9 @@ ldr x2, [x1], #254 # CHECK: [0,0] DeeeeeeeeER . . . . . . . ld1 { v1.h }[0], [x27], x28 # CHECK-NEXT: [0,1] D========eeeeeeeeER . . . . . . ld1 { v1.h }[4], [x27], x28 -# CHECK-NEXT: [0,2] D================eeeeeeeeER . . . . ld1 { v1.s }[0], [x27], #4 -# CHECK-NEXT: [0,3] D========================eeeeeeeeER. . . ld1 { v1.s }[0], [x27], x28 -# CHECK-NEXT: [0,4] D================================eeeeeeeeER ld1 { v1.d }[0], [x27], #8 +# CHECK-NEXT: [0,2] .D===============eeeeeeeeER . . . . ld1 { v1.s }[0], [x27], #4 +# CHECK-NEXT: [0,3] .D=======================eeeeeeeeER. . . ld1 { v1.s }[0], [x27], x28 +# CHECK-NEXT: [0,4] . D==============================eeeeeeeeER ld1 { v1.d }[0], [x27], #8 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -1261,10 +1261,10 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.h }[0], [x27], x28 # CHECK-NEXT: 1. 1 9.0 0.0 0.0 ld1 { v1.h }[4], [x27], x28 -# CHECK-NEXT: 2. 1 17.0 0.0 0.0 ld1 { v1.s }[0], [x27], #4 -# CHECK-NEXT: 3. 1 25.0 0.0 0.0 ld1 { v1.s }[0], [x27], x28 -# CHECK-NEXT: 4. 1 33.0 0.0 0.0 ld1 { v1.d }[0], [x27], #8 -# CHECK-NEXT: 1 17.0 0.2 0.0 +# CHECK-NEXT: 2. 1 16.0 0.0 0.0 ld1 { v1.s }[0], [x27], #4 +# CHECK-NEXT: 3. 1 24.0 0.0 0.0 ld1 { v1.s }[0], [x27], x28 +# CHECK-NEXT: 4. 1 31.0 0.0 0.0 ld1 { v1.d }[0], [x27], #8 +# CHECK-NEXT: 1 16.2 0.2 0.0 # CHECK: [15] Code Region - G16 @@ -1273,10 +1273,10 @@ ldr x2, [x1], #254 # CHECK-NEXT: Total Cycles: 1203 # CHECK-NEXT: Total uOps: 1500 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 1.25 # CHECK-NEXT: IPC: 0.42 -# CHECK-NEXT: Block RThroughput: 1.7 +# CHECK-NEXT: Block RThroughput: 2.5 # CHECK: Timeline view: # CHECK-NEXT: 01234 @@ -1284,9 +1284,9 @@ ldr x2, [x1], #254 # CHECK: [0,0] DeeeeeeeeER . ld1 { v1.d }[0], [x27], x28 # CHECK-NEXT: [0,1] D=eeeeeeeeER . ld1r { v1.1d }, [x27], #8 -# CHECK-NEXT: [0,2] D==eeeeeeeeER . ld1r { v1.2d }, [x27], #8 -# CHECK-NEXT: [0,3] D===eeeeeeeeER. ld1r { v1.2s }, [x27], #4 -# CHECK-NEXT: [0,4] D====eeeeeeeeER ld1r { v1.4h }, [x27], #2 +# CHECK-NEXT: [0,2] .D=eeeeeeeeER . ld1r { v1.2d }, [x27], #8 +# CHECK-NEXT: [0,3] .D==eeeeeeeeER. ld1r { v1.2s }, [x27], #4 +# CHECK-NEXT: [0,4] . D==eeeeeeeeER ld1r { v1.4h }, [x27], #2 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -1297,10 +1297,10 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.d }[0], [x27], x28 # CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld1r { v1.1d }, [x27], #8 -# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ld1r { v1.2d }, [x27], #8 -# CHECK-NEXT: 3. 1 4.0 0.0 0.0 ld1r { v1.2s }, [x27], #4 -# CHECK-NEXT: 4. 1 5.0 0.0 0.0 ld1r { v1.4h }, [x27], #2 -# CHECK-NEXT: 1 3.0 0.2 0.0 +# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld1r { v1.2d }, [x27], #8 +# CHECK-NEXT: 3. 1 3.0 0.0 0.0 ld1r { v1.2s }, [x27], #4 +# CHECK-NEXT: 4. 1 3.0 0.0 0.0 ld1r { v1.4h }, [x27], #2 +# CHECK-NEXT: 1 2.2 0.2 0.0 # CHECK: [16] Code Region - G17 @@ -1309,10 +1309,10 @@ ldr x2, [x1], #254 # CHECK-NEXT: Total Cycles: 510 # CHECK-NEXT: Total uOps: 1500 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 2.94 # CHECK-NEXT: IPC: 0.98 -# CHECK-NEXT: Block RThroughput: 1.7 +# CHECK-NEXT: Block RThroughput: 2.5 # CHECK: Timeline view: # CHECK-NEXT: 01234 @@ -1320,9 +1320,9 @@ ldr x2, [x1], #254 # CHECK: [0,0] DeeeeeeeeER . ld1r { v1.4s }, [x27], #4 # CHECK-NEXT: [0,1] D=eeeeeeeeER . ld1r { v1.8b }, [x27], #1 -# CHECK-NEXT: [0,2] D==eeeeeeeeER . ld1r { v1.8h }, [x27], #2 -# CHECK-NEXT: [0,3] D===eeeeeeeeER. ld1r { v1.16b }, [x27], #1 -# CHECK-NEXT: [0,4] D====eeeeeeeeER ld1r { v1.1d }, [x27], x28 +# CHECK-NEXT: [0,2] .D=eeeeeeeeER . ld1r { v1.8h }, [x27], #2 +# CHECK-NEXT: [0,3] .D==eeeeeeeeER. ld1r { v1.16b }, [x27], #1 +# CHECK-NEXT: [0,4] . D==eeeeeeeeER ld1r { v1.1d }, [x27], x28 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -1333,10 +1333,10 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1r { v1.4s }, [x27], #4 # CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld1r { v1.8b }, [x27], #1 -# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ld1r { v1.8h }, [x27], #2 -# CHECK-NEXT: 3. 1 4.0 0.0 0.0 ld1r { v1.16b }, [x27], #1 -# CHECK-NEXT: 4. 1 5.0 0.0 0.0 ld1r { v1.1d }, [x27], x28 -# CHECK-NEXT: 1 3.0 0.2 0.0 +# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld1r { v1.8h }, [x27], #2 +# CHECK-NEXT: 3. 1 3.0 0.0 0.0 ld1r { v1.16b }, [x27], #1 +# CHECK-NEXT: 4. 1 3.0 0.0 0.0 ld1r { v1.1d }, [x27], x28 +# CHECK-NEXT: 1 2.2 0.2 0.0 # CHECK: [17] Code Region - G18 @@ -1345,10 +1345,10 @@ ldr x2, [x1], #254 # CHECK-NEXT: Total Cycles: 510 # CHECK-NEXT: Total uOps: 1500 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 2.94 # CHECK-NEXT: IPC: 0.98 -# CHECK-NEXT: Block RThroughput: 1.7 +# CHECK-NEXT: Block RThroughput: 2.5 # CHECK: Timeline view: # CHECK-NEXT: 01234 @@ -1356,9 +1356,9 @@ ldr x2, [x1], #254 # CHECK: [0,0] DeeeeeeeeER . ld1r { v1.2d }, [x27], x28 # CHECK-NEXT: [0,1] D=eeeeeeeeER . ld1r { v1.2s }, [x27], x28 -# CHECK-NEXT: [0,2] D==eeeeeeeeER . ld1r { v1.4h }, [x27], x28 -# CHECK-NEXT: [0,3] D===eeeeeeeeER. ld1r { v1.4s }, [x27], x28 -# CHECK-NEXT: [0,4] D====eeeeeeeeER ld1r { v1.8b }, [x27], x28 +# CHECK-NEXT: [0,2] .D=eeeeeeeeER . ld1r { v1.4h }, [x27], x28 +# CHECK-NEXT: [0,3] .D==eeeeeeeeER. ld1r { v1.4s }, [x27], x28 +# CHECK-NEXT: [0,4] . D==eeeeeeeeER ld1r { v1.8b }, [x27], x28 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -1369,10 +1369,10 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1r { v1.2d }, [x27], x28 # CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld1r { v1.2s }, [x27], x28 -# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ld1r { v1.4h }, [x27], x28 -# CHECK-NEXT: 3. 1 4.0 0.0 0.0 ld1r { v1.4s }, [x27], x28 -# CHECK-NEXT: 4. 1 5.0 0.0 0.0 ld1r { v1.8b }, [x27], x28 -# CHECK-NEXT: 1 3.0 0.2 0.0 +# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld1r { v1.4h }, [x27], x28 +# CHECK-NEXT: 3. 1 3.0 0.0 0.0 ld1r { v1.4s }, [x27], x28 +# CHECK-NEXT: 4. 1 3.0 0.0 0.0 ld1r { v1.8b }, [x27], x28 +# CHECK-NEXT: 1 2.2 0.2 0.0 # CHECK: [18] Code Region - G19 @@ -1381,10 +1381,10 @@ ldr x2, [x1], #254 # CHECK-NEXT: Total Cycles: 510 # CHECK-NEXT: Total uOps: 1900 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 3.73 # CHECK-NEXT: IPC: 0.98 -# CHECK-NEXT: Block RThroughput: 2.0 +# CHECK-NEXT: Block RThroughput: 3.2 # CHECK: Timeline view: # CHECK-NEXT: 01234 @@ -1392,9 +1392,9 @@ ldr x2, [x1], #254 # CHECK: [0,0] DeeeeeeeeER . ld1r { v1.8h }, [x27], x28 # CHECK-NEXT: [0,1] D=eeeeeeeeER . ld1r { v1.16b }, [x27], x28 -# CHECK-NEXT: [0,2] D==eeeeeeeeER . ld2 { v1.2d, v2.2d }, [x27], #32 -# CHECK-NEXT: [0,3] D===eeeeeeeeER. ld2 { v1.2s, v2.2s }, [x27], #16 -# CHECK-NEXT: [0,4] .D===eeeeeeeeER ld2 { v1.4h, v2.4h }, [x27], #16 +# CHECK-NEXT: [0,2] .D=eeeeeeeeER . ld2 { v1.2d, v2.2d }, [x27], #32 +# CHECK-NEXT: [0,3] . D=eeeeeeeeER. ld2 { v1.2s, v2.2s }, [x27], #16 +# CHECK-NEXT: [0,4] . D=eeeeeeeeER ld2 { v1.4h, v2.4h }, [x27], #16 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -1405,10 +1405,10 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1r { v1.8h }, [x27], x28 # CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld1r { v1.16b }, [x27], x28 -# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ld2 { v1.2d, v2.2d }, [x27], #32 -# CHECK-NEXT: 3. 1 4.0 0.0 0.0 ld2 { v1.2s, v2.2s }, [x27], #16 -# CHECK-NEXT: 4. 1 4.0 0.0 0.0 ld2 { v1.4h, v2.4h }, [x27], #16 -# CHECK-NEXT: 1 2.8 0.2 0.0 +# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld2 { v1.2d, v2.2d }, [x27], #32 +# CHECK-NEXT: 3. 1 2.0 0.0 0.0 ld2 { v1.2s, v2.2s }, [x27], #16 +# CHECK-NEXT: 4. 1 2.0 0.0 0.0 ld2 { v1.4h, v2.4h }, [x27], #16 +# CHECK-NEXT: 1 1.8 0.2 0.0 # CHECK: [19] Code Region - G20 @@ -1417,20 +1417,20 @@ ldr x2, [x1], #254 # CHECK-NEXT: Total Cycles: 510 # CHECK-NEXT: Total uOps: 2400 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 4.71 # CHECK-NEXT: IPC: 0.98 -# CHECK-NEXT: Block RThroughput: 3.0 +# CHECK-NEXT: Block RThroughput: 4.0 # CHECK: Timeline view: # CHECK-NEXT: 01234 # CHECK-NEXT: Index 0123456789 # CHECK: [0,0] DeeeeeeeeER . ld2 { v1.4s, v2.4s }, [x27], #32 -# CHECK-NEXT: [0,1] D=eeeeeeeeER . ld2 { v1.8b, v2.8b }, [x27], #16 -# CHECK-NEXT: [0,2] D==eeeeeeeeER . ld2 { v1.8h, v2.8h }, [x27], #32 -# CHECK-NEXT: [0,3] .D==eeeeeeeeER. ld2 { v1.16b, v2.16b }, [x27], #32 -# CHECK-NEXT: [0,4] .D===eeeeeeeeER ld2 { v1.2d, v2.2d }, [x27], x28 +# CHECK-NEXT: [0,1] .DeeeeeeeeER . ld2 { v1.8b, v2.8b }, [x27], #16 +# CHECK-NEXT: [0,2] . DeeeeeeeeER . ld2 { v1.8h, v2.8h }, [x27], #32 +# CHECK-NEXT: [0,3] . DeeeeeeeeER. ld2 { v1.16b, v2.16b }, [x27], #32 +# CHECK-NEXT: [0,4] . DeeeeeeeeER ld2 { v1.2d, v2.2d }, [x27], x28 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -1440,11 +1440,11 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld2 { v1.4s, v2.4s }, [x27], #32 -# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld2 { v1.8b, v2.8b }, [x27], #16 -# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ld2 { v1.8h, v2.8h }, [x27], #32 -# CHECK-NEXT: 3. 1 3.0 0.0 0.0 ld2 { v1.16b, v2.16b }, [x27], #32 -# CHECK-NEXT: 4. 1 4.0 0.0 0.0 ld2 { v1.2d, v2.2d }, [x27], x28 -# CHECK-NEXT: 1 2.6 0.2 0.0 +# CHECK-NEXT: 1. 1 1.0 0.0 0.0 ld2 { v1.8b, v2.8b }, [x27], #16 +# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld2 { v1.8h, v2.8h }, [x27], #32 +# CHECK-NEXT: 3. 1 1.0 0.0 0.0 ld2 { v1.16b, v2.16b }, [x27], #32 +# CHECK-NEXT: 4. 1 1.0 0.0 0.0 ld2 { v1.2d, v2.2d }, [x27], x28 +# CHECK-NEXT: 1 1.0 0.2 0.0 # CHECK: [20] Code Region - G21 @@ -1453,20 +1453,20 @@ ldr x2, [x1], #254 # CHECK-NEXT: Total Cycles: 510 # CHECK-NEXT: Total uOps: 2200 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 4.31 # CHECK-NEXT: IPC: 0.98 -# CHECK-NEXT: Block RThroughput: 2.5 +# CHECK-NEXT: Block RThroughput: 3.7 # CHECK: Timeline view: # CHECK-NEXT: 01234 # CHECK-NEXT: Index 0123456789 # CHECK: [0,0] DeeeeeeeeER . ld2 { v1.2s, v2.2s }, [x27], x28 -# CHECK-NEXT: [0,1] D=eeeeeeeeER . ld2 { v1.4h, v2.4h }, [x27], x28 -# CHECK-NEXT: [0,2] D==eeeeeeeeER . ld2 { v1.4s, v2.4s }, [x27], x28 -# CHECK-NEXT: [0,3] .D==eeeeeeeeER. ld2 { v1.8b, v2.8b }, [x27], x28 -# CHECK-NEXT: [0,4] .D===eeeeeeeeER ld2 { v1.8h, v2.8h }, [x27], x28 +# CHECK-NEXT: [0,1] .DeeeeeeeeER . ld2 { v1.4h, v2.4h }, [x27], x28 +# CHECK-NEXT: [0,2] . DeeeeeeeeER . ld2 { v1.4s, v2.4s }, [x27], x28 +# CHECK-NEXT: [0,3] . DeeeeeeeeER. ld2 { v1.8b, v2.8b }, [x27], x28 +# CHECK-NEXT: [0,4] . DeeeeeeeeER ld2 { v1.8h, v2.8h }, [x27], x28 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -1476,11 +1476,11 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld2 { v1.2s, v2.2s }, [x27], x28 -# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld2 { v1.4h, v2.4h }, [x27], x28 -# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ld2 { v1.4s, v2.4s }, [x27], x28 -# CHECK-NEXT: 3. 1 3.0 0.0 0.0 ld2 { v1.8b, v2.8b }, [x27], x28 -# CHECK-NEXT: 4. 1 4.0 0.0 0.0 ld2 { v1.8h, v2.8h }, [x27], x28 -# CHECK-NEXT: 1 2.6 0.2 0.0 +# CHECK-NEXT: 1. 1 1.0 0.0 0.0 ld2 { v1.4h, v2.4h }, [x27], x28 +# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld2 { v1.4s, v2.4s }, [x27], x28 +# CHECK-NEXT: 3. 1 1.0 0.0 0.0 ld2 { v1.8b, v2.8b }, [x27], x28 +# CHECK-NEXT: 4. 1 1.0 0.0 0.0 ld2 { v1.8h, v2.8h }, [x27], x28 +# CHECK-NEXT: 1 1.0 0.2 0.0 # CHECK: [21] Code Region - G22 @@ -1489,20 +1489,20 @@ ldr x2, [x1], #254 # CHECK-NEXT: Total Cycles: 3310 # CHECK-NEXT: Total uOps: 2100 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 0.63 # CHECK-NEXT: IPC: 0.15 -# CHECK-NEXT: Block RThroughput: 2.5 +# CHECK-NEXT: Block RThroughput: 3.5 # CHECK: Timeline view: # CHECK-NEXT: 0123456789 0123456789 # CHECK-NEXT: Index 0123456789 0123456789 012 # CHECK: [0,0] DeeeeeeeeER . . . . . . . ld2 { v1.16b, v2.16b }, [x27], x28 -# CHECK-NEXT: [0,1] D========eeeeeeeeER . . . . . . ld2 { v1.b, v2.b }[0], [x27], #2 -# CHECK-NEXT: [0,2] D================eeeeeeeeER . . . . ld2 { v1.b, v2.b }[8], [x27], #2 -# CHECK-NEXT: [0,3] .D=======================eeeeeeeeER. . . ld2 { v1.b, v2.b }[0], [x27], x28 -# CHECK-NEXT: [0,4] .D===============================eeeeeeeeER ld2 { v1.b, v2.b }[8], [x27], x28 +# CHECK-NEXT: [0,1] .D=======eeeeeeeeER . . . . . . ld2 { v1.b, v2.b }[0], [x27], #2 +# CHECK-NEXT: [0,2] . D==============eeeeeeeeER . . . . ld2 { v1.b, v2.b }[8], [x27], #2 +# CHECK-NEXT: [0,3] . D=====================eeeeeeeeER. . . ld2 { v1.b, v2.b }[0], [x27], x28 +# CHECK-NEXT: [0,4] . D============================eeeeeeeeER ld2 { v1.b, v2.b }[8], [x27], x28 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -1512,11 +1512,11 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld2 { v1.16b, v2.16b }, [x27], x28 -# CHECK-NEXT: 1. 1 9.0 0.0 0.0 ld2 { v1.b, v2.b }[0], [x27], #2 -# CHECK-NEXT: 2. 1 17.0 0.0 0.0 ld2 { v1.b, v2.b }[8], [x27], #2 -# CHECK-NEXT: 3. 1 24.0 0.0 0.0 ld2 { v1.b, v2.b }[0], [x27], x28 -# CHECK-NEXT: 4. 1 32.0 0.0 0.0 ld2 { v1.b, v2.b }[8], [x27], x28 -# CHECK-NEXT: 1 16.6 0.2 0.0 +# CHECK-NEXT: 1. 1 8.0 0.0 0.0 ld2 { v1.b, v2.b }[0], [x27], #2 +# CHECK-NEXT: 2. 1 15.0 0.0 0.0 ld2 { v1.b, v2.b }[8], [x27], #2 +# CHECK-NEXT: 3. 1 22.0 0.0 0.0 ld2 { v1.b, v2.b }[0], [x27], x28 +# CHECK-NEXT: 4. 1 29.0 0.0 0.0 ld2 { v1.b, v2.b }[8], [x27], x28 +# CHECK-NEXT: 1 15.0 0.2 0.0 # CHECK: [22] Code Region - G23 @@ -1525,20 +1525,20 @@ ldr x2, [x1], #254 # CHECK-NEXT: Total Cycles: 4003 # CHECK-NEXT: Total uOps: 2000 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 0.50 # CHECK-NEXT: IPC: 0.12 -# CHECK-NEXT: Block RThroughput: 2.5 +# CHECK-NEXT: Block RThroughput: 3.3 # CHECK: Timeline view: # CHECK-NEXT: 0123456789 0123456789 # CHECK-NEXT: Index 0123456789 0123456789 012 # CHECK: [0,0] DeeeeeeeeER . . . . . . . ld2 { v1.h, v2.h }[0], [x27], #4 -# CHECK-NEXT: [0,1] D========eeeeeeeeER . . . . . . ld2 { v1.h, v2.h }[4], [x27], #4 -# CHECK-NEXT: [0,2] D================eeeeeeeeER . . . . ld2 { v1.h, v2.h }[0], [x27], x28 -# CHECK-NEXT: [0,3] D========================eeeeeeeeER. . . ld2 { v1.h, v2.h }[4], [x27], x28 -# CHECK-NEXT: [0,4] .D===============================eeeeeeeeER ld2 { v1.s, v2.s }[0], [x27], #8 +# CHECK-NEXT: [0,1] .D=======eeeeeeeeER . . . . . . ld2 { v1.h, v2.h }[4], [x27], #4 +# CHECK-NEXT: [0,2] . D==============eeeeeeeeER . . . . ld2 { v1.h, v2.h }[0], [x27], x28 +# CHECK-NEXT: [0,3] . D=====================eeeeeeeeER. . . ld2 { v1.h, v2.h }[4], [x27], x28 +# CHECK-NEXT: [0,4] . D============================eeeeeeeeER ld2 { v1.s, v2.s }[0], [x27], #8 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -1548,11 +1548,11 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld2 { v1.h, v2.h }[0], [x27], #4 -# CHECK-NEXT: 1. 1 9.0 0.0 0.0 ld2 { v1.h, v2.h }[4], [x27], #4 -# CHECK-NEXT: 2. 1 17.0 0.0 0.0 ld2 { v1.h, v2.h }[0], [x27], x28 -# CHECK-NEXT: 3. 1 25.0 0.0 0.0 ld2 { v1.h, v2.h }[4], [x27], x28 -# CHECK-NEXT: 4. 1 32.0 0.0 0.0 ld2 { v1.s, v2.s }[0], [x27], #8 -# CHECK-NEXT: 1 16.8 0.2 0.0 +# CHECK-NEXT: 1. 1 8.0 0.0 0.0 ld2 { v1.h, v2.h }[4], [x27], #4 +# CHECK-NEXT: 2. 1 15.0 0.0 0.0 ld2 { v1.h, v2.h }[0], [x27], x28 +# CHECK-NEXT: 3. 1 22.0 0.0 0.0 ld2 { v1.h, v2.h }[4], [x27], x28 +# CHECK-NEXT: 4. 1 29.0 0.0 0.0 ld2 { v1.s, v2.s }[0], [x27], #8 +# CHECK-NEXT: 1 15.0 0.2 0.0 # CHECK: [23] Code Region - G24 @@ -1561,20 +1561,20 @@ ldr x2, [x1], #254 # CHECK-NEXT: Total Cycles: 2603 # CHECK-NEXT: Total uOps: 2000 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 0.77 # CHECK-NEXT: IPC: 0.19 -# CHECK-NEXT: Block RThroughput: 2.5 +# CHECK-NEXT: Block RThroughput: 3.3 # CHECK: Timeline view: # CHECK-NEXT: 0123456789 # CHECK-NEXT: Index 0123456789 012345678 # CHECK: [0,0] DeeeeeeeeER . . . . ld2 { v1.s, v2.s }[0], [x27], x28 -# CHECK-NEXT: [0,1] D========eeeeeeeeER . . . ld2 { v1.d, v2.d }[0], [x27], #16 -# CHECK-NEXT: [0,2] D================eeeeeeeeER . ld2 { v1.d, v2.d }[0], [x27], x28 -# CHECK-NEXT: [0,3] D=================eeeeeeeeER. ld2r { v1.1d, v2.1d }, [x27], #16 -# CHECK-NEXT: [0,4] .D=================eeeeeeeeER ld2r { v1.2d, v2.2d }, [x27], #16 +# CHECK-NEXT: [0,1] .D=======eeeeeeeeER . . . ld2 { v1.d, v2.d }[0], [x27], #16 +# CHECK-NEXT: [0,2] . D==============eeeeeeeeER . ld2 { v1.d, v2.d }[0], [x27], x28 +# CHECK-NEXT: [0,3] . D==============eeeeeeeeER. ld2r { v1.1d, v2.1d }, [x27], #16 +# CHECK-NEXT: [0,4] . D==============eeeeeeeeER ld2r { v1.2d, v2.2d }, [x27], #16 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -1584,11 +1584,11 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld2 { v1.s, v2.s }[0], [x27], x28 -# CHECK-NEXT: 1. 1 9.0 0.0 0.0 ld2 { v1.d, v2.d }[0], [x27], #16 -# CHECK-NEXT: 2. 1 17.0 0.0 0.0 ld2 { v1.d, v2.d }[0], [x27], x28 -# CHECK-NEXT: 3. 1 18.0 0.0 0.0 ld2r { v1.1d, v2.1d }, [x27], #16 -# CHECK-NEXT: 4. 1 18.0 0.0 0.0 ld2r { v1.2d, v2.2d }, [x27], #16 -# CHECK-NEXT: 1 12.6 0.2 0.0 +# CHECK-NEXT: 1. 1 8.0 0.0 0.0 ld2 { v1.d, v2.d }[0], [x27], #16 +# CHECK-NEXT: 2. 1 15.0 0.0 0.0 ld2 { v1.d, v2.d }[0], [x27], x28 +# CHECK-NEXT: 3. 1 15.0 0.0 0.0 ld2r { v1.1d, v2.1d }, [x27], #16 +# CHECK-NEXT: 4. 1 15.0 0.0 0.0 ld2r { v1.2d, v2.2d }, [x27], #16 +# CHECK-NEXT: 1 10.8 0.2 0.0 # CHECK: [24] Code Region - G25 @@ -1597,20 +1597,20 @@ ldr x2, [x1], #254 # CHECK-NEXT: Total Cycles: 510 # CHECK-NEXT: Total uOps: 2000 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 3.92 # CHECK-NEXT: IPC: 0.98 -# CHECK-NEXT: Block RThroughput: 2.5 +# CHECK-NEXT: Block RThroughput: 3.3 # CHECK: Timeline view: # CHECK-NEXT: 01234 # CHECK-NEXT: Index 0123456789 # CHECK: [0,0] DeeeeeeeeER . ld2r { v1.2s, v2.2s }, [x27], #8 -# CHECK-NEXT: [0,1] D=eeeeeeeeER . ld2r { v1.4h, v2.4h }, [x27], #4 -# CHECK-NEXT: [0,2] D==eeeeeeeeER . ld2r { v1.4s, v2.4s }, [x27], #8 -# CHECK-NEXT: [0,3] D===eeeeeeeeER. ld2r { v1.8b, v2.8b }, [x27], #2 -# CHECK-NEXT: [0,4] .D===eeeeeeeeER ld2r { v1.8h, v2.8h }, [x27], #4 +# CHECK-NEXT: [0,1] .DeeeeeeeeER . ld2r { v1.4h, v2.4h }, [x27], #4 +# CHECK-NEXT: [0,2] . DeeeeeeeeER . ld2r { v1.4s, v2.4s }, [x27], #8 +# CHECK-NEXT: [0,3] . DeeeeeeeeER. ld2r { v1.8b, v2.8b }, [x27], #2 +# CHECK-NEXT: [0,4] . DeeeeeeeeER ld2r { v1.8h, v2.8h }, [x27], #4 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -1620,11 +1620,11 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld2r { v1.2s, v2.2s }, [x27], #8 -# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld2r { v1.4h, v2.4h }, [x27], #4 -# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ld2r { v1.4s, v2.4s }, [x27], #8 -# CHECK-NEXT: 3. 1 4.0 0.0 0.0 ld2r { v1.8b, v2.8b }, [x27], #2 -# CHECK-NEXT: 4. 1 4.0 0.0 0.0 ld2r { v1.8h, v2.8h }, [x27], #4 -# CHECK-NEXT: 1 2.8 0.2 0.0 +# CHECK-NEXT: 1. 1 1.0 0.0 0.0 ld2r { v1.4h, v2.4h }, [x27], #4 +# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld2r { v1.4s, v2.4s }, [x27], #8 +# CHECK-NEXT: 3. 1 1.0 0.0 0.0 ld2r { v1.8b, v2.8b }, [x27], #2 +# CHECK-NEXT: 4. 1 1.0 0.0 0.0 ld2r { v1.8h, v2.8h }, [x27], #4 +# CHECK-NEXT: 1 1.0 0.2 0.0 # CHECK: [25] Code Region - G26 @@ -1633,20 +1633,20 @@ ldr x2, [x1], #254 # CHECK-NEXT: Total Cycles: 510 # CHECK-NEXT: Total uOps: 2000 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 3.92 # CHECK-NEXT: IPC: 0.98 -# CHECK-NEXT: Block RThroughput: 2.5 +# CHECK-NEXT: Block RThroughput: 3.3 # CHECK: Timeline view: # CHECK-NEXT: 01234 # CHECK-NEXT: Index 0123456789 # CHECK: [0,0] DeeeeeeeeER . ld2r { v1.16b, v2.16b }, [x27], #2 -# CHECK-NEXT: [0,1] D=eeeeeeeeER . ld2r { v1.1d, v2.1d }, [x27], x28 -# CHECK-NEXT: [0,2] D==eeeeeeeeER . ld2r { v1.2d, v2.2d }, [x27], x28 -# CHECK-NEXT: [0,3] D===eeeeeeeeER. ld2r { v1.2s, v2.2s }, [x27], x28 -# CHECK-NEXT: [0,4] .D===eeeeeeeeER ld2r { v1.4h, v2.4h }, [x27], x28 +# CHECK-NEXT: [0,1] .DeeeeeeeeER . ld2r { v1.1d, v2.1d }, [x27], x28 +# CHECK-NEXT: [0,2] . DeeeeeeeeER . ld2r { v1.2d, v2.2d }, [x27], x28 +# CHECK-NEXT: [0,3] . DeeeeeeeeER. ld2r { v1.2s, v2.2s }, [x27], x28 +# CHECK-NEXT: [0,4] . DeeeeeeeeER ld2r { v1.4h, v2.4h }, [x27], x28 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -1656,11 +1656,11 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld2r { v1.16b, v2.16b }, [x27], #2 -# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld2r { v1.1d, v2.1d }, [x27], x28 -# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ld2r { v1.2d, v2.2d }, [x27], x28 -# CHECK-NEXT: 3. 1 4.0 0.0 0.0 ld2r { v1.2s, v2.2s }, [x27], x28 -# CHECK-NEXT: 4. 1 4.0 0.0 0.0 ld2r { v1.4h, v2.4h }, [x27], x28 -# CHECK-NEXT: 1 2.8 0.2 0.0 +# CHECK-NEXT: 1. 1 1.0 0.0 0.0 ld2r { v1.1d, v2.1d }, [x27], x28 +# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld2r { v1.2d, v2.2d }, [x27], x28 +# CHECK-NEXT: 3. 1 1.0 0.0 0.0 ld2r { v1.2s, v2.2s }, [x27], x28 +# CHECK-NEXT: 4. 1 1.0 0.0 0.0 ld2r { v1.4h, v2.4h }, [x27], x28 +# CHECK-NEXT: 1 1.0 0.2 0.0 # CHECK: [26] Code Region - G27 @@ -1669,20 +1669,20 @@ ldr x2, [x1], #254 # CHECK-NEXT: Total Cycles: 510 # CHECK-NEXT: Total uOps: 2300 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 4.51 # CHECK-NEXT: IPC: 0.98 -# CHECK-NEXT: Block RThroughput: 2.8 +# CHECK-NEXT: Block RThroughput: 3.8 # CHECK: Timeline view: # CHECK-NEXT: 01234 # CHECK-NEXT: Index 0123456789 # CHECK: [0,0] DeeeeeeeeER . ld2r { v1.4s, v2.4s }, [x27], x28 -# CHECK-NEXT: [0,1] D=eeeeeeeeER . ld2r { v1.8b, v2.8b }, [x27], x28 -# CHECK-NEXT: [0,2] D==eeeeeeeeER . ld2r { v1.8h, v2.8h }, [x27], x28 -# CHECK-NEXT: [0,3] D===eeeeeeeeER. ld2r { v1.16b, v2.16b }, [x27], x28 -# CHECK-NEXT: [0,4] .D===eeeeeeeeER ld3 { v1.2d, v2.2d, v3.2d }, [x27], #48 +# CHECK-NEXT: [0,1] .DeeeeeeeeER . ld2r { v1.8b, v2.8b }, [x27], x28 +# CHECK-NEXT: [0,2] . DeeeeeeeeER . ld2r { v1.8h, v2.8h }, [x27], x28 +# CHECK-NEXT: [0,3] . DeeeeeeeeER. ld2r { v1.16b, v2.16b }, [x27], x28 +# CHECK-NEXT: [0,4] . DeeeeeeeeER ld3 { v1.2d, v2.2d, v3.2d }, [x27], #48 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -1692,33 +1692,33 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld2r { v1.4s, v2.4s }, [x27], x28 -# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld2r { v1.8b, v2.8b }, [x27], x28 -# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ld2r { v1.8h, v2.8h }, [x27], x28 -# CHECK-NEXT: 3. 1 4.0 0.0 0.0 ld2r { v1.16b, v2.16b }, [x27], x28 -# CHECK-NEXT: 4. 1 4.0 0.0 0.0 ld3 { v1.2d, v2.2d, v3.2d }, [x27], #48 -# CHECK-NEXT: 1 2.8 0.2 0.0 +# CHECK-NEXT: 1. 1 1.0 0.0 0.0 ld2r { v1.8b, v2.8b }, [x27], x28 +# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld2r { v1.8h, v2.8h }, [x27], x28 +# CHECK-NEXT: 3. 1 1.0 0.0 0.0 ld2r { v1.16b, v2.16b }, [x27], x28 +# CHECK-NEXT: 4. 1 1.0 0.0 0.0 ld3 { v1.2d, v2.2d, v3.2d }, [x27], #48 +# CHECK-NEXT: 1 1.0 0.2 0.0 # CHECK: [27] Code Region - G28 # CHECK: Iterations: 100 # CHECK-NEXT: Instructions: 500 -# CHECK-NEXT: Total Cycles: 510 +# CHECK-NEXT: Total Cycles: 709 # CHECK-NEXT: Total uOps: 3200 -# CHECK: Dispatch Width: 16 -# CHECK-NEXT: uOps Per Cycle: 6.27 -# CHECK-NEXT: IPC: 0.98 -# CHECK-NEXT: Block RThroughput: 4.0 +# CHECK: Dispatch Width: 6 +# CHECK-NEXT: uOps Per Cycle: 4.51 +# CHECK-NEXT: IPC: 0.71 +# CHECK-NEXT: Block RThroughput: 5.3 # CHECK: Timeline view: -# CHECK-NEXT: 01234 +# CHECK-NEXT: 012345 # CHECK-NEXT: Index 0123456789 -# CHECK: [0,0] DeeeeeeeeER . ld3 { v1.2s, v2.2s, v3.2s }, [x27], #24 -# CHECK-NEXT: [0,1] D=eeeeeeeeER . ld3 { v1.4h, v2.4h, v3.4h }, [x27], #24 -# CHECK-NEXT: [0,2] .D=eeeeeeeeER . ld3 { v1.4s, v2.4s, v3.4s }, [x27], #48 -# CHECK-NEXT: [0,3] .D==eeeeeeeeER. ld3 { v1.8b, v2.8b, v3.8b }, [x27], #24 -# CHECK-NEXT: [0,4] . D==eeeeeeeeER ld3 { v1.8h, v2.8h, v3.8h }, [x27], #48 +# CHECK: [0,0] DeeeeeeeeER . ld3 { v1.2s, v2.2s, v3.2s }, [x27], #24 +# CHECK-NEXT: [0,1] .DeeeeeeeeER . ld3 { v1.4h, v2.4h, v3.4h }, [x27], #24 +# CHECK-NEXT: [0,2] . DeeeeeeeeER . ld3 { v1.4s, v2.4s, v3.4s }, [x27], #48 +# CHECK-NEXT: [0,3] . DeeeeeeeeER. ld3 { v1.8b, v2.8b, v3.8b }, [x27], #24 +# CHECK-NEXT: [0,4] . DeeeeeeeeER ld3 { v1.8h, v2.8h, v3.8h }, [x27], #48 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -1728,33 +1728,33 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld3 { v1.2s, v2.2s, v3.2s }, [x27], #24 -# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld3 { v1.4h, v2.4h, v3.4h }, [x27], #24 -# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld3 { v1.4s, v2.4s, v3.4s }, [x27], #48 -# CHECK-NEXT: 3. 1 3.0 0.0 0.0 ld3 { v1.8b, v2.8b, v3.8b }, [x27], #24 -# CHECK-NEXT: 4. 1 3.0 0.0 0.0 ld3 { v1.8h, v2.8h, v3.8h }, [x27], #48 -# CHECK-NEXT: 1 2.2 0.2 0.0 +# CHECK-NEXT: 1. 1 1.0 0.0 0.0 ld3 { v1.4h, v2.4h, v3.4h }, [x27], #24 +# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld3 { v1.4s, v2.4s, v3.4s }, [x27], #48 +# CHECK-NEXT: 3. 1 1.0 1.0 0.0 ld3 { v1.8b, v2.8b, v3.8b }, [x27], #24 +# CHECK-NEXT: 4. 1 1.0 0.0 0.0 ld3 { v1.8h, v2.8h, v3.8h }, [x27], #48 +# CHECK-NEXT: 1 1.0 0.4 0.0 # CHECK: [28] Code Region - G29 # CHECK: Iterations: 100 # CHECK-NEXT: Instructions: 500 -# CHECK-NEXT: Total Cycles: 510 +# CHECK-NEXT: Total Cycles: 809 # CHECK-NEXT: Total uOps: 3300 -# CHECK: Dispatch Width: 16 -# CHECK-NEXT: uOps Per Cycle: 6.47 -# CHECK-NEXT: IPC: 0.98 -# CHECK-NEXT: Block RThroughput: 4.3 +# CHECK: Dispatch Width: 6 +# CHECK-NEXT: uOps Per Cycle: 4.08 +# CHECK-NEXT: IPC: 0.62 +# CHECK-NEXT: Block RThroughput: 5.5 # CHECK: Timeline view: -# CHECK-NEXT: 01234 +# CHECK-NEXT: 0123456 # CHECK-NEXT: Index 0123456789 -# CHECK: [0,0] DeeeeeeeeER . ld3 { v1.16b, v2.16b, v3.16b }, [x27], #48 -# CHECK-NEXT: [0,1] D=eeeeeeeeER . ld3 { v1.2d, v2.2d, v3.2d }, [x27], x28 -# CHECK-NEXT: [0,2] .D=eeeeeeeeER . ld3 { v1.2s, v2.2s, v3.2s }, [x27], x28 -# CHECK-NEXT: [0,3] .D==eeeeeeeeER. ld3 { v1.4h, v2.4h, v3.4h }, [x27], x28 -# CHECK-NEXT: [0,4] . D==eeeeeeeeER ld3 { v1.4s, v2.4s, v3.4s }, [x27], x28 +# CHECK: [0,0] DeeeeeeeeER .. ld3 { v1.16b, v2.16b, v3.16b }, [x27], #48 +# CHECK-NEXT: [0,1] . DeeeeeeeeER .. ld3 { v1.2d, v2.2d, v3.2d }, [x27], x28 +# CHECK-NEXT: [0,2] . DeeeeeeeeER.. ld3 { v1.2s, v2.2s, v3.2s }, [x27], x28 +# CHECK-NEXT: [0,3] . DeeeeeeeeER. ld3 { v1.4h, v2.4h, v3.4h }, [x27], x28 +# CHECK-NEXT: [0,4] . .DeeeeeeeeER ld3 { v1.4s, v2.4s, v3.4s }, [x27], x28 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -1764,33 +1764,33 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld3 { v1.16b, v2.16b, v3.16b }, [x27], #48 -# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld3 { v1.2d, v2.2d, v3.2d }, [x27], x28 -# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld3 { v1.2s, v2.2s, v3.2s }, [x27], x28 -# CHECK-NEXT: 3. 1 3.0 0.0 0.0 ld3 { v1.4h, v2.4h, v3.4h }, [x27], x28 -# CHECK-NEXT: 4. 1 3.0 0.0 0.0 ld3 { v1.4s, v2.4s, v3.4s }, [x27], x28 -# CHECK-NEXT: 1 2.2 0.2 0.0 +# CHECK-NEXT: 1. 1 1.0 1.0 0.0 ld3 { v1.2d, v2.2d, v3.2d }, [x27], x28 +# CHECK-NEXT: 2. 1 1.0 1.0 0.0 ld3 { v1.2s, v2.2s, v3.2s }, [x27], x28 +# CHECK-NEXT: 3. 1 1.0 0.0 0.0 ld3 { v1.4h, v2.4h, v3.4h }, [x27], x28 +# CHECK-NEXT: 4. 1 1.0 0.0 0.0 ld3 { v1.4s, v2.4s, v3.4s }, [x27], x28 +# CHECK-NEXT: 1 1.0 0.6 0.0 # CHECK: [29] Code Region - G30 # CHECK: Iterations: 100 # CHECK-NEXT: Instructions: 500 -# CHECK-NEXT: Total Cycles: 1910 +# CHECK-NEXT: Total Cycles: 1911 # CHECK-NEXT: Total uOps: 3200 -# CHECK: Dispatch Width: 16 -# CHECK-NEXT: uOps Per Cycle: 1.68 +# CHECK: Dispatch Width: 6 +# CHECK-NEXT: uOps Per Cycle: 1.67 # CHECK-NEXT: IPC: 0.26 -# CHECK-NEXT: Block RThroughput: 4.0 +# CHECK-NEXT: Block RThroughput: 5.3 # CHECK: Timeline view: # CHECK-NEXT: 0123456789 -# CHECK-NEXT: Index 0123456789 012345678 +# CHECK-NEXT: Index 0123456789 0123456789 -# CHECK: [0,0] DeeeeeeeeER . . . . ld3 { v1.8b, v2.8b, v3.8b }, [x27], x28 -# CHECK-NEXT: [0,1] D=eeeeeeeeER . . . . ld3 { v1.8h, v2.8h, v3.8h }, [x27], x28 -# CHECK-NEXT: [0,2] .D=eeeeeeeeER . . . . ld3 { v1.16b, v2.16b, v3.16b }, [x27], x28 -# CHECK-NEXT: [0,3] .D=========eeeeeeeeER . . ld3 { v1.b, v2.b, v3.b }[0], [x27], #3 -# CHECK-NEXT: [0,4] . D================eeeeeeeeER ld3 { v1.b, v2.b, v3.b }[8], [x27], #3 +# CHECK: [0,0] DeeeeeeeeER . . . . ld3 { v1.8b, v2.8b, v3.8b }, [x27], x28 +# CHECK-NEXT: [0,1] .DeeeeeeeeER . . . . ld3 { v1.8h, v2.8h, v3.8h }, [x27], x28 +# CHECK-NEXT: [0,2] . DeeeeeeeeER . . . . ld3 { v1.16b, v2.16b, v3.16b }, [x27], x28 +# CHECK-NEXT: [0,3] . D======eeeeeeeeER . . ld3 { v1.b, v2.b, v3.b }[0], [x27], #3 +# CHECK-NEXT: [0,4] . .D=============eeeeeeeeER ld3 { v1.b, v2.b, v3.b }[8], [x27], #3 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -1800,11 +1800,11 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld3 { v1.8b, v2.8b, v3.8b }, [x27], x28 -# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld3 { v1.8h, v2.8h, v3.8h }, [x27], x28 -# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld3 { v1.16b, v2.16b, v3.16b }, [x27], x28 -# CHECK-NEXT: 3. 1 10.0 0.0 0.0 ld3 { v1.b, v2.b, v3.b }[0], [x27], #3 -# CHECK-NEXT: 4. 1 17.0 0.0 0.0 ld3 { v1.b, v2.b, v3.b }[8], [x27], #3 -# CHECK-NEXT: 1 6.4 0.2 0.0 +# CHECK-NEXT: 1. 1 1.0 0.0 0.0 ld3 { v1.8h, v2.8h, v3.8h }, [x27], x28 +# CHECK-NEXT: 2. 1 1.0 1.0 0.0 ld3 { v1.16b, v2.16b, v3.16b }, [x27], x28 +# CHECK-NEXT: 3. 1 7.0 0.0 0.0 ld3 { v1.b, v2.b, v3.b }[0], [x27], #3 +# CHECK-NEXT: 4. 1 14.0 0.0 0.0 ld3 { v1.b, v2.b, v3.b }[8], [x27], #3 +# CHECK-NEXT: 1 4.8 0.4 0.0 # CHECK: [30] Code Region - G31 @@ -1813,20 +1813,20 @@ ldr x2, [x1], #254 # CHECK-NEXT: Total Cycles: 4003 # CHECK-NEXT: Total uOps: 3000 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 0.75 # CHECK-NEXT: IPC: 0.12 -# CHECK-NEXT: Block RThroughput: 3.8 +# CHECK-NEXT: Block RThroughput: 5.0 # CHECK: Timeline view: # CHECK-NEXT: 0123456789 0123456789 # CHECK-NEXT: Index 0123456789 0123456789 012 # CHECK: [0,0] DeeeeeeeeER . . . . . . . ld3 { v1.b, v2.b, v3.b }[0], [x27], x28 -# CHECK-NEXT: [0,1] D========eeeeeeeeER . . . . . . ld3 { v1.b, v2.b, v3.b }[8], [x27], x28 -# CHECK-NEXT: [0,2] .D===============eeeeeeeeER . . . . ld3 { v1.h, v2.h, v3.h }[0], [x27], #6 -# CHECK-NEXT: [0,3] .D=======================eeeeeeeeER. . . ld3 { v1.h, v2.h, v3.h }[4], [x27], #6 -# CHECK-NEXT: [0,4] . D==============================eeeeeeeeER ld3 { v1.h, v2.h, v3.h }[0], [x27], x28 +# CHECK-NEXT: [0,1] .D=======eeeeeeeeER . . . . . . ld3 { v1.b, v2.b, v3.b }[8], [x27], x28 +# CHECK-NEXT: [0,2] . D==============eeeeeeeeER . . . . ld3 { v1.h, v2.h, v3.h }[0], [x27], #6 +# CHECK-NEXT: [0,3] . D=====================eeeeeeeeER. . . ld3 { v1.h, v2.h, v3.h }[4], [x27], #6 +# CHECK-NEXT: [0,4] . D============================eeeeeeeeER ld3 { v1.h, v2.h, v3.h }[0], [x27], x28 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -1836,11 +1836,11 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld3 { v1.b, v2.b, v3.b }[0], [x27], x28 -# CHECK-NEXT: 1. 1 9.0 0.0 0.0 ld3 { v1.b, v2.b, v3.b }[8], [x27], x28 -# CHECK-NEXT: 2. 1 16.0 0.0 0.0 ld3 { v1.h, v2.h, v3.h }[0], [x27], #6 -# CHECK-NEXT: 3. 1 24.0 0.0 0.0 ld3 { v1.h, v2.h, v3.h }[4], [x27], #6 -# CHECK-NEXT: 4. 1 31.0 0.0 0.0 ld3 { v1.h, v2.h, v3.h }[0], [x27], x28 -# CHECK-NEXT: 1 16.2 0.2 0.0 +# CHECK-NEXT: 1. 1 8.0 0.0 0.0 ld3 { v1.b, v2.b, v3.b }[8], [x27], x28 +# CHECK-NEXT: 2. 1 15.0 0.0 0.0 ld3 { v1.h, v2.h, v3.h }[0], [x27], #6 +# CHECK-NEXT: 3. 1 22.0 0.0 0.0 ld3 { v1.h, v2.h, v3.h }[4], [x27], #6 +# CHECK-NEXT: 4. 1 29.0 0.0 0.0 ld3 { v1.h, v2.h, v3.h }[0], [x27], x28 +# CHECK-NEXT: 1 15.0 0.2 0.0 # CHECK: [31] Code Region - G32 @@ -1849,20 +1849,20 @@ ldr x2, [x1], #254 # CHECK-NEXT: Total Cycles: 4003 # CHECK-NEXT: Total uOps: 3000 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 0.75 # CHECK-NEXT: IPC: 0.12 -# CHECK-NEXT: Block RThroughput: 3.8 +# CHECK-NEXT: Block RThroughput: 5.0 # CHECK: Timeline view: # CHECK-NEXT: 0123456789 0123456789 # CHECK-NEXT: Index 0123456789 0123456789 012 # CHECK: [0,0] DeeeeeeeeER . . . . . . . ld3 { v1.h, v2.h, v3.h }[4], [x27], x28 -# CHECK-NEXT: [0,1] D========eeeeeeeeER . . . . . . ld3 { v1.s, v2.s, v3.s }[0], [x27], #12 -# CHECK-NEXT: [0,2] .D===============eeeeeeeeER . . . . ld3 { v1.s, v2.s, v3.s }[0], [x27], x28 -# CHECK-NEXT: [0,3] .D=======================eeeeeeeeER. . . ld3 { v1.d, v2.d, v3.d }[0], [x27], #24 -# CHECK-NEXT: [0,4] . D==============================eeeeeeeeER ld3 { v1.d, v2.d, v3.d }[0], [x27], x28 +# CHECK-NEXT: [0,1] .D=======eeeeeeeeER . . . . . . ld3 { v1.s, v2.s, v3.s }[0], [x27], #12 +# CHECK-NEXT: [0,2] . D==============eeeeeeeeER . . . . ld3 { v1.s, v2.s, v3.s }[0], [x27], x28 +# CHECK-NEXT: [0,3] . D=====================eeeeeeeeER. . . ld3 { v1.d, v2.d, v3.d }[0], [x27], #24 +# CHECK-NEXT: [0,4] . D============================eeeeeeeeER ld3 { v1.d, v2.d, v3.d }[0], [x27], x28 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -1872,33 +1872,33 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld3 { v1.h, v2.h, v3.h }[4], [x27], x28 -# CHECK-NEXT: 1. 1 9.0 0.0 0.0 ld3 { v1.s, v2.s, v3.s }[0], [x27], #12 -# CHECK-NEXT: 2. 1 16.0 0.0 0.0 ld3 { v1.s, v2.s, v3.s }[0], [x27], x28 -# CHECK-NEXT: 3. 1 24.0 0.0 0.0 ld3 { v1.d, v2.d, v3.d }[0], [x27], #24 -# CHECK-NEXT: 4. 1 31.0 0.0 0.0 ld3 { v1.d, v2.d, v3.d }[0], [x27], x28 -# CHECK-NEXT: 1 16.2 0.2 0.0 +# CHECK-NEXT: 1. 1 8.0 0.0 0.0 ld3 { v1.s, v2.s, v3.s }[0], [x27], #12 +# CHECK-NEXT: 2. 1 15.0 0.0 0.0 ld3 { v1.s, v2.s, v3.s }[0], [x27], x28 +# CHECK-NEXT: 3. 1 22.0 0.0 0.0 ld3 { v1.d, v2.d, v3.d }[0], [x27], #24 +# CHECK-NEXT: 4. 1 29.0 0.0 0.0 ld3 { v1.d, v2.d, v3.d }[0], [x27], x28 +# CHECK-NEXT: 1 15.0 0.2 0.0 # CHECK: [32] Code Region - G33 # CHECK: Iterations: 100 # CHECK-NEXT: Instructions: 500 -# CHECK-NEXT: Total Cycles: 510 +# CHECK-NEXT: Total Cycles: 709 # CHECK-NEXT: Total uOps: 3200 -# CHECK: Dispatch Width: 16 -# CHECK-NEXT: uOps Per Cycle: 6.27 -# CHECK-NEXT: IPC: 0.98 -# CHECK-NEXT: Block RThroughput: 4.0 +# CHECK: Dispatch Width: 6 +# CHECK-NEXT: uOps Per Cycle: 4.51 +# CHECK-NEXT: IPC: 0.71 +# CHECK-NEXT: Block RThroughput: 5.3 # CHECK: Timeline view: -# CHECK-NEXT: 01234 +# CHECK-NEXT: 012345 # CHECK-NEXT: Index 0123456789 -# CHECK: [0,0] DeeeeeeeeER . ld3r { v1.1d, v2.1d, v3.1d }, [x27], #24 -# CHECK-NEXT: [0,1] D=eeeeeeeeER . ld3r { v1.2d, v2.2d, v3.2d }, [x27], #24 -# CHECK-NEXT: [0,2] .D=eeeeeeeeER . ld3r { v1.2s, v2.2s, v3.2s }, [x27], #12 -# CHECK-NEXT: [0,3] .D==eeeeeeeeER. ld3r { v1.4h, v2.4h, v3.4h }, [x27], #6 -# CHECK-NEXT: [0,4] . D==eeeeeeeeER ld3r { v1.4s, v2.4s, v3.4s }, [x27], #12 +# CHECK: [0,0] DeeeeeeeeER . ld3r { v1.1d, v2.1d, v3.1d }, [x27], #24 +# CHECK-NEXT: [0,1] .DeeeeeeeeER . ld3r { v1.2d, v2.2d, v3.2d }, [x27], #24 +# CHECK-NEXT: [0,2] . DeeeeeeeeER . ld3r { v1.2s, v2.2s, v3.2s }, [x27], #12 +# CHECK-NEXT: [0,3] . DeeeeeeeeER. ld3r { v1.4h, v2.4h, v3.4h }, [x27], #6 +# CHECK-NEXT: [0,4] . DeeeeeeeeER ld3r { v1.4s, v2.4s, v3.4s }, [x27], #12 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -1908,33 +1908,33 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld3r { v1.1d, v2.1d, v3.1d }, [x27], #24 -# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld3r { v1.2d, v2.2d, v3.2d }, [x27], #24 -# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld3r { v1.2s, v2.2s, v3.2s }, [x27], #12 -# CHECK-NEXT: 3. 1 3.0 0.0 0.0 ld3r { v1.4h, v2.4h, v3.4h }, [x27], #6 -# CHECK-NEXT: 4. 1 3.0 0.0 0.0 ld3r { v1.4s, v2.4s, v3.4s }, [x27], #12 -# CHECK-NEXT: 1 2.2 0.2 0.0 +# CHECK-NEXT: 1. 1 1.0 0.0 0.0 ld3r { v1.2d, v2.2d, v3.2d }, [x27], #24 +# CHECK-NEXT: 2. 1 1.0 1.0 0.0 ld3r { v1.2s, v2.2s, v3.2s }, [x27], #12 +# CHECK-NEXT: 3. 1 1.0 0.0 0.0 ld3r { v1.4h, v2.4h, v3.4h }, [x27], #6 +# CHECK-NEXT: 4. 1 1.0 0.0 0.0 ld3r { v1.4s, v2.4s, v3.4s }, [x27], #12 +# CHECK-NEXT: 1 1.0 0.4 0.0 # CHECK: [33] Code Region - G34 # CHECK: Iterations: 100 # CHECK-NEXT: Instructions: 500 -# CHECK-NEXT: Total Cycles: 510 +# CHECK-NEXT: Total Cycles: 809 # CHECK-NEXT: Total uOps: 3300 -# CHECK: Dispatch Width: 16 -# CHECK-NEXT: uOps Per Cycle: 6.47 -# CHECK-NEXT: IPC: 0.98 -# CHECK-NEXT: Block RThroughput: 4.3 +# CHECK: Dispatch Width: 6 +# CHECK-NEXT: uOps Per Cycle: 4.08 +# CHECK-NEXT: IPC: 0.62 +# CHECK-NEXT: Block RThroughput: 5.5 # CHECK: Timeline view: -# CHECK-NEXT: 01234 +# CHECK-NEXT: 0123456 # CHECK-NEXT: Index 0123456789 -# CHECK: [0,0] DeeeeeeeeER . ld3r { v1.8b, v2.8b, v3.8b }, [x27], #3 -# CHECK-NEXT: [0,1] D=eeeeeeeeER . ld3r { v1.8h, v2.8h, v3.8h }, [x27], #6 -# CHECK-NEXT: [0,2] .D=eeeeeeeeER . ld3r { v1.16b, v2.16b, v3.16b }, [x27], #3 -# CHECK-NEXT: [0,3] .D==eeeeeeeeER. ld3r { v1.1d, v2.1d, v3.1d }, [x27], x28 -# CHECK-NEXT: [0,4] . D==eeeeeeeeER ld3r { v1.2d, v2.2d, v3.2d }, [x27], x28 +# CHECK: [0,0] DeeeeeeeeER .. ld3r { v1.8b, v2.8b, v3.8b }, [x27], #3 +# CHECK-NEXT: [0,1] .DeeeeeeeeER .. ld3r { v1.8h, v2.8h, v3.8h }, [x27], #6 +# CHECK-NEXT: [0,2] . DeeeeeeeeER .. ld3r { v1.16b, v2.16b, v3.16b }, [x27], #3 +# CHECK-NEXT: [0,3] . DeeeeeeeeER. ld3r { v1.1d, v2.1d, v3.1d }, [x27], x28 +# CHECK-NEXT: [0,4] . .DeeeeeeeeER ld3r { v1.2d, v2.2d, v3.2d }, [x27], x28 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -1944,33 +1944,33 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld3r { v1.8b, v2.8b, v3.8b }, [x27], #3 -# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld3r { v1.8h, v2.8h, v3.8h }, [x27], #6 -# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld3r { v1.16b, v2.16b, v3.16b }, [x27], #3 -# CHECK-NEXT: 3. 1 3.0 0.0 0.0 ld3r { v1.1d, v2.1d, v3.1d }, [x27], x28 -# CHECK-NEXT: 4. 1 3.0 0.0 0.0 ld3r { v1.2d, v2.2d, v3.2d }, [x27], x28 -# CHECK-NEXT: 1 2.2 0.2 0.0 +# CHECK-NEXT: 1. 1 1.0 0.0 0.0 ld3r { v1.8h, v2.8h, v3.8h }, [x27], #6 +# CHECK-NEXT: 2. 1 1.0 1.0 0.0 ld3r { v1.16b, v2.16b, v3.16b }, [x27], #3 +# CHECK-NEXT: 3. 1 1.0 1.0 0.0 ld3r { v1.1d, v2.1d, v3.1d }, [x27], x28 +# CHECK-NEXT: 4. 1 1.0 0.0 0.0 ld3r { v1.2d, v2.2d, v3.2d }, [x27], x28 +# CHECK-NEXT: 1 1.0 0.6 0.0 # CHECK: [34] Code Region - G35 # CHECK: Iterations: 100 # CHECK-NEXT: Instructions: 500 -# CHECK-NEXT: Total Cycles: 510 +# CHECK-NEXT: Total Cycles: 709 # CHECK-NEXT: Total uOps: 3200 -# CHECK: Dispatch Width: 16 -# CHECK-NEXT: uOps Per Cycle: 6.27 -# CHECK-NEXT: IPC: 0.98 -# CHECK-NEXT: Block RThroughput: 4.0 +# CHECK: Dispatch Width: 6 +# CHECK-NEXT: uOps Per Cycle: 4.51 +# CHECK-NEXT: IPC: 0.71 +# CHECK-NEXT: Block RThroughput: 5.3 # CHECK: Timeline view: -# CHECK-NEXT: 01234 +# CHECK-NEXT: 012345 # CHECK-NEXT: Index 0123456789 -# CHECK: [0,0] DeeeeeeeeER . ld3r { v1.2s, v2.2s, v3.2s }, [x27], x28 -# CHECK-NEXT: [0,1] D=eeeeeeeeER . ld3r { v1.4h, v2.4h, v3.4h }, [x27], x28 -# CHECK-NEXT: [0,2] .D=eeeeeeeeER . ld3r { v1.4s, v2.4s, v3.4s }, [x27], x28 -# CHECK-NEXT: [0,3] .D==eeeeeeeeER. ld3r { v1.8b, v2.8b, v3.8b }, [x27], x28 -# CHECK-NEXT: [0,4] . D==eeeeeeeeER ld3r { v1.8h, v2.8h, v3.8h }, [x27], x28 +# CHECK: [0,0] DeeeeeeeeER . ld3r { v1.2s, v2.2s, v3.2s }, [x27], x28 +# CHECK-NEXT: [0,1] .DeeeeeeeeER . ld3r { v1.4h, v2.4h, v3.4h }, [x27], x28 +# CHECK-NEXT: [0,2] . DeeeeeeeeER . ld3r { v1.4s, v2.4s, v3.4s }, [x27], x28 +# CHECK-NEXT: [0,3] . DeeeeeeeeER. ld3r { v1.8b, v2.8b, v3.8b }, [x27], x28 +# CHECK-NEXT: [0,4] . DeeeeeeeeER ld3r { v1.8h, v2.8h, v3.8h }, [x27], x28 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -1980,33 +1980,33 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld3r { v1.2s, v2.2s, v3.2s }, [x27], x28 -# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld3r { v1.4h, v2.4h, v3.4h }, [x27], x28 -# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld3r { v1.4s, v2.4s, v3.4s }, [x27], x28 -# CHECK-NEXT: 3. 1 3.0 0.0 0.0 ld3r { v1.8b, v2.8b, v3.8b }, [x27], x28 -# CHECK-NEXT: 4. 1 3.0 0.0 0.0 ld3r { v1.8h, v2.8h, v3.8h }, [x27], x28 -# CHECK-NEXT: 1 2.2 0.2 0.0 +# CHECK-NEXT: 1. 1 1.0 0.0 0.0 ld3r { v1.4h, v2.4h, v3.4h }, [x27], x28 +# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld3r { v1.4s, v2.4s, v3.4s }, [x27], x28 +# CHECK-NEXT: 3. 1 1.0 1.0 0.0 ld3r { v1.8b, v2.8b, v3.8b }, [x27], x28 +# CHECK-NEXT: 4. 1 1.0 0.0 0.0 ld3r { v1.8h, v2.8h, v3.8h }, [x27], x28 +# CHECK-NEXT: 1 1.0 0.4 0.0 # CHECK: [35] Code Region - G36 # CHECK: Iterations: 100 # CHECK-NEXT: Instructions: 500 -# CHECK-NEXT: Total Cycles: 710 +# CHECK-NEXT: Total Cycles: 1010 # CHECK-NEXT: Total uOps: 4500 -# CHECK: Dispatch Width: 16 -# CHECK-NEXT: uOps Per Cycle: 6.34 -# CHECK-NEXT: IPC: 0.70 -# CHECK-NEXT: Block RThroughput: 7.0 +# CHECK: Dispatch Width: 6 +# CHECK-NEXT: uOps Per Cycle: 4.46 +# CHECK-NEXT: IPC: 0.50 +# CHECK-NEXT: Block RThroughput: 7.5 # CHECK: Timeline view: -# CHECK-NEXT: 0123456 +# CHECK-NEXT: 0123456789 # CHECK-NEXT: Index 0123456789 -# CHECK: [0,0] DeeeeeeeeER .. ld3r { v1.16b, v2.16b, v3.16b }, [x27], x28 -# CHECK-NEXT: [0,1] .DeeeeeeeeeER .. ld4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64 -# CHECK-NEXT: [0,2] . DeeeeeeeeER .. ld4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32 -# CHECK-NEXT: [0,3] . D=eeeeeeeeER .. ld4 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32 -# CHECK-NEXT: [0,4] . D==eeeeeeeeeER ld4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64 +# CHECK: [0,0] DeeeeeeeeER . . ld3r { v1.16b, v2.16b, v3.16b }, [x27], x28 +# CHECK-NEXT: [0,1] . DeeeeeeeeeER . . ld4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64 +# CHECK-NEXT: [0,2] . DeeeeeeeeER. . ld4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32 +# CHECK-NEXT: [0,3] . .DeeeeeeeeER . ld4 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32 +# CHECK-NEXT: [0,4] . . DeeeeeeeeeER ld4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -2016,33 +2016,33 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld3r { v1.16b, v2.16b, v3.16b }, [x27], x28 -# CHECK-NEXT: 1. 1 1.0 0.0 0.0 ld4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64 -# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32 -# CHECK-NEXT: 3. 1 2.0 0.0 0.0 ld4 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32 -# CHECK-NEXT: 4. 1 3.0 1.0 0.0 ld4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64 -# CHECK-NEXT: 1 1.6 0.4 0.0 +# CHECK-NEXT: 1. 1 1.0 1.0 0.0 ld4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64 +# CHECK-NEXT: 2. 1 1.0 1.0 0.0 ld4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32 +# CHECK-NEXT: 3. 1 1.0 1.0 0.0 ld4 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32 +# CHECK-NEXT: 4. 1 1.0 1.0 0.0 ld4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64 +# CHECK-NEXT: 1 1.0 1.0 0.0 # CHECK: [36] Code Region - G37 # CHECK: Iterations: 100 # CHECK-NEXT: Instructions: 500 -# CHECK-NEXT: Total Cycles: 810 +# CHECK-NEXT: Total Cycles: 1009 # CHECK-NEXT: Total uOps: 4900 -# CHECK: Dispatch Width: 16 -# CHECK-NEXT: uOps Per Cycle: 6.05 -# CHECK-NEXT: IPC: 0.62 -# CHECK-NEXT: Block RThroughput: 8.0 +# CHECK: Dispatch Width: 6 +# CHECK-NEXT: uOps Per Cycle: 4.86 +# CHECK-NEXT: IPC: 0.50 +# CHECK-NEXT: Block RThroughput: 8.2 # CHECK: Timeline view: -# CHECK-NEXT: 01234567 +# CHECK-NEXT: 012345678 # CHECK-NEXT: Index 0123456789 -# CHECK: [0,0] DeeeeeeeeER . . ld4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32 -# CHECK-NEXT: [0,1] .DeeeeeeeeeER . . ld4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64 -# CHECK-NEXT: [0,2] . DeeeeeeeeeER . . ld4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64 -# CHECK-NEXT: [0,3] . DeeeeeeeeeER. . ld4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28 -# CHECK-NEXT: [0,4] . D===eeeeeeeeER ld4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28 +# CHECK: [0,0] DeeeeeeeeER . . ld4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32 +# CHECK-NEXT: [0,1] . DeeeeeeeeeER . . ld4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64 +# CHECK-NEXT: [0,2] . DeeeeeeeeeER . ld4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64 +# CHECK-NEXT: [0,3] . .DeeeeeeeeeER. ld4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28 +# CHECK-NEXT: [0,4] . . DeeeeeeeeER ld4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -2052,33 +2052,33 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32 -# CHECK-NEXT: 1. 1 1.0 0.0 0.0 ld4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64 -# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64 -# CHECK-NEXT: 3. 1 1.0 0.0 0.0 ld4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28 -# CHECK-NEXT: 4. 1 4.0 3.0 0.0 ld4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28 -# CHECK-NEXT: 1 1.6 0.8 0.0 +# CHECK-NEXT: 1. 1 1.0 1.0 0.0 ld4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64 +# CHECK-NEXT: 2. 1 1.0 1.0 0.0 ld4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64 +# CHECK-NEXT: 3. 1 1.0 1.0 0.0 ld4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28 +# CHECK-NEXT: 4. 1 1.0 1.0 0.0 ld4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28 +# CHECK-NEXT: 1 1.0 1.0 0.0 # CHECK: [37] Code Region - G38 # CHECK: Iterations: 100 # CHECK-NEXT: Instructions: 500 -# CHECK-NEXT: Total Cycles: 809 +# CHECK-NEXT: Total Cycles: 1010 # CHECK-NEXT: Total uOps: 4900 -# CHECK: Dispatch Width: 16 -# CHECK-NEXT: uOps Per Cycle: 6.06 -# CHECK-NEXT: IPC: 0.62 -# CHECK-NEXT: Block RThroughput: 8.0 +# CHECK: Dispatch Width: 6 +# CHECK-NEXT: uOps Per Cycle: 4.85 +# CHECK-NEXT: IPC: 0.50 +# CHECK-NEXT: Block RThroughput: 8.2 # CHECK: Timeline view: -# CHECK-NEXT: 0123456 +# CHECK-NEXT: 0123456789 # CHECK-NEXT: Index 0123456789 -# CHECK: [0,0] DeeeeeeeeER .. ld4 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28 -# CHECK-NEXT: [0,1] .DeeeeeeeeeER .. ld4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28 -# CHECK-NEXT: [0,2] . DeeeeeeeeER .. ld4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28 -# CHECK-NEXT: [0,3] . DeeeeeeeeeER.. ld4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28 -# CHECK-NEXT: [0,4] . D=eeeeeeeeeER ld4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28 +# CHECK: [0,0] DeeeeeeeeER . . ld4 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28 +# CHECK-NEXT: [0,1] . DeeeeeeeeeER . . ld4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28 +# CHECK-NEXT: [0,2] . DeeeeeeeeER. . ld4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28 +# CHECK-NEXT: [0,3] . .DeeeeeeeeeER . ld4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28 +# CHECK-NEXT: [0,4] . . DeeeeeeeeeER ld4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -2088,11 +2088,11 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld4 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28 -# CHECK-NEXT: 1. 1 1.0 0.0 0.0 ld4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28 -# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28 -# CHECK-NEXT: 3. 1 1.0 0.0 0.0 ld4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28 -# CHECK-NEXT: 4. 1 2.0 1.0 0.0 ld4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28 -# CHECK-NEXT: 1 1.2 0.4 0.0 +# CHECK-NEXT: 1. 1 1.0 1.0 0.0 ld4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28 +# CHECK-NEXT: 2. 1 1.0 1.0 0.0 ld4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28 +# CHECK-NEXT: 3. 1 1.0 1.0 0.0 ld4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28 +# CHECK-NEXT: 4. 1 1.0 1.0 0.0 ld4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28 +# CHECK-NEXT: 1 1.0 1.0 0.0 # CHECK: [38] Code Region - G39 @@ -2101,20 +2101,20 @@ ldr x2, [x1], #254 # CHECK-NEXT: Total Cycles: 4003 # CHECK-NEXT: Total uOps: 4000 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 1.00 # CHECK-NEXT: IPC: 0.12 -# CHECK-NEXT: Block RThroughput: 5.0 +# CHECK-NEXT: Block RThroughput: 6.7 # CHECK: Timeline view: # CHECK-NEXT: 0123456789 0123456789 # CHECK-NEXT: Index 0123456789 0123456789 012 # CHECK: [0,0] DeeeeeeeeER . . . . . . . ld4 { v1.b, v2.b, v3.b, v4.b }[0], [x27], #4 -# CHECK-NEXT: [0,1] D========eeeeeeeeER . . . . . . ld4 { v1.b, v2.b, v3.b, v4.b }[8], [x27], #4 -# CHECK-NEXT: [0,2] .D===============eeeeeeeeER . . . . ld4 { v1.b, v2.b, v3.b, v4.b }[0], [x27], x28 -# CHECK-NEXT: [0,3] .D=======================eeeeeeeeER. . . ld4 { v1.b, v2.b, v3.b, v4.b }[8], [x27], x28 -# CHECK-NEXT: [0,4] . D==============================eeeeeeeeER ld4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], #8 +# CHECK-NEXT: [0,1] . D======eeeeeeeeER . . . . . . ld4 { v1.b, v2.b, v3.b, v4.b }[8], [x27], #4 +# CHECK-NEXT: [0,2] . D============eeeeeeeeER . . . . ld4 { v1.b, v2.b, v3.b, v4.b }[0], [x27], x28 +# CHECK-NEXT: [0,3] . .D==================eeeeeeeeER. . . ld4 { v1.b, v2.b, v3.b, v4.b }[8], [x27], x28 +# CHECK-NEXT: [0,4] . . D========================eeeeeeeeER ld4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], #8 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -2124,11 +2124,11 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld4 { v1.b, v2.b, v3.b, v4.b }[0], [x27], #4 -# CHECK-NEXT: 1. 1 9.0 0.0 0.0 ld4 { v1.b, v2.b, v3.b, v4.b }[8], [x27], #4 -# CHECK-NEXT: 2. 1 16.0 0.0 0.0 ld4 { v1.b, v2.b, v3.b, v4.b }[0], [x27], x28 -# CHECK-NEXT: 3. 1 24.0 0.0 0.0 ld4 { v1.b, v2.b, v3.b, v4.b }[8], [x27], x28 -# CHECK-NEXT: 4. 1 31.0 0.0 0.0 ld4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], #8 -# CHECK-NEXT: 1 16.2 0.2 0.0 +# CHECK-NEXT: 1. 1 7.0 0.0 0.0 ld4 { v1.b, v2.b, v3.b, v4.b }[8], [x27], #4 +# CHECK-NEXT: 2. 1 13.0 0.0 0.0 ld4 { v1.b, v2.b, v3.b, v4.b }[0], [x27], x28 +# CHECK-NEXT: 3. 1 19.0 0.0 0.0 ld4 { v1.b, v2.b, v3.b, v4.b }[8], [x27], x28 +# CHECK-NEXT: 4. 1 25.0 0.0 0.0 ld4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], #8 +# CHECK-NEXT: 1 13.0 0.2 0.0 # CHECK: [39] Code Region - G40 @@ -2137,20 +2137,20 @@ ldr x2, [x1], #254 # CHECK-NEXT: Total Cycles: 4003 # CHECK-NEXT: Total uOps: 4000 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 1.00 # CHECK-NEXT: IPC: 0.12 -# CHECK-NEXT: Block RThroughput: 5.0 +# CHECK-NEXT: Block RThroughput: 6.7 # CHECK: Timeline view: # CHECK-NEXT: 0123456789 0123456789 # CHECK-NEXT: Index 0123456789 0123456789 012 # CHECK: [0,0] DeeeeeeeeER . . . . . . . ld4 { v1.h, v2.h, v3.h, v4.h }[4], [x27], #8 -# CHECK-NEXT: [0,1] D========eeeeeeeeER . . . . . . ld4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], x28 -# CHECK-NEXT: [0,2] .D===============eeeeeeeeER . . . . ld4 { v1.h, v2.h, v3.h, v4.h }[4], [x27], x28 -# CHECK-NEXT: [0,3] .D=======================eeeeeeeeER. . . ld4 { v1.s, v2.s, v3.s, v4.s }[0], [x27], #16 -# CHECK-NEXT: [0,4] . D==============================eeeeeeeeER ld4 { v1.s, v2.s, v3.s, v4.s }[0], [x27], x28 +# CHECK-NEXT: [0,1] . D======eeeeeeeeER . . . . . . ld4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], x28 +# CHECK-NEXT: [0,2] . D============eeeeeeeeER . . . . ld4 { v1.h, v2.h, v3.h, v4.h }[4], [x27], x28 +# CHECK-NEXT: [0,3] . .D==================eeeeeeeeER. . . ld4 { v1.s, v2.s, v3.s, v4.s }[0], [x27], #16 +# CHECK-NEXT: [0,4] . . D========================eeeeeeeeER ld4 { v1.s, v2.s, v3.s, v4.s }[0], [x27], x28 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -2160,11 +2160,11 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld4 { v1.h, v2.h, v3.h, v4.h }[4], [x27], #8 -# CHECK-NEXT: 1. 1 9.0 0.0 0.0 ld4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], x28 -# CHECK-NEXT: 2. 1 16.0 0.0 0.0 ld4 { v1.h, v2.h, v3.h, v4.h }[4], [x27], x28 -# CHECK-NEXT: 3. 1 24.0 0.0 0.0 ld4 { v1.s, v2.s, v3.s, v4.s }[0], [x27], #16 -# CHECK-NEXT: 4. 1 31.0 0.0 0.0 ld4 { v1.s, v2.s, v3.s, v4.s }[0], [x27], x28 -# CHECK-NEXT: 1 16.2 0.2 0.0 +# CHECK-NEXT: 1. 1 7.0 0.0 0.0 ld4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], x28 +# CHECK-NEXT: 2. 1 13.0 0.0 0.0 ld4 { v1.h, v2.h, v3.h, v4.h }[4], [x27], x28 +# CHECK-NEXT: 3. 1 19.0 0.0 0.0 ld4 { v1.s, v2.s, v3.s, v4.s }[0], [x27], #16 +# CHECK-NEXT: 4. 1 25.0 0.0 0.0 ld4 { v1.s, v2.s, v3.s, v4.s }[0], [x27], x28 +# CHECK-NEXT: 1 13.0 0.2 0.0 # CHECK: [40] Code Region - G41 @@ -2173,20 +2173,20 @@ ldr x2, [x1], #254 # CHECK-NEXT: Total Cycles: 1903 # CHECK-NEXT: Total uOps: 4100 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 2.15 # CHECK-NEXT: IPC: 0.26 -# CHECK-NEXT: Block RThroughput: 5.3 +# CHECK-NEXT: Block RThroughput: 6.8 # CHECK: Timeline view: # CHECK-NEXT: 0123456789 # CHECK-NEXT: Index 0123456789 01 # CHECK: [0,0] DeeeeeeeeER . .. ld4 { v1.d, v2.d, v3.d, v4.d }[0], [x27], #32 -# CHECK-NEXT: [0,1] D========eeeeeeeeER .. ld4 { v1.d, v2.d, v3.d, v4.d }[0], [x27], x28 -# CHECK-NEXT: [0,2] .D========eeeeeeeeER.. ld4r { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32 -# CHECK-NEXT: [0,3] . D========eeeeeeeeER. ld4r { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32 -# CHECK-NEXT: [0,4] . D========eeeeeeeeER ld4r { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16 +# CHECK-NEXT: [0,1] . D======eeeeeeeeER .. ld4 { v1.d, v2.d, v3.d, v4.d }[0], [x27], x28 +# CHECK-NEXT: [0,2] . D=====eeeeeeeeER.. ld4r { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32 +# CHECK-NEXT: [0,3] . .D====eeeeeeeeER. ld4r { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32 +# CHECK-NEXT: [0,4] . . D===eeeeeeeeER ld4r { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -2196,33 +2196,33 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld4 { v1.d, v2.d, v3.d, v4.d }[0], [x27], #32 -# CHECK-NEXT: 1. 1 9.0 0.0 0.0 ld4 { v1.d, v2.d, v3.d, v4.d }[0], [x27], x28 -# CHECK-NEXT: 2. 1 9.0 0.0 0.0 ld4r { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32 -# CHECK-NEXT: 3. 1 9.0 0.0 0.0 ld4r { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32 -# CHECK-NEXT: 4. 1 9.0 0.0 0.0 ld4r { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16 -# CHECK-NEXT: 1 7.4 0.2 0.0 +# CHECK-NEXT: 1. 1 7.0 0.0 0.0 ld4 { v1.d, v2.d, v3.d, v4.d }[0], [x27], x28 +# CHECK-NEXT: 2. 1 6.0 0.0 0.0 ld4r { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32 +# CHECK-NEXT: 3. 1 5.0 0.0 0.0 ld4r { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32 +# CHECK-NEXT: 4. 1 4.0 0.0 0.0 ld4r { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16 +# CHECK-NEXT: 1 4.6 0.2 0.0 # CHECK: [41] Code Region - G42 # CHECK: Iterations: 100 # CHECK-NEXT: Instructions: 500 -# CHECK-NEXT: Total Cycles: 659 +# CHECK-NEXT: Total Cycles: 1009 # CHECK-NEXT: Total uOps: 4300 -# CHECK: Dispatch Width: 16 -# CHECK-NEXT: uOps Per Cycle: 6.53 -# CHECK-NEXT: IPC: 0.76 -# CHECK-NEXT: Block RThroughput: 6.0 +# CHECK: Dispatch Width: 6 +# CHECK-NEXT: uOps Per Cycle: 4.26 +# CHECK-NEXT: IPC: 0.50 +# CHECK-NEXT: Block RThroughput: 7.2 # CHECK: Timeline view: -# CHECK-NEXT: 012345 +# CHECK-NEXT: 012345678 # CHECK-NEXT: Index 0123456789 -# CHECK: [0,0] DeeeeeeeeER . ld4r { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8 -# CHECK-NEXT: [0,1] .DeeeeeeeeER . ld4r { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16 -# CHECK-NEXT: [0,2] . DeeeeeeeeER . ld4r { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4 -# CHECK-NEXT: [0,3] . DeeeeeeeeER . ld4r { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8 -# CHECK-NEXT: [0,4] . D=eeeeeeeeER ld4r { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4 +# CHECK: [0,0] DeeeeeeeeER . . ld4r { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8 +# CHECK-NEXT: [0,1] . DeeeeeeeeER . . ld4r { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16 +# CHECK-NEXT: [0,2] . DeeeeeeeeER. . ld4r { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4 +# CHECK-NEXT: [0,3] . .DeeeeeeeeER . ld4r { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8 +# CHECK-NEXT: [0,4] . . DeeeeeeeeER ld4r { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -2232,33 +2232,33 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld4r { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8 -# CHECK-NEXT: 1. 1 1.0 0.0 0.0 ld4r { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16 -# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld4r { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4 -# CHECK-NEXT: 3. 1 1.0 0.0 0.0 ld4r { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8 -# CHECK-NEXT: 4. 1 2.0 1.0 0.0 ld4r { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4 -# CHECK-NEXT: 1 1.2 0.4 0.0 +# CHECK-NEXT: 1. 1 1.0 1.0 0.0 ld4r { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16 +# CHECK-NEXT: 2. 1 1.0 1.0 0.0 ld4r { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4 +# CHECK-NEXT: 3. 1 1.0 1.0 0.0 ld4r { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8 +# CHECK-NEXT: 4. 1 1.0 1.0 0.0 ld4r { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4 +# CHECK-NEXT: 1 1.0 1.0 0.0 # CHECK: [42] Code Region - G43 # CHECK: Iterations: 100 # CHECK-NEXT: Instructions: 500 -# CHECK-NEXT: Total Cycles: 610 +# CHECK-NEXT: Total Cycles: 1009 # CHECK-NEXT: Total uOps: 4200 -# CHECK: Dispatch Width: 16 -# CHECK-NEXT: uOps Per Cycle: 6.89 -# CHECK-NEXT: IPC: 0.82 -# CHECK-NEXT: Block RThroughput: 5.7 +# CHECK: Dispatch Width: 6 +# CHECK-NEXT: uOps Per Cycle: 4.16 +# CHECK-NEXT: IPC: 0.50 +# CHECK-NEXT: Block RThroughput: 7.0 # CHECK: Timeline view: -# CHECK-NEXT: 012345 +# CHECK-NEXT: 012345678 # CHECK-NEXT: Index 0123456789 -# CHECK: [0,0] DeeeeeeeeER . ld4r { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28 -# CHECK-NEXT: [0,1] .DeeeeeeeeER . ld4r { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28 -# CHECK-NEXT: [0,2] . DeeeeeeeeER . ld4r { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28 -# CHECK-NEXT: [0,3] . D=eeeeeeeeER . ld4r { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28 -# CHECK-NEXT: [0,4] . D==eeeeeeeeER ld4r { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28 +# CHECK: [0,0] DeeeeeeeeER . . ld4r { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28 +# CHECK-NEXT: [0,1] . DeeeeeeeeER . . ld4r { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28 +# CHECK-NEXT: [0,2] . DeeeeeeeeER. . ld4r { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28 +# CHECK-NEXT: [0,3] . .DeeeeeeeeER . ld4r { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28 +# CHECK-NEXT: [0,4] . . DeeeeeeeeER ld4r { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -2268,33 +2268,33 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld4r { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28 -# CHECK-NEXT: 1. 1 1.0 0.0 0.0 ld4r { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28 -# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld4r { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28 -# CHECK-NEXT: 3. 1 2.0 0.0 0.0 ld4r { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28 -# CHECK-NEXT: 4. 1 3.0 1.0 0.0 ld4r { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28 -# CHECK-NEXT: 1 1.6 0.4 0.0 +# CHECK-NEXT: 1. 1 1.0 1.0 0.0 ld4r { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28 +# CHECK-NEXT: 2. 1 1.0 1.0 0.0 ld4r { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28 +# CHECK-NEXT: 3. 1 1.0 1.0 0.0 ld4r { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28 +# CHECK-NEXT: 4. 1 1.0 1.0 0.0 ld4r { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28 +# CHECK-NEXT: 1 1.0 1.0 0.0 # CHECK: [43] Code Region - G44 # CHECK: Iterations: 100 # CHECK-NEXT: Instructions: 500 -# CHECK-NEXT: Total Cycles: 508 +# CHECK-NEXT: Total Cycles: 808 # CHECK-NEXT: Total uOps: 3400 -# CHECK: Dispatch Width: 16 -# CHECK-NEXT: uOps Per Cycle: 6.69 -# CHECK-NEXT: IPC: 0.98 -# CHECK-NEXT: Block RThroughput: 4.3 +# CHECK: Dispatch Width: 6 +# CHECK-NEXT: uOps Per Cycle: 4.21 +# CHECK-NEXT: IPC: 0.62 +# CHECK-NEXT: Block RThroughput: 5.7 # CHECK: Timeline view: -# CHECK-NEXT: 012 +# CHECK-NEXT: 012345 # CHECK-NEXT: Index 0123456789 -# CHECK: [0,0] DeeeeeeeeER . ld4r { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28 -# CHECK-NEXT: [0,1] .DeeeeeeeeER. ld4r { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28 -# CHECK-NEXT: [0,2] . DeeeeeeeeER ld4r { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28 -# CHECK-NEXT: [0,3] . D=eeeeeeE-R ldp s1, s2, [x27], #248 -# CHECK-NEXT: [0,4] . D=eeeeeeER ldp d1, d2, [x27], #496 +# CHECK: [0,0] DeeeeeeeeER . ld4r { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28 +# CHECK-NEXT: [0,1] . DeeeeeeeeER . ld4r { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28 +# CHECK-NEXT: [0,2] . DeeeeeeeeER. ld4r { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28 +# CHECK-NEXT: [0,3] . .DeeeeeeER. ldp s1, s2, [x27], #248 +# CHECK-NEXT: [0,4] . . DeeeeeeER ldp d1, d2, [x27], #496 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -2304,11 +2304,11 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld4r { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28 -# CHECK-NEXT: 1. 1 1.0 0.0 0.0 ld4r { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28 -# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld4r { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28 -# CHECK-NEXT: 3. 1 2.0 0.0 1.0 ldp s1, s2, [x27], #248 -# CHECK-NEXT: 4. 1 2.0 0.0 0.0 ldp d1, d2, [x27], #496 -# CHECK-NEXT: 1 1.4 0.2 0.2 +# CHECK-NEXT: 1. 1 1.0 1.0 0.0 ld4r { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28 +# CHECK-NEXT: 2. 1 1.0 1.0 0.0 ld4r { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28 +# CHECK-NEXT: 3. 1 1.0 1.0 0.0 ldp s1, s2, [x27], #248 +# CHECK-NEXT: 4. 1 1.0 0.0 0.0 ldp d1, d2, [x27], #496 +# CHECK-NEXT: 1 1.0 0.8 0.0 # CHECK: [44] Code Region - G45 @@ -2317,20 +2317,20 @@ ldr x2, [x1], #254 # CHECK-NEXT: Total Cycles: 507 # CHECK-NEXT: Total uOps: 2300 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 4.54 # CHECK-NEXT: IPC: 0.99 -# CHECK-NEXT: Block RThroughput: 2.3 +# CHECK-NEXT: Block RThroughput: 3.8 # CHECK: Timeline view: # CHECK-NEXT: 01 # CHECK-NEXT: Index 0123456789 # CHECK: [0,0] DeeeeeeER .. ldp q1, q2, [x27], #992 -# CHECK-NEXT: [0,1] D=eeeeeeER.. ldp s1, s2, [x27, #248]! -# CHECK-NEXT: [0,2] D==eeeeeeER. ldp d1, d2, [x27, #496]! -# CHECK-NEXT: [0,3] .D==eeeeeeER ldp q1, q2, [x27, #992]! -# CHECK-NEXT: [0,4] .D===eeeeE-R ldp w1, w2, [x27], #248 +# CHECK-NEXT: [0,1] .DeeeeeeER.. ldp s1, s2, [x27, #248]! +# CHECK-NEXT: [0,2] . DeeeeeeER. ldp d1, d2, [x27, #496]! +# CHECK-NEXT: [0,3] . DeeeeeeER ldp q1, q2, [x27, #992]! +# CHECK-NEXT: [0,4] . DeeeeE-R ldp w1, w2, [x27], #248 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -2340,11 +2340,11 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 ldp q1, q2, [x27], #992 -# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ldp s1, s2, [x27, #248]! -# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ldp d1, d2, [x27, #496]! -# CHECK-NEXT: 3. 1 3.0 0.0 0.0 ldp q1, q2, [x27, #992]! -# CHECK-NEXT: 4. 1 4.0 0.0 1.0 ldp w1, w2, [x27], #248 -# CHECK-NEXT: 1 2.6 0.2 0.2 +# CHECK-NEXT: 1. 1 1.0 0.0 0.0 ldp s1, s2, [x27, #248]! +# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ldp d1, d2, [x27, #496]! +# CHECK-NEXT: 3. 1 1.0 0.0 0.0 ldp q1, q2, [x27, #992]! +# CHECK-NEXT: 4. 1 1.0 0.0 1.0 ldp w1, w2, [x27], #248 +# CHECK-NEXT: 1 1.0 0.2 0.2 # CHECK: [45] Code Region - G46 @@ -2353,10 +2353,10 @@ ldr x2, [x1], #254 # CHECK-NEXT: Total Cycles: 507 # CHECK-NEXT: Total uOps: 2100 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 4.14 # CHECK-NEXT: IPC: 0.99 -# CHECK-NEXT: Block RThroughput: 3.0 +# CHECK-NEXT: Block RThroughput: 3.5 # CHECK: Timeline view: # CHECK-NEXT: 01 @@ -2364,9 +2364,9 @@ ldr x2, [x1], #254 # CHECK: [0,0] DeeeeER .. ldp x1, x2, [x27], #496 # CHECK-NEXT: [0,1] D=eeeeER .. ldp w1, w2, [x27, #248]! -# CHECK-NEXT: [0,2] D==eeeeER .. ldp x1, x2, [x27, #496]! -# CHECK-NEXT: [0,3] D===eeeeeER. ldpsw x1, x2, [x27], #248 -# CHECK-NEXT: [0,4] .D===eeeeeER ldpsw x1, x2, [x27, #248]! +# CHECK-NEXT: [0,2] .D=eeeeER .. ldp x1, x2, [x27, #496]! +# CHECK-NEXT: [0,3] . D=eeeeeER. ldpsw x1, x2, [x27], #248 +# CHECK-NEXT: [0,4] . D=eeeeeER ldpsw x1, x2, [x27, #248]! # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -2377,10 +2377,10 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 ldp x1, x2, [x27], #496 # CHECK-NEXT: 1. 1 2.0 0.0 0.0 ldp w1, w2, [x27, #248]! -# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ldp x1, x2, [x27, #496]! -# CHECK-NEXT: 3. 1 4.0 0.0 0.0 ldpsw x1, x2, [x27], #248 -# CHECK-NEXT: 4. 1 4.0 0.0 0.0 ldpsw x1, x2, [x27, #248]! -# CHECK-NEXT: 1 2.8 0.2 0.0 +# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ldp x1, x2, [x27, #496]! +# CHECK-NEXT: 3. 1 2.0 0.0 0.0 ldpsw x1, x2, [x27], #248 +# CHECK-NEXT: 4. 1 2.0 0.0 0.0 ldpsw x1, x2, [x27, #248]! +# CHECK-NEXT: 1 1.8 0.2 0.0 # CHECK: [46] Code Region - G47 @@ -2389,10 +2389,10 @@ ldr x2, [x1], #254 # CHECK-NEXT: Total Cycles: 508 # CHECK-NEXT: Total uOps: 1500 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 2.95 # CHECK-NEXT: IPC: 0.98 -# CHECK-NEXT: Block RThroughput: 1.7 +# CHECK-NEXT: Block RThroughput: 2.5 # CHECK: Timeline view: # CHECK-NEXT: 012 @@ -2400,9 +2400,9 @@ ldr x2, [x1], #254 # CHECK: [0,0] DeeeeeeER . . ldr b1, [x27], #254 # CHECK-NEXT: [0,1] D=eeeeeeER. . ldr h1, [x27], #254 -# CHECK-NEXT: [0,2] D==eeeeeeER . ldr s1, [x27], #254 -# CHECK-NEXT: [0,3] D===eeeeeeER. ldr d1, [x27], #254 -# CHECK-NEXT: [0,4] D====eeeeeeER ldr q1, [x27], #254 +# CHECK-NEXT: [0,2] .D=eeeeeeER . ldr s1, [x27], #254 +# CHECK-NEXT: [0,3] .D==eeeeeeER. ldr d1, [x27], #254 +# CHECK-NEXT: [0,4] . D==eeeeeeER ldr q1, [x27], #254 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -2413,10 +2413,10 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 ldr b1, [x27], #254 # CHECK-NEXT: 1. 1 2.0 0.0 0.0 ldr h1, [x27], #254 -# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ldr s1, [x27], #254 -# CHECK-NEXT: 3. 1 4.0 0.0 0.0 ldr d1, [x27], #254 -# CHECK-NEXT: 4. 1 5.0 0.0 0.0 ldr q1, [x27], #254 -# CHECK-NEXT: 1 3.0 0.2 0.0 +# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ldr s1, [x27], #254 +# CHECK-NEXT: 3. 1 3.0 0.0 0.0 ldr d1, [x27], #254 +# CHECK-NEXT: 4. 1 3.0 0.0 0.0 ldr q1, [x27], #254 +# CHECK-NEXT: 1 2.2 0.2 0.0 # CHECK: [47] Code Region - G48 @@ -2425,10 +2425,10 @@ ldr x2, [x1], #254 # CHECK-NEXT: Total Cycles: 508 # CHECK-NEXT: Total uOps: 1500 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 2.95 # CHECK-NEXT: IPC: 0.98 -# CHECK-NEXT: Block RThroughput: 1.7 +# CHECK-NEXT: Block RThroughput: 2.5 # CHECK: Timeline view: # CHECK-NEXT: 012 @@ -2436,9 +2436,9 @@ ldr x2, [x1], #254 # CHECK: [0,0] DeeeeeeER . . ldr b1, [x27, #254]! # CHECK-NEXT: [0,1] D=eeeeeeER. . ldr h1, [x27, #254]! -# CHECK-NEXT: [0,2] D==eeeeeeER . ldr s1, [x27, #254]! -# CHECK-NEXT: [0,3] D===eeeeeeER. ldr d1, [x27, #254]! -# CHECK-NEXT: [0,4] D====eeeeeeER ldr q1, [x27, #254]! +# CHECK-NEXT: [0,2] .D=eeeeeeER . ldr s1, [x27, #254]! +# CHECK-NEXT: [0,3] .D==eeeeeeER. ldr d1, [x27, #254]! +# CHECK-NEXT: [0,4] . D==eeeeeeER ldr q1, [x27, #254]! # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -2449,10 +2449,10 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 ldr b1, [x27, #254]! # CHECK-NEXT: 1. 1 2.0 0.0 0.0 ldr h1, [x27, #254]! -# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ldr s1, [x27, #254]! -# CHECK-NEXT: 3. 1 4.0 0.0 0.0 ldr d1, [x27, #254]! -# CHECK-NEXT: 4. 1 5.0 0.0 0.0 ldr q1, [x27, #254]! -# CHECK-NEXT: 1 3.0 0.2 0.0 +# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ldr s1, [x27, #254]! +# CHECK-NEXT: 3. 1 3.0 0.0 0.0 ldr d1, [x27, #254]! +# CHECK-NEXT: 4. 1 3.0 0.0 0.0 ldr q1, [x27, #254]! +# CHECK-NEXT: 1 2.2 0.2 0.0 # CHECK: [48] Code Region - G49 @@ -2461,7 +2461,7 @@ ldr x2, [x1], #254 # CHECK-NEXT: Total Cycles: 506 # CHECK-NEXT: Total uOps: 1000 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 1.98 # CHECK-NEXT: IPC: 0.99 # CHECK-NEXT: Block RThroughput: 1.7 @@ -2473,8 +2473,8 @@ ldr x2, [x1], #254 # CHECK: [0,0] DeeeeER . ldr w1, [x27], #254 # CHECK-NEXT: [0,1] D=eeeeER . ldr x1, [x27], #254 # CHECK-NEXT: [0,2] D==eeeeER . ldr w1, [x27, #254]! -# CHECK-NEXT: [0,3] D===eeeeER. ldr x1, [x27, #254]! -# CHECK-NEXT: [0,4] D====eeeeER ldrb w1, [x27], #254 +# CHECK-NEXT: [0,3] .D==eeeeER. ldr x1, [x27, #254]! +# CHECK-NEXT: [0,4] .D===eeeeER ldrb w1, [x27], #254 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -2486,9 +2486,9 @@ ldr x2, [x1], #254 # CHECK-NEXT: 0. 1 1.0 1.0 0.0 ldr w1, [x27], #254 # CHECK-NEXT: 1. 1 2.0 0.0 0.0 ldr x1, [x27], #254 # CHECK-NEXT: 2. 1 3.0 0.0 0.0 ldr w1, [x27, #254]! -# CHECK-NEXT: 3. 1 4.0 0.0 0.0 ldr x1, [x27, #254]! -# CHECK-NEXT: 4. 1 5.0 0.0 0.0 ldrb w1, [x27], #254 -# CHECK-NEXT: 1 3.0 0.2 0.0 +# CHECK-NEXT: 3. 1 3.0 0.0 0.0 ldr x1, [x27, #254]! +# CHECK-NEXT: 4. 1 4.0 0.0 0.0 ldrb w1, [x27], #254 +# CHECK-NEXT: 1 2.6 0.2 0.0 # CHECK: [49] Code Region - G50 @@ -2497,7 +2497,7 @@ ldr x2, [x1], #254 # CHECK-NEXT: Total Cycles: 506 # CHECK-NEXT: Total uOps: 1000 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 1.98 # CHECK-NEXT: IPC: 0.99 # CHECK-NEXT: Block RThroughput: 1.7 @@ -2509,8 +2509,8 @@ ldr x2, [x1], #254 # CHECK: [0,0] DeeeeER . ldrb w1, [x27, #254]! # CHECK-NEXT: [0,1] D=eeeeER . ldrh w1, [x27], #254 # CHECK-NEXT: [0,2] D==eeeeER . ldrh w1, [x27, #254]! -# CHECK-NEXT: [0,3] D===eeeeER. ldrsb w1, [x27], #254 -# CHECK-NEXT: [0,4] D====eeeeER ldrsb x1, [x27], #254 +# CHECK-NEXT: [0,3] .D==eeeeER. ldrsb w1, [x27], #254 +# CHECK-NEXT: [0,4] .D===eeeeER ldrsb x1, [x27], #254 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -2522,9 +2522,9 @@ ldr x2, [x1], #254 # CHECK-NEXT: 0. 1 1.0 1.0 0.0 ldrb w1, [x27, #254]! # CHECK-NEXT: 1. 1 2.0 0.0 0.0 ldrh w1, [x27], #254 # CHECK-NEXT: 2. 1 3.0 0.0 0.0 ldrh w1, [x27, #254]! -# CHECK-NEXT: 3. 1 4.0 0.0 0.0 ldrsb w1, [x27], #254 -# CHECK-NEXT: 4. 1 5.0 0.0 0.0 ldrsb x1, [x27], #254 -# CHECK-NEXT: 1 3.0 0.2 0.0 +# CHECK-NEXT: 3. 1 3.0 0.0 0.0 ldrsb w1, [x27], #254 +# CHECK-NEXT: 4. 1 4.0 0.0 0.0 ldrsb x1, [x27], #254 +# CHECK-NEXT: 1 2.6 0.2 0.0 # CHECK: [50] Code Region - G51 @@ -2533,7 +2533,7 @@ ldr x2, [x1], #254 # CHECK-NEXT: Total Cycles: 506 # CHECK-NEXT: Total uOps: 1000 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 1.98 # CHECK-NEXT: IPC: 0.99 # CHECK-NEXT: Block RThroughput: 1.7 @@ -2545,8 +2545,8 @@ ldr x2, [x1], #254 # CHECK: [0,0] DeeeeER . ldrsb w1, [x27, #254]! # CHECK-NEXT: [0,1] D=eeeeER . ldrsb x1, [x27, #254]! # CHECK-NEXT: [0,2] D==eeeeER . ldrsh w1, [x27], #254 -# CHECK-NEXT: [0,3] D===eeeeER. ldrsh x1, [x27], #254 -# CHECK-NEXT: [0,4] D====eeeeER ldrsh w1, [x27, #254]! +# CHECK-NEXT: [0,3] .D==eeeeER. ldrsh x1, [x27], #254 +# CHECK-NEXT: [0,4] .D===eeeeER ldrsh w1, [x27, #254]! # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -2558,9 +2558,9 @@ ldr x2, [x1], #254 # CHECK-NEXT: 0. 1 1.0 1.0 0.0 ldrsb w1, [x27, #254]! # CHECK-NEXT: 1. 1 2.0 0.0 0.0 ldrsb x1, [x27, #254]! # CHECK-NEXT: 2. 1 3.0 0.0 0.0 ldrsh w1, [x27], #254 -# CHECK-NEXT: 3. 1 4.0 0.0 0.0 ldrsh x1, [x27], #254 -# CHECK-NEXT: 4. 1 5.0 0.0 0.0 ldrsh w1, [x27, #254]! -# CHECK-NEXT: 1 3.0 0.2 0.0 +# CHECK-NEXT: 3. 1 3.0 0.0 0.0 ldrsh x1, [x27], #254 +# CHECK-NEXT: 4. 1 4.0 0.0 0.0 ldrsh w1, [x27, #254]! +# CHECK-NEXT: 1 2.6 0.2 0.0 # CHECK: [51] Code Region - G52 @@ -2569,10 +2569,10 @@ ldr x2, [x1], #254 # CHECK-NEXT: Total Cycles: 504 # CHECK-NEXT: Total uOps: 1200 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 2.38 # CHECK-NEXT: IPC: 0.99 -# CHECK-NEXT: Block RThroughput: 1.0 +# CHECK-NEXT: Block RThroughput: 2.0 # CHECK: Timeline view: # CHECK-NEXT: Index 012345678 @@ -2580,8 +2580,8 @@ ldr x2, [x1], #254 # CHECK: [0,0] DeeeeER . ldrsh x1, [x27, #254]! # CHECK-NEXT: [0,1] D=eeeeER. ldrsw x1, [x27], #254 # CHECK-NEXT: [0,2] D==eeeeER ldrsw x1, [x27, #254]! -# CHECK-NEXT: [0,3] D===eeE-R st1 { v1.1d }, [x27], #8 -# CHECK-NEXT: [0,4] D====eeER st1 { v1.2d }, [x27], #16 +# CHECK-NEXT: [0,3] .D==eeE-R st1 { v1.1d }, [x27], #8 +# CHECK-NEXT: [0,4] .D===eeER st1 { v1.2d }, [x27], #16 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -2593,9 +2593,9 @@ ldr x2, [x1], #254 # CHECK-NEXT: 0. 1 1.0 1.0 0.0 ldrsh x1, [x27, #254]! # CHECK-NEXT: 1. 1 2.0 0.0 0.0 ldrsw x1, [x27], #254 # CHECK-NEXT: 2. 1 3.0 0.0 0.0 ldrsw x1, [x27, #254]! -# CHECK-NEXT: 3. 1 4.0 0.0 1.0 st1 { v1.1d }, [x27], #8 -# CHECK-NEXT: 4. 1 5.0 0.0 0.0 st1 { v1.2d }, [x27], #16 -# CHECK-NEXT: 1 3.0 0.2 0.2 +# CHECK-NEXT: 3. 1 3.0 0.0 1.0 st1 { v1.1d }, [x27], #8 +# CHECK-NEXT: 4. 1 4.0 0.0 0.0 st1 { v1.2d }, [x27], #16 +# CHECK-NEXT: 1 2.6 0.2 0.2 # CHECK: [52] Code Region - G53 @@ -2604,7 +2604,7 @@ ldr x2, [x1], #254 # CHECK-NEXT: Total Cycles: 504 # CHECK-NEXT: Total uOps: 1500 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 2.98 # CHECK-NEXT: IPC: 0.99 # CHECK-NEXT: Block RThroughput: 2.5 @@ -2614,9 +2614,9 @@ ldr x2, [x1], #254 # CHECK: [0,0] DeeER. . st1 { v1.2s }, [x27], #8 # CHECK-NEXT: [0,1] D=eeER . st1 { v1.4h }, [x27], #8 -# CHECK-NEXT: [0,2] D==eeER . st1 { v1.4s }, [x27], #16 -# CHECK-NEXT: [0,3] D===eeER. st1 { v1.8b }, [x27], #8 -# CHECK-NEXT: [0,4] D====eeER st1 { v1.8h }, [x27], #16 +# CHECK-NEXT: [0,2] .D=eeER . st1 { v1.4s }, [x27], #16 +# CHECK-NEXT: [0,3] .D==eeER. st1 { v1.8b }, [x27], #8 +# CHECK-NEXT: [0,4] . D==eeER st1 { v1.8h }, [x27], #16 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -2627,10 +2627,10 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.2s }, [x27], #8 # CHECK-NEXT: 1. 1 2.0 0.0 0.0 st1 { v1.4h }, [x27], #8 -# CHECK-NEXT: 2. 1 3.0 0.0 0.0 st1 { v1.4s }, [x27], #16 -# CHECK-NEXT: 3. 1 4.0 0.0 0.0 st1 { v1.8b }, [x27], #8 -# CHECK-NEXT: 4. 1 5.0 0.0 0.0 st1 { v1.8h }, [x27], #16 -# CHECK-NEXT: 1 3.0 0.2 0.0 +# CHECK-NEXT: 2. 1 2.0 0.0 0.0 st1 { v1.4s }, [x27], #16 +# CHECK-NEXT: 3. 1 3.0 0.0 0.0 st1 { v1.8b }, [x27], #8 +# CHECK-NEXT: 4. 1 3.0 0.0 0.0 st1 { v1.8h }, [x27], #16 +# CHECK-NEXT: 1 2.2 0.2 0.0 # CHECK: [53] Code Region - G54 @@ -2639,7 +2639,7 @@ ldr x2, [x1], #254 # CHECK-NEXT: Total Cycles: 504 # CHECK-NEXT: Total uOps: 1500 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 2.98 # CHECK-NEXT: IPC: 0.99 # CHECK-NEXT: Block RThroughput: 2.5 @@ -2649,9 +2649,9 @@ ldr x2, [x1], #254 # CHECK: [0,0] DeeER. . st1 { v1.16b }, [x27], #16 # CHECK-NEXT: [0,1] D=eeER . st1 { v1.1d }, [x27], x28 -# CHECK-NEXT: [0,2] D==eeER . st1 { v1.2d }, [x27], x28 -# CHECK-NEXT: [0,3] D===eeER. st1 { v1.2s }, [x27], x28 -# CHECK-NEXT: [0,4] D====eeER st1 { v1.4h }, [x27], x28 +# CHECK-NEXT: [0,2] .D=eeER . st1 { v1.2d }, [x27], x28 +# CHECK-NEXT: [0,3] .D==eeER. st1 { v1.2s }, [x27], x28 +# CHECK-NEXT: [0,4] . D==eeER st1 { v1.4h }, [x27], x28 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -2662,10 +2662,10 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.16b }, [x27], #16 # CHECK-NEXT: 1. 1 2.0 0.0 0.0 st1 { v1.1d }, [x27], x28 -# CHECK-NEXT: 2. 1 3.0 0.0 0.0 st1 { v1.2d }, [x27], x28 -# CHECK-NEXT: 3. 1 4.0 0.0 0.0 st1 { v1.2s }, [x27], x28 -# CHECK-NEXT: 4. 1 5.0 0.0 0.0 st1 { v1.4h }, [x27], x28 -# CHECK-NEXT: 1 3.0 0.2 0.0 +# CHECK-NEXT: 2. 1 2.0 0.0 0.0 st1 { v1.2d }, [x27], x28 +# CHECK-NEXT: 3. 1 3.0 0.0 0.0 st1 { v1.2s }, [x27], x28 +# CHECK-NEXT: 4. 1 3.0 0.0 0.0 st1 { v1.4h }, [x27], x28 +# CHECK-NEXT: 1 2.2 0.2 0.0 # CHECK: [54] Code Region - G55 @@ -2674,7 +2674,7 @@ ldr x2, [x1], #254 # CHECK-NEXT: Total Cycles: 504 # CHECK-NEXT: Total uOps: 1500 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 2.98 # CHECK-NEXT: IPC: 0.99 # CHECK-NEXT: Block RThroughput: 2.5 @@ -2684,9 +2684,9 @@ ldr x2, [x1], #254 # CHECK: [0,0] DeeER. . st1 { v1.4s }, [x27], x28 # CHECK-NEXT: [0,1] D=eeER . st1 { v1.8b }, [x27], x28 -# CHECK-NEXT: [0,2] D==eeER . st1 { v1.8h }, [x27], x28 -# CHECK-NEXT: [0,3] D===eeER. st1 { v1.16b }, [x27], x28 -# CHECK-NEXT: [0,4] D====eeER st1 { v1.1d, v2.1d }, [x27], #16 +# CHECK-NEXT: [0,2] .D=eeER . st1 { v1.8h }, [x27], x28 +# CHECK-NEXT: [0,3] .D==eeER. st1 { v1.16b }, [x27], x28 +# CHECK-NEXT: [0,4] . D==eeER st1 { v1.1d, v2.1d }, [x27], #16 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -2697,10 +2697,10 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.4s }, [x27], x28 # CHECK-NEXT: 1. 1 2.0 0.0 0.0 st1 { v1.8b }, [x27], x28 -# CHECK-NEXT: 2. 1 3.0 0.0 0.0 st1 { v1.8h }, [x27], x28 -# CHECK-NEXT: 3. 1 4.0 0.0 0.0 st1 { v1.16b }, [x27], x28 -# CHECK-NEXT: 4. 1 5.0 0.0 0.0 st1 { v1.1d, v2.1d }, [x27], #16 -# CHECK-NEXT: 1 3.0 0.2 0.0 +# CHECK-NEXT: 2. 1 2.0 0.0 0.0 st1 { v1.8h }, [x27], x28 +# CHECK-NEXT: 3. 1 3.0 0.0 0.0 st1 { v1.16b }, [x27], x28 +# CHECK-NEXT: 4. 1 3.0 0.0 0.0 st1 { v1.1d, v2.1d }, [x27], #16 +# CHECK-NEXT: 1 2.2 0.2 0.0 # CHECK: [55] Code Region - G56 @@ -2709,7 +2709,7 @@ ldr x2, [x1], #254 # CHECK-NEXT: Total Cycles: 504 # CHECK-NEXT: Total uOps: 1900 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 3.77 # CHECK-NEXT: IPC: 0.99 # CHECK-NEXT: Block RThroughput: 3.5 @@ -2718,10 +2718,10 @@ ldr x2, [x1], #254 # CHECK-NEXT: Index 012345678 # CHECK: [0,0] DeeER. . st1 { v1.2d, v2.2d }, [x27], #32 -# CHECK-NEXT: [0,1] D=eeER . st1 { v1.2s, v2.2s }, [x27], #16 -# CHECK-NEXT: [0,2] D==eeER . st1 { v1.4h, v2.4h }, [x27], #16 -# CHECK-NEXT: [0,3] D===eeER. st1 { v1.4s, v2.4s }, [x27], #32 -# CHECK-NEXT: [0,4] .D===eeER st1 { v1.8b, v2.8b }, [x27], #16 +# CHECK-NEXT: [0,1] .DeeER . st1 { v1.2s, v2.2s }, [x27], #16 +# CHECK-NEXT: [0,2] .D=eeER . st1 { v1.4h, v2.4h }, [x27], #16 +# CHECK-NEXT: [0,3] . D=eeER. st1 { v1.4s, v2.4s }, [x27], #32 +# CHECK-NEXT: [0,4] . D=eeER st1 { v1.8b, v2.8b }, [x27], #16 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -2731,11 +2731,11 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.2d, v2.2d }, [x27], #32 -# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st1 { v1.2s, v2.2s }, [x27], #16 -# CHECK-NEXT: 2. 1 3.0 0.0 0.0 st1 { v1.4h, v2.4h }, [x27], #16 -# CHECK-NEXT: 3. 1 4.0 0.0 0.0 st1 { v1.4s, v2.4s }, [x27], #32 -# CHECK-NEXT: 4. 1 4.0 0.0 0.0 st1 { v1.8b, v2.8b }, [x27], #16 -# CHECK-NEXT: 1 2.8 0.2 0.0 +# CHECK-NEXT: 1. 1 1.0 0.0 0.0 st1 { v1.2s, v2.2s }, [x27], #16 +# CHECK-NEXT: 2. 1 2.0 0.0 0.0 st1 { v1.4h, v2.4h }, [x27], #16 +# CHECK-NEXT: 3. 1 2.0 0.0 0.0 st1 { v1.4s, v2.4s }, [x27], #32 +# CHECK-NEXT: 4. 1 2.0 0.0 0.0 st1 { v1.8b, v2.8b }, [x27], #16 +# CHECK-NEXT: 1 1.6 0.2 0.0 # CHECK: [56] Code Region - G57 @@ -2744,7 +2744,7 @@ ldr x2, [x1], #254 # CHECK-NEXT: Total Cycles: 504 # CHECK-NEXT: Total uOps: 2100 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 4.17 # CHECK-NEXT: IPC: 0.99 # CHECK-NEXT: Block RThroughput: 4.0 @@ -2753,10 +2753,10 @@ ldr x2, [x1], #254 # CHECK-NEXT: Index 012345678 # CHECK: [0,0] DeeER. . st1 { v1.8h, v2.8h }, [x27], #32 -# CHECK-NEXT: [0,1] D=eeER . st1 { v1.16b, v2.16b }, [x27], #32 -# CHECK-NEXT: [0,2] D==eeER . st1 { v1.1d, v2.1d }, [x27], x28 -# CHECK-NEXT: [0,3] .D==eeER. st1 { v1.2d, v2.2d }, [x27], x28 -# CHECK-NEXT: [0,4] .D===eeER st1 { v1.2s, v2.2s }, [x27], x28 +# CHECK-NEXT: [0,1] .DeeER . st1 { v1.16b, v2.16b }, [x27], #32 +# CHECK-NEXT: [0,2] . DeeER . st1 { v1.1d, v2.1d }, [x27], x28 +# CHECK-NEXT: [0,3] . DeeER. st1 { v1.2d, v2.2d }, [x27], x28 +# CHECK-NEXT: [0,4] . DeeER st1 { v1.2s, v2.2s }, [x27], x28 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -2766,11 +2766,11 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.8h, v2.8h }, [x27], #32 -# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st1 { v1.16b, v2.16b }, [x27], #32 -# CHECK-NEXT: 2. 1 3.0 0.0 0.0 st1 { v1.1d, v2.1d }, [x27], x28 -# CHECK-NEXT: 3. 1 3.0 0.0 0.0 st1 { v1.2d, v2.2d }, [x27], x28 -# CHECK-NEXT: 4. 1 4.0 0.0 0.0 st1 { v1.2s, v2.2s }, [x27], x28 -# CHECK-NEXT: 1 2.6 0.2 0.0 +# CHECK-NEXT: 1. 1 1.0 0.0 0.0 st1 { v1.16b, v2.16b }, [x27], #32 +# CHECK-NEXT: 2. 1 1.0 0.0 0.0 st1 { v1.1d, v2.1d }, [x27], x28 +# CHECK-NEXT: 3. 1 1.0 0.0 0.0 st1 { v1.2d, v2.2d }, [x27], x28 +# CHECK-NEXT: 4. 1 1.0 0.0 0.0 st1 { v1.2s, v2.2s }, [x27], x28 +# CHECK-NEXT: 1 1.0 0.2 0.0 # CHECK: [57] Code Region - G58 @@ -2779,7 +2779,7 @@ ldr x2, [x1], #254 # CHECK-NEXT: Total Cycles: 504 # CHECK-NEXT: Total uOps: 2100 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 4.17 # CHECK-NEXT: IPC: 0.99 # CHECK-NEXT: Block RThroughput: 4.0 @@ -2788,10 +2788,10 @@ ldr x2, [x1], #254 # CHECK-NEXT: Index 012345678 # CHECK: [0,0] DeeER. . st1 { v1.4h, v2.4h }, [x27], x28 -# CHECK-NEXT: [0,1] D=eeER . st1 { v1.4s, v2.4s }, [x27], x28 -# CHECK-NEXT: [0,2] D==eeER . st1 { v1.8b, v2.8b }, [x27], x28 -# CHECK-NEXT: [0,3] D===eeER. st1 { v1.8h, v2.8h }, [x27], x28 -# CHECK-NEXT: [0,4] .D===eeER st1 { v1.16b, v2.16b }, [x27], x28 +# CHECK-NEXT: [0,1] .DeeER . st1 { v1.4s, v2.4s }, [x27], x28 +# CHECK-NEXT: [0,2] . DeeER . st1 { v1.8b, v2.8b }, [x27], x28 +# CHECK-NEXT: [0,3] . DeeER. st1 { v1.8h, v2.8h }, [x27], x28 +# CHECK-NEXT: [0,4] . DeeER st1 { v1.16b, v2.16b }, [x27], x28 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -2801,11 +2801,11 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.4h, v2.4h }, [x27], x28 -# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st1 { v1.4s, v2.4s }, [x27], x28 -# CHECK-NEXT: 2. 1 3.0 0.0 0.0 st1 { v1.8b, v2.8b }, [x27], x28 -# CHECK-NEXT: 3. 1 4.0 0.0 0.0 st1 { v1.8h, v2.8h }, [x27], x28 -# CHECK-NEXT: 4. 1 4.0 0.0 0.0 st1 { v1.16b, v2.16b }, [x27], x28 -# CHECK-NEXT: 1 2.8 0.2 0.0 +# CHECK-NEXT: 1. 1 1.0 0.0 0.0 st1 { v1.4s, v2.4s }, [x27], x28 +# CHECK-NEXT: 2. 1 1.0 0.0 0.0 st1 { v1.8b, v2.8b }, [x27], x28 +# CHECK-NEXT: 3. 1 1.0 0.0 0.0 st1 { v1.8h, v2.8h }, [x27], x28 +# CHECK-NEXT: 4. 1 1.0 0.0 0.0 st1 { v1.16b, v2.16b }, [x27], x28 +# CHECK-NEXT: 1 1.0 0.2 0.0 # CHECK: [58] Code Region - G59 @@ -2814,7 +2814,7 @@ ldr x2, [x1], #254 # CHECK-NEXT: Total Cycles: 703 # CHECK-NEXT: Total uOps: 2900 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 4.13 # CHECK-NEXT: IPC: 0.71 # CHECK-NEXT: Block RThroughput: 6.0 @@ -2823,10 +2823,10 @@ ldr x2, [x1], #254 # CHECK-NEXT: Index 0123456789 # CHECK: [0,0] DeeER. . st1 { v1.1d, v2.1d, v3.1d }, [x27], #24 -# CHECK-NEXT: [0,1] D=eeER . st1 { v1.2d, v2.2d, v3.2d }, [x27], #48 -# CHECK-NEXT: [0,2] .D=eeER . st1 { v1.2s, v2.2s, v3.2s }, [x27], #24 -# CHECK-NEXT: [0,3] .D===eeER. st1 { v1.4h, v2.4h, v3.4h }, [x27], #24 -# CHECK-NEXT: [0,4] . D===eeER st1 { v1.4s, v2.4s, v3.4s }, [x27], #48 +# CHECK-NEXT: [0,1] .DeeER . st1 { v1.2d, v2.2d, v3.2d }, [x27], #48 +# CHECK-NEXT: [0,2] . DeeER . st1 { v1.2s, v2.2s, v3.2s }, [x27], #24 +# CHECK-NEXT: [0,3] . D=eeER. st1 { v1.4h, v2.4h, v3.4h }, [x27], #24 +# CHECK-NEXT: [0,4] . D=eeER st1 { v1.4s, v2.4s, v3.4s }, [x27], #48 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -2836,32 +2836,33 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.1d, v2.1d, v3.1d }, [x27], #24 -# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st1 { v1.2d, v2.2d, v3.2d }, [x27], #48 -# CHECK-NEXT: 2. 1 2.0 0.0 0.0 st1 { v1.2s, v2.2s, v3.2s }, [x27], #24 -# CHECK-NEXT: 3. 1 4.0 1.0 0.0 st1 { v1.4h, v2.4h, v3.4h }, [x27], #24 -# CHECK-NEXT: 4. 1 4.0 0.0 0.0 st1 { v1.4s, v2.4s, v3.4s }, [x27], #48 -# CHECK-NEXT: 1 2.6 0.4 0.0 +# CHECK-NEXT: 1. 1 1.0 0.0 0.0 st1 { v1.2d, v2.2d, v3.2d }, [x27], #48 +# CHECK-NEXT: 2. 1 1.0 0.0 0.0 st1 { v1.2s, v2.2s, v3.2s }, [x27], #24 +# CHECK-NEXT: 3. 1 2.0 1.0 0.0 st1 { v1.4h, v2.4h, v3.4h }, [x27], #24 +# CHECK-NEXT: 4. 1 2.0 0.0 0.0 st1 { v1.4s, v2.4s, v3.4s }, [x27], #48 +# CHECK-NEXT: 1 1.4 0.4 0.0 # CHECK: [59] Code Region - G60 # CHECK: Iterations: 100 # CHECK-NEXT: Instructions: 500 -# CHECK-NEXT: Total Cycles: 703 +# CHECK-NEXT: Total Cycles: 704 # CHECK-NEXT: Total uOps: 3100 -# CHECK: Dispatch Width: 16 -# CHECK-NEXT: uOps Per Cycle: 4.41 +# CHECK: Dispatch Width: 6 +# CHECK-NEXT: uOps Per Cycle: 4.40 # CHECK-NEXT: IPC: 0.71 # CHECK-NEXT: Block RThroughput: 6.5 # CHECK: Timeline view: +# CHECK-NEXT: 0 # CHECK-NEXT: Index 0123456789 -# CHECK: [0,0] DeeER. . st1 { v1.8b, v2.8b, v3.8b }, [x27], #24 -# CHECK-NEXT: [0,1] D=eeER . st1 { v1.8h, v2.8h, v3.8h }, [x27], #48 -# CHECK-NEXT: [0,2] .D=eeER . st1 { v1.16b, v2.16b, v3.16b }, [x27], #48 -# CHECK-NEXT: [0,3] .D===eeER. st1 { v1.1d, v2.1d, v3.1d }, [x27], x28 -# CHECK-NEXT: [0,4] . D===eeER st1 { v1.2d, v2.2d, v3.2d }, [x27], x28 +# CHECK: [0,0] DeeER. . st1 { v1.8b, v2.8b, v3.8b }, [x27], #24 +# CHECK-NEXT: [0,1] .DeeER . st1 { v1.8h, v2.8h, v3.8h }, [x27], #48 +# CHECK-NEXT: [0,2] . DeeER . st1 { v1.16b, v2.16b, v3.16b }, [x27], #48 +# CHECK-NEXT: [0,3] . DeeER . st1 { v1.1d, v2.1d, v3.1d }, [x27], x28 +# CHECK-NEXT: [0,4] . D=eeER st1 { v1.2d, v2.2d, v3.2d }, [x27], x28 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -2871,11 +2872,11 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.8b, v2.8b, v3.8b }, [x27], #24 -# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st1 { v1.8h, v2.8h, v3.8h }, [x27], #48 -# CHECK-NEXT: 2. 1 2.0 0.0 0.0 st1 { v1.16b, v2.16b, v3.16b }, [x27], #48 -# CHECK-NEXT: 3. 1 4.0 1.0 0.0 st1 { v1.1d, v2.1d, v3.1d }, [x27], x28 -# CHECK-NEXT: 4. 1 4.0 0.0 0.0 st1 { v1.2d, v2.2d, v3.2d }, [x27], x28 -# CHECK-NEXT: 1 2.6 0.4 0.0 +# CHECK-NEXT: 1. 1 1.0 0.0 0.0 st1 { v1.8h, v2.8h, v3.8h }, [x27], #48 +# CHECK-NEXT: 2. 1 1.0 1.0 0.0 st1 { v1.16b, v2.16b, v3.16b }, [x27], #48 +# CHECK-NEXT: 3. 1 1.0 0.0 0.0 st1 { v1.1d, v2.1d, v3.1d }, [x27], x28 +# CHECK-NEXT: 4. 1 2.0 1.0 0.0 st1 { v1.2d, v2.2d, v3.2d }, [x27], x28 +# CHECK-NEXT: 1 1.2 0.6 0.0 # CHECK: [60] Code Region - G61 @@ -2884,7 +2885,7 @@ ldr x2, [x1], #254 # CHECK-NEXT: Total Cycles: 703 # CHECK-NEXT: Total uOps: 2900 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 4.13 # CHECK-NEXT: IPC: 0.71 # CHECK-NEXT: Block RThroughput: 6.0 @@ -2893,10 +2894,10 @@ ldr x2, [x1], #254 # CHECK-NEXT: Index 0123456789 # CHECK: [0,0] DeeER. . st1 { v1.2s, v2.2s, v3.2s }, [x27], x28 -# CHECK-NEXT: [0,1] D=eeER . st1 { v1.4h, v2.4h, v3.4h }, [x27], x28 -# CHECK-NEXT: [0,2] .D=eeER . st1 { v1.4s, v2.4s, v3.4s }, [x27], x28 -# CHECK-NEXT: [0,3] .D==eeER . st1 { v1.8b, v2.8b, v3.8b }, [x27], x28 -# CHECK-NEXT: [0,4] . D===eeER st1 { v1.8h, v2.8h, v3.8h }, [x27], x28 +# CHECK-NEXT: [0,1] .DeeER . st1 { v1.4h, v2.4h, v3.4h }, [x27], x28 +# CHECK-NEXT: [0,2] . DeeER . st1 { v1.4s, v2.4s, v3.4s }, [x27], x28 +# CHECK-NEXT: [0,3] . DeeER . st1 { v1.8b, v2.8b, v3.8b }, [x27], x28 +# CHECK-NEXT: [0,4] . D=eeER st1 { v1.8h, v2.8h, v3.8h }, [x27], x28 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -2906,11 +2907,11 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.2s, v2.2s, v3.2s }, [x27], x28 -# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st1 { v1.4h, v2.4h, v3.4h }, [x27], x28 -# CHECK-NEXT: 2. 1 2.0 0.0 0.0 st1 { v1.4s, v2.4s, v3.4s }, [x27], x28 -# CHECK-NEXT: 3. 1 3.0 0.0 0.0 st1 { v1.8b, v2.8b, v3.8b }, [x27], x28 -# CHECK-NEXT: 4. 1 4.0 1.0 0.0 st1 { v1.8h, v2.8h, v3.8h }, [x27], x28 -# CHECK-NEXT: 1 2.4 0.4 0.0 +# CHECK-NEXT: 1. 1 1.0 0.0 0.0 st1 { v1.4h, v2.4h, v3.4h }, [x27], x28 +# CHECK-NEXT: 2. 1 1.0 0.0 0.0 st1 { v1.4s, v2.4s, v3.4s }, [x27], x28 +# CHECK-NEXT: 3. 1 1.0 0.0 0.0 st1 { v1.8b, v2.8b, v3.8b }, [x27], x28 +# CHECK-NEXT: 4. 1 2.0 1.0 0.0 st1 { v1.8h, v2.8h, v3.8h }, [x27], x28 +# CHECK-NEXT: 1 1.2 0.4 0.0 # CHECK: [61] Code Region - G62 @@ -2919,7 +2920,7 @@ ldr x2, [x1], #254 # CHECK-NEXT: Total Cycles: 704 # CHECK-NEXT: Total uOps: 3100 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 4.40 # CHECK-NEXT: IPC: 0.71 # CHECK-NEXT: Block RThroughput: 6.5 @@ -2929,10 +2930,10 @@ ldr x2, [x1], #254 # CHECK-NEXT: Index 0123456789 # CHECK: [0,0] DeeER. . st1 { v1.16b, v2.16b, v3.16b }, [x27], x28 -# CHECK-NEXT: [0,1] D=eeER . st1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32 -# CHECK-NEXT: [0,2] .D==eeER . st1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64 -# CHECK-NEXT: [0,3] .D===eeER . st1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32 -# CHECK-NEXT: [0,4] . D====eeER st1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32 +# CHECK-NEXT: [0,1] .DeeER . st1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32 +# CHECK-NEXT: [0,2] . D=eeER . st1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64 +# CHECK-NEXT: [0,3] . DeeER . st1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32 +# CHECK-NEXT: [0,4] . D=eeER st1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -2942,33 +2943,33 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.16b, v2.16b, v3.16b }, [x27], x28 -# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32 -# CHECK-NEXT: 2. 1 3.0 1.0 0.0 st1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64 -# CHECK-NEXT: 3. 1 4.0 0.0 0.0 st1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32 -# CHECK-NEXT: 4. 1 5.0 1.0 0.0 st1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32 -# CHECK-NEXT: 1 3.0 0.6 0.0 +# CHECK-NEXT: 1. 1 1.0 0.0 0.0 st1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32 +# CHECK-NEXT: 2. 1 2.0 1.0 0.0 st1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64 +# CHECK-NEXT: 3. 1 1.0 0.0 0.0 st1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32 +# CHECK-NEXT: 4. 1 2.0 1.0 0.0 st1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32 +# CHECK-NEXT: 1 1.4 0.6 0.0 # CHECK: [62] Code Region - G63 # CHECK: Iterations: 100 # CHECK-NEXT: Instructions: 500 -# CHECK-NEXT: Total Cycles: 804 +# CHECK-NEXT: Total Cycles: 805 # CHECK-NEXT: Total uOps: 3700 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 4.60 # CHECK-NEXT: IPC: 0.62 # CHECK-NEXT: Block RThroughput: 8.0 # CHECK: Timeline view: -# CHECK-NEXT: 01 +# CHECK-NEXT: 012 # CHECK-NEXT: Index 0123456789 -# CHECK: [0,0] DeeER. .. st1 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64 -# CHECK-NEXT: [0,1] D=eeER .. st1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32 -# CHECK-NEXT: [0,2] .D==eeER .. st1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64 -# CHECK-NEXT: [0,3] . D==eeER .. st1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64 -# CHECK-NEXT: [0,4] . D=====eeER st1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28 +# CHECK: [0,0] DeeER. . . st1 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64 +# CHECK-NEXT: [0,1] . DeeER . . st1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32 +# CHECK-NEXT: [0,2] . D=eeER . . st1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64 +# CHECK-NEXT: [0,3] . DeeER. . st1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64 +# CHECK-NEXT: [0,4] . . D=eeER st1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -2978,32 +2979,33 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64 -# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32 -# CHECK-NEXT: 2. 1 3.0 1.0 0.0 st1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64 -# CHECK-NEXT: 3. 1 3.0 0.0 0.0 st1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64 -# CHECK-NEXT: 4. 1 6.0 2.0 0.0 st1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28 -# CHECK-NEXT: 1 3.0 0.8 0.0 +# CHECK-NEXT: 1. 1 1.0 1.0 0.0 st1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32 +# CHECK-NEXT: 2. 1 2.0 1.0 0.0 st1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64 +# CHECK-NEXT: 3. 1 1.0 0.0 0.0 st1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64 +# CHECK-NEXT: 4. 1 2.0 2.0 0.0 st1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28 +# CHECK-NEXT: 1 1.4 1.0 0.0 # CHECK: [63] Code Region - G64 # CHECK: Iterations: 100 # CHECK-NEXT: Instructions: 500 -# CHECK-NEXT: Total Cycles: 703 +# CHECK-NEXT: Total Cycles: 704 # CHECK-NEXT: Total uOps: 3300 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 4.69 # CHECK-NEXT: IPC: 0.71 # CHECK-NEXT: Block RThroughput: 7.0 # CHECK: Timeline view: +# CHECK-NEXT: 0 # CHECK-NEXT: Index 0123456789 -# CHECK: [0,0] DeeER. . st1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28 -# CHECK-NEXT: [0,1] D=eeER . st1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28 -# CHECK-NEXT: [0,2] .D==eeER . st1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28 -# CHECK-NEXT: [0,3] .D===eeER. st1 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28 -# CHECK-NEXT: [0,4] . D===eeER st1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28 +# CHECK: [0,0] DeeER. . st1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28 +# CHECK-NEXT: [0,1] . DeeER . st1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28 +# CHECK-NEXT: [0,2] . D=eeER . st1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28 +# CHECK-NEXT: [0,3] . D=eeER. st1 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28 +# CHECK-NEXT: [0,4] . .DeeER st1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -3013,33 +3015,33 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28 -# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28 -# CHECK-NEXT: 2. 1 3.0 1.0 0.0 st1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28 -# CHECK-NEXT: 3. 1 4.0 0.0 0.0 st1 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28 -# CHECK-NEXT: 4. 1 4.0 0.0 0.0 st1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28 -# CHECK-NEXT: 1 2.8 0.4 0.0 +# CHECK-NEXT: 1. 1 1.0 1.0 0.0 st1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28 +# CHECK-NEXT: 2. 1 2.0 1.0 0.0 st1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28 +# CHECK-NEXT: 3. 1 2.0 0.0 0.0 st1 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28 +# CHECK-NEXT: 4. 1 1.0 0.0 0.0 st1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28 +# CHECK-NEXT: 1 1.4 0.6 0.0 # CHECK: [64] Code Region - G65 # CHECK: Iterations: 100 # CHECK-NEXT: Instructions: 500 -# CHECK-NEXT: Total Cycles: 706 +# CHECK-NEXT: Total Cycles: 707 # CHECK-NEXT: Total uOps: 3000 -# CHECK: Dispatch Width: 16 -# CHECK-NEXT: uOps Per Cycle: 4.25 +# CHECK: Dispatch Width: 6 +# CHECK-NEXT: uOps Per Cycle: 4.24 # CHECK-NEXT: IPC: 0.71 # CHECK-NEXT: Block RThroughput: 7.0 # CHECK: Timeline view: -# CHECK-NEXT: 012 +# CHECK-NEXT: 0123 # CHECK-NEXT: Index 0123456789 -# CHECK: [0,0] DeeER. . . st1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28 -# CHECK-NEXT: [0,1] .DeeER . . st1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28 -# CHECK-NEXT: [0,2] .D===eeeeER . st1 { v1.b }[0], [x27], #1 -# CHECK-NEXT: [0,3] . D===eeeeER. st1 { v1.b }[8], [x27], #1 -# CHECK-NEXT: [0,4] . D====eeeeER st1 { v1.b }[0], [x27], x28 +# CHECK: [0,0] DeeER. . . st1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28 +# CHECK-NEXT: [0,1] . DeeER . . st1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28 +# CHECK-NEXT: [0,2] . DeeeeER . st1 { v1.b }[0], [x27], #1 +# CHECK-NEXT: [0,3] . D=eeeeER. st1 { v1.b }[8], [x27], #1 +# CHECK-NEXT: [0,4] . .D=eeeeER st1 { v1.b }[0], [x27], x28 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -3049,11 +3051,11 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28 -# CHECK-NEXT: 1. 1 1.0 0.0 0.0 st1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28 -# CHECK-NEXT: 2. 1 4.0 2.0 0.0 st1 { v1.b }[0], [x27], #1 -# CHECK-NEXT: 3. 1 4.0 0.0 0.0 st1 { v1.b }[8], [x27], #1 -# CHECK-NEXT: 4. 1 5.0 0.0 0.0 st1 { v1.b }[0], [x27], x28 -# CHECK-NEXT: 1 3.0 0.6 0.0 +# CHECK-NEXT: 1. 1 1.0 1.0 0.0 st1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28 +# CHECK-NEXT: 2. 1 1.0 1.0 0.0 st1 { v1.b }[0], [x27], #1 +# CHECK-NEXT: 3. 1 2.0 1.0 0.0 st1 { v1.b }[8], [x27], #1 +# CHECK-NEXT: 4. 1 2.0 0.0 0.0 st1 { v1.b }[0], [x27], x28 +# CHECK-NEXT: 1 1.4 0.8 0.0 # CHECK: [65] Code Region - G66 @@ -3062,7 +3064,7 @@ ldr x2, [x1], #254 # CHECK-NEXT: Total Cycles: 506 # CHECK-NEXT: Total uOps: 2000 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 3.95 # CHECK-NEXT: IPC: 0.99 # CHECK-NEXT: Block RThroughput: 5.0 @@ -3072,10 +3074,10 @@ ldr x2, [x1], #254 # CHECK-NEXT: Index 0123456789 # CHECK: [0,0] DeeeeER . st1 { v1.b }[8], [x27], x28 -# CHECK-NEXT: [0,1] D=eeeeER . st1 { v1.h }[0], [x27], #2 -# CHECK-NEXT: [0,2] D==eeeeER . st1 { v1.h }[4], [x27], #2 -# CHECK-NEXT: [0,3] D===eeeeER. st1 { v1.h }[0], [x27], x28 -# CHECK-NEXT: [0,4] .D===eeeeER st1 { v1.h }[4], [x27], x28 +# CHECK-NEXT: [0,1] .DeeeeER . st1 { v1.h }[0], [x27], #2 +# CHECK-NEXT: [0,2] . DeeeeER . st1 { v1.h }[4], [x27], #2 +# CHECK-NEXT: [0,3] . DeeeeER. st1 { v1.h }[0], [x27], x28 +# CHECK-NEXT: [0,4] . DeeeeER st1 { v1.h }[4], [x27], x28 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -3085,11 +3087,11 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.b }[8], [x27], x28 -# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st1 { v1.h }[0], [x27], #2 -# CHECK-NEXT: 2. 1 3.0 0.0 0.0 st1 { v1.h }[4], [x27], #2 -# CHECK-NEXT: 3. 1 4.0 0.0 0.0 st1 { v1.h }[0], [x27], x28 -# CHECK-NEXT: 4. 1 4.0 0.0 0.0 st1 { v1.h }[4], [x27], x28 -# CHECK-NEXT: 1 2.8 0.2 0.0 +# CHECK-NEXT: 1. 1 1.0 0.0 0.0 st1 { v1.h }[0], [x27], #2 +# CHECK-NEXT: 2. 1 1.0 0.0 0.0 st1 { v1.h }[4], [x27], #2 +# CHECK-NEXT: 3. 1 1.0 0.0 0.0 st1 { v1.h }[0], [x27], x28 +# CHECK-NEXT: 4. 1 1.0 0.0 0.0 st1 { v1.h }[4], [x27], x28 +# CHECK-NEXT: 1 1.0 0.2 0.0 # CHECK: [66] Code Region - G67 @@ -3098,7 +3100,7 @@ ldr x2, [x1], #254 # CHECK-NEXT: Total Cycles: 605 # CHECK-NEXT: Total uOps: 2300 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 3.80 # CHECK-NEXT: IPC: 0.83 # CHECK-NEXT: Block RThroughput: 6.0 @@ -3108,10 +3110,10 @@ ldr x2, [x1], #254 # CHECK-NEXT: Index 0123456789 # CHECK: [0,0] DeeeeER . st1 { v1.s }[0], [x27], #4 -# CHECK-NEXT: [0,1] D=eeeeER . st1 { v1.s }[0], [x27], x28 -# CHECK-NEXT: [0,2] D==eeeeER . st1 { v1.d }[0], [x27], #8 -# CHECK-NEXT: [0,3] D===eeeeER. st1 { v1.d }[0], [x27], x28 -# CHECK-NEXT: [0,4] .D===eeeeER st2 { v1.2d, v2.2d }, [x27], #32 +# CHECK-NEXT: [0,1] .DeeeeER . st1 { v1.s }[0], [x27], x28 +# CHECK-NEXT: [0,2] . DeeeeER . st1 { v1.d }[0], [x27], #8 +# CHECK-NEXT: [0,3] . DeeeeER. st1 { v1.d }[0], [x27], x28 +# CHECK-NEXT: [0,4] . DeeeeER st2 { v1.2d, v2.2d }, [x27], #32 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -3121,11 +3123,11 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.s }[0], [x27], #4 -# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st1 { v1.s }[0], [x27], x28 -# CHECK-NEXT: 2. 1 3.0 0.0 0.0 st1 { v1.d }[0], [x27], #8 -# CHECK-NEXT: 3. 1 4.0 0.0 0.0 st1 { v1.d }[0], [x27], x28 -# CHECK-NEXT: 4. 1 4.0 0.0 0.0 st2 { v1.2d, v2.2d }, [x27], #32 -# CHECK-NEXT: 1 2.8 0.2 0.0 +# CHECK-NEXT: 1. 1 1.0 0.0 0.0 st1 { v1.s }[0], [x27], x28 +# CHECK-NEXT: 2. 1 1.0 0.0 0.0 st1 { v1.d }[0], [x27], #8 +# CHECK-NEXT: 3. 1 1.0 0.0 0.0 st1 { v1.d }[0], [x27], x28 +# CHECK-NEXT: 4. 1 1.0 0.0 0.0 st2 { v1.2d, v2.2d }, [x27], #32 +# CHECK-NEXT: 1 1.0 0.2 0.0 # CHECK: [67] Code Region - G68 @@ -3134,7 +3136,7 @@ ldr x2, [x1], #254 # CHECK-NEXT: Total Cycles: 705 # CHECK-NEXT: Total uOps: 2600 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 3.69 # CHECK-NEXT: IPC: 0.71 # CHECK-NEXT: Block RThroughput: 7.0 @@ -3144,10 +3146,10 @@ ldr x2, [x1], #254 # CHECK-NEXT: Index 0123456789 # CHECK: [0,0] DeeeeER .. st2 { v1.2s, v2.2s }, [x27], #16 -# CHECK-NEXT: [0,1] D=eeeeER .. st2 { v1.4h, v2.4h }, [x27], #16 -# CHECK-NEXT: [0,2] D==eeeeER .. st2 { v1.4s, v2.4s }, [x27], #32 -# CHECK-NEXT: [0,3] .D==eeeeER.. st2 { v1.8b, v2.8b }, [x27], #16 -# CHECK-NEXT: [0,4] .D====eeeeER st2 { v1.8h, v2.8h }, [x27], #32 +# CHECK-NEXT: [0,1] .DeeeeER .. st2 { v1.4h, v2.4h }, [x27], #16 +# CHECK-NEXT: [0,2] . DeeeeER .. st2 { v1.4s, v2.4s }, [x27], #32 +# CHECK-NEXT: [0,3] . DeeeeER.. st2 { v1.8b, v2.8b }, [x27], #16 +# CHECK-NEXT: [0,4] . D=eeeeER st2 { v1.8h, v2.8h }, [x27], #32 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -3157,33 +3159,33 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 st2 { v1.2s, v2.2s }, [x27], #16 -# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st2 { v1.4h, v2.4h }, [x27], #16 -# CHECK-NEXT: 2. 1 3.0 0.0 0.0 st2 { v1.4s, v2.4s }, [x27], #32 -# CHECK-NEXT: 3. 1 3.0 0.0 0.0 st2 { v1.8b, v2.8b }, [x27], #16 -# CHECK-NEXT: 4. 1 5.0 1.0 0.0 st2 { v1.8h, v2.8h }, [x27], #32 -# CHECK-NEXT: 1 2.8 0.4 0.0 +# CHECK-NEXT: 1. 1 1.0 0.0 0.0 st2 { v1.4h, v2.4h }, [x27], #16 +# CHECK-NEXT: 2. 1 1.0 0.0 0.0 st2 { v1.4s, v2.4s }, [x27], #32 +# CHECK-NEXT: 3. 1 1.0 0.0 0.0 st2 { v1.8b, v2.8b }, [x27], #16 +# CHECK-NEXT: 4. 1 2.0 1.0 0.0 st2 { v1.8h, v2.8h }, [x27], #32 +# CHECK-NEXT: 1 1.2 0.4 0.0 # CHECK: [68] Code Region - G69 # CHECK: Iterations: 100 # CHECK-NEXT: Instructions: 500 -# CHECK-NEXT: Total Cycles: 805 +# CHECK-NEXT: Total Cycles: 806 # CHECK-NEXT: Total uOps: 2900 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 3.60 # CHECK-NEXT: IPC: 0.62 # CHECK-NEXT: Block RThroughput: 8.0 # CHECK: Timeline view: -# CHECK-NEXT: 012 +# CHECK-NEXT: 0123 # CHECK-NEXT: Index 0123456789 -# CHECK: [0,0] DeeeeER . . st2 { v1.16b, v2.16b }, [x27], #32 -# CHECK-NEXT: [0,1] D=eeeeER . . st2 { v1.2d, v2.2d }, [x27], x28 -# CHECK-NEXT: [0,2] .D===eeeeER . st2 { v1.2s, v2.2s }, [x27], x28 -# CHECK-NEXT: [0,3] .D====eeeeER. st2 { v1.4h, v2.4h }, [x27], x28 -# CHECK-NEXT: [0,4] .D=====eeeeER st2 { v1.4s, v2.4s }, [x27], x28 +# CHECK: [0,0] DeeeeER . . st2 { v1.16b, v2.16b }, [x27], #32 +# CHECK-NEXT: [0,1] . DeeeeER . . st2 { v1.2d, v2.2d }, [x27], x28 +# CHECK-NEXT: [0,2] . D=eeeeER . st2 { v1.2s, v2.2s }, [x27], x28 +# CHECK-NEXT: [0,3] . D==eeeeER. st2 { v1.4h, v2.4h }, [x27], x28 +# CHECK-NEXT: [0,4] . D==eeeeER st2 { v1.4s, v2.4s }, [x27], x28 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -3193,33 +3195,33 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 st2 { v1.16b, v2.16b }, [x27], #32 -# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st2 { v1.2d, v2.2d }, [x27], x28 -# CHECK-NEXT: 2. 1 4.0 2.0 0.0 st2 { v1.2s, v2.2s }, [x27], x28 -# CHECK-NEXT: 3. 1 5.0 0.0 0.0 st2 { v1.4h, v2.4h }, [x27], x28 -# CHECK-NEXT: 4. 1 6.0 0.0 0.0 st2 { v1.4s, v2.4s }, [x27], x28 -# CHECK-NEXT: 1 3.6 0.6 0.0 +# CHECK-NEXT: 1. 1 1.0 1.0 0.0 st2 { v1.2d, v2.2d }, [x27], x28 +# CHECK-NEXT: 2. 1 2.0 1.0 0.0 st2 { v1.2s, v2.2s }, [x27], x28 +# CHECK-NEXT: 3. 1 3.0 1.0 0.0 st2 { v1.4h, v2.4h }, [x27], x28 +# CHECK-NEXT: 4. 1 3.0 0.0 0.0 st2 { v1.4s, v2.4s }, [x27], x28 +# CHECK-NEXT: 1 2.0 0.8 0.0 # CHECK: [69] Code Region - G70 # CHECK: Iterations: 100 # CHECK-NEXT: Instructions: 500 -# CHECK-NEXT: Total Cycles: 706 +# CHECK-NEXT: Total Cycles: 707 # CHECK-NEXT: Total uOps: 2600 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 3.68 # CHECK-NEXT: IPC: 0.71 # CHECK-NEXT: Block RThroughput: 7.0 # CHECK: Timeline view: -# CHECK-NEXT: 012 +# CHECK-NEXT: 0123 # CHECK-NEXT: Index 0123456789 -# CHECK: [0,0] DeeeeER . . st2 { v1.8b, v2.8b }, [x27], x28 -# CHECK-NEXT: [0,1] D=eeeeER . . st2 { v1.8h, v2.8h }, [x27], x28 -# CHECK-NEXT: [0,2] .D=eeeeER . . st2 { v1.16b, v2.16b }, [x27], x28 -# CHECK-NEXT: [0,3] .D====eeeeER. st2 { v1.b, v2.b }[0], [x27], #2 -# CHECK-NEXT: [0,4] .D=====eeeeER st2 { v1.b, v2.b }[8], [x27], #2 +# CHECK: [0,0] DeeeeER . . st2 { v1.8b, v2.8b }, [x27], x28 +# CHECK-NEXT: [0,1] .DeeeeER . . st2 { v1.8h, v2.8h }, [x27], x28 +# CHECK-NEXT: [0,2] . DeeeeER. . st2 { v1.16b, v2.16b }, [x27], x28 +# CHECK-NEXT: [0,3] . D=eeeeER . st2 { v1.b, v2.b }[0], [x27], #2 +# CHECK-NEXT: [0,4] . D==eeeeER st2 { v1.b, v2.b }[8], [x27], #2 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -3229,11 +3231,11 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 st2 { v1.8b, v2.8b }, [x27], x28 -# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st2 { v1.8h, v2.8h }, [x27], x28 -# CHECK-NEXT: 2. 1 2.0 0.0 0.0 st2 { v1.16b, v2.16b }, [x27], x28 -# CHECK-NEXT: 3. 1 5.0 2.0 0.0 st2 { v1.b, v2.b }[0], [x27], #2 -# CHECK-NEXT: 4. 1 6.0 0.0 0.0 st2 { v1.b, v2.b }[8], [x27], #2 -# CHECK-NEXT: 1 3.2 0.6 0.0 +# CHECK-NEXT: 1. 1 1.0 0.0 0.0 st2 { v1.8h, v2.8h }, [x27], x28 +# CHECK-NEXT: 2. 1 1.0 1.0 0.0 st2 { v1.16b, v2.16b }, [x27], x28 +# CHECK-NEXT: 3. 1 2.0 1.0 0.0 st2 { v1.b, v2.b }[0], [x27], #2 +# CHECK-NEXT: 4. 1 3.0 1.0 0.0 st2 { v1.b, v2.b }[8], [x27], #2 +# CHECK-NEXT: 1 1.6 0.8 0.0 # CHECK: [70] Code Region - G71 @@ -3242,7 +3244,7 @@ ldr x2, [x1], #254 # CHECK-NEXT: Total Cycles: 506 # CHECK-NEXT: Total uOps: 2000 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 3.95 # CHECK-NEXT: IPC: 0.99 # CHECK-NEXT: Block RThroughput: 5.0 @@ -3252,10 +3254,10 @@ ldr x2, [x1], #254 # CHECK-NEXT: Index 0123456789 # CHECK: [0,0] DeeeeER . st2 { v1.b, v2.b }[0], [x27], x28 -# CHECK-NEXT: [0,1] D=eeeeER . st2 { v1.b, v2.b }[8], [x27], x28 -# CHECK-NEXT: [0,2] D==eeeeER . st2 { v1.h, v2.h }[0], [x27], #4 -# CHECK-NEXT: [0,3] D===eeeeER. st2 { v1.h, v2.h }[4], [x27], #4 -# CHECK-NEXT: [0,4] .D===eeeeER st2 { v1.h, v2.h }[0], [x27], x28 +# CHECK-NEXT: [0,1] .DeeeeER . st2 { v1.b, v2.b }[8], [x27], x28 +# CHECK-NEXT: [0,2] . DeeeeER . st2 { v1.h, v2.h }[0], [x27], #4 +# CHECK-NEXT: [0,3] . DeeeeER. st2 { v1.h, v2.h }[4], [x27], #4 +# CHECK-NEXT: [0,4] . DeeeeER st2 { v1.h, v2.h }[0], [x27], x28 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -3265,11 +3267,11 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 st2 { v1.b, v2.b }[0], [x27], x28 -# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st2 { v1.b, v2.b }[8], [x27], x28 -# CHECK-NEXT: 2. 1 3.0 0.0 0.0 st2 { v1.h, v2.h }[0], [x27], #4 -# CHECK-NEXT: 3. 1 4.0 0.0 0.0 st2 { v1.h, v2.h }[4], [x27], #4 -# CHECK-NEXT: 4. 1 4.0 0.0 0.0 st2 { v1.h, v2.h }[0], [x27], x28 -# CHECK-NEXT: 1 2.8 0.2 0.0 +# CHECK-NEXT: 1. 1 1.0 0.0 0.0 st2 { v1.b, v2.b }[8], [x27], x28 +# CHECK-NEXT: 2. 1 1.0 0.0 0.0 st2 { v1.h, v2.h }[0], [x27], #4 +# CHECK-NEXT: 3. 1 1.0 0.0 0.0 st2 { v1.h, v2.h }[4], [x27], #4 +# CHECK-NEXT: 4. 1 1.0 0.0 0.0 st2 { v1.h, v2.h }[0], [x27], x28 +# CHECK-NEXT: 1 1.0 0.2 0.0 # CHECK: [71] Code Region - G72 @@ -3278,7 +3280,7 @@ ldr x2, [x1], #254 # CHECK-NEXT: Total Cycles: 506 # CHECK-NEXT: Total uOps: 2000 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 3.95 # CHECK-NEXT: IPC: 0.99 # CHECK-NEXT: Block RThroughput: 5.0 @@ -3288,10 +3290,10 @@ ldr x2, [x1], #254 # CHECK-NEXT: Index 0123456789 # CHECK: [0,0] DeeeeER . st2 { v1.h, v2.h }[4], [x27], x28 -# CHECK-NEXT: [0,1] D=eeeeER . st2 { v1.s, v2.s }[0], [x27], #8 -# CHECK-NEXT: [0,2] D==eeeeER . st2 { v1.s, v2.s }[0], [x27], x28 -# CHECK-NEXT: [0,3] D===eeeeER. st2 { v1.d, v2.d }[0], [x27], #16 -# CHECK-NEXT: [0,4] .D===eeeeER st2 { v1.d, v2.d }[0], [x27], x28 +# CHECK-NEXT: [0,1] .DeeeeER . st2 { v1.s, v2.s }[0], [x27], #8 +# CHECK-NEXT: [0,2] . DeeeeER . st2 { v1.s, v2.s }[0], [x27], x28 +# CHECK-NEXT: [0,3] . DeeeeER. st2 { v1.d, v2.d }[0], [x27], #16 +# CHECK-NEXT: [0,4] . DeeeeER st2 { v1.d, v2.d }[0], [x27], x28 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -3301,11 +3303,11 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 st2 { v1.h, v2.h }[4], [x27], x28 -# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st2 { v1.s, v2.s }[0], [x27], #8 -# CHECK-NEXT: 2. 1 3.0 0.0 0.0 st2 { v1.s, v2.s }[0], [x27], x28 -# CHECK-NEXT: 3. 1 4.0 0.0 0.0 st2 { v1.d, v2.d }[0], [x27], #16 -# CHECK-NEXT: 4. 1 4.0 0.0 0.0 st2 { v1.d, v2.d }[0], [x27], x28 -# CHECK-NEXT: 1 2.8 0.2 0.0 +# CHECK-NEXT: 1. 1 1.0 0.0 0.0 st2 { v1.s, v2.s }[0], [x27], #8 +# CHECK-NEXT: 2. 1 1.0 0.0 0.0 st2 { v1.s, v2.s }[0], [x27], x28 +# CHECK-NEXT: 3. 1 1.0 0.0 0.0 st2 { v1.d, v2.d }[0], [x27], #16 +# CHECK-NEXT: 4. 1 1.0 0.0 0.0 st2 { v1.d, v2.d }[0], [x27], x28 +# CHECK-NEXT: 1 1.0 0.2 0.0 # CHECK: [72] Code Region - G73 @@ -3314,7 +3316,7 @@ ldr x2, [x1], #254 # CHECK-NEXT: Total Cycles: 807 # CHECK-NEXT: Total uOps: 3000 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 3.72 # CHECK-NEXT: IPC: 0.62 # CHECK-NEXT: Block RThroughput: 7.0 @@ -3325,9 +3327,9 @@ ldr x2, [x1], #254 # CHECK: [0,0] DeER . . . st2g x26, [x27], #4064 # CHECK-NEXT: [0,1] D=eER. . . st2g x26, [x27, #4064]! -# CHECK-NEXT: [0,2] D==eeeeeeER . st3 { v1.2d, v2.2d, v3.2d }, [x27], #48 -# CHECK-NEXT: [0,3] .D==eeeeeER . st3 { v1.2s, v2.2s, v3.2s }, [x27], #24 -# CHECK-NEXT: [0,4] .D======eeeeeER st3 { v1.4h, v2.4h, v3.4h }, [x27], #24 +# CHECK-NEXT: [0,2] .D=eeeeeeER . st3 { v1.2d, v2.2d, v3.2d }, [x27], #48 +# CHECK-NEXT: [0,3] . DeeeeeER . st3 { v1.2s, v2.2s, v3.2s }, [x27], #24 +# CHECK-NEXT: [0,4] . D==eeeeeER st3 { v1.4h, v2.4h, v3.4h }, [x27], #24 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -3338,32 +3340,32 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 st2g x26, [x27], #4064 # CHECK-NEXT: 1. 1 2.0 0.0 0.0 st2g x26, [x27, #4064]! -# CHECK-NEXT: 2. 1 3.0 0.0 0.0 st3 { v1.2d, v2.2d, v3.2d }, [x27], #48 -# CHECK-NEXT: 3. 1 3.0 0.0 0.0 st3 { v1.2s, v2.2s, v3.2s }, [x27], #24 -# CHECK-NEXT: 4. 1 7.0 3.0 0.0 st3 { v1.4h, v2.4h, v3.4h }, [x27], #24 -# CHECK-NEXT: 1 3.2 0.8 0.0 +# CHECK-NEXT: 2. 1 2.0 0.0 0.0 st3 { v1.2d, v2.2d, v3.2d }, [x27], #48 +# CHECK-NEXT: 3. 1 1.0 0.0 0.0 st3 { v1.2s, v2.2s, v3.2s }, [x27], #24 +# CHECK-NEXT: 4. 1 3.0 3.0 0.0 st3 { v1.4h, v2.4h, v3.4h }, [x27], #24 +# CHECK-NEXT: 1 1.8 0.8 0.0 # CHECK: [73] Code Region - G74 # CHECK: Iterations: 100 # CHECK-NEXT: Instructions: 500 -# CHECK-NEXT: Total Cycles: 1405 +# CHECK-NEXT: Total Cycles: 1406 # CHECK-NEXT: Total uOps: 4700 -# CHECK: Dispatch Width: 16 -# CHECK-NEXT: uOps Per Cycle: 3.35 +# CHECK: Dispatch Width: 6 +# CHECK-NEXT: uOps Per Cycle: 3.34 # CHECK-NEXT: IPC: 0.36 # CHECK-NEXT: Block RThroughput: 14.0 # CHECK: Timeline view: # CHECK-NEXT: 0123456789 -# CHECK-NEXT: Index 0123456789 +# CHECK-NEXT: Index 0123456789 0 -# CHECK: [0,0] DeeeeeeER . . . st3 { v1.4s, v2.4s, v3.4s }, [x27], #48 -# CHECK-NEXT: [0,1] .DeeeeeER . . . st3 { v1.8b, v2.8b, v3.8b }, [x27], #24 -# CHECK-NEXT: [0,2] . D===eeeeeeER . . st3 { v1.8h, v2.8h, v3.8h }, [x27], #48 -# CHECK-NEXT: [0,3] . D===eeeeeeER. . st3 { v1.16b, v2.16b, v3.16b }, [x27], #48 -# CHECK-NEXT: [0,4] . D=======eeeeeeER st3 { v1.2d, v2.2d, v3.2d }, [x27], x28 +# CHECK: [0,0] DeeeeeeER . . . st3 { v1.4s, v2.4s, v3.4s }, [x27], #48 +# CHECK-NEXT: [0,1] . DeeeeeER. . . st3 { v1.8b, v2.8b, v3.8b }, [x27], #24 +# CHECK-NEXT: [0,2] . D==eeeeeeER. . st3 { v1.8h, v2.8h, v3.8h }, [x27], #48 +# CHECK-NEXT: [0,3] . .D=eeeeeeER . st3 { v1.16b, v2.16b, v3.16b }, [x27], #48 +# CHECK-NEXT: [0,4] . . D====eeeeeeER st3 { v1.2d, v2.2d, v3.2d }, [x27], x28 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -3373,33 +3375,33 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 st3 { v1.4s, v2.4s, v3.4s }, [x27], #48 -# CHECK-NEXT: 1. 1 1.0 0.0 0.0 st3 { v1.8b, v2.8b, v3.8b }, [x27], #24 -# CHECK-NEXT: 2. 1 4.0 3.0 0.0 st3 { v1.8h, v2.8h, v3.8h }, [x27], #48 -# CHECK-NEXT: 3. 1 4.0 0.0 0.0 st3 { v1.16b, v2.16b, v3.16b }, [x27], #48 -# CHECK-NEXT: 4. 1 8.0 4.0 0.0 st3 { v1.2d, v2.2d, v3.2d }, [x27], x28 -# CHECK-NEXT: 1 3.6 1.6 0.0 +# CHECK-NEXT: 1. 1 1.0 1.0 0.0 st3 { v1.8b, v2.8b, v3.8b }, [x27], #24 +# CHECK-NEXT: 2. 1 3.0 3.0 0.0 st3 { v1.8h, v2.8h, v3.8h }, [x27], #48 +# CHECK-NEXT: 3. 1 2.0 0.0 0.0 st3 { v1.16b, v2.16b, v3.16b }, [x27], #48 +# CHECK-NEXT: 4. 1 5.0 4.0 0.0 st3 { v1.2d, v2.2d, v3.2d }, [x27], x28 +# CHECK-NEXT: 1 2.4 1.8 0.0 # CHECK: [74] Code Region - G75 # CHECK: Iterations: 100 # CHECK-NEXT: Instructions: 500 -# CHECK-NEXT: Total Cycles: 1206 +# CHECK-NEXT: Total Cycles: 1207 # CHECK-NEXT: Total uOps: 4100 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 3.40 # CHECK-NEXT: IPC: 0.41 # CHECK-NEXT: Block RThroughput: 12.0 # CHECK: Timeline view: -# CHECK-NEXT: 01234567 +# CHECK-NEXT: 012345678 # CHECK-NEXT: Index 0123456789 -# CHECK: [0,0] DeeeeeER . . . st3 { v1.2s, v2.2s, v3.2s }, [x27], x28 -# CHECK-NEXT: [0,1] D=eeeeeER . . . st3 { v1.4h, v2.4h, v3.4h }, [x27], x28 -# CHECK-NEXT: [0,2] .D===eeeeeeER . . st3 { v1.4s, v2.4s, v3.4s }, [x27], x28 -# CHECK-NEXT: [0,3] . D===eeeeeER . . st3 { v1.8b, v2.8b, v3.8b }, [x27], x28 -# CHECK-NEXT: [0,4] . D======eeeeeeER st3 { v1.8h, v2.8h, v3.8h }, [x27], x28 +# CHECK: [0,0] DeeeeeER . . . st3 { v1.2s, v2.2s, v3.2s }, [x27], x28 +# CHECK-NEXT: [0,1] . DeeeeeER. . . st3 { v1.4h, v2.4h, v3.4h }, [x27], x28 +# CHECK-NEXT: [0,2] . DeeeeeeER . . st3 { v1.4s, v2.4s, v3.4s }, [x27], x28 +# CHECK-NEXT: [0,3] . .DeeeeeER . . st3 { v1.8b, v2.8b, v3.8b }, [x27], x28 +# CHECK-NEXT: [0,4] . . D==eeeeeeER st3 { v1.8h, v2.8h, v3.8h }, [x27], x28 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -3409,33 +3411,33 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 st3 { v1.2s, v2.2s, v3.2s }, [x27], x28 -# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st3 { v1.4h, v2.4h, v3.4h }, [x27], x28 -# CHECK-NEXT: 2. 1 4.0 2.0 0.0 st3 { v1.4s, v2.4s, v3.4s }, [x27], x28 -# CHECK-NEXT: 3. 1 4.0 0.0 0.0 st3 { v1.8b, v2.8b, v3.8b }, [x27], x28 -# CHECK-NEXT: 4. 1 7.0 3.0 0.0 st3 { v1.8h, v2.8h, v3.8h }, [x27], x28 -# CHECK-NEXT: 1 3.6 1.2 0.0 +# CHECK-NEXT: 1. 1 1.0 1.0 0.0 st3 { v1.4h, v2.4h, v3.4h }, [x27], x28 +# CHECK-NEXT: 2. 1 1.0 1.0 0.0 st3 { v1.4s, v2.4s, v3.4s }, [x27], x28 +# CHECK-NEXT: 3. 1 1.0 1.0 0.0 st3 { v1.8b, v2.8b, v3.8b }, [x27], x28 +# CHECK-NEXT: 4. 1 3.0 3.0 0.0 st3 { v1.8h, v2.8h, v3.8h }, [x27], x28 +# CHECK-NEXT: 1 1.4 1.4 0.0 # CHECK: [75] Code Region - G76 # CHECK: Iterations: 100 # CHECK-NEXT: Instructions: 500 -# CHECK-NEXT: Total Cycles: 1106 +# CHECK-NEXT: Total Cycles: 1107 # CHECK-NEXT: Total uOps: 3800 -# CHECK: Dispatch Width: 16 -# CHECK-NEXT: uOps Per Cycle: 3.44 +# CHECK: Dispatch Width: 6 +# CHECK-NEXT: uOps Per Cycle: 3.43 # CHECK-NEXT: IPC: 0.45 # CHECK-NEXT: Block RThroughput: 11.0 # CHECK: Timeline view: -# CHECK-NEXT: 0123456 +# CHECK-NEXT: 01234567 # CHECK-NEXT: Index 0123456789 -# CHECK: [0,0] DeeeeeeER . .. st3 { v1.16b, v2.16b, v3.16b }, [x27], x28 -# CHECK-NEXT: [0,1] .DeeeeeER . .. st3 { v1.b, v2.b, v3.b }[0], [x27], #3 -# CHECK-NEXT: [0,2] .D====eeeeeER .. st3 { v1.b, v2.b, v3.b }[8], [x27], #3 -# CHECK-NEXT: [0,3] . D====eeeeeER .. st3 { v1.b, v2.b, v3.b }[0], [x27], x28 -# CHECK-NEXT: [0,4] . D=======eeeeeER st3 { v1.b, v2.b, v3.b }[8], [x27], x28 +# CHECK: [0,0] DeeeeeeER . . . st3 { v1.16b, v2.16b, v3.16b }, [x27], x28 +# CHECK-NEXT: [0,1] . DeeeeeER. . . st3 { v1.b, v2.b, v3.b }[0], [x27], #3 +# CHECK-NEXT: [0,2] . D==eeeeeER . . st3 { v1.b, v2.b, v3.b }[8], [x27], #3 +# CHECK-NEXT: [0,3] . .D=eeeeeER. . st3 { v1.b, v2.b, v3.b }[0], [x27], x28 +# CHECK-NEXT: [0,4] . . D==eeeeeER st3 { v1.b, v2.b, v3.b }[8], [x27], x28 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -3445,20 +3447,20 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 st3 { v1.16b, v2.16b, v3.16b }, [x27], x28 -# CHECK-NEXT: 1. 1 1.0 0.0 0.0 st3 { v1.b, v2.b, v3.b }[0], [x27], #3 -# CHECK-NEXT: 2. 1 5.0 3.0 0.0 st3 { v1.b, v2.b, v3.b }[8], [x27], #3 -# CHECK-NEXT: 3. 1 5.0 0.0 0.0 st3 { v1.b, v2.b, v3.b }[0], [x27], x28 -# CHECK-NEXT: 4. 1 8.0 2.0 0.0 st3 { v1.b, v2.b, v3.b }[8], [x27], x28 -# CHECK-NEXT: 1 4.0 1.2 0.0 +# CHECK-NEXT: 1. 1 1.0 1.0 0.0 st3 { v1.b, v2.b, v3.b }[0], [x27], #3 +# CHECK-NEXT: 2. 1 3.0 3.0 0.0 st3 { v1.b, v2.b, v3.b }[8], [x27], #3 +# CHECK-NEXT: 3. 1 2.0 0.0 0.0 st3 { v1.b, v2.b, v3.b }[0], [x27], x28 +# CHECK-NEXT: 4. 1 3.0 2.0 0.0 st3 { v1.b, v2.b, v3.b }[8], [x27], x28 +# CHECK-NEXT: 1 2.0 1.4 0.0 # CHECK: [76] Code Region - G77 # CHECK: Iterations: 100 # CHECK-NEXT: Instructions: 500 -# CHECK-NEXT: Total Cycles: 1005 +# CHECK-NEXT: Total Cycles: 1006 # CHECK-NEXT: Total uOps: 3500 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 3.48 # CHECK-NEXT: IPC: 0.50 # CHECK-NEXT: Block RThroughput: 10.0 @@ -3468,10 +3470,10 @@ ldr x2, [x1], #254 # CHECK-NEXT: Index 0123456789 # CHECK: [0,0] DeeeeeER . . st3 { v1.h, v2.h, v3.h }[0], [x27], #6 -# CHECK-NEXT: [0,1] D=eeeeeER . . st3 { v1.h, v2.h, v3.h }[4], [x27], #6 -# CHECK-NEXT: [0,2] .D===eeeeeER . st3 { v1.h, v2.h, v3.h }[0], [x27], x28 -# CHECK-NEXT: [0,3] .D====eeeeeER . st3 { v1.h, v2.h, v3.h }[4], [x27], x28 -# CHECK-NEXT: [0,4] . D======eeeeeER st3 { v1.s, v2.s, v3.s }[0], [x27], #12 +# CHECK-NEXT: [0,1] . DeeeeeER. . st3 { v1.h, v2.h, v3.h }[4], [x27], #6 +# CHECK-NEXT: [0,2] . DeeeeeER . st3 { v1.h, v2.h, v3.h }[0], [x27], x28 +# CHECK-NEXT: [0,3] . .DeeeeeER . st3 { v1.h, v2.h, v3.h }[4], [x27], x28 +# CHECK-NEXT: [0,4] . . DeeeeeER st3 { v1.s, v2.s, v3.s }[0], [x27], #12 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -3481,33 +3483,33 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 st3 { v1.h, v2.h, v3.h }[0], [x27], #6 -# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st3 { v1.h, v2.h, v3.h }[4], [x27], #6 -# CHECK-NEXT: 2. 1 4.0 2.0 0.0 st3 { v1.h, v2.h, v3.h }[0], [x27], x28 -# CHECK-NEXT: 3. 1 5.0 0.0 0.0 st3 { v1.h, v2.h, v3.h }[4], [x27], x28 -# CHECK-NEXT: 4. 1 7.0 2.0 0.0 st3 { v1.s, v2.s, v3.s }[0], [x27], #12 -# CHECK-NEXT: 1 3.8 1.0 0.0 +# CHECK-NEXT: 1. 1 1.0 1.0 0.0 st3 { v1.h, v2.h, v3.h }[4], [x27], #6 +# CHECK-NEXT: 2. 1 1.0 1.0 0.0 st3 { v1.h, v2.h, v3.h }[0], [x27], x28 +# CHECK-NEXT: 3. 1 1.0 1.0 0.0 st3 { v1.h, v2.h, v3.h }[4], [x27], x28 +# CHECK-NEXT: 4. 1 1.0 1.0 0.0 st3 { v1.s, v2.s, v3.s }[0], [x27], #12 +# CHECK-NEXT: 1 1.0 1.0 0.0 # CHECK: [77] Code Region - G78 # CHECK: Iterations: 100 # CHECK-NEXT: Instructions: 500 -# CHECK-NEXT: Total Cycles: 1304 +# CHECK-NEXT: Total Cycles: 1305 # CHECK-NEXT: Total uOps: 4300 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 3.30 # CHECK-NEXT: IPC: 0.38 # CHECK-NEXT: Block RThroughput: 13.0 # CHECK: Timeline view: -# CHECK-NEXT: 0123456 +# CHECK-NEXT: 01234567 # CHECK-NEXT: Index 0123456789 -# CHECK: [0,0] DeeeeeER . .. st3 { v1.s, v2.s, v3.s }[0], [x27], x28 -# CHECK-NEXT: [0,1] D=eeeeeER . .. st3 { v1.d, v2.d, v3.d }[0], [x27], #24 -# CHECK-NEXT: [0,2] .D===eeeeeER .. st3 { v1.d, v2.d, v3.d }[0], [x27], x28 -# CHECK-NEXT: [0,3] . D===eeeeeER .. st4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64 -# CHECK-NEXT: [0,4] . D=====eeeeeeER st4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32 +# CHECK: [0,0] DeeeeeER . . . st3 { v1.s, v2.s, v3.s }[0], [x27], x28 +# CHECK-NEXT: [0,1] . DeeeeeER. . . st3 { v1.d, v2.d, v3.d }[0], [x27], #24 +# CHECK-NEXT: [0,2] . DeeeeeER . . st3 { v1.d, v2.d, v3.d }[0], [x27], x28 +# CHECK-NEXT: [0,3] . .DeeeeeER . . st4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64 +# CHECK-NEXT: [0,4] . . DeeeeeeER st4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -3517,33 +3519,33 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 st3 { v1.s, v2.s, v3.s }[0], [x27], x28 -# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st3 { v1.d, v2.d, v3.d }[0], [x27], #24 -# CHECK-NEXT: 2. 1 4.0 2.0 0.0 st3 { v1.d, v2.d, v3.d }[0], [x27], x28 -# CHECK-NEXT: 3. 1 4.0 0.0 0.0 st4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64 -# CHECK-NEXT: 4. 1 6.0 2.0 0.0 st4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32 -# CHECK-NEXT: 1 3.4 1.0 0.0 +# CHECK-NEXT: 1. 1 1.0 1.0 0.0 st3 { v1.d, v2.d, v3.d }[0], [x27], #24 +# CHECK-NEXT: 2. 1 1.0 1.0 0.0 st3 { v1.d, v2.d, v3.d }[0], [x27], x28 +# CHECK-NEXT: 3. 1 1.0 1.0 0.0 st4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64 +# CHECK-NEXT: 4. 1 1.0 1.0 0.0 st4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32 +# CHECK-NEXT: 1 1.0 1.0 0.0 # CHECK: [78] Code Region - G79 # CHECK: Iterations: 100 # CHECK-NEXT: Instructions: 500 -# CHECK-NEXT: Total Cycles: 2399 +# CHECK-NEXT: Total Cycles: 2400 # CHECK-NEXT: Total uOps: 6900 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 2.88 # CHECK-NEXT: IPC: 0.21 # CHECK-NEXT: Block RThroughput: 24.0 # CHECK: Timeline view: # CHECK-NEXT: 0123456789 -# CHECK-NEXT: Index 0123456789 012 +# CHECK-NEXT: Index 0123456789 0123 -# CHECK: [0,0] DeeeeeeER . . . . st4 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32 -# CHECK-NEXT: [0,1] .DeeeeeeeER . . . st4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64 -# CHECK-NEXT: [0,2] . D====eeeeeeER. . . st4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32 -# CHECK-NEXT: [0,3] . D=========eeeeeeeER. st4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64 -# CHECK-NEXT: [0,4] . D========eeeeeeeER st4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64 +# CHECK: [0,0] DeeeeeeER . . . . st4 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32 +# CHECK-NEXT: [0,1] . DeeeeeeeER . . . st4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64 +# CHECK-NEXT: [0,2] . D=eeeeeeER. . . st4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32 +# CHECK-NEXT: [0,3] . . D=====eeeeeeeER . st4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64 +# CHECK-NEXT: [0,4] . . D====eeeeeeeER st4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -3553,33 +3555,33 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 st4 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32 -# CHECK-NEXT: 1. 1 1.0 0.0 0.0 st4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64 -# CHECK-NEXT: 2. 1 5.0 4.0 0.0 st4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32 -# CHECK-NEXT: 3. 1 10.0 5.0 0.0 st4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64 -# CHECK-NEXT: 4. 1 9.0 0.0 0.0 st4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64 -# CHECK-NEXT: 1 5.2 2.0 0.0 +# CHECK-NEXT: 1. 1 1.0 1.0 0.0 st4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64 +# CHECK-NEXT: 2. 1 2.0 2.0 0.0 st4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32 +# CHECK-NEXT: 3. 1 6.0 5.0 0.0 st4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64 +# CHECK-NEXT: 4. 1 5.0 1.0 0.0 st4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64 +# CHECK-NEXT: 1 3.0 2.0 0.0 # CHECK: [79] Code Region - G80 # CHECK: Iterations: 100 # CHECK-NEXT: Instructions: 500 -# CHECK-NEXT: Total Cycles: 1903 +# CHECK-NEXT: Total Cycles: 1904 # CHECK-NEXT: Total uOps: 5700 -# CHECK: Dispatch Width: 16 -# CHECK-NEXT: uOps Per Cycle: 3.00 +# CHECK: Dispatch Width: 6 +# CHECK-NEXT: uOps Per Cycle: 2.99 # CHECK-NEXT: IPC: 0.26 # CHECK-NEXT: Block RThroughput: 19.0 # CHECK: Timeline view: # CHECK-NEXT: 0123456789 -# CHECK-NEXT: Index 0123456789 01 +# CHECK-NEXT: Index 0123456789 012 -# CHECK: [0,0] DeeeeeER . . .. st4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28 -# CHECK-NEXT: [0,1] .DeeeeeeER. . .. st4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28 -# CHECK-NEXT: [0,2] . D=====eeeeeeER .. st4 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28 -# CHECK-NEXT: [0,3] . D=====eeeeeeeER .. st4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28 -# CHECK-NEXT: [0,4] . D=========eeeeeeER st4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28 +# CHECK: [0,0] DeeeeeER . . . . st4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28 +# CHECK-NEXT: [0,1] . DeeeeeeER . . . st4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28 +# CHECK-NEXT: [0,2] . D===eeeeeeER . . st4 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28 +# CHECK-NEXT: [0,3] . . D==eeeeeeeER . . st4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28 +# CHECK-NEXT: [0,4] . . D====eeeeeeER st4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -3589,33 +3591,33 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 st4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28 -# CHECK-NEXT: 1. 1 1.0 0.0 0.0 st4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28 -# CHECK-NEXT: 2. 1 6.0 5.0 0.0 st4 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28 -# CHECK-NEXT: 3. 1 6.0 0.0 0.0 st4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28 -# CHECK-NEXT: 4. 1 10.0 4.0 0.0 st4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28 -# CHECK-NEXT: 1 4.8 2.0 0.0 +# CHECK-NEXT: 1. 1 1.0 1.0 0.0 st4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28 +# CHECK-NEXT: 2. 1 4.0 4.0 0.0 st4 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28 +# CHECK-NEXT: 3. 1 3.0 0.0 0.0 st4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28 +# CHECK-NEXT: 4. 1 5.0 4.0 0.0 st4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28 +# CHECK-NEXT: 1 2.8 2.0 0.0 # CHECK: [80] Code Region - G81 # CHECK: Iterations: 100 # CHECK-NEXT: Instructions: 500 -# CHECK-NEXT: Total Cycles: 1658 +# CHECK-NEXT: Total Cycles: 1659 # CHECK-NEXT: Total uOps: 4900 -# CHECK: Dispatch Width: 16 -# CHECK-NEXT: uOps Per Cycle: 2.96 +# CHECK: Dispatch Width: 6 +# CHECK-NEXT: uOps Per Cycle: 2.95 # CHECK-NEXT: IPC: 0.30 # CHECK-NEXT: Block RThroughput: 16.5 # CHECK: Timeline view: # CHECK-NEXT: 0123456789 -# CHECK-NEXT: Index 0123456789 0123 +# CHECK-NEXT: Index 0123456789 01234 -# CHECK: [0,0] DeeeeeeeER. . . . st4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28 -# CHECK-NEXT: [0,1] . DeeeeeeeER . . . st4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28 -# CHECK-NEXT: [0,2] . D=========eeeeeeER . st4 { v1.b, v2.b, v3.b, v4.b }[0], [x27], #4 -# CHECK-NEXT: [0,3] . D===========eeeeeeER. st4 { v1.b, v2.b, v3.b, v4.b }[8], [x27], #4 -# CHECK-NEXT: [0,4] . D============eeeeeeER st4 { v1.b, v2.b, v3.b, v4.b }[0], [x27], x28 +# CHECK: [0,0] DeeeeeeeER. . . . st4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28 +# CHECK-NEXT: [0,1] . DeeeeeeeER . . . st4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28 +# CHECK-NEXT: [0,2] . .D======eeeeeeER . st4 { v1.b, v2.b, v3.b, v4.b }[0], [x27], #4 +# CHECK-NEXT: [0,3] . . D========eeeeeeER. st4 { v1.b, v2.b, v3.b, v4.b }[8], [x27], #4 +# CHECK-NEXT: [0,4] . . D========eeeeeeER st4 { v1.b, v2.b, v3.b, v4.b }[0], [x27], x28 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -3626,10 +3628,10 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 st4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28 # CHECK-NEXT: 1. 1 1.0 1.0 0.0 st4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28 -# CHECK-NEXT: 2. 1 10.0 9.0 0.0 st4 { v1.b, v2.b, v3.b, v4.b }[0], [x27], #4 -# CHECK-NEXT: 3. 1 12.0 1.0 0.0 st4 { v1.b, v2.b, v3.b, v4.b }[8], [x27], #4 -# CHECK-NEXT: 4. 1 13.0 0.0 0.0 st4 { v1.b, v2.b, v3.b, v4.b }[0], [x27], x28 -# CHECK-NEXT: 1 7.4 2.4 0.0 +# CHECK-NEXT: 2. 1 7.0 7.0 0.0 st4 { v1.b, v2.b, v3.b, v4.b }[0], [x27], #4 +# CHECK-NEXT: 3. 1 9.0 2.0 0.0 st4 { v1.b, v2.b, v3.b, v4.b }[8], [x27], #4 +# CHECK-NEXT: 4. 1 9.0 0.0 0.0 st4 { v1.b, v2.b, v3.b, v4.b }[0], [x27], x28 +# CHECK-NEXT: 1 5.4 2.2 0.0 # CHECK: [81] Code Region - G82 @@ -3638,7 +3640,7 @@ ldr x2, [x1], #254 # CHECK-NEXT: Total Cycles: 757 # CHECK-NEXT: Total uOps: 2500 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 3.30 # CHECK-NEXT: IPC: 0.66 # CHECK-NEXT: Block RThroughput: 7.5 @@ -3648,10 +3650,10 @@ ldr x2, [x1], #254 # CHECK-NEXT: Index 0123456789 # CHECK: [0,0] DeeeeeeER . . st4 { v1.b, v2.b, v3.b, v4.b }[8], [x27], x28 -# CHECK-NEXT: [0,1] D=eeeeeeER. . st4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], #8 -# CHECK-NEXT: [0,2] D===eeeeeeER . st4 { v1.h, v2.h, v3.h, v4.h }[4], [x27], #8 -# CHECK-NEXT: [0,3] .D===eeeeeeER . st4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], x28 -# CHECK-NEXT: [0,4] .D=====eeeeeeER st4 { v1.h, v2.h, v3.h, v4.h }[4], [x27], x28 +# CHECK-NEXT: [0,1] .DeeeeeeER. . st4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], #8 +# CHECK-NEXT: [0,2] . D=eeeeeeER . st4 { v1.h, v2.h, v3.h, v4.h }[4], [x27], #8 +# CHECK-NEXT: [0,3] . D=eeeeeeER . st4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], x28 +# CHECK-NEXT: [0,4] . D==eeeeeeER st4 { v1.h, v2.h, v3.h, v4.h }[4], [x27], x28 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -3661,11 +3663,11 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 st4 { v1.b, v2.b, v3.b, v4.b }[8], [x27], x28 -# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], #8 -# CHECK-NEXT: 2. 1 4.0 1.0 0.0 st4 { v1.h, v2.h, v3.h, v4.h }[4], [x27], #8 -# CHECK-NEXT: 3. 1 4.0 0.0 0.0 st4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], x28 -# CHECK-NEXT: 4. 1 6.0 1.0 0.0 st4 { v1.h, v2.h, v3.h, v4.h }[4], [x27], x28 -# CHECK-NEXT: 1 3.4 0.6 0.0 +# CHECK-NEXT: 1. 1 1.0 0.0 0.0 st4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], #8 +# CHECK-NEXT: 2. 1 2.0 1.0 0.0 st4 { v1.h, v2.h, v3.h, v4.h }[4], [x27], #8 +# CHECK-NEXT: 3. 1 2.0 0.0 0.0 st4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], x28 +# CHECK-NEXT: 4. 1 3.0 1.0 0.0 st4 { v1.h, v2.h, v3.h, v4.h }[4], [x27], x28 +# CHECK-NEXT: 1 1.8 0.6 0.0 # CHECK: [82] Code Region - G83 @@ -3674,7 +3676,7 @@ ldr x2, [x1], #254 # CHECK-NEXT: Total Cycles: 704 # CHECK-NEXT: Total uOps: 2700 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 3.84 # CHECK-NEXT: IPC: 0.71 # CHECK-NEXT: Block RThroughput: 7.0 @@ -3684,10 +3686,10 @@ ldr x2, [x1], #254 # CHECK-NEXT: Index 0123456789 # CHECK: [0,0] DeeeeeeER . st4 { v1.s, v2.s, v3.s, v4.s }[0], [x27], #16 -# CHECK-NEXT: [0,1] D=eeeeeeER. st4 { v1.s, v2.s, v3.s, v4.s }[0], [x27], x28 -# CHECK-NEXT: [0,2] .D==eeeeER. st4 { v1.d, v2.d, v3.d, v4.d }[0], [x27], #32 -# CHECK-NEXT: [0,3] .D===eeeeER st4 { v1.d, v2.d, v3.d, v4.d }[0], [x27], x28 -# CHECK-NEXT: [0,4] . D===eE--R stg x26, [x27], #4064 +# CHECK-NEXT: [0,1] .DeeeeeeER. st4 { v1.s, v2.s, v3.s, v4.s }[0], [x27], x28 +# CHECK-NEXT: [0,2] . D=eeeeER. st4 { v1.d, v2.d, v3.d, v4.d }[0], [x27], #32 +# CHECK-NEXT: [0,3] . DeeeeER st4 { v1.d, v2.d, v3.d, v4.d }[0], [x27], x28 +# CHECK-NEXT: [0,4] . DeE--R stg x26, [x27], #4064 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -3697,11 +3699,11 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 st4 { v1.s, v2.s, v3.s, v4.s }[0], [x27], #16 -# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st4 { v1.s, v2.s, v3.s, v4.s }[0], [x27], x28 -# CHECK-NEXT: 2. 1 3.0 1.0 0.0 st4 { v1.d, v2.d, v3.d, v4.d }[0], [x27], #32 -# CHECK-NEXT: 3. 1 4.0 0.0 0.0 st4 { v1.d, v2.d, v3.d, v4.d }[0], [x27], x28 -# CHECK-NEXT: 4. 1 4.0 0.0 2.0 stg x26, [x27], #4064 -# CHECK-NEXT: 1 2.8 0.4 0.4 +# CHECK-NEXT: 1. 1 1.0 0.0 0.0 st4 { v1.s, v2.s, v3.s, v4.s }[0], [x27], x28 +# CHECK-NEXT: 2. 1 2.0 1.0 0.0 st4 { v1.d, v2.d, v3.d, v4.d }[0], [x27], #32 +# CHECK-NEXT: 3. 1 1.0 0.0 0.0 st4 { v1.d, v2.d, v3.d, v4.d }[0], [x27], x28 +# CHECK-NEXT: 4. 1 1.0 0.0 2.0 stg x26, [x27], #4064 +# CHECK-NEXT: 1 1.2 0.4 0.4 # CHECK: [83] Code Region - G84 @@ -3710,19 +3712,19 @@ ldr x2, [x1], #254 # CHECK-NEXT: Total Cycles: 504 # CHECK-NEXT: Total uOps: 1700 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 3.37 # CHECK-NEXT: IPC: 0.99 -# CHECK-NEXT: Block RThroughput: 2.5 +# CHECK-NEXT: Block RThroughput: 2.8 # CHECK: Timeline view: # CHECK-NEXT: Index 012345678 # CHECK: [0,0] DeER . . stg x26, [x27, #4064]! # CHECK-NEXT: [0,1] D=eER. . stgp x1, x2, [x27], #992 -# CHECK-NEXT: [0,2] D==eER . stgp x1, x2, [x27, #992]! -# CHECK-NEXT: [0,3] D===eeER. stp s1, s2, [x27], #248 -# CHECK-NEXT: [0,4] .D===eeER stp d1, d2, [x27], #496 +# CHECK-NEXT: [0,2] .D=eER . stgp x1, x2, [x27, #992]! +# CHECK-NEXT: [0,3] . D=eeER. stp s1, s2, [x27], #248 +# CHECK-NEXT: [0,4] . D=eeER stp d1, d2, [x27], #496 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -3733,10 +3735,10 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 stg x26, [x27, #4064]! # CHECK-NEXT: 1. 1 2.0 0.0 0.0 stgp x1, x2, [x27], #992 -# CHECK-NEXT: 2. 1 3.0 0.0 0.0 stgp x1, x2, [x27, #992]! -# CHECK-NEXT: 3. 1 4.0 0.0 0.0 stp s1, s2, [x27], #248 -# CHECK-NEXT: 4. 1 4.0 0.0 0.0 stp d1, d2, [x27], #496 -# CHECK-NEXT: 1 2.8 0.2 0.0 +# CHECK-NEXT: 2. 1 2.0 0.0 0.0 stgp x1, x2, [x27, #992]! +# CHECK-NEXT: 3. 1 2.0 0.0 0.0 stp s1, s2, [x27], #248 +# CHECK-NEXT: 4. 1 2.0 0.0 0.0 stp d1, d2, [x27], #496 +# CHECK-NEXT: 1 1.8 0.2 0.0 # CHECK: [84] Code Region - G85 @@ -3745,19 +3747,19 @@ ldr x2, [x1], #254 # CHECK-NEXT: Total Cycles: 703 # CHECK-NEXT: Total uOps: 2000 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 2.84 # CHECK-NEXT: IPC: 0.71 -# CHECK-NEXT: Block RThroughput: 3.0 +# CHECK-NEXT: Block RThroughput: 3.3 # CHECK: Timeline view: # CHECK-NEXT: Index 0123456789 # CHECK: [0,0] DeeER. . stp q1, q2, [x27], #992 -# CHECK-NEXT: [0,1] D==eeER . stp s1, s2, [x27, #248]! -# CHECK-NEXT: [0,2] D===eeER . stp d1, d2, [x27, #496]! -# CHECK-NEXT: [0,3] .D===eeER. stp q1, q2, [x27, #992]! -# CHECK-NEXT: [0,4] .D=====eER stp w1, w2, [x27], #248 +# CHECK-NEXT: [0,1] .D=eeER . stp s1, s2, [x27, #248]! +# CHECK-NEXT: [0,2] . D=eeER . stp d1, d2, [x27, #496]! +# CHECK-NEXT: [0,3] . D=eeER. stp q1, q2, [x27, #992]! +# CHECK-NEXT: [0,4] . D==eER stp w1, w2, [x27], #248 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -3767,11 +3769,11 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 stp q1, q2, [x27], #992 -# CHECK-NEXT: 1. 1 3.0 0.0 0.0 stp s1, s2, [x27, #248]! -# CHECK-NEXT: 2. 1 4.0 0.0 0.0 stp d1, d2, [x27, #496]! -# CHECK-NEXT: 3. 1 4.0 0.0 0.0 stp q1, q2, [x27, #992]! -# CHECK-NEXT: 4. 1 6.0 0.0 0.0 stp w1, w2, [x27], #248 -# CHECK-NEXT: 1 3.6 0.2 0.0 +# CHECK-NEXT: 1. 1 2.0 0.0 0.0 stp s1, s2, [x27, #248]! +# CHECK-NEXT: 2. 1 2.0 0.0 0.0 stp d1, d2, [x27, #496]! +# CHECK-NEXT: 3. 1 2.0 0.0 0.0 stp q1, q2, [x27, #992]! +# CHECK-NEXT: 4. 1 3.0 0.0 0.0 stp w1, w2, [x27], #248 +# CHECK-NEXT: 1 2.0 0.2 0.0 # CHECK: [85] Code Region - G86 @@ -3780,19 +3782,19 @@ ldr x2, [x1], #254 # CHECK-NEXT: Total Cycles: 504 # CHECK-NEXT: Total uOps: 1700 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 3.37 # CHECK-NEXT: IPC: 0.99 -# CHECK-NEXT: Block RThroughput: 2.5 +# CHECK-NEXT: Block RThroughput: 2.8 # CHECK: Timeline view: # CHECK-NEXT: Index 012345678 # CHECK: [0,0] DeER . . stp x1, x2, [x27], #496 # CHECK-NEXT: [0,1] D=eER. . stp w1, w2, [x27, #248]! -# CHECK-NEXT: [0,2] D==eER . stp x1, x2, [x27, #496]! -# CHECK-NEXT: [0,3] D===eeER. str b1, [x27], #254 -# CHECK-NEXT: [0,4] .D===eeER str h1, [x27], #254 +# CHECK-NEXT: [0,2] .D=eER . stp x1, x2, [x27, #496]! +# CHECK-NEXT: [0,3] . D=eeER. str b1, [x27], #254 +# CHECK-NEXT: [0,4] . D=eeER str h1, [x27], #254 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -3803,10 +3805,10 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 stp x1, x2, [x27], #496 # CHECK-NEXT: 1. 1 2.0 0.0 0.0 stp w1, w2, [x27, #248]! -# CHECK-NEXT: 2. 1 3.0 0.0 0.0 stp x1, x2, [x27, #496]! -# CHECK-NEXT: 3. 1 4.0 0.0 0.0 str b1, [x27], #254 -# CHECK-NEXT: 4. 1 4.0 0.0 0.0 str h1, [x27], #254 -# CHECK-NEXT: 1 2.8 0.2 0.0 +# CHECK-NEXT: 2. 1 2.0 0.0 0.0 stp x1, x2, [x27, #496]! +# CHECK-NEXT: 3. 1 2.0 0.0 0.0 str b1, [x27], #254 +# CHECK-NEXT: 4. 1 2.0 0.0 0.0 str h1, [x27], #254 +# CHECK-NEXT: 1 1.8 0.2 0.0 # CHECK: [86] Code Region - G87 @@ -3815,19 +3817,19 @@ ldr x2, [x1], #254 # CHECK-NEXT: Total Cycles: 504 # CHECK-NEXT: Total uOps: 2000 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 3.97 # CHECK-NEXT: IPC: 0.99 -# CHECK-NEXT: Block RThroughput: 2.5 +# CHECK-NEXT: Block RThroughput: 3.3 # CHECK: Timeline view: # CHECK-NEXT: Index 012345678 # CHECK: [0,0] DeeER. . str s1, [x27], #254 -# CHECK-NEXT: [0,1] D=eeER . str d1, [x27], #254 -# CHECK-NEXT: [0,2] D==eeER . str q1, [x27], #254 -# CHECK-NEXT: [0,3] D===eeER. str b1, [x27, #254]! -# CHECK-NEXT: [0,4] .D===eeER str h1, [x27, #254]! +# CHECK-NEXT: [0,1] .DeeER . str d1, [x27], #254 +# CHECK-NEXT: [0,2] . DeeER . str q1, [x27], #254 +# CHECK-NEXT: [0,3] . DeeER. str b1, [x27, #254]! +# CHECK-NEXT: [0,4] . DeeER str h1, [x27, #254]! # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -3837,11 +3839,11 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 str s1, [x27], #254 -# CHECK-NEXT: 1. 1 2.0 0.0 0.0 str d1, [x27], #254 -# CHECK-NEXT: 2. 1 3.0 0.0 0.0 str q1, [x27], #254 -# CHECK-NEXT: 3. 1 4.0 0.0 0.0 str b1, [x27, #254]! -# CHECK-NEXT: 4. 1 4.0 0.0 0.0 str h1, [x27, #254]! -# CHECK-NEXT: 1 2.8 0.2 0.0 +# CHECK-NEXT: 1. 1 1.0 0.0 0.0 str d1, [x27], #254 +# CHECK-NEXT: 2. 1 1.0 0.0 0.0 str q1, [x27], #254 +# CHECK-NEXT: 3. 1 1.0 0.0 0.0 str b1, [x27, #254]! +# CHECK-NEXT: 4. 1 1.0 0.0 0.0 str h1, [x27, #254]! +# CHECK-NEXT: 1 1.0 0.2 0.0 # CHECK: [87] Code Region - G88 @@ -3850,19 +3852,19 @@ ldr x2, [x1], #254 # CHECK-NEXT: Total Cycles: 503 # CHECK-NEXT: Total uOps: 1800 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 3.58 # CHECK-NEXT: IPC: 0.99 -# CHECK-NEXT: Block RThroughput: 2.5 +# CHECK-NEXT: Block RThroughput: 3.0 # CHECK: Timeline view: # CHECK-NEXT: Index 01234567 # CHECK: [0,0] DeeER. . str s1, [x27, #254]! -# CHECK-NEXT: [0,1] D=eeER . str d1, [x27, #254]! -# CHECK-NEXT: [0,2] D==eeER. str q1, [x27, #254]! -# CHECK-NEXT: [0,3] D===eER. str w1, [x27], #254 -# CHECK-NEXT: [0,4] .D===eER str x1, [x27], #254 +# CHECK-NEXT: [0,1] .DeeER . str d1, [x27, #254]! +# CHECK-NEXT: [0,2] . DeeER. str q1, [x27, #254]! +# CHECK-NEXT: [0,3] . DeER. str w1, [x27], #254 +# CHECK-NEXT: [0,4] . D=eER str x1, [x27], #254 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -3872,11 +3874,11 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 str s1, [x27, #254]! -# CHECK-NEXT: 1. 1 2.0 0.0 0.0 str d1, [x27, #254]! -# CHECK-NEXT: 2. 1 3.0 0.0 0.0 str q1, [x27, #254]! -# CHECK-NEXT: 3. 1 4.0 0.0 0.0 str w1, [x27], #254 -# CHECK-NEXT: 4. 1 4.0 0.0 0.0 str x1, [x27], #254 -# CHECK-NEXT: 1 2.8 0.2 0.0 +# CHECK-NEXT: 1. 1 1.0 0.0 0.0 str d1, [x27, #254]! +# CHECK-NEXT: 2. 1 1.0 0.0 0.0 str q1, [x27, #254]! +# CHECK-NEXT: 3. 1 1.0 0.0 0.0 str w1, [x27], #254 +# CHECK-NEXT: 4. 1 2.0 0.0 0.0 str x1, [x27], #254 +# CHECK-NEXT: 1 1.2 0.2 0.0 # CHECK: [88] Code Region - G89 @@ -3885,7 +3887,7 @@ ldr x2, [x1], #254 # CHECK-NEXT: Total Cycles: 503 # CHECK-NEXT: Total uOps: 1500 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 2.98 # CHECK-NEXT: IPC: 0.99 # CHECK-NEXT: Block RThroughput: 2.5 @@ -3895,9 +3897,9 @@ ldr x2, [x1], #254 # CHECK: [0,0] DeER . . str w1, [x27, #254]! # CHECK-NEXT: [0,1] D=eER. . str x1, [x27, #254]! -# CHECK-NEXT: [0,2] D==eER . strb w1, [x27], #254 -# CHECK-NEXT: [0,3] D===eER. strb w1, [x27, #254]! -# CHECK-NEXT: [0,4] D====eER strh w1, [x27], #254 +# CHECK-NEXT: [0,2] .D=eER . strb w1, [x27], #254 +# CHECK-NEXT: [0,3] .D==eER. strb w1, [x27, #254]! +# CHECK-NEXT: [0,4] . D==eER strh w1, [x27], #254 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -3908,10 +3910,10 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 str w1, [x27, #254]! # CHECK-NEXT: 1. 1 2.0 0.0 0.0 str x1, [x27, #254]! -# CHECK-NEXT: 2. 1 3.0 0.0 0.0 strb w1, [x27], #254 -# CHECK-NEXT: 3. 1 4.0 0.0 0.0 strb w1, [x27, #254]! -# CHECK-NEXT: 4. 1 5.0 0.0 0.0 strh w1, [x27], #254 -# CHECK-NEXT: 1 3.0 0.2 0.0 +# CHECK-NEXT: 2. 1 2.0 0.0 0.0 strb w1, [x27], #254 +# CHECK-NEXT: 3. 1 3.0 0.0 0.0 strb w1, [x27, #254]! +# CHECK-NEXT: 4. 1 3.0 0.0 0.0 strh w1, [x27], #254 +# CHECK-NEXT: 1 2.2 0.2 0.0 # CHECK: [89] Code Region - G90 @@ -3920,7 +3922,7 @@ ldr x2, [x1], #254 # CHECK-NEXT: Total Cycles: 503 # CHECK-NEXT: Total uOps: 1500 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 2.98 # CHECK-NEXT: IPC: 0.99 # CHECK-NEXT: Block RThroughput: 2.5 @@ -3930,9 +3932,9 @@ ldr x2, [x1], #254 # CHECK: [0,0] DeER . . strh w1, [x27, #254]! # CHECK-NEXT: [0,1] D=eER. . stz2g x26, [x27], #4064 -# CHECK-NEXT: [0,2] D==eER . stz2g x26, [x27, #4064]! -# CHECK-NEXT: [0,3] D===eER. stzg x26, [x27], #4064 -# CHECK-NEXT: [0,4] D====eER stzg x26, [x27, #4064]! +# CHECK-NEXT: [0,2] .D=eER . stz2g x26, [x27, #4064]! +# CHECK-NEXT: [0,3] .D==eER. stzg x26, [x27], #4064 +# CHECK-NEXT: [0,4] . D==eER stzg x26, [x27, #4064]! # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -3943,10 +3945,10 @@ ldr x2, [x1], #254 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 strh w1, [x27, #254]! # CHECK-NEXT: 1. 1 2.0 0.0 0.0 stz2g x26, [x27], #4064 -# CHECK-NEXT: 2. 1 3.0 0.0 0.0 stz2g x26, [x27, #4064]! -# CHECK-NEXT: 3. 1 4.0 0.0 0.0 stzg x26, [x27], #4064 -# CHECK-NEXT: 4. 1 5.0 0.0 0.0 stzg x26, [x27, #4064]! -# CHECK-NEXT: 1 3.0 0.2 0.0 +# CHECK-NEXT: 2. 1 2.0 0.0 0.0 stz2g x26, [x27, #4064]! +# CHECK-NEXT: 3. 1 3.0 0.0 0.0 stzg x26, [x27], #4064 +# CHECK-NEXT: 4. 1 3.0 0.0 0.0 stzg x26, [x27, #4064]! +# CHECK-NEXT: 1 2.2 0.2 0.0 # CHECK: [90] Code Region - G91 @@ -3955,7 +3957,7 @@ ldr x2, [x1], #254 # CHECK-NEXT: Total Cycles: 110 # CHECK-NEXT: Total uOps: 400 -# CHECK: Dispatch Width: 16 +# CHECK: Dispatch Width: 6 # CHECK-NEXT: uOps Per Cycle: 3.64 # CHECK-NEXT: IPC: 1.82 # CHECK-NEXT: Block RThroughput: 0.7 diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-zero-lat-movs.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-zero-lat-movs.s index 1690d9669b396..3ddb525327015 100644 --- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-zero-lat-movs.s +++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-zero-lat-movs.s @@ -23,18 +23,18 @@ mov x1, x2 # CHECK-NEXT: [6]: HasSideEffects (U) # CHECK: [1] [2] [3] [4] [5] [6] Instructions: -# CHECK-NEXT: 1 0 0.06 mov x1, #0 -# CHECK-NEXT: 1 0 0.06 mov x1, xzr -# CHECK-NEXT: 1 0 0.06 mov w1, #0 -# CHECK-NEXT: 1 0 0.06 mov w1, wzr -# CHECK-NEXT: 1 0 0.06 fmov h1, wzr -# CHECK-NEXT: 1 0 0.06 fmov h1, xzr -# CHECK-NEXT: 1 0 0.06 fmov s1, wzr -# CHECK-NEXT: 1 0 0.06 fmov d1, xzr -# CHECK-NEXT: 1 0 0.06 movi d1, #0000000000000000 -# CHECK-NEXT: 1 0 0.06 movi v1.2d, #0000000000000000 -# CHECK-NEXT: 1 0 0.06 mov w1, w2 -# CHECK-NEXT: 1 0 0.06 mov x1, x2 +# CHECK-NEXT: 1 0 0.17 mov x1, #0 +# CHECK-NEXT: 1 0 0.17 mov x1, xzr +# CHECK-NEXT: 1 0 0.17 mov w1, #0 +# CHECK-NEXT: 1 0 0.17 mov w1, wzr +# CHECK-NEXT: 1 0 0.17 fmov h1, wzr +# CHECK-NEXT: 1 0 0.17 fmov h1, xzr +# CHECK-NEXT: 1 0 0.17 fmov s1, wzr +# CHECK-NEXT: 1 0 0.17 fmov d1, xzr +# CHECK-NEXT: 1 0 0.17 movi d1, #0000000000000000 +# CHECK-NEXT: 1 0 0.17 movi v1.2d, #0000000000000000 +# CHECK-NEXT: 1 0 0.17 mov w1, w2 +# CHECK-NEXT: 1 0 0.17 mov x1, x2 # CHECK: Resources: # CHECK-NEXT: [0.0] - V2UnitB