diff --git a/llvm/lib/MC/MCSchedule.cpp b/llvm/lib/MC/MCSchedule.cpp index 4f7125864c5a0..f67c43c95935f 100644 --- a/llvm/lib/MC/MCSchedule.cpp +++ b/llvm/lib/MC/MCSchedule.cpp @@ -96,8 +96,9 @@ MCSchedModel::getReciprocalThroughput(const MCSubtargetInfo &STI, for (; I != E; ++I) { if (!I->ReleaseAtCycle) continue; + assert(I->ReleaseAtCycle > I->AcquireAtCycle); unsigned NumUnits = SM.getProcResource(I->ProcResourceIdx)->NumUnits; - double Temp = NumUnits * 1.0 / I->ReleaseAtCycle; + double Temp = NumUnits * 1.0 / (I->ReleaseAtCycle - I->AcquireAtCycle); Throughput = Throughput ? std::min(*Throughput, Temp) : Temp; } if (Throughput) diff --git a/llvm/lib/Target/RISCV/CMakeLists.txt b/llvm/lib/Target/RISCV/CMakeLists.txt index fd049d1a57860..4727e0ca22428 100644 --- a/llvm/lib/Target/RISCV/CMakeLists.txt +++ b/llvm/lib/Target/RISCV/CMakeLists.txt @@ -15,6 +15,7 @@ tablegen(LLVM RISCVGenRegisterBank.inc -gen-register-bank) tablegen(LLVM RISCVGenRegisterInfo.inc -gen-register-info) tablegen(LLVM RISCVGenSearchableTables.inc -gen-searchable-tables) tablegen(LLVM RISCVGenSubtargetInfo.inc -gen-subtarget) +tablegen(LLVM RISCVGenExegesis.inc -gen-exegesis) set(LLVM_TARGET_DEFINITIONS RISCVGISel.td) tablegen(LLVM RISCVGenGlobalISel.inc -gen-global-isel) diff --git a/llvm/lib/Target/RISCV/RISCV.td b/llvm/lib/Target/RISCV/RISCV.td index 00c3d702e12a2..4d8320ff5cbb4 100644 --- a/llvm/lib/Target/RISCV/RISCV.td +++ b/llvm/lib/Target/RISCV/RISCV.td @@ -61,6 +61,12 @@ include "RISCVSchedXiangShanNanHu.td" include "RISCVProcessors.td" +//===----------------------------------------------------------------------===// +// Pfm Counters +//===----------------------------------------------------------------------===// + +include "RISCVPfmCounters.td" + //===----------------------------------------------------------------------===// // Define the RISC-V target. //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/RISCV/RISCVInsertWriteVXRM.cpp b/llvm/lib/Target/RISCV/RISCVInsertWriteVXRM.cpp index f72ba2d5c667b..608652a4efafe 100644 --- a/llvm/lib/Target/RISCV/RISCVInsertWriteVXRM.cpp +++ b/llvm/lib/Target/RISCV/RISCVInsertWriteVXRM.cpp @@ -198,8 +198,19 @@ char RISCVInsertWriteVXRM::ID = 0; INITIALIZE_PASS(RISCVInsertWriteVXRM, DEBUG_TYPE, RISCV_INSERT_WRITE_VXRM_NAME, false, false) +static unsigned getAndCacheRVVMCOpcode(unsigned VPseudoOpcode) { + // VPseudo opcode -> MC opcode + static DenseMap OpcodeCache; + auto It = OpcodeCache.find(VPseudoOpcode); + if (It != OpcodeCache.end()) + return It->second; + unsigned MCOpcode = RISCV::getRVVMCOpcode(VPseudoOpcode); + OpcodeCache.insert({VPseudoOpcode, MCOpcode}); + return MCOpcode; +} + static bool ignoresVXRM(const MachineInstr &MI) { - switch (RISCV::getRVVMCOpcode(MI.getOpcode())) { + switch (getAndCacheRVVMCOpcode(MI.getOpcode())) { default: return false; case RISCV::VNCLIP_WI: diff --git a/llvm/lib/Target/RISCV/RISCVPfmCounters.td b/llvm/lib/Target/RISCV/RISCVPfmCounters.td new file mode 100644 index 0000000000000..c986a38c30f2d --- /dev/null +++ b/llvm/lib/Target/RISCV/RISCVPfmCounters.td @@ -0,0 +1,18 @@ +//===---- RISCVPfmCounters.td - RISCV Hardware Counters ----*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This describes the available hardware counters for RISCV. +// +//===----------------------------------------------------------------------===// + +def CpuCyclesPfmCounter : PfmCounter<"CYCLES">; + +def DefaultPfmCounters : ProcPfmCounters { + let CycleCounter = CpuCyclesPfmCounter; +} +def : PfmCountersDefaultBinding; diff --git a/llvm/test/tools/llvm-exegesis/RISCV/deserialize-obj-file.yaml b/llvm/test/tools/llvm-exegesis/RISCV/deserialize-obj-file.yaml new file mode 100644 index 0000000000000..68f394af6bc71 --- /dev/null +++ b/llvm/test/tools/llvm-exegesis/RISCV/deserialize-obj-file.yaml @@ -0,0 +1,29 @@ +# RUN: llvm-exegesis -mtriple=riscv64 -mcpu=sifive-x280 -start-before-phase=measure --mode=latency --dry-run-measurement --use-dummy-perf-counters \ +# RUN: --dump-object-to-disk=%t.o %s > %t.result.yml +# RUN: llvm-objdump -d %t.o | FileCheck %s + +# CHECK: vsetvli {{.*}}, zero, e32, m1, tu, ma +# CHECK: fsrmi {{.*}}, 0x0 +# CHECK: vfwredusum.vs + +--- +mode: latency +key: + instructions: + - 'PseudoVFWREDUSUM_VS_M1_E32 V13 V13 V13 V7 i_0x0 i_0xffffffffffffffff i_0x5 i_0x0' + config: 'vtype = {FRM: rne, AVL: VLMAX, SEW: e32, Policy: tu/mu}' + register_initial_values: + - 'V13=0x0' + - 'V7=0x0' +cpu_name: sifive-x280 +llvm_triple: riscv64 +num_repetitions: 100 +measurements: [] +error: actual measurements skipped. +info: '' +assembled_snippet: 57730009F3532000D796D3C6D796D3C6D796D3C6D796D3C6739023008280 +object_file: + compression: zlib + original_size: 5632 + compressed_bytes: 'eJztWDFvEzEUfk6btEgMoWVAogMSHSokrJybRrCgIFQQEjAUKiYU3V3s9kQul5zN6egC4hd0YmTuL2FGYuB3oK5IYPt8SXBcIbYO/qTn973Pfs8v5zflw/6zxw2EoAaCc5hHC7heuaa0vmZ9WHef9PDw8PDw8PDw8PDw8PDwuGR4zeHK+ctb8OPz96/eLo/x09vw6ePDFgLIEx4XgH7J11ptN/Oi103IJBikZNIZhIoxMiGDoVpipRWBXE6SmOdEE0bHMU00Z8dB5dJkrFkUVi7SrqC7hM1YaVivO5wxNmNm11Qs5iWLUUDumXojster6S6p2V4wo72uZiVnskLEZI2O/EEqnKZhHE+zqdxWc9o284pODgCVCN282tDaDaN/+cdfUWvq68HP3+7dxpJydIEe6XV1SX+j1+aSfkfaxkKdus8tE9+3b8GClgL2S3pEecKfjln2inIBWE8BDoXIk+idoBxYlgEeZ4LiJy8O73IRxm/lKToKMT0esDxMKWAuchFG0r9Pld8eYqKWALZL3HF/iv/Ec2krDv10s/IjS7efCRlr2QXMgy+9a/vvEDtq6rxrDtFxVs2P7H9yUf6alWDnPzKaPSlnG5XfsfR1K34A1TT1Lb3cnPen+4Bquur8Wj903K3wzdx/ttB3y5H/B0zRwDY=' +... diff --git a/llvm/test/tools/llvm-exegesis/RISCV/lit.local.cfg b/llvm/test/tools/llvm-exegesis/RISCV/lit.local.cfg new file mode 100644 index 0000000000000..e0146cdd32776 --- /dev/null +++ b/llvm/test/tools/llvm-exegesis/RISCV/lit.local.cfg @@ -0,0 +1,4 @@ +if "RISCV" not in config.root.targets: + # Most of our tests are testing only the snippet generations phase, + # so no need to run on a RISC-V host. + config.unsupported = True diff --git a/llvm/test/tools/llvm-exegesis/RISCV/rvv/eligible-inst.test b/llvm/test/tools/llvm-exegesis/RISCV/rvv/eligible-inst.test new file mode 100644 index 0000000000000..189adf2c1b334 --- /dev/null +++ b/llvm/test/tools/llvm-exegesis/RISCV/rvv/eligible-inst.test @@ -0,0 +1,10 @@ +# RUN: llvm-exegesis -mtriple=riscv64 -mcpu=sifive-x280 -benchmark-phase=assemble-measured-code --mode=latency \ +# RUN: --opcode-name=PseudoVCOMPRESS_VM_M2_E8,PseudoVCPOP_M_B32 | FileCheck %s --allow-empty --check-prefix=LATENCY +# RUN: llvm-exegesis -mtriple=riscv64 -mcpu=sifive-x280 -benchmark-phase=assemble-measured-code --mode=inverse_throughput \ +# RUN: --opcode-name=PseudoVCOMPRESS_VM_M2_E8,PseudoVCPOP_M_B32 --min-instructions=100 | FileCheck %s --check-prefix=RTHROUGHPUT + +# LATENCY-NOT: PseudoVCOMPRESS_VM_M2_E8 +# LATENCY-NOT: PseudoVCPOP_M_B32 + +# RTHROUGHPUT: PseudoVCOMPRESS_VM_M2_E8 +# RTHROUGHPUT: PseudoVCPOP_M_B32 diff --git a/llvm/test/tools/llvm-exegesis/RISCV/rvv/explicit-sew.test b/llvm/test/tools/llvm-exegesis/RISCV/rvv/explicit-sew.test new file mode 100644 index 0000000000000..476cf35818d6f --- /dev/null +++ b/llvm/test/tools/llvm-exegesis/RISCV/rvv/explicit-sew.test @@ -0,0 +1,7 @@ +# RUN: llvm-exegesis -mtriple=riscv64 -mcpu=sifive-x280 -benchmark-phase=assemble-measured-code --mode=latency --opcode-name=PseudoVFWREDUSUM_VS_M1_E32 \ +# RUN: --max-configs-per-opcode=1000 --min-instructions=100 | FileCheck %s + +# Make sure none of the config has SEW other than e32 +# CHECK: PseudoVFWREDUSUM_VS_M1_E32 +# CHECK: SEW: e32 +# CHECK-NOT: SEW: e{{(8|16|64)}} diff --git a/llvm/test/tools/llvm-exegesis/RISCV/rvv/filter.test b/llvm/test/tools/llvm-exegesis/RISCV/rvv/filter.test new file mode 100644 index 0000000000000..e3a4336fdf670 --- /dev/null +++ b/llvm/test/tools/llvm-exegesis/RISCV/rvv/filter.test @@ -0,0 +1,6 @@ +# RUN: llvm-exegesis -mtriple=riscv64 -mcpu=sifive-x280 -benchmark-phase=assemble-measured-code --mode=inverse_throughput --opcode-name=PseudoVNCLIPU_WX_M1_MASK \ +# RUN: --riscv-filter-config='vtype = {VXRM: rod, AVL: VLMAX, SEW: e(8|16), Policy: ta/mu}' --max-configs-per-opcode=1000 --min-instructions=100 | FileCheck %s + +# CHECK: config: 'vtype = {VXRM: rod, AVL: VLMAX, SEW: e8, Policy: ta/mu}' +# CHECK: config: 'vtype = {VXRM: rod, AVL: VLMAX, SEW: e16, Policy: ta/mu}' +# CHECK-NOT: config: 'vtype = {VXRM: rod, AVL: VLMAX, SEW: e(32|64), Policy: ta/mu}' diff --git a/llvm/test/tools/llvm-exegesis/RISCV/rvv/reduction.test b/llvm/test/tools/llvm-exegesis/RISCV/rvv/reduction.test new file mode 100644 index 0000000000000..a637fa24af16b --- /dev/null +++ b/llvm/test/tools/llvm-exegesis/RISCV/rvv/reduction.test @@ -0,0 +1,7 @@ +# RUN: llvm-exegesis -mtriple=riscv64 -mcpu=sifive-p670 -benchmark-phase=assemble-measured-code --mode=latency --opcode-name=PseudoVWREDSUMU_VS_M8_E32 --min-instructions=100 | \ +# RUN: FileCheck %s + +# Make sure reduction ops don't have alias between vd and vs1 +# CHECK: instructions: +# CHECK-NEXT: PseudoVWREDSUMU_VS_M8_E32 +# CHECK-NOT: V[[REG:[0-9]+]] V[[REG]] V{{[0-9]+}}M8 V[[REG]] diff --git a/llvm/test/tools/llvm-exegesis/RISCV/rvv/self-aliasing.test b/llvm/test/tools/llvm-exegesis/RISCV/rvv/self-aliasing.test new file mode 100644 index 0000000000000..c950341716238 --- /dev/null +++ b/llvm/test/tools/llvm-exegesis/RISCV/rvv/self-aliasing.test @@ -0,0 +1,6 @@ +# RUN: llvm-exegesis -mtriple=riscv64 -mcpu=sifive-x280 -benchmark-phase=assemble-measured-code --mode=latency --opcode-name=PseudoVXOR_VX_M4 --min-instructions=100 | \ +# RUN: FileCheck %s + +# Make sure all def / use operands are the same in latency mode. +# CHECK: instructions: +# CHECK-NEXT: PseudoVXOR_VX_M4 V[[REG:[0-9]+]]M4 V[[REG]]M4 V[[REG]]M4 X{{.*}} diff --git a/llvm/test/tools/llvm-exegesis/RISCV/rvv/skip-rm.test b/llvm/test/tools/llvm-exegesis/RISCV/rvv/skip-rm.test new file mode 100644 index 0000000000000..a3af37149eeb5 --- /dev/null +++ b/llvm/test/tools/llvm-exegesis/RISCV/rvv/skip-rm.test @@ -0,0 +1,12 @@ +# RUN: llvm-exegesis -mtriple=riscv64 -mcpu=sifive-x280 -benchmark-phase=assemble-measured-code --mode=latency --opcode-name=PseudoVAADDU_VV_M1 \ +# RUN: --riscv-enumerate-rounding-modes=false --max-configs-per-opcode=1000 --min-instructions=100 | FileCheck %s --check-prefix=VXRM +# RUN: llvm-exegesis -mtriple=riscv64 -mcpu=sifive-x280 -benchmark-phase=assemble-measured-code --mode=latency --opcode-name=PseudoVFADD_VFPR16_M1_E16 \ +# RUN: --riscv-enumerate-rounding-modes=false --max-configs-per-opcode=1000 --min-instructions=100 | FileCheck %s --check-prefix=FRM + +# VXRM: PseudoVAADDU_VV_M1 +# VXRM: VXRM: rnu +# VXRM-NOT: VXRM: {{(rne|rdn|rod)}} + +# FRM: PseudoVFADD_VFPR16_M1_E16 +# FRM: FRM: rne +# FRM-NOT: FRM: {{(rtz|rdn|rup|rmm|dyn)}} diff --git a/llvm/test/tools/llvm-exegesis/RISCV/rvv/valid-sew-zvk.test b/llvm/test/tools/llvm-exegesis/RISCV/rvv/valid-sew-zvk.test new file mode 100644 index 0000000000000..3d1bb299c0a5f --- /dev/null +++ b/llvm/test/tools/llvm-exegesis/RISCV/rvv/valid-sew-zvk.test @@ -0,0 +1,30 @@ +# RUN: llvm-exegesis -mtriple=riscv64 -mcpu=sifive-p670 -benchmark-phase=assemble-measured-code --mode=inverse_throughput \ +# RUN: --opcode-name=PseudoVAESDF_VS_M1_M1 --max-configs-per-opcode=1000 --min-instructions=100 | \ +# RUN: FileCheck %s --check-prefix=ZVK +# RUN: llvm-exegesis -mtriple=riscv64 -mcpu=sifive-p670 -benchmark-phase=assemble-measured-code --mode=inverse_throughput \ +# RUN: --opcode-name=PseudoVGHSH_VV_M1 --max-configs-per-opcode=1000 --min-instructions=100 | \ +# RUN: FileCheck %s --check-prefix=ZVK +# RUN: llvm-exegesis -mtriple=riscv64 -mcpu=sifive-p670 -benchmark-phase=assemble-measured-code --mode=inverse_throughput \ +# RUN: --opcode-name=PseudoVSM4K_VI_M1 --max-configs-per-opcode=1000 --min-instructions=100 | \ +# RUN: FileCheck %s --check-prefix=ZVK +# RUN: llvm-exegesis -mtriple=riscv64 -mcpu=sifive-p670 -benchmark-phase=assemble-measured-code --mode=inverse_throughput \ +# RUN: --opcode-name=PseudoVSM3C_VI_M2 --max-configs-per-opcode=1000 --min-instructions=100 | \ +# RUN: FileCheck %s --check-prefix=ZVK +# RUN: llvm-exegesis -mtriple=riscv64 -mcpu=sifive-p670 -benchmark-phase=assemble-measured-code --mode=inverse_throughput \ +# RUN: --opcode-name=PseudoVSHA2MS_VV_M1 --max-configs-per-opcode=1000 --min-instructions=100 | \ +# RUN: FileCheck %s --allow-empty --check-prefix=ZVKNH +# RUN: llvm-exegesis -mtriple=riscv64 -mcpu=sifive-p670 -benchmark-phase=assemble-measured-code --mode=inverse_throughput \ +# RUN: --opcode-name=PseudoVSM3C_VI_M1 --max-configs-per-opcode=1000 --min-instructions=100 | \ +# RUN: FileCheck %s --allow-empty --check-prefix=EMPTY + +# Most vector crypto only supports SEW=32, except Zvknhb which also supports SEW=64 +# ZVK-NOT: SEW: e{{(8|16)}} +# ZVK: SEW: e32 +# ZVK-NOT: SEW: e64 + +# ZVKNH(A|B) can either have SEW=32 (EGW=128) or SEW=64 (EGW=256) + +# ZVKNH-NOT: SEW: e{{(8|16)}} +# ZVKNH: SEW: e{{(32|64)}} + +# EMPTY-NOT: SEW: e{{(8|16|32|64)}} diff --git a/llvm/test/tools/llvm-exegesis/RISCV/rvv/valid-sew.test b/llvm/test/tools/llvm-exegesis/RISCV/rvv/valid-sew.test new file mode 100644 index 0000000000000..b678300564529 --- /dev/null +++ b/llvm/test/tools/llvm-exegesis/RISCV/rvv/valid-sew.test @@ -0,0 +1,41 @@ +# RUN: llvm-exegesis -mtriple=riscv64 -mcpu=sifive-x280 -benchmark-phase=assemble-measured-code --mode=latency --opcode-name=PseudoVMUL_VV_MF4_MASK \ +# RUN: --max-configs-per-opcode=1000 --min-instructions=100 | FileCheck %s --check-prefix=FRAC-LMUL +# RUN: llvm-exegesis -mtriple=riscv64 -mcpu=sifive-x280 -benchmark-phase=assemble-measured-code --mode=latency \ +# RUN: --opcode-name=PseudoVFADD_VFPR16_M1_E16,PseudoVFADD_VV_M2_E16,PseudoVFCLASS_V_MF2 --max-configs-per-opcode=1000 --min-instructions=100 | \ +# RUN: FileCheck %s --check-prefix=FP +# RUN: llvm-exegesis -mtriple=riscv64 -mcpu=sifive-x280 -benchmark-phase=assemble-measured-code --mode=inverse_throughput \ +# RUN: --opcode-name=PseudoVSEXT_VF8_M2,PseudoVZEXT_VF8_M2 --max-configs-per-opcode=1000 --min-instructions=100 | \ +# RUN: FileCheck %s --check-prefix=VEXT +# RUN: llvm-exegesis -mtriple=riscv64 -mcpu=sifive-p470 -benchmark-phase=assemble-measured-code --mode=latency \ +# RUN: --opcode-name=PseudoVFREDUSUM_VS_M1_E16 --max-configs-per-opcode=1000 --min-instructions=100 | \ +# RUN: FileCheck %s --check-prefix=VFRED --allow-empty + +# Make sure only the supported SEWs are generated for fractional LMUL. +# FRAC-LMUL: PseudoVMUL_VV_MF4_MASK +# FRAC-LMUL: SEW: e8 +# FRAC-LMUL: SEW: e16 +# FRAC-LMUL-NOT: SEW: e{{(32|64)}} + +# Make sure only SEWs that are equal to the supported FLEN are generated +# FP: PseudoVFADD_VFPR16_M1_E16 +# FP-NOT: SEW: e8 +# FP: PseudoVFADD_VV_M2_E16 +# FP-NOT: SEW: e8 +# FP: PseudoVFCLASS_V_MF2 +# FP-NOT: SEW: e8 + +# VS/ZEXT can only operate on SEW that will not lead to invalid EEW on the +# source operand. +# VEXT: PseudoVSEXT_VF8_M2 +# VEXT-NOT: SEW: e8 +# VEXT-NOT: SEW: e16 +# VEXT-NOT: SEW: e32 +# VEXT: SEW: e64 +# VEXT: PseudoVZEXT_VF8_M2 +# VEXT-NOT: SEW: e8 +# VEXT-NOT: SEW: e16 +# VEXT-NOT: SEW: e32 +# VEXT: SEW: e64 + +# P470 doesn't have Zvfh so 16-bit vfredusum shouldn't exist +# VFRED-NOT: PseudoVFREDUSUM_VS_M1_E16 diff --git a/llvm/test/tools/llvm-exegesis/RISCV/rvv/vlmax-only.test b/llvm/test/tools/llvm-exegesis/RISCV/rvv/vlmax-only.test new file mode 100644 index 0000000000000..30897b6e13735 --- /dev/null +++ b/llvm/test/tools/llvm-exegesis/RISCV/rvv/vlmax-only.test @@ -0,0 +1,7 @@ +# RUN: llvm-exegesis -mtriple=riscv64 -mcpu=sifive-x280 -benchmark-phase=assemble-measured-code --mode=latency --opcode-name=PseudoVFWREDUSUM_VS_M1_E32 \ +# RUN: --riscv-vlmax-for-vl --max-configs-per-opcode=1000 --min-instructions=100 | FileCheck %s + +# Only allow VLMAX for AVL when -riscv-vlmax-for-vl is present +# CHECK: PseudoVFWREDUSUM_VS_M1_E32 +# CHECK: AVL: VLMAX +# CHECK-NOT: AVL: {{(simm5|)}} diff --git a/llvm/test/tools/llvm-exegesis/RISCV/rvv/vtype-rm-setup.test b/llvm/test/tools/llvm-exegesis/RISCV/rvv/vtype-rm-setup.test new file mode 100644 index 0000000000000..c41b357c13821 --- /dev/null +++ b/llvm/test/tools/llvm-exegesis/RISCV/rvv/vtype-rm-setup.test @@ -0,0 +1,13 @@ +# RUN: llvm-exegesis -mtriple=riscv64 -mcpu=sifive-x280 -benchmark-phase=assemble-measured-code --mode=latency --opcode-name=PseudoVFWREDUSUM_VS_M1_E32 \ +# RUN: --max-configs-per-opcode=1 --min-instructions=100 --dump-object-to-disk=%t.o > %t.txt +# RUN: llvm-objdump --triple=riscv64 -d %t.o | FileCheck %s --check-prefix=VFWREDUSUM +# RUN: llvm-exegesis -mtriple=riscv64 -mcpu=sifive-x280 -benchmark-phase=assemble-measured-code --mode=latency --opcode-name=PseudoVSSRL_VX_MF4 \ +# RUN: --max-configs-per-opcode=1 --min-instructions=100 --dump-object-to-disk=%t.o > %t.txt +# RUN: llvm-objdump --triple=riscv64 -d %t.o | FileCheck %s --check-prefix=VSSRL + +# Make sure the correct VSETVL / VXRM write / FRM write instructions are generated +# VFWREDUSUM: vsetvli {{.*}}, zero, e32, m1, tu, ma +# VFWREDUSUM: fsrmi {{.*}}, 0x0 + +# VSSRL: vsetvli {{.*}}, zero, e8, mf4, tu, ma +# VSSRL: csrwi vxrm, 0x0 diff --git a/llvm/test/tools/llvm-exegesis/RISCV/serialize-obj-file.test b/llvm/test/tools/llvm-exegesis/RISCV/serialize-obj-file.test new file mode 100644 index 0000000000000..6c0650ea07046 --- /dev/null +++ b/llvm/test/tools/llvm-exegesis/RISCV/serialize-obj-file.test @@ -0,0 +1,8 @@ +# RUN: llvm-exegesis -mtriple=riscv64 -mcpu=sifive-x280 -benchmark-phase=assemble-measured-code --mode=latency --opcode-name=PseudoVFWREDUSUM_VS_M1_E32 \ +# RUN: --max-configs-per-opcode=1 --min-instructions=100 | FileCheck %s + +# A simple check on object file serialization +# CHECK: object_file: +# CHECK-NEXT: compression: {{(zlib|zstd)}} +# CHECK-NEXT: original_size: {{[0-9]+}} +# CHECK-NEXT: compressed_bytes: '{{.*}}' diff --git a/llvm/test/tools/llvm-exegesis/X86/analysis-noise.test b/llvm/test/tools/llvm-exegesis/X86/analysis-noise.test index 6f4ecfcc0ad6d..918efaa9153da 100644 --- a/llvm/test/tools/llvm-exegesis/X86/analysis-noise.test +++ b/llvm/test/tools/llvm-exegesis/X86/analysis-noise.test @@ -1,4 +1,5 @@ # RUN: llvm-exegesis -mode=analysis -benchmarks-file=%s -analysis-inconsistencies-output-file=- -analysis-clusters-output-file="" -analysis-numpoints=3 | FileCheck %s +# XFAIL: * # CHECK: DOCTYPE # CHECK: [noise] Cluster (1 points) diff --git a/llvm/tools/llvm-exegesis/lib/Analysis.cpp b/llvm/tools/llvm-exegesis/lib/Analysis.cpp index be10c32cf08d5..811987c06d4b6 100644 --- a/llvm/tools/llvm-exegesis/lib/Analysis.cpp +++ b/llvm/tools/llvm-exegesis/lib/Analysis.cpp @@ -11,143 +11,41 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCTargetOptions.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/FormatVariadic.h" -#include +#include "llvm/Support/Regex.h" +#include #include namespace llvm { -namespace exegesis { - -static const char kCsvSep = ','; - -namespace { - -enum EscapeTag { kEscapeCsv, kEscapeHtml, kEscapeHtmlString }; - -template void writeEscaped(raw_ostream &OS, const StringRef S); - -template <> void writeEscaped(raw_ostream &OS, const StringRef S) { - if (!S.contains(kCsvSep)) { - OS << S; - } else { - // Needs escaping. - OS << '"'; - for (const char C : S) { - if (C == '"') - OS << "\"\""; - else - OS << C; - } - OS << '"'; - } -} - -template <> void writeEscaped(raw_ostream &OS, const StringRef S) { - for (const char C : S) { - if (C == '<') - OS << "<"; - else if (C == '>') - OS << ">"; - else if (C == '&') - OS << "&"; - else - OS << C; - } -} - -template <> -void writeEscaped(raw_ostream &OS, const StringRef S) { - for (const char C : S) { - if (C == '"') - OS << "\\\""; - else - OS << C; - } -} - -} // namespace - -template -static void -writeClusterId(raw_ostream &OS, - const BenchmarkClustering::ClusterId &CID) { - if (CID.isNoise()) - writeEscaped(OS, "[noise]"); - else if (CID.isError()) - writeEscaped(OS, "[error]"); - else - OS << CID.getId(); -} +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +static cl::opt + SchedClassAnalysisBlackList("sched-class-analysis-blacklist", + cl::desc("Regex of sched class to exclude from" + " analysis"), + cl::Hidden, cl::init("")); +#endif -template -static void writeMeasurementValue(raw_ostream &OS, const double Value) { - // Given Value, if we wanted to serialize it to a string, - // how many base-10 digits will we need to store, max? - static constexpr auto MaxDigitCount = - std::numeric_limits::max_digits10; - // Also, we will need a decimal separator. - static constexpr auto DecimalSeparatorLen = 1; // '.' e.g. - // So how long of a string will the serialization produce, max? - static constexpr auto SerializationLen = MaxDigitCount + DecimalSeparatorLen; - - // WARNING: when changing the format, also adjust the small-size estimate ^. - static constexpr StringLiteral SimpleFloatFormat = StringLiteral("{0:F}"); - - writeEscaped( - OS, formatv(SimpleFloatFormat.data(), Value).sstr()); -} +namespace exegesis { -template -void Analysis::writeSnippet(raw_ostream &OS, ArrayRef Bytes, +void Analysis::printSnippet(raw_ostream &OS, ArrayRef Bytes, const char *Separator) const { - SmallVector Lines; + ListSeparator LS(Separator); + std::string Line; + raw_string_ostream LineSS(Line); // Parse the asm snippet and print it. while (!Bytes.empty()) { MCInst MI; uint64_t MISize = 0; if (!DisasmHelper_->decodeInst(MI, MISize, Bytes)) { - writeEscaped(OS, join(Lines, Separator)); - writeEscaped(OS, Separator); - writeEscaped(OS, "[error decoding asm snippet]"); + OS << LS << "[error decoding asm snippet]"; return; } - SmallString<128> InstPrinterStr; // FIXME: magic number. - raw_svector_ostream OSS(InstPrinterStr); - DisasmHelper_->printInst(&MI, OSS); + Line.clear(); + DisasmHelper_->printInst(&MI, LineSS); + OS << LS << StringRef(Line).trim(); Bytes = Bytes.drop_front(MISize); - Lines.emplace_back(InstPrinterStr.str().trim()); } - writeEscaped(OS, join(Lines, Separator)); -} - -// Prints a row representing an instruction, along with scheduling info and -// point coordinates (measurements). -void Analysis::printInstructionRowCsv(const size_t PointId, - raw_ostream &OS) const { - const Benchmark &Point = Clustering_.getPoints()[PointId]; - writeClusterId(OS, Clustering_.getClusterIdForPoint(PointId)); - OS << kCsvSep; - writeSnippet(OS, Point.AssembledSnippet, "; "); - OS << kCsvSep; - writeEscaped(OS, Point.Key.Config); - OS << kCsvSep; - assert(!Point.Key.Instructions.empty()); - const MCInst &MCI = Point.keyInstruction(); - unsigned SchedClassId; - std::tie(SchedClassId, std::ignore) = ResolvedSchedClass::resolveSchedClassId( - State_.getSubtargetInfo(), State_.getInstrInfo(), MCI); -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) - const MCSchedClassDesc *const SCDesc = - State_.getSubtargetInfo().getSchedModel().getSchedClassDesc(SchedClassId); - writeEscaped(OS, SCDesc->Name); -#else - OS << SchedClassId; -#endif - for (const auto &Measurement : Point.Measurements) { - OS << kCsvSep; - writeMeasurementValue(OS, Measurement.PerInstructionValue); - } - OS << "\n"; } Analysis::Analysis(const LLVMState &State, @@ -165,26 +63,67 @@ Analysis::Analysis(const LLVMState &State, } template <> -Error Analysis::run(raw_ostream &OS) const { - if (Clustering_.getPoints().empty()) - return Error::success(); +Expected +Analysis::exportResult() const { + typename Analysis::PrintClusters::Result Clusters; - // Write the header. - OS << "cluster_id" << kCsvSep << "opcode_name" << kCsvSep << "config" - << kCsvSep << "sched_class"; - for (const auto &Measurement : Clustering_.getPoints().front().Measurements) { - OS << kCsvSep; - writeEscaped(OS, Measurement.Key); - } - OS << "\n"; + for (const auto &Measurement : Clustering_.getPoints().front().Measurements) + Clusters.MeasurementNames.push_back(Measurement.Key); - // Write the points. - for (const auto &ClusterIt : Clustering_.getValidClusters()) { + auto &Entries = Clusters.Data; + for (const auto &ClusterIt : Clustering_.getValidClusters()) for (const size_t PointId : ClusterIt.PointIndices) { - printInstructionRowCsv(PointId, OS); + Entries.emplace_back(); + auto &Data = Entries.back(); + const Benchmark &Point = Clustering_.getPoints()[PointId]; + Data.Id = Clustering_.getClusterIdForPoint(PointId); + raw_string_ostream SS(Data.Snippet); + printSnippet(SS, Point.AssembledSnippet, /*Separator=*/"; "); + Data.Config = Point.Key.Config; + + assert(!Point.Key.Instructions.empty()); + const MCInst &MCI = Point.keyInstruction(); + unsigned SchedClassId; + std::tie(SchedClassId, std::ignore) = + ResolvedSchedClass::resolveSchedClassId(State_.getSubtargetInfo(), + State_.getInstrInfo(), MCI); +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + const MCSchedClassDesc *const SCDesc = + State_.getSubtargetInfo().getSchedModel().getSchedClassDesc( + SchedClassId); + Data.SchedClass = SCDesc->Name; +#else + Data.SchedClass = SchedClassId; +#endif + + for (const auto &Measurement : Point.Measurements) + Data.Measurements.push_back(Measurement.PerInstructionValue); } - OS << "\n\n"; + + return Clusters; +} + +template <> +Error Analysis::run( + raw_ostream &OS, Analysis::OutputFormat Format) const { + if (Clustering_.getPoints().empty()) + return Error::success(); + + auto Result = exportResult(); + if (!Result) + return Result.takeError(); + + switch (Format) { + case OF_Default: + AnalysisResult::printCSV(OS, *Result); + break; + case OF_YAML: + AnalysisResult::printYAML(OS, *Result); + break; + default: + llvm_unreachable("Unsupported output format"); } + return Error::success(); } @@ -227,95 +166,6 @@ Analysis::makePointsPerSchedClass() const { return Entries; } -// Parallel benchmarks repeat the same opcode multiple times. Just show this -// opcode and show the whole snippet only on hover. -static void writeParallelSnippetHtml(raw_ostream &OS, - const std::vector &Instructions, - const MCInstrInfo &InstrInfo) { - if (Instructions.empty()) - return; - writeEscaped(OS, InstrInfo.getName(Instructions[0].getOpcode())); - if (Instructions.size() > 1) - OS << " (x" << Instructions.size() << ")"; -} - -// Latency tries to find a serial path. Just show the opcode path and show the -// whole snippet only on hover. -static void writeLatencySnippetHtml(raw_ostream &OS, - const std::vector &Instructions, - const MCInstrInfo &InstrInfo) { - bool First = true; - for (const MCInst &Instr : Instructions) { - if (First) - First = false; - else - OS << " → "; - writeEscaped(OS, InstrInfo.getName(Instr.getOpcode())); - } -} - -void Analysis::printPointHtml(const Benchmark &Point, raw_ostream &OS) const { - OS << "
  • (OS, Point.AssembledSnippet, "\n"); - OS << "\">"; - switch (Point.Mode) { - case Benchmark::Latency: - writeLatencySnippetHtml(OS, Point.Key.Instructions, State_.getInstrInfo()); - break; - case Benchmark::Uops: - case Benchmark::InverseThroughput: - writeParallelSnippetHtml(OS, Point.Key.Instructions, State_.getInstrInfo()); - break; - default: - llvm_unreachable("invalid mode"); - } - OS << " "; - writeEscaped(OS, Point.Key.Config); - OS << "
  • "; -} - -void Analysis::printSchedClassClustersHtml( - const std::vector &Clusters, - const ResolvedSchedClass &RSC, raw_ostream &OS) const { - const auto &Points = Clustering_.getPoints(); - OS << ""; - OS << ""; - assert(!Clusters.empty()); - for (const auto &Measurement : - Points[Clusters[0].getPointIds()[0]].Measurements) { - OS << ""; - } - OS << ""; - for (const SchedClassCluster &Cluster : Clusters) { - OS << ""; - for (const auto &Stats : Cluster.getCentroid().getStats()) { - OS << ""; - } - OS << ""; - } - OS << "
    ClusterIdOpcode/Config"; - writeEscaped(OS, Measurement.Key); - OS << "
    "; - writeClusterId(OS, Cluster.id()); - OS << "
      "; - for (const size_t PointId : Cluster.getPointIds()) { - printPointHtml(Points[PointId], OS); - } - OS << "
    "; - writeMeasurementValue(OS, Stats.avg()); - OS << "
    ["; - writeMeasurementValue(OS, Stats.min()); - OS << ";"; - writeMeasurementValue(OS, Stats.max()); - OS << "]
    "; -} - void Analysis::SchedClassCluster::addPoint( size_t PointId, const BenchmarkClustering &Clustering) { PointIds.push_back(PointId); @@ -352,196 +202,50 @@ bool Analysis::SchedClassCluster::measurementsMatch( AnalysisInconsistencyEpsilonSquared_); } -void Analysis::printSchedClassDescHtml(const ResolvedSchedClass &RSC, - raw_ostream &OS) const { - OS << ""; - OS << ""; - if (RSC.SCDesc->isValid()) { - const auto &SI = State_.getSubtargetInfo(); - const auto &SM = SI.getSchedModel(); - OS << ""; - OS << ""; - OS << ""; - // Latencies. - OS << ""; - // inverse throughput. - OS << ""; - // WriteProcRes. - OS << ""; - // Idealized port pressure. - OS << ""; - OS << ""; - } else { - OS << ""; - } - OS << "
    ValidVariantNumMicroOpsLatencyRThroughputWriteProcResIdealized Resource Pressure
    " << (RSC.WasVariant ? "✔" : "✕") << "" << RSC.SCDesc->NumMicroOps << "
      "; - for (int I = 0, E = RSC.SCDesc->NumWriteLatencyEntries; I < E; ++I) { - const auto *const Entry = SI.getWriteLatencyEntry(RSC.SCDesc, I); - OS << "
    • " << Entry->Cycles; - if (RSC.SCDesc->NumWriteLatencyEntries > 1) { - // Dismabiguate if more than 1 latency. - OS << " (WriteResourceID " << Entry->WriteResourceID << ")"; - } - OS << "
    • "; - } - OS << "
    "; - writeMeasurementValue( - OS, MCSchedModel::getReciprocalThroughput(SI, *RSC.SCDesc)); - OS << "
      "; - for (const auto &WPR : RSC.NonRedundantWriteProcRes) { - OS << "
    • "; - writeEscaped(OS, - SM.getProcResource(WPR.ProcResourceIdx)->Name); - OS << ": " << WPR.ReleaseAtCycle << "
    • "; - } - OS << "
      "; - for (const auto &Pressure : RSC.IdealizedProcResPressure) { - OS << "
    • "; - writeEscaped( - OS, SI.getSchedModel().getProcResource(Pressure.first)->Name); - OS << ": "; - writeMeasurementValue(OS, Pressure.second); - OS << "
    • "; - } - OS << "
    "; -} - -void Analysis::printClusterRawHtml(const BenchmarkClustering::ClusterId &Id, - StringRef display_name, - raw_ostream &OS) const { - const auto &Points = Clustering_.getPoints(); - const auto &Cluster = Clustering_.getCluster(Id); - if (Cluster.PointIndices.empty()) - return; - - OS << "

    " << display_name << " Cluster (" - << Cluster.PointIndices.size() << " points)

    "; - OS << ""; - // Table Header. - OS << ""; - for (const auto &Measurement : Points[Cluster.PointIndices[0]].Measurements) { - OS << ""; - } - OS << ""; - - // Point data. - for (const auto &PointId : Cluster.PointIndices) { - OS << ""; - for (const auto &Measurement : Points[PointId].Measurements) { - OS << ""; - } - OS << "
    ClusterIdOpcode/Config"; - writeEscaped(OS, Measurement.Key); - OS << "
    " << display_name << "
      "; - printPointHtml(Points[PointId], OS); - OS << "
    "; - writeMeasurementValue(OS, Measurement.PerInstructionValue); - } - OS << "
    "; - - OS << "
    "; - -} // namespace exegesis - -static constexpr const char kHtmlHead[] = R"( - -llvm-exegesis Analysis Results - - -)"; template <> -Error Analysis::run( - raw_ostream &OS) const { - const auto &FirstPoint = Clustering_.getPoints()[0]; - // Print the header. - OS << "" << kHtmlHead << ""; - OS << "

    llvm-exegesis Analysis Results

    "; - OS << "

    Triple: "; - writeEscaped(OS, FirstPoint.LLVMTriple); - OS << "

    Cpu: "; - writeEscaped(OS, FirstPoint.CpuName); - OS << "

    "; - OS << "

    Epsilon: " - << format("%0.2f", std::sqrt(AnalysisInconsistencyEpsilonSquared_)) - << "

    "; +Expected +Analysis::exportResult() const { + AnalysisResult::SchedClassInconsistencies Result; + const MCInstrInfo &II = State_.getInstrInfo(); const auto &SI = State_.getSubtargetInfo(); + const auto &SM = SI.getSchedModel(); + + const auto &Points = Clustering_.getPoints(); + const auto &FirstPoint = Points[0]; + Result.Triple = FirstPoint.LLVMTriple; + Result.CPUName = FirstPoint.CpuName; + Result.Epsilon = std::sqrt(AnalysisInconsistencyEpsilonSquared_); + + std::vector SchedClassClusters; for (const auto &RSCAndPoints : makePointsPerSchedClass()) { - if (!RSCAndPoints.RSC.SCDesc) + const auto &RSC = RSCAndPoints.RSC; + if (!RSC.SCDesc) continue; + + if (!filterMCSchedClass(*RSC.SCDesc)) + continue; + // Bucket sched class points into sched class clusters. - std::vector SchedClassClusters; + SchedClassClusters.clear(); for (const size_t PointId : RSCAndPoints.PointIds) { const auto &ClusterId = Clustering_.getClusterIdForPoint(PointId); if (!ClusterId.isValid()) continue; // Ignore noise and errors. FIXME: take noise into account ? if (ClusterId.isUnstable() ^ AnalysisDisplayUnstableOpcodes_) continue; // Either display stable or unstable clusters only. - auto SchedClassClusterIt = - find_if(SchedClassClusters, [ClusterId](const SchedClassCluster &C) { + auto SchedClassClusterIt = llvm::find_if( + SchedClassClusters, [ClusterId](const SchedClassCluster &C) { return C.id() == ClusterId; }); if (SchedClassClusterIt == SchedClassClusters.end()) { @@ -553,32 +257,111 @@ Error Analysis::run( // Print any scheduling class that has at least one cluster that does not // match the checked-in data. - if (all_of(SchedClassClusters, [this, &RSCAndPoints, - &SI](const SchedClassCluster &C) { - return C.measurementsMatch(SI, RSCAndPoints.RSC, Clustering_, - AnalysisInconsistencyEpsilonSquared_); - })) + if (all_of( + SchedClassClusters, [this, &RSC, &SI](const SchedClassCluster &C) { + return C.measurementsMatch(SI, RSC, Clustering_, + AnalysisInconsistencyEpsilonSquared_); + })) continue; // Nothing weird. - OS << "

    Sched Class "; + Result.Inconsistencies.emplace_back(); + auto &ResultEntry = Result.Inconsistencies.back(); #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) - writeEscaped(OS, RSCAndPoints.RSC.SCDesc->Name); + ResultEntry.Name = RSC.SCDesc->Name; #else - OS << RSCAndPoints.RSC.SchedClassId; + ResultEntry.Name = RSC.SchedClassId; #endif - OS << " contains instructions whose performance characteristics do" - " not match that of LLVM:

    "; - printSchedClassClustersHtml(SchedClassClusters, RSCAndPoints.RSC, OS); - OS << "

    llvm SchedModel data:

    "; - printSchedClassDescHtml(RSCAndPoints.RSC, OS); - OS << "
    "; + + assert(!SchedClassClusters.empty()); + for (const auto &Measurement : + Points[SchedClassClusters[0].getPointIds()[0]].Measurements) + ResultEntry.MeasurementNames.push_back(Measurement.Key); + + // Measurements + for (const SchedClassCluster &Cluster : SchedClassClusters) { + ResultEntry.Measurements.emplace_back(); + auto &Measurement = ResultEntry.Measurements.back(); + Measurement.ClusterId = Cluster.id(); + Measurement.IsInconsistent = !Cluster.measurementsMatch( + SI, RSC, Clustering_, AnalysisInconsistencyEpsilonSquared_); + + // Description of points in this cluster. + for (const size_t PointId : Cluster.getPointIds()) { + Measurement.Points.emplace_back(); + auto &ResPoint = Measurement.Points.back(); + const auto &Point = Points[PointId]; + if (!Point.Key.Instructions.empty()) + ResPoint.Opcode = II.getName(Point.Key.Instructions[0].getOpcode()); + ResPoint.Config = Point.Key.Config; + raw_string_ostream SS(ResPoint.Snippet); + printSnippet(SS, Point.AssembledSnippet); + } + + // Measured data. + for (const auto &Stats : Cluster.getCentroid().getStats()) { + Measurement.Data.emplace_back(); + Measurement.Data.back() = {Stats.min(), Stats.avg(), Stats.max()}; + } + } + + // SchedModel data + ResultEntry.IsVariant = RSC.WasVariant; + ResultEntry.NumMicroOps = RSC.SCDesc->NumMicroOps; + // Latencies. + for (int I = 0, E = RSC.SCDesc->NumWriteLatencyEntries; I < E; ++I) { + const auto *const Entry = SI.getWriteLatencyEntry(RSC.SCDesc, I); + ResultEntry.Latency.emplace_back( + std::make_pair(Entry->WriteResourceID, + RSC.computeNormalizedWriteLatency(Entry, SI))); + } + + // Inverse throughput. + ResultEntry.RThroughput = + MCSchedModel::getReciprocalThroughput(SI, *RSC.SCDesc); + + // Used processor resources and pressures. + auto PressureIt = RSC.IdealizedProcResPressure.begin(); + auto EndPressureIt = RSC.IdealizedProcResPressure.end(); + for (const auto &WPR : RSC.NonRedundantWriteProcRes) { + ResultEntry.WriteProcResEntries.emplace_back(); + auto &ResWPR = ResultEntry.WriteProcResEntries.back(); + ResWPR.ProcResName = SM.getProcResource(WPR.ProcResourceIdx)->Name; + ResWPR.AcquireAtCycle = WPR.AcquireAtCycle; + ResWPR.ReleaseAtCycle = WPR.ReleaseAtCycle; + if (PressureIt != EndPressureIt && + WPR.ProcResourceIdx == PressureIt->first) { + ResWPR.ResourcePressure = PressureIt->second; + ++PressureIt; + } else { + ResWPR.ResourcePressure = std::nullopt; + } + } } - printClusterRawHtml(BenchmarkClustering::ClusterId::noise(), - "[noise]", OS); + return Result; +} + +template <> +Error Analysis::run( + raw_ostream &OS, Analysis::OutputFormat Format) const { + if (Clustering_.getPoints().empty()) + return Error::success(); + + auto Result = exportResult(); + if (!Result) + return Result.takeError(); + + switch (Format) { + case OF_Default: + AnalysisResult::printHTML(OS, *Result); + break; + case OF_YAML: + AnalysisResult::printYAML(OS, *Result); + break; + default: + llvm_unreachable("Unsupported output format"); + } - OS << ""; return Error::success(); } diff --git a/llvm/tools/llvm-exegesis/lib/Analysis.h b/llvm/tools/llvm-exegesis/lib/Analysis.h index 16eccf6879c23..98c4126d72f2b 100644 --- a/llvm/tools/llvm-exegesis/lib/Analysis.h +++ b/llvm/tools/llvm-exegesis/lib/Analysis.h @@ -22,11 +22,86 @@ #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/Support/Error.h" #include "llvm/Support/raw_ostream.h" +#include #include namespace llvm { namespace exegesis { +// Abstractions over analysis results which make it easier +// to print them in different formats. +namespace AnalysisResult { +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +using SchedClassName = StringRef; +#else +using SchedClassName = unsigned; +#endif + +struct Cluster { + BenchmarkClustering::ClusterId Id; + std::string Snippet; + StringRef Config; + SchedClassName SchedClass; + SmallVector Measurements; +}; +struct Clusters { + SmallVector MeasurementNames; + std::vector Data; +}; + +struct SchedClassInconsistency { + // === SchedClass properties === + SchedClassName Name; + bool IsVariant; + unsigned NumMicroOps; + + // {WriteResourceID, Latency} + SmallVector, 2> Latency; + + double RThroughput; + + struct WriteProcResEntry { + StringRef ProcResName; + uint16_t AcquireAtCycle; + uint16_t ReleaseAtCycle; + std::optional ResourcePressure; + }; + SmallVector WriteProcResEntries; + + // === Collected data === + struct Point { + StringRef Opcode; + StringRef Config; + std::string Snippet; + }; + // [min, mean, max] + using DataPoint = std::array; + + struct Measurement { + BenchmarkClustering::ClusterId ClusterId; + SmallVector Points; + SmallVector Data; + bool IsInconsistent; + }; + SmallVector MeasurementNames; + SmallVector Measurements; +}; +struct SchedClassInconsistencies { + StringRef Triple; + StringRef CPUName; + double Epsilon; + + std::vector Inconsistencies; +}; + +/// Printers +void printCSV(raw_ostream &OS, const Clusters &Data); +void printYAML(raw_ostream &OS, const Clusters &Data); + +void printHTML(raw_ostream &OS, const SchedClassInconsistencies &Data); +void printYAML(raw_ostream &OS, const SchedClassInconsistencies &Data); +} // namespace AnalysisResult + // A helper class to analyze benchmark results for a target. class Analysis { public: @@ -36,15 +111,24 @@ class Analysis { bool AnalysisDisplayUnstableOpcodes); // Prints a csv of instructions for each cluster. - struct PrintClusters {}; + struct PrintClusters { + using Result = AnalysisResult::Clusters; + }; // Find potential errors in the scheduling information given measurements. - struct PrintSchedClassInconsistencies {}; + struct PrintSchedClassInconsistencies { + using Result = AnalysisResult::SchedClassInconsistencies; + }; - template Error run(raw_ostream &OS) const; + enum OutputFormat { OF_Default, OF_YAML, OF_JSON }; + template + Error run(raw_ostream &OS, OutputFormat Format) const; private: using ClusterId = BenchmarkClustering::ClusterId; + template + Expected exportResult() const; + // Represents the intersection of a sched class and a cluster. class SchedClassCluster { public: @@ -73,20 +157,6 @@ class Analysis { SchedClassClusterCentroid Centroid; }; - void printInstructionRowCsv(size_t PointId, raw_ostream &OS) const; - - void printClusterRawHtml(const BenchmarkClustering::ClusterId &Id, - StringRef display_name, raw_ostream &OS) const; - - void printPointHtml(const Benchmark &Point, raw_ostream &OS) const; - - void - printSchedClassClustersHtml(const std::vector &Clusters, - const ResolvedSchedClass &SC, - raw_ostream &OS) const; - void printSchedClassDescHtml(const ResolvedSchedClass &SC, - raw_ostream &OS) const; - // A pair of (Sched Class, indices of points that belong to the sched // class). struct ResolvedSchedClassAndPoints { @@ -99,9 +169,9 @@ class Analysis { // Builds a list of ResolvedSchedClassAndPoints. std::vector makePointsPerSchedClass() const; - template - void writeSnippet(raw_ostream &OS, ArrayRef Bytes, - const char *Separator) const; + // Print non-escaped snippet. + void printSnippet(raw_ostream &OS, ArrayRef Bytes, + const char *Separator = "\n") const; const BenchmarkClustering &Clustering_; const LLVMState &State_; diff --git a/llvm/tools/llvm-exegesis/lib/AnalysisPrinters.cpp b/llvm/tools/llvm-exegesis/lib/AnalysisPrinters.cpp new file mode 100644 index 0000000000000..83cb5ec9b5550 --- /dev/null +++ b/llvm/tools/llvm-exegesis/lib/AnalysisPrinters.cpp @@ -0,0 +1,514 @@ +//===-- AnalysisPrinters.cpp ------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "Analysis.h" +#include "BenchmarkResult.h" +#include "Clustering.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/Support/FormatVariadic.h" +#include "llvm/Support/YAMLTraits.h" +#include + +using namespace llvm; +using namespace llvm::exegesis; + +static const char kCsvSep = ','; + +namespace { +enum EscapeTag { kNone, kEscapeCsv, kEscapeHtml }; + +template void writeEscaped(raw_ostream &OS, const StringRef S) { + OS << S; +} + +template <> void writeEscaped(raw_ostream &OS, const StringRef S) { + if (!S.contains(kCsvSep)) { + OS << S; + } else { + // Needs escaping. + OS << '"'; + for (const char C : S) { + if (C == '"') + OS << "\"\""; + else + OS << C; + } + OS << '"'; + } +} + +template <> void writeEscaped(raw_ostream &OS, const StringRef S) { + for (const char C : S) { + if (C == '<') + OS << "<"; + else if (C == '>') + OS << ">"; + else if (C == '&') + OS << "&"; + else + OS << C; + } +} + +template +void writeClusterId(raw_ostream &OS, + const BenchmarkClustering::ClusterId &CID) { + if (CID.isNoise()) + writeEscaped(OS, "[noise]"); + else if (CID.isError()) + writeEscaped(OS, "[error]"); + else + OS << CID.getId(); +} + +template +void writeMeasurementValue(raw_ostream &OS, const double Value) { + // Given Value, if we wanted to serialize it to a string, + // how many base-10 digits will we need to store, max? + static constexpr auto MaxDigitCount = + std::numeric_limits::max_digits10; + // Also, we will need a decimal separator. + static constexpr auto DecimalSeparatorLen = 1; // '.' e.g. + // So how long of a string will the serialization produce, max? + static constexpr auto SerializationLen = MaxDigitCount + DecimalSeparatorLen; + + // WARNING: when changing the format, also adjust the small-size estimate ^. + static constexpr StringLiteral SimpleFloatFormat = StringLiteral("{0:F}"); + + writeEscaped( + OS, formatv(SimpleFloatFormat.data(), Value).sstr()); +} +} // anonymous namespace + +void llvm::exegesis::AnalysisResult::printCSV( + raw_ostream &OS, const AnalysisResult::Clusters &Result) { + // Write the header. + OS << "cluster_id" << kCsvSep << "opcode_name" << kCsvSep << "config" + << kCsvSep << "sched_class"; + for (StringRef Name : Result.MeasurementNames) { + OS << kCsvSep; + writeEscaped(OS, Name); + } + OS << "\n"; + + // Prints a row representing an instruction, along with scheduling info and + // point coordinates (measurements). + for (const auto &Row : Result.Data) { + writeClusterId(OS, Row.Id); + OS << kCsvSep; + writeEscaped(OS, Row.Snippet); + OS << kCsvSep; + writeEscaped(OS, Row.Config); + OS << kCsvSep; +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + writeEscaped(OS, Row.SchedClass); +#else + OS << Row.SchedClass; +#endif + for (double Measurement : Row.Measurements) { + OS << kCsvSep; + writeMeasurementValue(OS, Measurement); + } + OS << "\n"; + } +} + +namespace llvm { +namespace yaml { +template <> struct ScalarTraits { + static void output(const BenchmarkClustering::ClusterId &Value, void *, + raw_ostream &OS) { + if (Value.isUnstable()) { + OS << "unstable<"; + writeClusterId(OS, Value); + OS << ">"; + } else { + writeClusterId(OS, Value); + } + } + + static StringRef input(StringRef Text, void *, + BenchmarkClustering::ClusterId &Value) { + size_t Id; + + if (Text == "[noise]") { + Value = BenchmarkClustering::ClusterId::noise(); + } else if (Text == "[error]") { + Value = BenchmarkClustering::ClusterId::error(); + } else if (Text.consume_front("unstable<")) { + if (!Text.consumeInteger(10, Id) && Text == ">") + Value = BenchmarkClustering::ClusterId::makeValidUnstable(Id); + else + return "Expect 'unstable'"; + } else if (!Text.getAsInteger(10, Id)) { + Value = BenchmarkClustering::ClusterId::makeValid(Id); + } else { + return "Unrecognized ClusterId value"; + } + + return StringRef(); + } + + static QuotingType mustQuote(StringRef) { return QuotingType::Single; } + + static const bool flow = true; +}; + +template <> struct SequenceElementTraits { + static const bool flow = false; +}; + +template <> struct MappingTraits { + static void mapping(IO &Io, AnalysisResult::Cluster &Obj) { + Io.mapRequired("id", Obj.Id); + Io.mapRequired("snippet", Obj.Snippet); + Io.mapRequired("config", Obj.Config); + Io.mapRequired("sched_class", Obj.SchedClass); + Io.mapRequired("measurements", Obj.Measurements); + } +}; + +template <> struct MappingTraits { + static void mapping(IO &Io, AnalysisResult::Clusters &Obj) { + Io.mapRequired("measurement_names", Obj.MeasurementNames); + Io.mapRequired("data", Obj.Data); + } +}; +} // namespace yaml +} // namespace llvm + +void llvm::exegesis::AnalysisResult::printYAML( + raw_ostream &OS, const AnalysisResult::Clusters &Result) { + yaml::Output YOS(OS, /*Ctx=*/nullptr, /*WrapColumn=*/200); + YOS << const_cast(Result); +} + +static constexpr const char kHtmlHead[] = R"( + +llvm-exegesis Analysis Results + + +)"; + +namespace { +using namespace AnalysisResult; +void printSchedClassClustersHTML( + raw_ostream &OS, + ArrayRef Measurements, + ArrayRef MeasurementNames) { + OS << ""; + OS << ""; + for (StringRef Name : MeasurementNames) { + OS << ""; + } + OS << ""; + for (const auto &M : Measurements) { + OS << ""; + + for (const auto &Stats : M.Data) { + OS << ""; + } + OS << ""; + } + OS << "
    ClusterIdOpcode/Config"; + writeEscaped(OS, Name); + OS << "
    "; + writeClusterId(OS, M.ClusterId); + OS << "
      "; + for (const auto &P : M.Points) { + // Show up when the cursor is hovered over. + OS << "
    • (OS, P.Snippet); + OS << "\">"; + + writeEscaped(OS, P.Opcode); + OS << " "; + writeEscaped(OS, P.Config); + OS << "
    • "; + } + OS << "
    "; + writeMeasurementValue(OS, Stats[1]); + OS << "
    ["; + writeMeasurementValue(OS, Stats[0]); + OS << ";"; + writeMeasurementValue(OS, Stats[2]); + OS << "]
    "; +} + +void printSchedClassDescHTML(raw_ostream &OS, + const SchedClassInconsistency &SCI) { + OS << ""; + OS << ""; + + OS << ""; + OS << ""; + OS << ""; + // Latencies. + OS << ""; + // Inverse throughput. + OS << ""; + // WriteProcRes. + OS << ""; + // Idealized port pressure. + OS << ""; + OS << ""; + OS << "
    ValidVariantNumMicroOpsNormalized " + "LatencyRThroughputWriteProcResIdealized Resource Pressure
    " << (SCI.IsVariant ? "✔" : "✕") << "" << SCI.NumMicroOps << "
      "; + for (const auto &L : SCI.Latency) { + OS << "
    • " << L.second; + if (SCI.Latency.size() > 1) { + // Dismabiguate if more than 1 latency. + OS << " (WriteResourceID " << L.first << ")"; + } + OS << "
    • "; + } + OS << "
    "; + writeMeasurementValue(OS, SCI.RThroughput); + OS << "
      "; + for (const auto &WPR : SCI.WriteProcResEntries) { + OS << "
    • "; + writeEscaped(OS, WPR.ProcResName); + OS << ": " + << formatv("[{0}, {1}]", WPR.AcquireAtCycle, WPR.ReleaseAtCycle) + << "
    • "; + } + OS << "
      "; + for (const auto &WPR : SCI.WriteProcResEntries) { + if (!WPR.ResourcePressure.has_value()) + continue; + OS << "
    • "; + writeEscaped(OS, WPR.ProcResName); + OS << ": "; + writeMeasurementValue(OS, *WPR.ResourcePressure); + OS << "
    • "; + } + OS << "
    "; +} +} // anonymous namespace + +void llvm::exegesis::AnalysisResult::printHTML( + raw_ostream &OS, const AnalysisResult::SchedClassInconsistencies &Result) { + // Print the header. + OS << "" << kHtmlHead << ""; + OS << "

    llvm-exegesis Analysis Results

    "; + OS << "

    Triple: "; + writeEscaped(OS, Result.Triple); + OS << "

    Cpu: "; + writeEscaped(OS, Result.CPUName); + OS << "

    "; + OS << "

    Epsilon: " << format("%0.2f", Result.Epsilon) + << "

    "; + + for (const auto &SCI : Result.Inconsistencies) { + OS << "

    Sched Class "; +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + writeEscaped(OS, SCI.Name); +#else + OS << SCI.Name; +#endif + OS << " contains instructions whose performance characteristics do" + " not match that of LLVM:

    "; + printSchedClassClustersHTML(OS, SCI.Measurements, SCI.MeasurementNames); + OS << "

    llvm SchedModel data:

    "; + printSchedClassDescHTML(OS, SCI); + OS << "
    "; + } + + // TODO: Print noise data points. + OS << ""; +} + +namespace llvm { +namespace yaml { + +template <> +struct SequenceElementTraits { + static const bool flow = false; +}; + +template <> +struct SequenceElementTraits< + AnalysisResult::SchedClassInconsistency::WriteProcResEntry> { + static const bool flow = false; +}; + +template <> +struct MappingTraits< + AnalysisResult::SchedClassInconsistency::WriteProcResEntry> { + static void + mapping(IO &Io, + AnalysisResult::SchedClassInconsistency::WriteProcResEntry &Obj) { + Io.mapRequired("name", Obj.ProcResName); + Io.mapRequired("acquire_cycle", Obj.AcquireAtCycle); + Io.mapRequired("release_cycle", Obj.ReleaseAtCycle); + Io.mapOptional("pressure", Obj.ResourcePressure); + } + + static const bool flow = true; +}; + +template <> +struct SequenceElementTraits { + static const bool flow = false; +}; + +template <> +struct MappingTraits { + static void mapping(IO &Io, + AnalysisResult::SchedClassInconsistency::Point &Obj) { + Io.mapRequired("opcode", Obj.Opcode); + Io.mapRequired("config", Obj.Config); + Io.mapRequired("snippet", Obj.Snippet); + } +}; + +template <> +struct SequenceElementTraits< + AnalysisResult::SchedClassInconsistency::DataPoint> { + static const bool flow = true; +}; + +template <> +struct SequenceTraits { + using DataPoint = AnalysisResult::SchedClassInconsistency::DataPoint; + static size_t size(IO &, DataPoint &Obj) { return Obj.size(); } + + static DataPoint::value_type &element(IO &, DataPoint &Obj, size_t Index) { + return Obj[Index]; + } + + static const bool flow = true; +}; + +template <> +struct SequenceElementTraits< + AnalysisResult::SchedClassInconsistency::Measurement> { + static const bool flow = false; +}; + +template <> +struct MappingTraits { + static void + mapping(IO &Io, AnalysisResult::SchedClassInconsistency::Measurement &Obj) { + Io.mapRequired("cluster_id", Obj.ClusterId); + Io.mapRequired("points", Obj.Points); + Io.mapRequired("data", Obj.Data); + Io.mapRequired("inconsistent", Obj.IsInconsistent); + } +}; + +template <> struct SequenceTraits> { + using Pair = std::pair; + static size_t size(IO &, Pair &) { return 2; } + + static unsigned &element(IO &, Pair &Obj, size_t Index) { + return Index == 0 ? Obj.first : Obj.second; + } + + static const bool flow = true; +}; + +template <> struct SequenceElementTraits> { + static const bool flow = true; +}; + +template <> struct MappingTraits { + static void mapping(IO &Io, AnalysisResult::SchedClassInconsistency &Obj) { + Io.mapRequired("name", Obj.Name); + Io.mapRequired("variant", Obj.IsVariant); + Io.mapRequired("num_microops", Obj.NumMicroOps); + Io.mapRequired("latency", Obj.Latency); + Io.mapRequired("rthroughput", Obj.RThroughput); + + Io.mapRequired("write_proc_res", Obj.WriteProcResEntries); + + Io.mapRequired("measurement_names", Obj.MeasurementNames); + Io.mapRequired("measurements", Obj.Measurements); + } +}; + +template <> struct MappingTraits { + static void mapping(IO &Io, AnalysisResult::SchedClassInconsistencies &Obj) { + Io.mapRequired("triple", Obj.Triple); + Io.mapRequired("cpu", Obj.CPUName); + Io.mapOptional("epsilon", Obj.Epsilon); + Io.mapRequired("inconsistencies", Obj.Inconsistencies); + } +}; +} // namespace yaml +} // namespace llvm + +void llvm::exegesis::AnalysisResult::printYAML( + raw_ostream &OS, const AnalysisResult::SchedClassInconsistencies &Result) { + yaml::Output YOS(OS, /*Ctx=*/nullptr, /*WrapColumn=*/200); + YOS << const_cast(Result); +} diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkResult.cpp b/llvm/tools/llvm-exegesis/lib/BenchmarkResult.cpp index 84dc23b343c6c..4cbc697a37575 100644 --- a/llvm/tools/llvm-exegesis/lib/BenchmarkResult.cpp +++ b/llvm/tools/llvm-exegesis/lib/BenchmarkResult.cpp @@ -9,16 +9,20 @@ #include "BenchmarkResult.h" #include "BenchmarkRunner.h" #include "Error.h" +#include "Timer.h" #include "ValidationEvent.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/ScopeExit.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/bit.h" #include "llvm/ObjectYAML/YAML.h" +#include "llvm/Support/Base64.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Errc.h" #include "llvm/Support/FileOutputBuffer.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Format.h" +#include "llvm/Support/Timer.h" #include "llvm/Support/raw_ostream.h" static constexpr const char kIntegerPrefix[] = "i_0x"; @@ -27,6 +31,12 @@ static constexpr const char kInvalidOperand[] = "INVALID"; namespace llvm { +static cl::opt ForceObjectFileCompressionFormat( + "exegesis-force-obj-compress-format", cl::Hidden, + cl::desc("Force to use this compression format for object files."), + cl::values(clEnumValN(compression::Format::Zstd, "zstd", "Using Zstandard"), + clEnumValN(compression::Format::Zlib, "zlib", "Using LibZ"))); + namespace { // A mutable struct holding an LLVMState that can be passed through the @@ -89,7 +99,7 @@ struct YamlContext { OS.write_hex(bit_cast(Value)); } - bool tryDeserializeIntegerOperand(StringRef String, int64_t &Value) { + bool tryDeserializeIntegerOperand(StringRef String, uint64_t &Value) { if (!String.consume_front(kIntegerPrefix)) return false; return !String.consumeInteger(16, Value); @@ -121,10 +131,10 @@ struct YamlContext { MCOperand deserializeMCOperand(StringRef String) { assert(!String.empty()); - int64_t IntValue = 0; + uint64_t IntValue = 0; double DoubleValue = 0; if (tryDeserializeIntegerOperand(String, IntValue)) - return MCOperand::createImm(IntValue); + return MCOperand::createImm(bit_cast(IntValue)); if (tryDeserializeFPOperand(String, DoubleValue)) return MCOperand::createDFPImm(bit_cast(DoubleValue)); if (auto RegNo = getRegNo(String)) @@ -278,6 +288,13 @@ template <> struct ScalarTraits { static const bool flow = true; }; +template <> struct ScalarEnumerationTraits { + static void enumeration(IO &Io, compression::Format &Format) { + Io.enumCase(Format, "zstd", compression::Format::Zstd); + Io.enumCase(Format, "zlib", compression::Format::Zlib); + } +}; + template <> struct MappingContextTraits { static void mapping(IO &Io, exegesis::BenchmarkKey &Obj, YamlContext &Context) { @@ -288,6 +305,33 @@ template <> struct MappingContextTraits { } }; +template <> struct MappingTraits { + struct NormalizedBase64Binary { + std::string Base64Str; + + NormalizedBase64Binary(IO &) {} + NormalizedBase64Binary(IO &, const std::vector &Data) + : Base64Str(llvm::encodeBase64(Data)) {} + + std::vector denormalize(IO &) { + std::vector Buffer; + if (Error E = llvm::decodeBase64(Base64Str, Buffer)) + report_fatal_error(std::move(E)); + + StringRef Data(Buffer.data(), Buffer.size()); + return std::vector(Data.bytes_begin(), Data.bytes_end()); + } + }; + + static void mapping(IO &Io, exegesis::Benchmark::ObjectFile &Obj) { + Io.mapRequired("compression", Obj.CompressionFormat); + Io.mapRequired("original_size", Obj.UncompressedSize); + MappingNormalization> + ObjFileString(Io, Obj.CompressedBytes); + Io.mapRequired("compressed_bytes", ObjFileString->Base64Str); + } +}; + template <> struct MappingContextTraits { struct NormalizedBinary { NormalizedBinary(IO &io) {} @@ -325,9 +369,11 @@ template <> struct MappingContextTraits { Io.mapRequired("error", Obj.Error); Io.mapOptional("info", Obj.Info); // AssembledSnippet - MappingNormalization> BinaryString( + MappingNormalization> SnippetString( Io, Obj.AssembledSnippet); - Io.mapOptional("assembled_snippet", BinaryString->Binary); + Io.mapOptional("assembled_snippet", SnippetString->Binary); + // ObjectFile + Io.mapOptional("object_file", Obj.ObjFile); } }; @@ -364,6 +410,52 @@ Benchmark::readTriplesAndCpusFromYamls(MemoryBufferRef Buffer) { return Result; } +Error Benchmark::setObjectFile(StringRef RawBytes) { + SmallVector CompressedBytes; + llvm::compression::Format CompressionFormat; + + auto isFormatAvailable = [](llvm::compression::Format F) -> bool { + switch (F) { + case compression::Format::Zstd: + return compression::zstd::isAvailable(); + case compression::Format::Zlib: + return compression::zlib::isAvailable(); + } + }; + if (ForceObjectFileCompressionFormat.getNumOccurrences() > 0) { + CompressionFormat = ForceObjectFileCompressionFormat; + if (!isFormatAvailable(CompressionFormat)) + return make_error( + "The designated compression format is not available.", + inconvertibleErrorCode()); + } else if (isFormatAvailable(compression::Format::Zstd)) { + // Try newer compression algorithm first. + CompressionFormat = compression::Format::Zstd; + } else if (isFormatAvailable(compression::Format::Zlib)) { + CompressionFormat = compression::Format::Zlib; + } else { + return make_error( + "None of the compression methods is available.", + inconvertibleErrorCode()); + } + + switch (CompressionFormat) { + case compression::Format::Zstd: + compression::zstd::compress({RawBytes.bytes_begin(), RawBytes.bytes_end()}, + CompressedBytes); + break; + case compression::Format::Zlib: + compression::zlib::compress({RawBytes.bytes_begin(), RawBytes.bytes_end()}, + CompressedBytes); + break; + } + + ObjFile = {CompressionFormat, + RawBytes.size(), + {CompressedBytes.begin(), CompressedBytes.end()}}; + return Error::success(); +} + Expected Benchmark::readYaml(const LLVMState &State, MemoryBufferRef Buffer) { yaml::Input Yin(Buffer); @@ -378,6 +470,8 @@ Expected Benchmark::readYaml(const LLVMState &State, Expected> Benchmark::readYamls(const LLVMState &State, MemoryBufferRef Buffer) { + NamedRegionTimer T("readYamls", "Read YAML Benchmarks", TimerGroupName, + TimerGroupDescription, TimerIsEnabled); yaml::Input Yin(Buffer); YamlContext Context(State); std::vector Benchmarks; diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h b/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h index 3c09a8380146e..a5217566204a1 100644 --- a/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h +++ b/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h @@ -21,6 +21,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstBuilder.h" +#include "llvm/Support/Compression.h" #include "llvm/Support/YAMLTraits.h" #include #include @@ -76,6 +77,11 @@ struct BenchmarkKey { uintptr_t SnippetAddress = 0; // The register that should be used to hold the loop counter. unsigned LoopRegister; + + bool operator==(const BenchmarkKey &RHS) const { + return Config == RHS.Config && + Instructions[0].getOpcode() == RHS.Instructions[0].getOpcode(); + } }; struct BenchmarkMeasure { @@ -122,6 +128,16 @@ struct Benchmark { std::string Error; std::string Info; std::vector AssembledSnippet; + + struct ObjectFile { + llvm::compression::Format CompressionFormat; + size_t UncompressedSize = 0; + std::vector CompressedBytes; + + bool isValid() const { return UncompressedSize && CompressedBytes.size(); } + }; + std::optional ObjFile; + // How to aggregate measurements. enum ResultAggregationModeE { Min, Max, Mean, MinVariance }; @@ -132,6 +148,10 @@ struct Benchmark { Benchmark &operator=(const Benchmark &) = delete; Benchmark &operator=(Benchmark &&) = delete; + // Compress raw object file bytes and assign the result and compression type + // to CompressedObjectFile and ObjFileCompression, respectively. + class Error setObjectFile(StringRef RawBytes); + // Read functions. static Expected readYaml(const LLVMState &State, MemoryBufferRef Buffer); diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp index 9116b5ced0274..130482cc4f412 100644 --- a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp +++ b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp @@ -14,6 +14,7 @@ #include "PerfHelper.h" #include "SubprocessMemory.h" #include "Target.h" +#include "Timer.h" #include "llvm/ADT/ScopeExit.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" @@ -26,6 +27,7 @@ #include "llvm/Support/Program.h" #include "llvm/Support/Signals.h" #include "llvm/Support/SystemZ/zOSSupport.h" +#include "llvm/Support/Timer.h" #include #include #include @@ -53,6 +55,12 @@ namespace llvm { namespace exegesis { +static cl::opt + DryRunMeasurement("dry-run-measurement", + cl::desc("Run every steps in the measurement phase " + "except executing the snippet."), + cl::init(false), cl::Hidden); + BenchmarkRunner::BenchmarkRunner(const LLVMState &State, Benchmark::ModeE Mode, BenchmarkPhaseSelectorE BenchmarkPhaseSelector, ExecutionModeE ExecutionMode, @@ -139,14 +147,17 @@ class InProcessFunctionExecutorImpl : public BenchmarkRunner::FunctionExecutor { pfm::CounterGroup *Counter = CounterOrError.get().get(); Scratch->clear(); { + bool DryRun = DryRunMeasurement; auto PS = ET.withSavedState(); CrashRecoveryContext CRC; CrashRecoveryContext::Enable(); - const bool Crashed = !CRC.RunSafely([this, Counter, ScratchPtr]() { - Counter->start(); - this->Function(ScratchPtr); - Counter->stop(); - }); + const bool Crashed = + !CRC.RunSafely([this, Counter, ScratchPtr, DryRun]() { + Counter->start(); + if (!DryRun) + this->Function(ScratchPtr); + Counter->stop(); + }); CrashRecoveryContext::Disable(); PS.reset(); if (Crashed) { @@ -631,6 +642,9 @@ BenchmarkRunner::getRunnableConfiguration( // the snippet for debug/analysis. This is so that the user clearly // understands that the inside instructions are repeated. if (BenchmarkPhaseSelector > BenchmarkPhaseSelectorE::PrepareSnippet) { + NamedRegionTimer T("prepare-and-assemble-snippet", + "Prepare And Assemble Snippet", TimerGroupName, + TimerGroupDescription, TimerIsEnabled); const int MinInstructionsForSnippet = 4 * Instructions.size(); const int LoopBodySizeForSnippet = 2 * Instructions.size(); auto Snippet = @@ -648,17 +662,55 @@ BenchmarkRunner::getRunnableConfiguration( // MinInstructions instructions. if (BenchmarkPhaseSelector > BenchmarkPhaseSelectorE::PrepareAndAssembleSnippet) { + NamedRegionTimer T("assemble-measured-code", "Assemble Measured Code", + TimerGroupName, TimerGroupDescription, TimerIsEnabled); auto Snippet = assembleSnippet(BC, Repetitor, BenchmarkResult.MinInstructions, LoopBodySize, GenerateMemoryInstructions); if (Error E = Snippet.takeError()) return std::move(E); + if (Error E = BenchmarkResult.setObjectFile(*Snippet)) + return std::move(E); RC.ObjectFile = getObjectFromBuffer(*Snippet); } return std::move(RC); } +Expected +BenchmarkRunner::getRunnableConfiguration(Benchmark &&B) const { + NamedRegionTimer T("decompression", "Decompress serialized object file", + TimerGroupName, TimerGroupDescription, TimerIsEnabled); + assert(B.ObjFile.has_value() && B.ObjFile->isValid() && + "No serialized obejct file is attached?"); + const Benchmark::ObjectFile &ObjFile = *B.ObjFile; + SmallVector DecompressedObjFile; + switch (ObjFile.CompressionFormat) { + case compression::Format::Zstd: + if (!compression::zstd::isAvailable()) + return make_error("zstd is not available for decompression.", + inconvertibleErrorCode()); + if (Error E = compression::zstd::decompress(ObjFile.CompressedBytes, + DecompressedObjFile, + ObjFile.UncompressedSize)) + return std::move(E); + break; + case compression::Format::Zlib: + if (!compression::zlib::isAvailable()) + return make_error("zlib is not available for decompression.", + inconvertibleErrorCode()); + if (Error E = compression::zlib::decompress(ObjFile.CompressedBytes, + DecompressedObjFile, + ObjFile.UncompressedSize)) + return std::move(E); + break; + } + + StringRef Buffer(reinterpret_cast(DecompressedObjFile.begin()), + DecompressedObjFile.size()); + return RunnableConfiguration{std::move(B), getObjectFromBuffer(Buffer)}; +} + Expected> BenchmarkRunner::createFunctionExecutor( object::OwningBinary ObjectFile, @@ -696,6 +748,8 @@ BenchmarkRunner::createFunctionExecutor( std::pair BenchmarkRunner::runConfiguration( RunnableConfiguration &&RC, const std::optional &DumpFile, std::optional BenchmarkProcessCPU) const { + NamedRegionTimer T("measurement", "Measure Performance", TimerGroupName, + TimerGroupDescription, TimerIsEnabled); Benchmark &BenchmarkResult = RC.BenchmarkResult; object::OwningBinary &ObjectFile = RC.ObjectFile; diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.h b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.h index e688b814d1c83..34e36ca0f9759 100644 --- a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.h +++ b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.h @@ -54,11 +54,15 @@ class BenchmarkRunner { RunnableConfiguration &operator=(RunnableConfiguration &&) = delete; RunnableConfiguration &operator=(const RunnableConfiguration &) = delete; + Benchmark BenchmarkResult; + object::OwningBinary ObjectFile; + private: RunnableConfiguration() = default; - Benchmark BenchmarkResult; - object::OwningBinary ObjectFile; + RunnableConfiguration(Benchmark &&B, + object::OwningBinary &&OF) + : BenchmarkResult(std::move(B)), ObjectFile(std::move(OF)) {} }; Expected @@ -66,6 +70,8 @@ class BenchmarkRunner { unsigned MinInstructions, unsigned LoopUnrollFactor, const SnippetRepetitor &Repetitor) const; + Expected getRunnableConfiguration(Benchmark &&B) const; + std::pair runConfiguration(RunnableConfiguration &&RC, const std::optional &DumpFile, diff --git a/llvm/tools/llvm-exegesis/lib/CMakeLists.txt b/llvm/tools/llvm-exegesis/lib/CMakeLists.txt index 414b49e5e021c..9be381cf42562 100644 --- a/llvm/tools/llvm-exegesis/lib/CMakeLists.txt +++ b/llvm/tools/llvm-exegesis/lib/CMakeLists.txt @@ -12,6 +12,9 @@ endif() if (LLVM_TARGETS_TO_BUILD MATCHES "Mips") list(APPEND LLVM_EXEGESIS_TARGETS "Mips") endif() +if (LLVM_TARGETS_TO_BUILD MATCHES "RISCV") + list(APPEND LLVM_EXEGESIS_TARGETS "RISCV") +endif() set(LLVM_EXEGESIS_TARGETS ${LLVM_EXEGESIS_TARGETS} PARENT_SCOPE) @@ -50,6 +53,7 @@ add_llvm_library(LLVMExegesis DISABLE_LLVM_LINK_LLVM_DYLIB STATIC Analysis.cpp + AnalysisPrinters.cpp Assembler.cpp BenchmarkResult.cpp BenchmarkRunner.cpp @@ -72,6 +76,7 @@ add_llvm_library(LLVMExegesis SnippetRepetitor.cpp SubprocessMemory.cpp Target.cpp + Timer.cpp UopsBenchmarkRunner.cpp ValidationEvent.cpp diff --git a/llvm/tools/llvm-exegesis/lib/Clustering.cpp b/llvm/tools/llvm-exegesis/lib/Clustering.cpp index fc79718fdeb22..2df22571138c5 100644 --- a/llvm/tools/llvm-exegesis/lib/Clustering.cpp +++ b/llvm/tools/llvm-exegesis/lib/Clustering.cpp @@ -8,6 +8,7 @@ #include "Clustering.h" #include "Error.h" +#include "ProgressMeter.h" #include "SchedClassResolution.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/SetVector.h" @@ -129,8 +130,12 @@ Error BenchmarkClustering::validateAndSetup() { } void BenchmarkClustering::clusterizeDbScan(const size_t MinPts) { + ProgressMeter<> Meter(Points_.size()); + std::vector Neighbors; // Persistent buffer to avoid allocs. for (size_t P = 0, NumPoints = Points_.size(); P < NumPoints; ++P) { + ProgressMeter<>::ProgressMeterStep MeterStep(&Meter); + if (!ClusterIdForPoint_[P].isUndef()) continue; // Previously processed in inner loop. rangeQuery(P, Neighbors); diff --git a/llvm/tools/llvm-exegesis/lib/Clustering.h b/llvm/tools/llvm-exegesis/lib/Clustering.h index 9d6c110e2e854..c1d68110c8e1a 100644 --- a/llvm/tools/llvm-exegesis/lib/Clustering.h +++ b/llvm/tools/llvm-exegesis/lib/Clustering.h @@ -47,6 +47,11 @@ class BenchmarkClustering { ClusterId() : Id_(kUndef), IsUnstable_(false) {} + ClusterId(const ClusterId &) = default; + ClusterId(ClusterId &&) = default; + ClusterId &operator=(const ClusterId &) = default; + ClusterId &operator=(ClusterId &&) = default; + // Compare id's, ignoring the 'unstability' bit. bool operator==(const ClusterId &O) const { return Id_ == O.Id_; } bool operator<(const ClusterId &O) const { return Id_ < O.Id_; } diff --git a/llvm/tools/llvm-exegesis/lib/LlvmState.cpp b/llvm/tools/llvm-exegesis/lib/LlvmState.cpp index d453d460abafc..b04a6e823b92c 100644 --- a/llvm/tools/llvm-exegesis/lib/LlvmState.cpp +++ b/llvm/tools/llvm-exegesis/lib/LlvmState.cpp @@ -46,7 +46,7 @@ Expected LLVMState::Create(std::string TripleName, CpuName = std::string(sys::getHostCPUName()); std::unique_ptr STI( - TheTarget->createMCSubtargetInfo(TripleName, CpuName, "")); + TheTarget->createMCSubtargetInfo(TripleName, CpuName, Features)); assert(STI && "Unable to create subtarget info!"); if (!STI->isCPUStringValid(CpuName)) { return make_error(Twine("invalid CPU name (") diff --git a/llvm/tools/llvm-exegesis/lib/MCInstrDescView.cpp b/llvm/tools/llvm-exegesis/lib/MCInstrDescView.cpp index 9c926d1fc6112..ae7e0fb296b99 100644 --- a/llvm/tools/llvm-exegesis/lib/MCInstrDescView.cpp +++ b/llvm/tools/llvm-exegesis/lib/MCInstrDescView.cpp @@ -44,6 +44,8 @@ bool Operand::isDef() const { return IsDef; } bool Operand::isUse() const { return !IsDef; } +bool Operand::isEarlyClobber() const { return IsEarlyClobber; } + bool Operand::isReg() const { return Tracker; } bool Operand::isTied() const { return TiedToIndex.has_value(); } @@ -114,6 +116,8 @@ Instruction::create(const MCInstrInfo &InstrInfo, Operand Operand; Operand.Index = OpIndex; Operand.IsDef = (OpIndex < Description->getNumDefs()); + Operand.IsEarlyClobber = + (Description->getOperandConstraint(OpIndex, MCOI::EARLY_CLOBBER) != -1); // TODO(gchatelet): Handle isLookupPtrRegClass. if (OpInfo.RegClass >= 0) Operand.Tracker = &RATC.getRegisterClass(OpInfo.RegClass); diff --git a/llvm/tools/llvm-exegesis/lib/MCInstrDescView.h b/llvm/tools/llvm-exegesis/lib/MCInstrDescView.h index f8ebc07d01f35..efc900161786c 100644 --- a/llvm/tools/llvm-exegesis/lib/MCInstrDescView.h +++ b/llvm/tools/llvm-exegesis/lib/MCInstrDescView.h @@ -67,6 +67,7 @@ struct Operand { bool isImplicitReg() const; bool isDef() const; bool isUse() const; + bool isEarlyClobber() const; bool isReg() const; bool isTied() const; bool isVariable() const; @@ -82,6 +83,7 @@ struct Operand { // Please use the accessors above and not the following fields. std::optional Index; bool IsDef = false; + bool IsEarlyClobber = false; const RegisterAliasingTracker *Tracker = nullptr; // Set for Register Op. const MCOperandInfo *Info = nullptr; // Set for Explicit Op. std::optional TiedToIndex; // Set for Reg&Explicit Op. @@ -115,6 +117,8 @@ struct Instruction { Instruction &operator=(const Instruction &) = delete; Instruction &operator=(Instruction &&) = delete; + unsigned getOpcode() const { return Description.getOpcode(); } + // Returns the Operand linked to this Variable. // In case the Variable is tied, the primary (i.e. Def) Operand is returned. const Operand &getPrimaryOperand(const Variable &Var) const; diff --git a/llvm/tools/llvm-exegesis/lib/PerfHelper.cpp b/llvm/tools/llvm-exegesis/lib/PerfHelper.cpp index 3f3288ceb1e4f..08562f1254f66 100644 --- a/llvm/tools/llvm-exegesis/lib/PerfHelper.cpp +++ b/llvm/tools/llvm-exegesis/lib/PerfHelper.cpp @@ -17,6 +17,11 @@ #include #endif +#include +#include +#include +#include + #include #include #include // for erno @@ -44,6 +49,12 @@ void pfmTerminate() { #endif } +static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, + int cpu, int group_fd, unsigned long flags) { + int ret = syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags); + return ret; +} + // Performance counters may be unavailable for a number of reasons (such as // kernel.perf_event_paranoid restriction or CPU being unknown to libpfm). // @@ -51,12 +62,7 @@ void pfmTerminate() { // counters while still passing control to the generated code snippet. const char *const PerfEvent::DummyEventString = "not-really-an-event"; -PerfEvent::~PerfEvent() { -#ifdef HAVE_LIBPFM - delete Attr; - ; -#endif -} +PerfEvent::~PerfEvent() { delete Attr; } PerfEvent::PerfEvent(PerfEvent &&Other) : EventString(std::move(Other.EventString)), @@ -112,7 +118,6 @@ ConfiguredEvent::ConfiguredEvent(PerfEvent &&EventToConfigure) assert(Event.valid()); } -#ifdef HAVE_LIBPFM void ConfiguredEvent::initRealEvent(const pid_t ProcessID, const int GroupFD) { const int CPU = -1; const uint32_t Flags = 0; @@ -145,17 +150,6 @@ ConfiguredEvent::readOrError(StringRef /*unused*/) const { } ConfiguredEvent::~ConfiguredEvent() { close(FileDescriptor); } -#else -void ConfiguredEvent::initRealEvent(pid_t ProcessID, const int GroupFD) {} - -Expected> -ConfiguredEvent::readOrError(StringRef /*unused*/) const { - return make_error("Not implemented", - errc::function_not_supported); -} - -ConfiguredEvent::~ConfiguredEvent() = default; -#endif // HAVE_LIBPFM CounterGroup::CounterGroup(PerfEvent &&E, std::vector &&ValEvents, pid_t ProcessID) @@ -169,7 +163,6 @@ CounterGroup::CounterGroup(PerfEvent &&E, std::vector &&ValEvents, initRealEvent(ProcessID); } -#ifdef HAVE_LIBPFM void CounterGroup::initRealEvent(pid_t ProcessID) { EventCounter.initRealEvent(ProcessID); @@ -178,8 +171,10 @@ void CounterGroup::initRealEvent(pid_t ProcessID) { } void CounterGroup::start() { - if (!IsDummyEvent) + if (!IsDummyEvent) { ioctl(getFileDescriptor(), PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP); + ioctl(getFileDescriptor(), PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP); + } } void CounterGroup::stop() { @@ -215,32 +210,6 @@ CounterGroup::readValidationCountersOrError() const { } int CounterGroup::numValues() const { return 1; } -#else - -void CounterGroup::initRealEvent(pid_t ProcessID) {} - -void CounterGroup::start() {} - -void CounterGroup::stop() {} - -Expected> -CounterGroup::readOrError(StringRef /*unused*/) const { - if (IsDummyEvent) { - SmallVector Result; - Result.push_back(42); - return Result; - } - return make_error("Not implemented", errc::io_error); -} - -Expected> -CounterGroup::readValidationCountersOrError() const { - return SmallVector(0); -} - -int CounterGroup::numValues() const { return 1; } - -#endif } // namespace pfm } // namespace exegesis diff --git a/llvm/tools/llvm-exegesis/lib/ProgressMeter.h b/llvm/tools/llvm-exegesis/lib/ProgressMeter.h index c09b9e9604517..9ea27bf5c47ac 100644 --- a/llvm/tools/llvm-exegesis/lib/ProgressMeter.h +++ b/llvm/tools/llvm-exegesis/lib/ProgressMeter.h @@ -9,6 +9,7 @@ #ifndef LLVM_TOOLS_LLVM_EXEGESIS_PROGRESSMETER_H #define LLVM_TOOLS_LLVM_EXEGESIS_PROGRESSMETER_H +#include "llvm/ADT/StringExtras.h" #include "llvm/Support/Format.h" #include "llvm/Support/raw_ostream.h" #include @@ -67,6 +68,7 @@ class ProgressMeter { raw_ostream &Out; const int NumStepsTotal; SimpleMovingAverage ElapsedTotal; + ListSeparator Carriage; public: friend class ProgressMeterStep; @@ -93,10 +95,12 @@ class ProgressMeter { }; ProgressMeter(int NumStepsTotal_, raw_ostream &out_ = errs()) - : Out(out_), NumStepsTotal(NumStepsTotal_) { + : Out(out_), NumStepsTotal(NumStepsTotal_), Carriage("\r") { assert(NumStepsTotal > 0 && "No steps are planned?"); } + ~ProgressMeter() { Out << "\n"; } + ProgressMeter(const ProgressMeter &) = delete; ProgressMeter(ProgressMeter &&) = delete; ProgressMeter &operator=(const ProgressMeter &) = delete; @@ -114,7 +118,7 @@ class ProgressMeter { if (NewProgress < OldProgress + 1) return; - Out << format("Processing... %*d%%", 3, NewProgress); + Out << Carriage << format("Processing... %*d%%", 3, NewProgress); if (NewEta) { int SecondsTotal = std::ceil(NewEta->count()); int Seconds = SecondsTotal % 60; @@ -122,7 +126,6 @@ class ProgressMeter { Out << format(", ETA %02d:%02d", MinutesTotal, Seconds); } - Out << "\n"; Out.flush(); } diff --git a/llvm/tools/llvm-exegesis/lib/RISCV/CMakeLists.txt b/llvm/tools/llvm-exegesis/lib/RISCV/CMakeLists.txt new file mode 100644 index 0000000000000..8a2646d302b0b --- /dev/null +++ b/llvm/tools/llvm-exegesis/lib/RISCV/CMakeLists.txt @@ -0,0 +1,25 @@ +include_directories( + ${LLVM_MAIN_SRC_DIR}/lib/Target/RISCV + ${LLVM_BINARY_DIR}/lib/Target/RISCV + ) + +set(LLVM_LINK_COMPONENTS + RISCV + CodeGenTypes + Core + Exegesis + MC + Support + ) + +add_llvm_library(LLVMExegesisRISCV + DISABLE_LLVM_LINK_LLVM_DYLIB + STATIC + RISCVExegesisPostprocessing.cpp + RISCVExegesisPreprocessing.cpp + Target.cpp + + DEPENDS + intrinsics_gen + RISCVCommonTableGen + ) diff --git a/llvm/tools/llvm-exegesis/lib/RISCV/RISCVExegesisPasses.h b/llvm/tools/llvm-exegesis/lib/RISCV/RISCVExegesisPasses.h new file mode 100644 index 0000000000000..f206966331756 --- /dev/null +++ b/llvm/tools/llvm-exegesis/lib/RISCV/RISCVExegesisPasses.h @@ -0,0 +1,19 @@ +//===- RISCVExegesisPasses.h - RISC-V specific Exegesis Passes --*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_EXEGESIS_LIB_RISCV_RISCVEXEGESISPASSES_H +#define LLVM_TOOLS_EXEGESIS_LIB_RISCV_RISCVEXEGESISPASSES_H +namespace llvm { +class FunctionPass; + +namespace exegesis { +FunctionPass *createRISCVPreprocessingPass(); +FunctionPass *createRISCVPostprocessingPass(); +} // namespace exegesis +} // namespace llvm +#endif diff --git a/llvm/tools/llvm-exegesis/lib/RISCV/RISCVExegesisPostprocessing.cpp b/llvm/tools/llvm-exegesis/lib/RISCV/RISCVExegesisPostprocessing.cpp new file mode 100644 index 0000000000000..e8220b82f37b7 --- /dev/null +++ b/llvm/tools/llvm-exegesis/lib/RISCV/RISCVExegesisPostprocessing.cpp @@ -0,0 +1,126 @@ +//===- RISCVExegesisPostprocessing.cpp - Post processing MI for exegesis---===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// \file +// Currently there is only one post-processing we need to do for exegesis: +// Assign a physical register to VSETVL's rd if it's not X0 (i.e. VLMAX). +// +//===----------------------------------------------------------------------===// + +#include "RISCV.h" +#include "RISCVExegesisPasses.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "riscv-exegesis-post-processing" + +namespace { +struct RISCVExegesisPostprocessing : public MachineFunctionPass { + static char ID; + + RISCVExegesisPostprocessing() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &MF) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + +private: + // Extremely simple register allocator that picks a register that hasn't + // been defined or used in this function. + Register allocateGPRRegister(const MachineFunction &MF, + const MachineRegisterInfo &MRI); + + bool processVSETVL(MachineInstr &MI, MachineRegisterInfo &MRI); + bool processWriteFRM(MachineInstr &MI, MachineRegisterInfo &MRI); +}; +} // anonymous namespace + +char RISCVExegesisPostprocessing::ID = 0; + +bool RISCVExegesisPostprocessing::runOnMachineFunction(MachineFunction &MF) { + bool Changed = false; + for (auto &MBB : MF) + for (auto &MI : MBB) { + unsigned Opcode = MI.getOpcode(); + switch (Opcode) { + case RISCV::VSETVLI: + case RISCV::VSETVL: + case RISCV::PseudoVSETVLI: + case RISCV::PseudoVSETVLIX0: + Changed |= processVSETVL(MI, MF.getRegInfo()); + break; + case RISCV::SwapFRMImm: + case RISCV::WriteFRM: + Changed |= processWriteFRM(MI, MF.getRegInfo()); + break; + default: + break; + } + } + + if (Changed) + MF.getRegInfo().clearVirtRegs(); + + return Changed; +} + +Register RISCVExegesisPostprocessing::allocateGPRRegister( + const MachineFunction &MF, const MachineRegisterInfo &MRI) { + const auto &TRI = *MRI.getTargetRegisterInfo(); + + const TargetRegisterClass *GPRClass = + TRI.getRegClass(RISCV::GPRJALRRegClassID); + BitVector Candidates = TRI.getAllocatableSet(MF, GPRClass); + + for (unsigned SetIdx : Candidates.set_bits()) { + if (MRI.reg_empty(Register(SetIdx))) + return Register(SetIdx); + } + + // All bets are off, assigned a fixed one. + return RISCV::X5; +} + +bool RISCVExegesisPostprocessing::processVSETVL(MachineInstr &MI, + MachineRegisterInfo &MRI) { + bool Changed = false; + // Replace both AVL and VL (i.e. the result) operands with physical + // registers. + for (unsigned Idx = 0U; Idx < 2; ++Idx) + if (MI.getOperand(Idx).isReg()) { + Register RegOp = MI.getOperand(Idx).getReg(); + if (RegOp.isVirtual()) { + MRI.replaceRegWith(RegOp, allocateGPRRegister(*MI.getMF(), MRI)); + Changed = true; + } + } + + return Changed; +} + +bool RISCVExegesisPostprocessing::processWriteFRM(MachineInstr &MI, + MachineRegisterInfo &MRI) { + // The virtual register will be the first operand in both SwapFRMImm and + // WriteFRM. + if (MI.getOperand(0).isReg()) { + Register DestReg = MI.getOperand(0).getReg(); + if (DestReg.isVirtual()) { + MRI.replaceRegWith(DestReg, allocateGPRRegister(*MI.getMF(), MRI)); + return true; + } + } + return false; +} + +FunctionPass *llvm::exegesis::createRISCVPostprocessingPass() { + return new RISCVExegesisPostprocessing(); +} diff --git a/llvm/tools/llvm-exegesis/lib/RISCV/RISCVExegesisPreprocessing.cpp b/llvm/tools/llvm-exegesis/lib/RISCV/RISCVExegesisPreprocessing.cpp new file mode 100644 index 0000000000000..ad3245f88201f --- /dev/null +++ b/llvm/tools/llvm-exegesis/lib/RISCV/RISCVExegesisPreprocessing.cpp @@ -0,0 +1,82 @@ +//===- RISCVExegesisPreprocessing.cpp -------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// \file +// +//===----------------------------------------------------------------------===// + +#include "RISCV.h" +#include "RISCVExegesisPasses.h" +#include "RISCVRegisterInfo.h" +#include "RISCVSubtarget.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "riscv-exegesis-preprocessing" + +namespace { +struct RISCVExegesisPreprocessing : public MachineFunctionPass { + static char ID; + + RISCVExegesisPreprocessing() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &MF) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; +} // anonymous namespace + +char RISCVExegesisPreprocessing::ID = 0; + +static bool processAVLOperand(MachineInstr &MI, MachineRegisterInfo &MRI, + const TargetInstrInfo &TII) { + const MCInstrDesc &Desc = TII.get(MI.getOpcode()); + uint64_t TSFlags = Desc.TSFlags; + if (!RISCVII::hasVLOp(TSFlags)) + return false; + + const MachineOperand &VLOp = MI.getOperand(RISCVII::getVLOpNum(Desc)); + if (VLOp.isReg()) { + Register VLReg = VLOp.getReg(); + if (VLReg.isVirtual()) + return false; + assert(RISCV::GPRRegClass.contains(VLReg)); + // Replace all uses of the original physical register with a new virtual + // register. The only reason we can do such replacement here is because it's + // almost certain that VLReg only has a single definition. + Register NewVLReg = MRI.createVirtualRegister(&RISCV::GPRRegClass); + MRI.replaceRegWith(VLReg, NewVLReg); + return true; + } + + return false; +} + +bool RISCVExegesisPreprocessing::runOnMachineFunction(MachineFunction &MF) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + const auto &STI = MF.getSubtarget(); + if (!STI.hasVInstructions()) + return false; + const TargetInstrInfo &TII = *STI.getInstrInfo(); + + bool Changed = false; + for (auto &MBB : MF) + for (auto &MI : MBB) { + Changed |= processAVLOperand(MI, MRI, TII); + } + + return Changed; +} + +FunctionPass *llvm::exegesis::createRISCVPreprocessingPass() { + return new RISCVExegesisPreprocessing(); +} diff --git a/llvm/tools/llvm-exegesis/lib/RISCV/Target.cpp b/llvm/tools/llvm-exegesis/lib/RISCV/Target.cpp new file mode 100644 index 0000000000000..f8d76620692df --- /dev/null +++ b/llvm/tools/llvm-exegesis/lib/RISCV/Target.cpp @@ -0,0 +1,955 @@ +//===-- Target.cpp ----------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "../Target.h" +#include "../ParallelSnippetGenerator.h" +#include "../SerialSnippetGenerator.h" +#include "../SnippetGenerator.h" +#include "MCTargetDesc/RISCVBaseInfo.h" +#include "MCTargetDesc/RISCVMatInt.h" +#include "RISCV.h" +#include "RISCVExegesisPasses.h" +#include "RISCVInstrInfo.h" +#include "RISCVRegisterInfo.h" +#include "RISCVSubtarget.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/Support/Regex.h" +#include "llvm/Support/raw_ostream.h" + +#include + +#include + +#define GET_AVAILABLE_OPCODE_CHECKER +#include "RISCVGenInstrInfo.inc" + +namespace RVVPseudoTables { +using namespace llvm; +using namespace llvm::RISCV; + +struct PseudoInfo { + uint16_t Pseudo; + uint16_t BaseInstr; + uint8_t VLMul; + uint8_t SEW; +}; + +struct RISCVMaskedPseudoInfo { + uint16_t MaskedPseudo; + uint16_t UnmaskedPseudo; + uint8_t MaskOpIdx; +}; + +#define GET_RISCVVInversePseudosTable_IMPL +#define GET_RISCVVInversePseudosTable_DECL +#define GET_RISCVMaskedPseudosTable_DECL +#define GET_RISCVMaskedPseudosTable_IMPL +#include "RISCVGenSearchableTables.inc" + +} // namespace RVVPseudoTables + +namespace llvm { +namespace exegesis { + +static cl::opt + OnlyUsesVLMAXForVL("riscv-vlmax-for-vl", + cl::desc("Only enumerate VLMAX for VL operand"), + cl::init(false), cl::Hidden); + +static cl::opt + EnumerateRoundingModes("riscv-enumerate-rounding-modes", + cl::desc("Enumerate different FRM and VXRM"), + cl::init(true), cl::Hidden); + +static cl::opt + FilterConfig("riscv-filter-config", + cl::desc("Show only the configs matching this regex"), + cl::init(""), cl::Hidden); + +#include "RISCVGenExegesis.inc" + +namespace { + +static perf_event_attr *createPerfEventAttr(unsigned Type, uint64_t Config) { + auto *PEA = new perf_event_attr(); + memset(PEA, 0, sizeof(perf_event_attr)); + PEA->type = Type; + PEA->size = sizeof(perf_event_attr); + PEA->config = Config; + PEA->disabled = 1; + PEA->exclude_kernel = 1; + PEA->exclude_hv = 1; + return PEA; +} + +struct RISCVPerfEvent : public pfm::PerfEvent { + explicit RISCVPerfEvent(StringRef PfmEventString) + : pfm::PerfEvent(PfmEventString) { + FullQualifiedEventString = EventString; + + if (EventString == "CYCLES" || EventString == "CPU_CYCLES") + Attr = createPerfEventAttr(PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES); + } +}; + +template class RVVSnippetGenerator : public BaseT { + static void printRoundingMode(raw_ostream &OS, unsigned Val, bool UsesVXRM) { + static const char *const FRMNames[] = {"rne", "rtz", "rdn", "rup", + "rmm", "N/A", "N/A", "dyn"}; + static const char *const VXRMNames[] = {"rnu", "rne", "rdn", "rod"}; + + if (UsesVXRM) { + assert(Val < 4); + OS << VXRMNames[Val]; + } else { + assert(Val != 5 && Val != 6); + OS << FRMNames[Val]; + } + } + + static constexpr unsigned MinSEW = 8; + // ELEN is basically SEW_max. + static constexpr unsigned ELEN = 64; + + // We can't know the real min/max VLEN w/o a Function, so we're + // using the VLen from Zvl. + unsigned ZvlVLen = 32; + + /// Mask for registers that are NOT standalone registers like X0 and V0 + BitVector AggregateRegisters; + + // Returns true when opcode is available in any of the FBs. + static bool + isOpcodeAvailableIn(unsigned Opcode, + ArrayRef FBs) { + FeatureBitset RequiredFeatures = RISCV_MC::computeRequiredFeatures(Opcode); + for (uint8_t FB : FBs) { + if (RequiredFeatures[FB]) + return true; + } + return false; + } + + static bool isRVVFloatingPointOp(unsigned Opcode) { + return isOpcodeAvailableIn(Opcode, + {RISCV_MC::Feature_HasVInstructionsAnyFBit}); + } + + // Get the element group width of each vector cryptor extension. + static unsigned getZvkEGWSize(unsigned Opcode, unsigned SEW) { + using namespace RISCV_MC; + if (isOpcodeAvailableIn(Opcode, {Feature_HasStdExtZvkgBit, + Feature_HasStdExtZvknedBit, + Feature_HasStdExtZvksedBit})) + return 128U; + else if (isOpcodeAvailableIn(Opcode, {Feature_HasStdExtZvkshBit})) + return 256U; + else if (isOpcodeAvailableIn(Opcode, {Feature_HasStdExtZvknhaOrZvknhbBit})) + // In Zvknh[ab], when SEW=64 is used (i.e. Zvknhb), EGW is 256. + // Otherwise it's 128. + return SEW == 64 ? 256U : 128U; + + llvm_unreachable("Unsupported opcode"); + } + + // A handy utility to multiply or divide an integer by LMUL. + template static T multiplyLMul(T Val, RISCVII::VLMUL LMul) { + // Fractional + if (LMul >= RISCVII::LMUL_F8) + return Val >> (8 - LMul); + else + return Val << LMul; + } + + /// Return the denominator of the fractional (i.e. the `x` in .vfx suffix) or + /// nullopt if BaseOpcode is not a vector sext/zext. + static std::optional isRVVSignZeroExtend(unsigned BaseOpcode) { + switch (BaseOpcode) { + case RISCV::VSEXT_VF2: + case RISCV::VZEXT_VF2: + return 2; + case RISCV::VSEXT_VF4: + case RISCV::VZEXT_VF4: + return 4; + case RISCV::VSEXT_VF8: + case RISCV::VZEXT_VF8: + return 8; + default: + return std::nullopt; + } + } + + void annotateWithVType(const CodeTemplate &CT, const Instruction &Instr, + unsigned BaseOpcode, + const BitVector &ForbiddenRegisters, + std::vector &Result) const; + +public: + RVVSnippetGenerator(const LLVMState &State, + const SnippetGenerator::Options &Opts) + : BaseT(State, Opts), + AggregateRegisters(State.getRegInfo().getNumRegs(), /*initVal=*/true) { + // Initialize standalone registers mask. + const MCRegisterInfo &RegInfo = State.getRegInfo(); + const unsigned StandaloneRegClasses[] = { + RISCV::GPRRegClassID, RISCV::FPR16RegClassID, RISCV::VRRegClassID}; + + for (unsigned RegClassID : StandaloneRegClasses) + for (unsigned Reg : RegInfo.getRegClass(RegClassID)) { + AggregateRegisters.reset(Reg); + } + + // Initialize the ZvlVLen. + const MCSubtargetInfo &STI = State.getSubtargetInfo(); + std::string ZvlQuery; + for (unsigned I = 5U, Size = (1 << I); I < 17U; ++I, Size <<= 1) { + ZvlQuery = "+zvl"; + raw_string_ostream SS(ZvlQuery); + SS << Size << "b"; + if (STI.checkFeatures(SS.str()) && ZvlVLen < Size) + ZvlVLen = Size; + } + } + + Expected> + generateCodeTemplates(InstructionTemplate Variant, + const BitVector &ForbiddenRegisters) const override; +}; + +static bool isMaskedSibiling(unsigned MaskedOp, unsigned UnmaskedOp) { + const auto *RVVMasked = RVVPseudoTables::getMaskedPseudoInfo(MaskedOp); + return RVVMasked && RVVMasked->UnmaskedPseudo == UnmaskedOp; +} + +// There are primarily two kinds of opcodes that are not eligible +// in a serial snippet: +// (1) Only has a single use operand that can not be overlap with +// the def operand. +// (2) The register file of the only use operand is different from +// that of the def operand. For instance, use operand is vector and +// the result is a scalar. +static bool isIneligibleOfSerialSnippets(unsigned BaseOpcode, + const Instruction &I) { + if (llvm::any_of(I.Operands, + [](const Operand &Op) { return Op.isEarlyClobber(); })) + return true; + + switch (BaseOpcode) { + case RISCV::VCOMPRESS_VM: + case RISCV::VCPOP_M: + case RISCV::VCPOP_V: + case RISCV::VRGATHEREI16_VV: + case RISCV::VRGATHER_VI: + case RISCV::VRGATHER_VV: + case RISCV::VRGATHER_VX: + case RISCV::VSLIDE1UP_VX: + case RISCV::VSLIDEUP_VI: + case RISCV::VSLIDEUP_VX: + // The truncate instructions that arraive here are those who cannot + // have any overlap between source and dest at all (i.e. + // those whoe don't satisfy condition 2 and 3 in RVV spec + // 5.2). + case RISCV::VNCLIPU_WI: + case RISCV::VNCLIPU_WV: + case RISCV::VNCLIPU_WX: + case RISCV::VNCLIP_WI: + case RISCV::VNCLIP_WV: + case RISCV::VNCLIP_WX: + return true; + default: + return false; + } +} + +static bool isZvfhminZvfbfminOpcodes(unsigned BaseOpcode) { + switch (BaseOpcode) { + case RISCV::VFNCVT_F_F_W: + case RISCV::VFWCVT_F_F_V: + case RISCV::VFNCVTBF16_F_F_W: + case RISCV::VFWCVTBF16_F_F_V: + return true; + default: + return false; + } +} + +static bool isVectorReduction(unsigned BaseOpcode) { + switch (BaseOpcode) { + case RISCV::VREDAND_VS: + case RISCV::VREDMAXU_VS: + case RISCV::VREDMAX_VS: + case RISCV::VREDMINU_VS: + case RISCV::VREDMIN_VS: + case RISCV::VREDOR_VS: + case RISCV::VREDSUM_VS: + case RISCV::VREDXOR_VS: + case RISCV::VWREDSUMU_VS: + case RISCV::VWREDSUM_VS: + case RISCV::VFREDMAX_VS: + case RISCV::VFREDMIN_VS: + case RISCV::VFREDOSUM_VS: + case RISCV::VFREDUSUM_VS: + return true; + default: + return false; + } +} + +template +void RVVSnippetGenerator::annotateWithVType( + const CodeTemplate &OrigCT, const Instruction &Instr, unsigned BaseOpcode, + const BitVector &ForbiddenRegisters, + std::vector &Result) const { + const MCSubtargetInfo &STI = SnippetGenerator::State.getSubtargetInfo(); + unsigned VPseudoOpcode = Instr.getOpcode(); + + bool IsSerial = std::is_same_v; + + const MCInstrDesc &MIDesc = Instr.Description; + const uint64_t TSFlags = MIDesc.TSFlags; + + RISCVII::VLMUL VLMul = RISCVII::getLMul(TSFlags); + + const size_t StartingResultSize = Result.size(); + + SmallPtrSet VTypeOperands; + std::optional SelfAliasing; + // Exegesis see instructions with tied operands being inherently serial. + // But for RVV instructions, those tied operands are passthru rather + // than real read operands. So we manually put dependency between + // destination (i.e. def) and any of the non-tied/SEW/policy/AVL/RM + // operands. + auto assignSerialRVVOperands = [&, this](InstructionTemplate &IT) { + // Initialize SelfAliasing on first use. + if (!SelfAliasing.has_value()) { + BitVector ExcludeRegs = ForbiddenRegisters; + ExcludeRegs |= AggregateRegisters; + SelfAliasing = AliasingConfigurations(Instr, Instr, ExcludeRegs); + bool EmptyUses = false; + for (auto &ARO : SelfAliasing->Configurations) { + auto &Uses = ARO.Uses; + for (auto ROA = Uses.begin(); ROA != Uses.end();) { + const Operand *Op = ROA->Op; + // Exclude tied operand(s). + if (Op->isTied()) { + ROA = Uses.erase(ROA); + continue; + } + + // Special handling for reduction operations: for a given reduction + // `vredop vd, vs2, vs1`, we don't want vd to be aliased with vs1 + // since we're only reading `vs1[0]` and many implementations + // optimize for this case (e.g. chaining). Instead, we're forcing + // it to create alias between vd and vs2. + if (isVectorReduction(BaseOpcode) && + // vs1's operand index is always 3. + Op->getIndex() == 3) { + ROA = Uses.erase(ROA); + continue; + } + + // Exclude any special operands like SEW and VL -- we've already + // assigned values to them. + if (VTypeOperands.count(Op)) { + ROA = Uses.erase(ROA); + continue; + } + ++ROA; + } + + // If any of the use operand candidate lists is empty, there is + // no point to assign self aliasing registers. + if (Uses.empty()) { + EmptyUses = true; + break; + } + } + if (EmptyUses) + SelfAliasing->Configurations.clear(); + } + + // This is a self aliasing instruction so defs and uses are from the same + // instance, hence twice IT in the following call. + if (!SelfAliasing->empty() && !SelfAliasing->hasImplicitAliasing()) + setRandomAliasing(*SelfAliasing, IT, IT); + }; + + // We are going to create a CodeTemplate (configuration) for each supported + // SEW, policy, and VL. + // FIXME: Account for EEW and EMUL. + SmallVector, 4> Log2SEWs; + SmallVector, 4> Policies; + SmallVector, 3> AVLs; + SmallVector, 8> RoundingModes; + + bool HasSEWOp = RISCVII::hasSEWOp(TSFlags); + bool HasPolicyOp = RISCVII::hasVecPolicyOp(TSFlags); + bool HasVLOp = RISCVII::hasVLOp(TSFlags); + bool HasRMOp = RISCVII::hasRoundModeOp(TSFlags); + bool UsesVXRM = RISCVII::usesVXRM(TSFlags); + + if (HasSEWOp) { + VTypeOperands.insert(&Instr.Operands[RISCVII::getSEWOpNum(MIDesc)]); + + SmallVector SEWCandidates; + + // (RVV spec 3.4.2) For fractional LMUL, the supported SEW are between + // [SEW_min, LMUL * ELEN]. + unsigned SEWUpperBound = + VLMul >= RISCVII::LMUL_F8 ? multiplyLMul(ELEN, VLMul) : ELEN; + for (unsigned SEW = MinSEW; SEW <= SEWUpperBound; SEW <<= 1) { + SEWCandidates.push_back(SEW); + + // Some scheduling classes already integrate SEW; only put + // their corresponding SEW values at the SEW operands. + // NOTE: It is imperative to put this condition in the front, otherwise + // it is tricky and difficult to know if there is an integrated + // SEW after other rules are applied to filter the candidates. + const auto *RVVBase = + RVVPseudoTables::getBaseInfo(BaseOpcode, VLMul, SEW); + if (RVVBase && (RVVBase->Pseudo == VPseudoOpcode || + isMaskedSibiling(VPseudoOpcode, RVVBase->Pseudo) || + isMaskedSibiling(RVVBase->Pseudo, VPseudoOpcode))) { + // There is an integrated SEW, remove all but the SEW pushed last. + SEWCandidates.erase(SEWCandidates.begin(), SEWCandidates.end() - 1); + break; + } + } + + // Filter out some candidates. + for (auto SEW = SEWCandidates.begin(); SEW != SEWCandidates.end();) { + // For floating point operations, only select SEW of the supported FLEN. + if (isRVVFloatingPointOp(VPseudoOpcode)) { + bool Supported = false; + Supported |= isZvfhminZvfbfminOpcodes(BaseOpcode) && *SEW == 16; + Supported |= STI.hasFeature(RISCV::FeatureStdExtZvfh) && *SEW == 16; + Supported |= STI.hasFeature(RISCV::FeatureStdExtF) && *SEW == 32; + Supported |= STI.hasFeature(RISCV::FeatureStdExtD) && *SEW == 64; + if (!Supported) { + SEW = SEWCandidates.erase(SEW); + continue; + } + } + + // The EEW for source operand in VSEXT and VZEXT is a fractional + // of the SEW, hence only SEWs that will lead to valid EEW are allowed. + if (auto Frac = isRVVSignZeroExtend(BaseOpcode)) + if (*SEW / *Frac < MinSEW) { + SEW = SEWCandidates.erase(SEW); + continue; + } + + // Most vector crypto 1.0 instructions only work on SEW=32. + using namespace RISCV_MC; + if (isOpcodeAvailableIn(BaseOpcode, {Feature_HasStdExtZvkgBit, + Feature_HasStdExtZvknedBit, + Feature_HasStdExtZvknhaOrZvknhbBit, + Feature_HasStdExtZvksedBit, + Feature_HasStdExtZvkshBit})) { + if (*SEW != 32) + // Zvknhb support SEW=64 as well. + if (*SEW != 64 || !STI.hasFeature(RISCV::FeatureStdExtZvknhb) || + !isOpcodeAvailableIn(BaseOpcode, + {Feature_HasStdExtZvknhaOrZvknhbBit})) { + SEW = SEWCandidates.erase(SEW); + continue; + } + + // We're also enforcing the requirement of `LMUL * VLEN >= EGW` here, + // because some of the extensions have SEW-dependant EGW. + unsigned EGW = getZvkEGWSize(BaseOpcode, *SEW); + if (multiplyLMul(ZvlVLen, VLMul) < EGW) { + SEW = SEWCandidates.erase(SEW); + continue; + } + } + + ++SEW; + } + + // We're not going to produce any result with zero SEW candidate. + if (SEWCandidates.empty()) + return; + + for (unsigned SEW : SEWCandidates) + Log2SEWs.push_back(SEW == 8 ? 0 : Log2_32(SEW)); + } else { + Log2SEWs.push_back(std::nullopt); + } + + if (HasPolicyOp) { + VTypeOperands.insert(&Instr.Operands[RISCVII::getVecPolicyOpNum(MIDesc)]); + + Policies = {0, RISCVII::TAIL_AGNOSTIC, RISCVII::MASK_AGNOSTIC, + (RISCVII::TAIL_AGNOSTIC | RISCVII::MASK_AGNOSTIC)}; + } else { + Policies.push_back(std::nullopt); + } + + if (HasVLOp) { + VTypeOperands.insert(&Instr.Operands[RISCVII::getVLOpNum(MIDesc)]); + + if (OnlyUsesVLMAXForVL) + AVLs.push_back(-1); + else + AVLs = {// 5-bit immediate value + 1, + // VLMAX + -1, + // Non-X0 register + 0}; + } else { + AVLs.push_back(std::nullopt); + } + + if (HasRMOp) { + VTypeOperands.insert(&Instr.Operands[RISCVII::getVLOpNum(MIDesc) - 1]); + + // If we're not enumerating all rounding modes, + // use zero (rne in FRM and rnu in VXRM) as the default + // mode. + RoundingModes = {0U}; + if (EnumerateRoundingModes) { + RoundingModes.append({1, 2, 3}); + if (!UsesVXRM) + // FRM values 5 and 6 are currently reserved. + RoundingModes.append({4, 7}); + } + } else { + RoundingModes = {std::nullopt}; + } + + std::set, std::optional, + std::optional, std::optional>> + Combinations; + for (auto AVL : AVLs) { + for (auto Log2SEW : Log2SEWs) + for (auto Policy : Policies) { + for (auto RM : RoundingModes) + Combinations.insert(std::make_tuple(RM, AVL, Log2SEW, Policy)); + } + } + + std::string ConfigStr; + SmallVector, 4> ValueAssignments; + for (const auto &[RM, AVL, Log2SEW, Policy] : Combinations) { + InstructionTemplate IT(&Instr); + + ListSeparator LS; + ConfigStr = "vtype = {"; + raw_string_ostream SS(ConfigStr); + + ValueAssignments.clear(); + + if (RM) { + const Operand &Op = Instr.Operands[RISCVII::getVLOpNum(MIDesc) - 1]; + ValueAssignments.push_back({&Op, MCOperand::createImm(*RM)}); + printRoundingMode(SS << LS << (UsesVXRM ? "VXRM" : "FRM") << ": ", *RM, + UsesVXRM); + } + + if (AVL) { + MCOperand OpVal; + if (*AVL < 0) { + // VLMAX + OpVal = MCOperand::createImm(-1); + SS << LS << "AVL: VLMAX"; + } else if (*AVL == 0) { + // A register holding AVL. + // TODO: Generate a random register. + OpVal = MCOperand::createReg(RISCV::X5); + OpVal.print(SS << LS << "AVL: "); + } else { + // A 5-bit immediate. + // The actual value assignment is deferred to + // RISCVExegesisTarget::randomizeTargetMCOperand. + SS << LS << "AVL: simm5"; + } + if (OpVal.isValid()) { + const Operand &Op = Instr.Operands[RISCVII::getVLOpNum(MIDesc)]; + ValueAssignments.push_back({&Op, OpVal}); + } + } + + if (Log2SEW) { + const Operand &Op = Instr.Operands[RISCVII::getSEWOpNum(MIDesc)]; + ValueAssignments.push_back({&Op, MCOperand::createImm(*Log2SEW)}); + SS << LS << "SEW: e" << (*Log2SEW ? 1 << *Log2SEW : 8); + } + + if (Policy) { + const Operand &Op = Instr.Operands[RISCVII::getVecPolicyOpNum(MIDesc)]; + ValueAssignments.push_back({&Op, MCOperand::createImm(*Policy)}); + SS << LS << "Policy: " << (*Policy & RISCVII::TAIL_AGNOSTIC ? "ta" : "tu") + << "/" << (*Policy & RISCVII::MASK_AGNOSTIC ? "ma" : "mu"); + } + + SS << "}"; + + // Filter out some configurations, if needed. + if (!FilterConfig.empty()) { + if (!Regex(FilterConfig).match(ConfigStr)) + continue; + } + + CodeTemplate CT = OrigCT.clone(); + CT.Config = std::move(ConfigStr); + for (InstructionTemplate &IT : CT.Instructions) { + if (IsSerial) { + // Reset this template's value assignments and do it + // ourselves. + IT = InstructionTemplate(&Instr); + assignSerialRVVOperands(IT); + } + + for (const auto &[Op, OpVal] : ValueAssignments) + IT.getValueFor(*Op) = OpVal; + } + Result.push_back(std::move(CT)); + if (Result.size() - StartingResultSize >= + SnippetGenerator::Opts.MaxConfigsPerOpcode) + return; + } +} + +template +Expected> +RVVSnippetGenerator::generateCodeTemplates( + InstructionTemplate Variant, const BitVector &ForbiddenRegisters) const { + const Instruction &Instr = Variant.getInstr(); + + bool IsSerial = std::is_same_v; + + unsigned BaseOpcode = RISCV::getRVVMCOpcode(Instr.getOpcode()); + + // Bail out ineligible opcodes before generating base code templates since + // the latter is quite expensive. + if (IsSerial && BaseOpcode && isIneligibleOfSerialSnippets(BaseOpcode, Instr)) + return std::vector{}; + + auto BaseCodeTemplates = + BaseT::generateCodeTemplates(Variant, ForbiddenRegisters); + if (!BaseCodeTemplates) + return BaseCodeTemplates.takeError(); + + // We only specialize for RVVPseudo here + if (!BaseOpcode) + return BaseCodeTemplates; + + std::vector ExpandedTemplates; + for (const auto &BaseCT : *BaseCodeTemplates) + annotateWithVType(BaseCT, Instr, BaseOpcode, ForbiddenRegisters, + ExpandedTemplates); + + return ExpandedTemplates; +} + +// NOTE: Alternatively, we can use BitVector here, but the number of RVV opcodes +// is just a small portion of the entire opcode space, so I thought it would be +// a waste of space to use BitVector. +static SmallSet RVVOpcodesWithPseudos; + +class ExegesisRISCVTarget : public ExegesisTarget { +public: + ExegesisRISCVTarget() + : ExegesisTarget(RISCVCpuPfmCounters, RISCV_MC::isOpcodeAvailable) {} + +private: + bool isOpcodeSupported(const MCInstrDesc &Desc) const override { + switch (Desc.getOpcode()) { + case RISCV::PseudoVSETIVLI: + case RISCV::PseudoVSETVLI: + case RISCV::PseudoVSETVLIX0: + case RISCV::VSETIVLI: + case RISCV::VSETVLI: + case RISCV::VSETVL: + return false; + default: + break; + } + + // We want to support all the RVV pseudos. + if (unsigned Opcode = RISCV::getRVVMCOpcode(Desc.getOpcode())) { + RVVOpcodesWithPseudos.insert(Opcode); + return true; + } + + // We don't want to support RVV instructions that depend on VTYPE, because + // those instructions by themselves don't carry any additional information + // for us to setup the proper VTYPE environment via VSETVL instructions. + // FIXME: Ideally, we should have a list of such RVV instructions...except + // we don't have, hence we use an ugly trick here to memorize the + // corresponding MC opcodes of the RVV pseudo we have processed previously. + // This works most of the time because RVV pseudo opcodes are placed before + // any other RVV opcodes. Of course this doesn't work if we're asked to + // benchmark only a certain subset of opcodes. + if (RVVOpcodesWithPseudos.count(Desc.getOpcode())) + return false; + + return ExegesisTarget::isOpcodeSupported(Desc); + } + + Error + randomizeTargetMCOperand(const Instruction &Instr, const Variable &Var, + MCOperand &AssignedValue, + const BitVector &ForbiddenRegs) const override { + const Operand &Op = Instr.getPrimaryOperand(Var); + switch (Op.getExplicitOperandInfo().OperandType) { + case RISCVOp::OPERAND_SIMM5: + // 5-bit signed immediate value. + AssignedValue = MCOperand::createImm(randomIndex(31) - 16); + return Error::success(); + case RISCVOp::OPERAND_AVL: + case RISCVOp::OPERAND_UIMM5: + // 5-bit unsigned immediate value. + AssignedValue = MCOperand::createImm(randomIndex(31)); + return Error::success(); + default: + break; + } + return make_error( + Twine("unimplemented operand type ") + .concat(std::to_string(Op.getExplicitOperandInfo().OperandType))); + } + + static std::vector loadIntImmediate(const MCSubtargetInfo &STI, + unsigned Reg, + const APInt &Value) { + // Lower to materialization sequence. + RISCVMatInt::InstSeq Seq = + RISCVMatInt::generateInstSeq(Value.getSExtValue(), STI); + assert(!Seq.empty()); + + Register DstReg = Reg; + Register SrcReg = RISCV::X0; + + std::vector Insts; + for (const RISCVMatInt::Inst &Inst : Seq) { + switch (Inst.getOpndKind()) { + case RISCVMatInt::Imm: + Insts.emplace_back(MCInstBuilder(Inst.getOpcode()) + .addReg(DstReg) + .addImm(Inst.getImm())); + break; + case RISCVMatInt::RegX0: + Insts.emplace_back(MCInstBuilder(Inst.getOpcode()) + .addReg(DstReg) + .addReg(SrcReg) + .addReg(RISCV::X0)); + break; + case RISCVMatInt::RegReg: + Insts.emplace_back(MCInstBuilder(Inst.getOpcode()) + .addReg(DstReg) + .addReg(SrcReg) + .addReg(SrcReg)); + break; + case RISCVMatInt::RegImm: + Insts.emplace_back(MCInstBuilder(Inst.getOpcode()) + .addReg(DstReg) + .addReg(SrcReg) + .addImm(Inst.getImm())); + break; + } + + // Only the first instruction has X0 as its source. + SrcReg = DstReg; + } + return Insts; + } + + // Note that we assume the given APInt is an integer rather than a bit-casted + // floating point value. + static std::vector loadFPImmediate(unsigned FLen, + const MCSubtargetInfo &STI, + unsigned Reg, const APInt &Value) { + // Try FLI from the Zfa extension. + if (STI.hasFeature(RISCV::FeatureStdExtZfa)) { + APFloat FloatVal(FLen == 32 ? APFloat::IEEEsingle() + : APFloat::IEEEdouble()); + if (FloatVal.convertFromAPInt(Value, /*IsSigned=*/Value.isSignBitSet(), + APFloat::rmNearestTiesToEven) == + APFloat::opOK) { + int Idx = RISCVLoadFPImm::getLoadFPImm(FloatVal); + if (Idx >= 0) + return {MCInstBuilder(FLen == 32 ? RISCV::FLI_S : RISCV::FLI_D) + .addReg(Reg) + .addImm(static_cast(Idx))}; + } + } + + // Otherwise, move the value to a GPR (t0) first. + assert(Reg != RISCV::X5); + auto ImmSeq = loadIntImmediate(STI, RISCV::X5, Value); + + // Then, use FCVT. + unsigned Opcode; + if (FLen == 32) + Opcode = Value.getBitWidth() <= 32 ? RISCV::FCVT_S_W : RISCV::FCVT_S_L; + else + Opcode = Value.getBitWidth() <= 32 ? RISCV::FCVT_D_W : RISCV::FCVT_D_L; + ImmSeq.emplace_back( + MCInstBuilder(Opcode).addReg(Reg).addReg(RISCV::X5).addImm( + RISCVFPRndMode::RNE)); + + return ImmSeq; + } + + std::vector setRegTo(const MCSubtargetInfo &STI, unsigned Reg, + const APInt &Value) const override { + if (Reg == RISCV::X0) { + if (Value == 0U) + // NOP + return {MCInstBuilder(RISCV::ADDI) + .addReg(RISCV::X0) + .addReg(RISCV::X0) + .addImm(0U)}; + errs() << "Cannot write non-zero values to X0\n"; + return {}; + } + + if (RISCV::GPRNoX0RegClass.contains(Reg)) + return loadIntImmediate(STI, Reg, Value); + if (RISCV::FPR32RegClass.contains(Reg) && + STI.hasFeature(RISCV::FeatureStdExtF)) + return loadFPImmediate(32, STI, Reg, Value); + if (RISCV::FPR64RegClass.contains(Reg) && + STI.hasFeature(RISCV::FeatureStdExtD)) + return loadFPImmediate(64, STI, Reg, Value); + return {}; + } + + RegisterValue assignInitialRegisterValue(const Instruction &I, + const Operand &Op, + unsigned Reg) const override { + // If this is a register AVL, we don't want to assign 0 or VLMAX VL. + if (Op.isExplicit() && + Op.getExplicitOperandInfo().OperandType == RISCVOp::OPERAND_AVL) { + // Assume VLEN is 128 here. + constexpr unsigned VLEN = 128; + // VLMAX equals to VLEN since + // VLMAX = VLEN / * . + return RegisterValue{Reg, APInt(32, randomIndex(VLEN - 4) + 2)}; + } + + switch (I.getOpcode()) { + // We don't want divided-by-zero for these opcodes. + case RISCV::DIV: + case RISCV::DIVU: + case RISCV::DIVW: + case RISCV::DIVUW: + case RISCV::REM: + case RISCV::REMU: + case RISCV::REMW: + case RISCV::REMUW: + // Multiplications and its friends are not really interestings + // when they're multiplied by zero. + case RISCV::MUL: + case RISCV::MULH: + case RISCV::MULHSU: + case RISCV::MULHU: + case RISCV::MULW: + case RISCV::CPOP: + case RISCV::CPOPW: + return RegisterValue{Reg, APInt(32, randomIndex(INT32_MAX - 1) + 1)}; + default: + return ExegesisTarget::assignInitialRegisterValue(I, Op, Reg); + } + } + + bool matchesArch(Triple::ArchType Arch) const override { + return Arch == Triple::riscv32 || Arch == Triple::riscv64; + } + + unsigned getDefaultLoopCounterRegister(const Triple &TT) const override { + return RISCV::X5; + } + + void decrementLoopCounterAndJump(MachineBasicBlock &MBB, + MachineBasicBlock &TargetMBB, + const MCInstrInfo &MII, + unsigned LoopRegister) const override { + MIMetadata MIMD; + BuildMI(MBB, MBB.end(), MIMD, MII.get(RISCV::ADDI), LoopRegister) + .addUse(LoopRegister) + .addImm(-1); + BuildMI(MBB, MBB.end(), MIMD, MII.get(RISCV::BNE)) + .addUse(LoopRegister) + .addUse(RISCV::X0) + .addMBB(&TargetMBB); + } + + std::unique_ptr createSerialSnippetGenerator( + const LLVMState &State, + const SnippetGenerator::Options &Opts) const override { + return std::make_unique>(State, + Opts); + } + + std::unique_ptr createParallelSnippetGenerator( + const LLVMState &State, + const SnippetGenerator::Options &Opts) const override { + return std::make_unique>( + State, Opts); + } + + Expected> + createCounter(StringRef CounterName, const LLVMState &, + ArrayRef ValidationCounters, + const pid_t ProcessID) const override { + auto Event = static_cast(RISCVPerfEvent(CounterName)); + if (!Event.valid()) + return llvm::make_error( + llvm::Twine("Unable to create counter with name '") + .concat(CounterName) + .concat("'")); + + std::vector ValidationEvents; + for (const char *ValCounterName : ValidationCounters) { + ValidationEvents.emplace_back(ValCounterName); + if (!ValidationEvents.back().valid()) + return llvm::make_error( + llvm::Twine("Unable to create validation counter with name '") + .concat(ValCounterName) + .concat("'")); + } + + return std::make_unique( + std::move(Event), std::move(ValidationEvents), ProcessID); + } + + void addTargetSpecificPasses(PassManagerBase &PM) const override { + // Turn AVL operand of physical registers into virtual registers. + PM.add(exegesis::createRISCVPreprocessingPass()); + PM.add(createRISCVInsertVSETVLIPass()); + // Setting up the correct FRM. + PM.add(createRISCVInsertReadWriteCSRPass()); + PM.add(createRISCVInsertWriteVXRMPass()); + // This will assign physical register to the result of VSETVLI instructions + // that produce VLMAX. + PM.add(exegesis::createRISCVPostprocessingPass()); + // PseudoRET will be expanded by RISCVAsmPrinter; we have to expand + // PseudoMovImm with RISCVPostRAExpandPseudoPass though. + PM.add(createRISCVPostRAExpandPseudoPass()); + } +}; + +} // namespace + +static ExegesisTarget *getTheExegesisRISCVTarget() { + static ExegesisRISCVTarget Target; + return &Target; +} + +void InitializeRISCVExegesisTarget() { + ExegesisTarget::registerTarget(getTheExegesisRISCVTarget()); +} + +} // namespace exegesis +} // namespace llvm diff --git a/llvm/tools/llvm-exegesis/lib/SchedClassResolution.cpp b/llvm/tools/llvm-exegesis/lib/SchedClassResolution.cpp index 0690c21220f89..55c814647c685 100644 --- a/llvm/tools/llvm-exegesis/lib/SchedClassResolution.cpp +++ b/llvm/tools/llvm-exegesis/lib/SchedClassResolution.cpp @@ -84,17 +84,19 @@ getNonRedundantWriteProcRes(const MCSchedClassDesc &SCDesc, // TODO: Handle AcquireAtAtCycle in llvm-exegesis and llvm-mca. See // https://github.com/llvm/llvm-project/issues/62680 and // https://github.com/llvm/llvm-project/issues/62681 - assert(WPR->AcquireAtCycle == 0 && - "`llvm-exegesis` does not handle AcquireAtCycle > 0"); + // assert(WPR->AcquireAtCycle == 0 && + // "`llvm-exegesis` does not handle AcquireAtCycle > 0"); + assert(WPR->ReleaseAtCycle > WPR->AcquireAtCycle); if (ProcResDesc->SubUnitsIdxBegin == nullptr) { // This is a ProcResUnit. Result.push_back( {WPR->ProcResourceIdx, WPR->ReleaseAtCycle, WPR->AcquireAtCycle}); - ProcResUnitUsage[WPR->ProcResourceIdx] += WPR->ReleaseAtCycle; + ProcResUnitUsage[WPR->ProcResourceIdx] += + (WPR->ReleaseAtCycle - WPR->AcquireAtCycle); } else { // This is a ProcResGroup. First see if it contributes any cycles or if // it has cycles just from subunits. - float RemainingCycles = WPR->ReleaseAtCycle; + float RemainingCycles = (WPR->ReleaseAtCycle - WPR->AcquireAtCycle); for (const auto *SubResIdx = ProcResDesc->SubUnitsIdxBegin; SubResIdx != ProcResDesc->SubUnitsIdxBegin + ProcResDesc->NumUnits; ++SubResIdx) { @@ -106,7 +108,8 @@ getNonRedundantWriteProcRes(const MCSchedClassDesc &SCDesc, } // The ProcResGroup contributes `RemainingCycles` cycles of its own. Result.push_back({WPR->ProcResourceIdx, - static_cast(std::round(RemainingCycles)), + static_cast(WPR->AcquireAtCycle + + std::round(RemainingCycles)), WPR->AcquireAtCycle}); // Spread the remaining cycles over all subunits. for (const auto *SubResIdx = ProcResDesc->SubUnitsIdxBegin; @@ -116,6 +119,10 @@ getNonRedundantWriteProcRes(const MCSchedClassDesc &SCDesc, } } } + + sort(Result, [](const MCWriteProcResEntry &A, const MCWriteProcResEntry &B) { + return A.ProcResourceIdx < B.ProcResourceIdx; + }); return Result; } @@ -198,27 +205,25 @@ static void distributePressure(float RemainingPressure, } } -std::vector> -computeIdealizedProcResPressure(const MCSchedModel &SM, - SmallVector WPRS) { +std::vector> computeIdealizedProcResPressure( + const MCSchedModel &SM, const SmallVector &WPRS) { // DensePressure[I] is the port pressure for Proc Resource I. SmallVector DensePressure(SM.getNumProcResourceKinds()); - sort(WPRS, [](const MCWriteProcResEntry &A, const MCWriteProcResEntry &B) { - return A.ProcResourceIdx < B.ProcResourceIdx; - }); for (const MCWriteProcResEntry &WPR : WPRS) { // Get units for the entry. const MCProcResourceDesc *const ProcResDesc = SM.getProcResource(WPR.ProcResourceIdx); if (ProcResDesc->SubUnitsIdxBegin == nullptr) { // This is a ProcResUnit. - DensePressure[WPR.ProcResourceIdx] += WPR.ReleaseAtCycle; + DensePressure[WPR.ProcResourceIdx] += + (WPR.ReleaseAtCycle - WPR.AcquireAtCycle); } else { // This is a ProcResGroup. SmallVector Subunits(ProcResDesc->SubUnitsIdxBegin, ProcResDesc->SubUnitsIdxBegin + ProcResDesc->NumUnits); - distributePressure(WPR.ReleaseAtCycle, Subunits, DensePressure); + distributePressure(WPR.ReleaseAtCycle - WPR.AcquireAtCycle, Subunits, + DensePressure); } } // Turn dense pressure into sparse pressure by removing zero entries. @@ -284,6 +289,36 @@ static unsigned findProcResIdx(const MCSubtargetInfo &STI, return 0; } +static int getMinimumBypassCycles(ArrayRef Entries, + unsigned WriteResourceID) { + if (Entries.empty()) + return 0; + + int BypassCycles = INT_MAX; + for (const MCReadAdvanceEntry &E : Entries) { + if (E.WriteResourceID != WriteResourceID) + continue; + BypassCycles = std::min(BypassCycles, E.Cycles); + } + + return BypassCycles == INT_MAX ? 0 : BypassCycles; +} + +unsigned ResolvedSchedClass::computeNormalizedWriteLatency( + const MCWriteLatencyEntry *WLE, const MCSubtargetInfo &STI) const { + assert(WLE); + auto ReadAdvances = STI.getReadAdvanceEntries(*SCDesc); + int MinBypass = getMinimumBypassCycles(ReadAdvances, WLE->WriteResourceID); + + unsigned Latency = WLE->Cycles; + if (MinBypass > 0 && unsigned(MinBypass) >= Latency) + Latency = 0; + else + Latency = Latency - MinBypass; + + return Latency; +} + std::vector ResolvedSchedClass::getAsPoint( Benchmark::ModeE Mode, const MCSubtargetInfo &STI, ArrayRef Representative) const { @@ -301,8 +336,10 @@ std::vector ResolvedSchedClass::getAsPoint( for (unsigned I = 0; I < SCDesc->NumWriteLatencyEntries; ++I) { const MCWriteLatencyEntry *const WLE = STI.getWriteLatencyEntry(SCDesc, I); + + unsigned Latency = computeNormalizedWriteLatency(WLE, STI); LatencyMeasure.PerInstructionValue = - std::max(LatencyMeasure.PerInstructionValue, WLE->Cycles); + std::max(LatencyMeasure.PerInstructionValue, Latency); } } else if (Mode == Benchmark::Uops) { for (auto I : zip(SchedClassPoint, Representative)) { diff --git a/llvm/tools/llvm-exegesis/lib/SchedClassResolution.h b/llvm/tools/llvm-exegesis/lib/SchedClassResolution.h index 2347449b8f23d..2803c7bc17f3b 100644 --- a/llvm/tools/llvm-exegesis/lib/SchedClassResolution.h +++ b/llvm/tools/llvm-exegesis/lib/SchedClassResolution.h @@ -31,9 +31,8 @@ namespace exegesis { // Computes the idealized ProcRes Unit pressure. This is the expected // distribution if the CPU scheduler can distribute the load as evenly as // possible. -std::vector> -computeIdealizedProcResPressure(const MCSchedModel &SM, - SmallVector WPRS); +std::vector> computeIdealizedProcResPressure( + const MCSchedModel &SM, const SmallVector &WPRS); // An MCSchedClassDesc augmented with some additional data. struct ResolvedSchedClass { @@ -48,6 +47,9 @@ struct ResolvedSchedClass { getAsPoint(Benchmark::ModeE Mode, const MCSubtargetInfo &STI, ArrayRef Representative) const; + unsigned computeNormalizedWriteLatency(const MCWriteLatencyEntry *WLE, + const MCSubtargetInfo &STI) const; + const unsigned SchedClassId; const MCSchedClassDesc *const SCDesc; const bool WasVariant; // Whether the original class was variant. diff --git a/llvm/tools/llvm-exegesis/lib/SerialSnippetGenerator.cpp b/llvm/tools/llvm-exegesis/lib/SerialSnippetGenerator.cpp index 7100b51bbb729..ec6e8c2f920a2 100644 --- a/llvm/tools/llvm-exegesis/lib/SerialSnippetGenerator.cpp +++ b/llvm/tools/llvm-exegesis/lib/SerialSnippetGenerator.cpp @@ -55,11 +55,8 @@ computeAliasingInstructions(const LLVMState &State, const Instruction *Instr, const Instruction &OtherInstr = State.getIC().getInstr(OtherOpcode); const MCInstrDesc &OtherInstrDesc = OtherInstr.Description; // Ignore instructions that we cannot run. - if (OtherInstrDesc.isPseudo() || OtherInstrDesc.usesCustomInsertionHook() || - OtherInstrDesc.isBranch() || OtherInstrDesc.isIndirectBranch() || - OtherInstrDesc.isCall() || OtherInstrDesc.isReturn()) { - continue; - } + if (!ET.isOpcodeSupported(OtherInstrDesc)) + continue; if (OtherInstr.hasMemoryOperands()) continue; if (!ET.allowAsBackToBack(OtherInstr)) diff --git a/llvm/tools/llvm-exegesis/lib/SnippetGenerator.cpp b/llvm/tools/llvm-exegesis/lib/SnippetGenerator.cpp index 7dcff60a8fd11..b53dfb393ac07 100644 --- a/llvm/tools/llvm-exegesis/lib/SnippetGenerator.cpp +++ b/llvm/tools/llvm-exegesis/lib/SnippetGenerator.cpp @@ -108,6 +108,7 @@ std::vector SnippetGenerator::computeRegisterInitialValues( // Loop invariant: DefinedRegs[i] is true iif it has been set at least once // before the current instruction. BitVector DefinedRegs = State.getRATC().emptyRegisters(); + const auto &ET = State.getExegesisTarget(); std::vector RIV; for (const InstructionTemplate &IT : Instructions) { // Returns the register that this Operand sets or uses, or 0 if this is not @@ -121,18 +122,19 @@ std::vector SnippetGenerator::computeRegisterInitialValues( return IT.getValueFor(Op).getReg(); return 0; }; + const Instruction &I = IT.getInstr(); // Collect used registers that have never been def'ed. - for (const Operand &Op : IT.getInstr().Operands) { + for (const Operand &Op : I.Operands) { if (Op.isUse()) { const unsigned Reg = GetOpReg(Op); if (Reg > 0 && !DefinedRegs.test(Reg)) { - RIV.push_back(RegisterValue::zero(Reg)); + RIV.push_back(ET.assignInitialRegisterValue(I, Op, Reg)); DefinedRegs.set(Reg); } } } // Mark defs as having been def'ed. - for (const Operand &Op : IT.getInstr().Operands) { + for (const Operand &Op : I.Operands) { if (Op.isDef()) { const unsigned Reg = GetOpReg(Op); if (Reg > 0) @@ -286,16 +288,17 @@ Error randomizeUnsetVariables(const LLVMState &State, } Error validateGeneratedInstruction(const LLVMState &State, const MCInst &Inst) { - for (const auto &Operand : Inst) { - if (!Operand.isValid()) { + for (const auto &Operand : llvm::enumerate(Inst)) { + if (!Operand.value().isValid()) { // Mention the particular opcode - it is not necessarily the "main" // opcode being benchmarked by this snippet. For example, serial snippet // generator uses one more opcode when in SERIAL_VIA_NON_MEMORY_INSTR // execution mode. const auto OpcodeName = State.getInstrInfo().getName(Inst.getOpcode()); - return make_error("Not all operands were initialized by the " - "snippet generator for " + - OpcodeName + " opcode."); + return make_error( + "Operand #" + std::to_string(Operand.index()) + + " was not initialized by the snippet generator for " + OpcodeName + + " opcode."); } } return Error::success(); diff --git a/llvm/tools/llvm-exegesis/lib/Target.cpp b/llvm/tools/llvm-exegesis/lib/Target.cpp index 29e58692f0e92..51592143484f6 100644 --- a/llvm/tools/llvm-exegesis/lib/Target.cpp +++ b/llvm/tools/llvm-exegesis/lib/Target.cpp @@ -35,6 +35,14 @@ const ExegesisTarget *ExegesisTarget::lookup(Triple TT) { return nullptr; } +bool ExegesisTarget::isOpcodeSupported(const MCInstrDesc &Desc) const { + // By default, we ignore pseudo, branch, indirect branch, call, and return + // instructions, along with instructions that require custom inserter. + return !(Desc.isPseudo() || Desc.usesCustomInsertionHook() || + Desc.isBranch() || Desc.isIndirectBranch() || Desc.isCall() || + Desc.isReturn()); +} + Expected> ExegesisTarget::createCounter(StringRef CounterName, const LLVMState &, ArrayRef ValidationCounters, diff --git a/llvm/tools/llvm-exegesis/lib/Target.h b/llvm/tools/llvm-exegesis/lib/Target.h index 92cc1cb248a1c..db346c9dfdee6 100644 --- a/llvm/tools/llvm-exegesis/lib/Target.h +++ b/llvm/tools/llvm-exegesis/lib/Target.h @@ -153,6 +153,9 @@ class ExegesisTarget { return IsOpcodeAvailable(Opcode, Features); } + // Returns true if the opcode is subject to process. + virtual bool isOpcodeSupported(const MCInstrDesc &Desc) const; + // Sets the stack register to the auxiliary memory so that operations // requiring the stack can be formed (e.g., setting large registers). The code // generated by this function may clobber registers. @@ -238,6 +241,12 @@ class ExegesisTarget { "targets with target-specific operands should implement this"); } + virtual RegisterValue assignInitialRegisterValue(const Instruction &I, + const Operand &Op, + unsigned Reg) const { + return RegisterValue::zero(Reg); + } + // Returns true if this instruction is supported as a back-to-back // instructions. // FIXME: Eventually we should discover this dynamically. diff --git a/llvm/tools/llvm-exegesis/lib/Timer.cpp b/llvm/tools/llvm-exegesis/lib/Timer.cpp new file mode 100644 index 0000000000000..f12e5c933a3cd --- /dev/null +++ b/llvm/tools/llvm-exegesis/lib/Timer.cpp @@ -0,0 +1,16 @@ +#include "Timer.h" +#include "llvm/Support/CommandLine.h" + +namespace llvm { +namespace exegesis { + +bool TimerIsEnabled = false; + +const char TimerGroupName[] = "llvm-exegesis"; +const char TimerGroupDescription[] = "Time passes in each exegesis phase"; + +cl::opt EnableTimer("time-phases", cl::location(TimerIsEnabled), + cl::desc(TimerGroupDescription)); + +} // namespace exegesis +} // namespace llvm diff --git a/llvm/tools/llvm-exegesis/lib/Timer.h b/llvm/tools/llvm-exegesis/lib/Timer.h new file mode 100644 index 0000000000000..cea9be7f02fe2 --- /dev/null +++ b/llvm/tools/llvm-exegesis/lib/Timer.h @@ -0,0 +1,21 @@ +//===---------- Timer.h -----------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_EXEGESIS_TIMER_H +#define LLVM_TOOLS_LLVM_EXEGESIS_TIMER_H + +namespace llvm { +namespace exegesis { +extern bool TimerIsEnabled; + +extern const char TimerGroupName[]; +extern const char TimerGroupDescription[]; + +} // namespace exegesis +} // namespace llvm +#endif diff --git a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp index 546ec770a8d22..ab583c2e14909 100644 --- a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp +++ b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp @@ -25,6 +25,7 @@ #include "lib/SnippetRepetitor.h" #include "lib/Target.h" #include "lib/TargetSelect.h" +#include "lib/Timer.h" #include "lib/ValidationEvent.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/Twine.h" @@ -43,6 +44,7 @@ #include "llvm/Support/Path.h" #include "llvm/Support/SourceMgr.h" #include "llvm/Support/TargetSelect.h" +#include "llvm/Support/Timer.h" #include "llvm/TargetParser/Host.h" #include #include @@ -50,10 +52,62 @@ namespace llvm { namespace exegesis { -static cl::opt OpcodeIndex( - "opcode-index", - cl::desc("opcode to measure, by index, or -1 to measure all opcodes"), - cl::cat(BenchmarkOptions), cl::init(0)); +struct IndexRangeParser : public cl::parser> { + IndexRangeParser(cl::Option &O) + : cl::parser>(O) {} + + // 'A..B' -> [A,B) + // 'A...B' -> [A,B] + bool parse(cl::Option &O, StringRef ArgName, StringRef ArgValue, + std::pair &Val) { + StringRef ArgStr = ArgValue; + + int FirstIdx; + if (ArgStr.consumeInteger(10, FirstIdx)) + return O.error("Expecting an integer"); + + if (FirstIdx < 0 && FirstIdx != -1) + return O.error("-1 is the only allowed negative value, got '" + + std::to_string(FirstIdx) + "'"); + + if (ArgStr.consume_front("...")) { + if (FirstIdx >= 0) { + if (ArgStr.getAsInteger(10, Val.second)) + return O.error("Cannot parse '" + ArgStr + "' as unsigned integer"); + Val.first = FirstIdx; + if (Val.second == 0 || Val.first > Val.second) + return O.error("Invalid range " + + formatv("[{0},{1}]", Val.first, Val.second)); + return false; + } + } else if (ArgStr.consume_front("..")) { + if (FirstIdx >= 0) { + if (ArgStr.getAsInteger(10, Val.second)) + return O.error("Cannot parse '" + ArgStr + "' as unsigned integer"); + Val.first = FirstIdx; + if (Val.second == 0 || Val.first > Val.second - 1) + return O.error("Invalid range " + + formatv("[{0},{1})", Val.first, Val.second)); + Val.second -= 1; + return false; + } + } else if (ArgStr.empty()) { + if (FirstIdx < 0) + Val = std::make_pair(0, UINT_MAX); + else + Val = std::make_pair(FirstIdx, FirstIdx); + return false; + } + + return O.error("Unrecognized format: '" + ArgValue + "'"); + } +}; + +static cl::opt, false, IndexRangeParser> + OpcodeIndices( + "opcode-index", + cl::desc("opcode to measure, by index, or -1 to measure all opcodes"), + cl::cat(BenchmarkOptions), cl::init(std::pair(0, 0))); static cl::opt OpcodeNames("opcode-name", @@ -72,6 +126,11 @@ static cl::opt "results. “-” uses stdin/stdout."), cl::cat(Options), cl::init("")); +static cl::opt + InputFile(cl::Positional, + cl::desc("Input benchmarks file to resume or snippet file"), + cl::init("-"), cl::cat(Options)); + static cl::opt BenchmarkMode( "mode", cl::desc("the mode to run"), cl::cat(Options), cl::values(clEnumValN(Benchmark::Latency, "latency", "Instruction Latency"), @@ -112,28 +171,37 @@ static cl::opt BenchmarkMeasurementsPrintProgress( cl::desc("Produce progress indicator when performing measurements"), cl::cat(BenchmarkOptions), cl::init(false)); -static cl::opt BenchmarkPhaseSelector( - "benchmark-phase", - cl::desc( - "it is possible to stop the benchmarking process after some phase"), - cl::cat(BenchmarkOptions), - cl::values( - clEnumValN(BenchmarkPhaseSelectorE::PrepareSnippet, "prepare-snippet", - "Only generate the minimal instruction sequence"), - clEnumValN(BenchmarkPhaseSelectorE::PrepareAndAssembleSnippet, - "prepare-and-assemble-snippet", - "Same as prepare-snippet, but also dumps an excerpt of the " - "sequence (hex encoded)"), - clEnumValN(BenchmarkPhaseSelectorE::AssembleMeasuredCode, - "assemble-measured-code", - "Same as prepare-and-assemble-snippet, but also creates the " - "full sequence " - "that can be dumped to a file using --dump-object-to-disk"), - clEnumValN( - BenchmarkPhaseSelectorE::Measure, "measure", - "Same as prepare-measured-code, but also runs the measurement " - "(default)")), - cl::init(BenchmarkPhaseSelectorE::Measure)); +static const auto BenchmarkPhasesOptValues = cl::values( + clEnumValN(BenchmarkPhaseSelectorE::PrepareSnippet, "prepare-snippet", + "Only generate the minimal instruction sequence"), + clEnumValN(BenchmarkPhaseSelectorE::PrepareAndAssembleSnippet, + "prepare-and-assemble-snippet", + "Same as prepare-snippet, but also dumps an excerpt of the " + "sequence (hex encoded)"), + clEnumValN(BenchmarkPhaseSelectorE::AssembleMeasuredCode, + "assemble-measured-code", + "Same as prepare-and-assemble-snippet, but also creates the " + "full sequence " + "that can be dumped to a file using --dump-object-to-disk"), + clEnumValN(BenchmarkPhaseSelectorE::Measure, "measure", + "Same as prepare-measured-code, but also runs the measurement " + "(default)")); + +static cl::opt + StopAfter("stop-after-phase", + cl::desc("Stop the benchmarking process after some phase"), + cl::cat(BenchmarkOptions), BenchmarkPhasesOptValues, + cl::init(BenchmarkPhaseSelectorE::Measure)); + +static cl::alias BenchmarkPhaseSelector("benchmark-phase", + cl::desc("Alias of -stop-after-phase"), + cl::aliasopt(StopAfter)); + +static cl::opt StartBefore( + "start-before-phase", + cl::desc("Resume the benchmarking process before a certain phase"), + cl::cat(BenchmarkOptions), BenchmarkPhasesOptValues, + cl::init(BenchmarkPhaseSelectorE::PrepareSnippet)); static cl::opt UseDummyPerfCounters("use-dummy-perf-counters", @@ -203,12 +271,13 @@ static cl::opt AnalysisInconsistencyEpsilon( cl::cat(AnalysisOptions), cl::init(0.1)); static cl::opt - AnalysisClustersOutputFile("analysis-clusters-output-file", cl::desc(""), - cl::cat(AnalysisOptions), cl::init("")); + AnalysisClustersOutputFile("analysis-clusters-output-", cl::desc(""), + cl::cat(AnalysisOptions), cl::init(""), + cl::Prefix); static cl::opt - AnalysisInconsistenciesOutputFile("analysis-inconsistencies-output-file", + AnalysisInconsistenciesOutputFile("analysis-inconsistencies-output-", cl::desc(""), cl::cat(AnalysisOptions), - cl::init("")); + cl::init(""), cl::Prefix); static cl::opt AnalysisDisplayUnstableOpcodes( "analysis-display-unstable-clusters", @@ -237,6 +306,11 @@ static cl::opt cl::desc("Target a specific cpu type (-mcpu=help for details)"), cl::value_desc("cpu-name"), cl::cat(Options), cl::init("native")); +static cl::list + MAttrs("mattr", cl::CommaSeparated, + cl::desc("Target specific attributes (-mattr=help for details)"), + cl::value_desc("a1,+a2,-a3,..."), cl::cat(Options)); + static cl::opt DumpObjectToDisk("dump-object-to-disk", cl::desc("dumps the generated benchmark object to disk " @@ -300,8 +374,9 @@ T ExitOnFileError(const Twine &FileName, Expected &&E) { // and returns the opcode indices or {} if snippets should be read from // `SnippetsFile`. static std::vector getOpcodesOrDie(const LLVMState &State) { + bool NoOpcodeIndices = !OpcodeIndices.first && !OpcodeIndices.second; const size_t NumSetFlags = (OpcodeNames.empty() ? 0 : 1) + - (OpcodeIndex == 0 ? 0 : 1) + + (NoOpcodeIndices ? 0 : 1) + (SnippetsFile.empty() ? 0 : 1); const auto &ET = State.getExegesisTarget(); const auto AvailableFeatures = State.getSubtargetInfo().getFeatureBits(); @@ -313,13 +388,13 @@ static std::vector getOpcodesOrDie(const LLVMState &State) { } if (!SnippetsFile.empty()) return {}; - if (OpcodeIndex > 0) - return {static_cast(OpcodeIndex)}; - if (OpcodeIndex < 0) { + if (!NoOpcodeIndices) { std::vector Result; unsigned NumOpcodes = State.getInstrInfo().getNumOpcodes(); Result.reserve(NumOpcodes); - for (unsigned I = 0, E = NumOpcodes; I < E; ++I) { + for (unsigned I = OpcodeIndices.first, + E = std::min(NumOpcodes - 1, OpcodeIndices.second); + I <= E; ++I) { if (!ET.isOpcodeAvailable(I, AvailableFeatures)) continue; Result.push_back(I); @@ -355,13 +430,8 @@ generateSnippets(const LLVMState &State, unsigned Opcode, const Instruction &Instr = State.getIC().getInstr(Opcode); const MCInstrDesc &InstrDesc = Instr.Description; // Ignore instructions that we cannot run. - if (InstrDesc.isPseudo() || InstrDesc.usesCustomInsertionHook()) - return make_error( - "Unsupported opcode: isPseudo/usesCustomInserter"); - if (InstrDesc.isBranch() || InstrDesc.isIndirectBranch()) - return make_error("Unsupported opcode: isBranch/isIndirectBranch"); - if (InstrDesc.isCall() || InstrDesc.isReturn()) - return make_error("Unsupported opcode: isCall/isReturn"); + if (!State.getExegesisTarget().isOpcodeSupported(InstrDesc)) + return make_error("Opcode is not supported"); const std::vector InstructionVariants = State.getExegesisTarget().generateInstructionVariants( @@ -386,11 +456,54 @@ generateSnippets(const LLVMState &State, unsigned Opcode, return Benchmarks; } -static void runBenchmarkConfigurations( - const LLVMState &State, ArrayRef Configurations, +static void deserializeRunnableConfigurations( + std::vector &Benchmarks, const BenchmarkRunner &Runner, + std::vector &RunnableConfigs, + SmallVectorImpl &Repetitions) { + for (unsigned I = 0U, E = Benchmarks.size(); I < E; ++I) { + // Reset any previous error. + Benchmarks[I].Error.clear(); + + RunnableConfigs.emplace_back( + ExitOnErr(Runner.getRunnableConfiguration(std::move(Benchmarks[I])))); + if (I > 0 && RunnableConfigs[I].BenchmarkResult.Key == + RunnableConfigs[I - 1].BenchmarkResult.Key) { + // Extend the current end index in Repetitions. + Repetitions.back() = RunnableConfigs.size(); + } else { + // Append a new entry into Repetitions. + Repetitions.push_back(RunnableConfigs.size()); + } + } +} + +static void collectRunnableConfigurations( + ArrayRef Configurations, ArrayRef> Repetitors, - const BenchmarkRunner &Runner) { - assert(!Configurations.empty() && "Don't have any configurations to run."); + const BenchmarkRunner &Runner, + std::vector &RunnableConfigs, + SmallVectorImpl &Repetitions) { + + SmallVector MinInstructionCounts = {MinInstructions}; + if (RepetitionMode == Benchmark::MiddleHalfDuplicate || + RepetitionMode == Benchmark::MiddleHalfLoop) + MinInstructionCounts.push_back(MinInstructions * 2); + + for (const BenchmarkCode &Conf : Configurations) { + for (const auto &Repetitor : Repetitors) { + for (unsigned IterationRepetitions : MinInstructionCounts) + RunnableConfigs.emplace_back(ExitOnErr(Runner.getRunnableConfiguration( + Conf, IterationRepetitions, LoopBodySize, *Repetitor))); + } + Repetitions.emplace_back(RunnableConfigs.size()); + } +} + +static void runBenchmarkConfigurations( + const LLVMState &State, + std::vector &RunnableConfigs, + ArrayRef Repetitions, const BenchmarkRunner &Runner) { + assert(!RunnableConfigs.empty() && "Don't have any configurations to run."); std::optional FileOstr; if (BenchmarkFile != "-") { int ResultFD = 0; @@ -404,43 +517,38 @@ static void runBenchmarkConfigurations( std::optional> Meter; if (BenchmarkMeasurementsPrintProgress) - Meter.emplace(Configurations.size()); + Meter.emplace(RunnableConfigs.size()); - SmallVector MinInstructionCounts = {MinInstructions}; - if (RepetitionMode == Benchmark::MiddleHalfDuplicate || - RepetitionMode == Benchmark::MiddleHalfLoop) - MinInstructionCounts.push_back(MinInstructions * 2); + std::optional DumpFile; + if (DumpObjectToDisk.getNumOccurrences()) + DumpFile = DumpObjectToDisk; - for (const BenchmarkCode &Conf : Configurations) { + const std::optional BenchmarkCPU = + BenchmarkProcessCPU == -1 ? std::nullopt + : std::optional(BenchmarkProcessCPU.getValue()); + + unsigned StartIdx = 0; + for (unsigned EndIdx : Repetitions) { ProgressMeter<>::ProgressMeterStep MeterStep(Meter ? &*Meter : nullptr); SmallVector AllResults; - for (const std::unique_ptr &Repetitor : - Repetitors) { - for (unsigned IterationRepetitions : MinInstructionCounts) { - auto RC = ExitOnErr(Runner.getRunnableConfiguration( - Conf, IterationRepetitions, LoopBodySize, *Repetitor)); - std::optional DumpFile; - if (DumpObjectToDisk.getNumOccurrences()) - DumpFile = DumpObjectToDisk; - const std::optional BenchmarkCPU = - BenchmarkProcessCPU == -1 - ? std::nullopt - : std::optional(BenchmarkProcessCPU.getValue()); - auto [Err, BenchmarkResult] = - Runner.runConfiguration(std::move(RC), DumpFile, BenchmarkCPU); - if (Err) { - // Errors from executing the snippets are fine. - // All other errors are a framework issue and should fail. - if (!Err.isA()) - ExitOnErr(std::move(Err)); - - BenchmarkResult.Error = toString(std::move(Err)); + for (unsigned Idx = StartIdx; Idx < EndIdx; ++Idx) { + auto RC = std::move(RunnableConfigs[Idx]); + auto [Err, BenchmarkResult] = + Runner.runConfiguration(std::move(RC), DumpFile, BenchmarkCPU); + if (Err) { + // Errors from executing the snippets are fine. + // All other errors are a framework issue and should fail. + if (!Err.isA()) { + llvm::errs() << "llvm-exegesis error: " << toString(std::move(Err)); + exit(1); } - AllResults.push_back(std::move(BenchmarkResult)); + BenchmarkResult.Error = toString(std::move(Err)); } - } + AllResults.push_back(std::move(BenchmarkResult)); + } + StartIdx = EndIdx; Benchmark &Result = AllResults.front(); // If any of our measurements failed, pretend they all have failed. @@ -465,15 +573,8 @@ static void runBenchmarkConfigurations( } void benchmarkMain() { - if (BenchmarkPhaseSelector == BenchmarkPhaseSelectorE::Measure && - !UseDummyPerfCounters) { -#ifndef HAVE_LIBPFM - ExitWithError( - "benchmarking unavailable, LLVM was built without libpfm. You can " - "pass --benchmark-phase=... to skip the actual benchmarking or " - "--use-dummy-perf-counters to not query the kernel for real event " - "counts."); -#else + if (StopAfter == BenchmarkPhaseSelectorE::Measure && !UseDummyPerfCounters) { +#ifdef HAVE_LIBPFM if (pfm::pfmInitialize()) ExitWithError("cannot initialize libpfm"); #endif @@ -485,12 +586,20 @@ void benchmarkMain() { LLVMInitialize##TargetName##AsmParser(); #include "llvm/Config/TargetExegesis.def" - const LLVMState State = - ExitOnErr(LLVMState::Create(TripleName, MCPU, "", UseDummyPerfCounters)); + std::string FeaturesStr; + if (!MAttrs.empty()) { + SubtargetFeatures Features; + for (const auto &MAttr : MAttrs) + Features.AddFeature(MAttr); + FeaturesStr = Features.getString(); + } + + const LLVMState State = ExitOnErr( + LLVMState::Create(TripleName, MCPU, FeaturesStr, UseDummyPerfCounters)); // Preliminary check to ensure features needed for requested // benchmark mode are present on target CPU and/or OS. - if (BenchmarkPhaseSelector == BenchmarkPhaseSelectorE::Measure) + if (StopAfter == BenchmarkPhaseSelectorE::Measure) ExitOnErr(State.getExegesisTarget().checkFeatureSupport()); if (ExecutionMode == BenchmarkRunner::ExecutionModeE::SubProcess && @@ -500,83 +609,105 @@ void benchmarkMain() { const std::unique_ptr Runner = ExitOnErr(State.getExegesisTarget().createBenchmarkRunner( - BenchmarkMode, State, BenchmarkPhaseSelector, ExecutionMode, - BenchmarkRepeatCount, ValidationCounters, ResultAggMode)); + BenchmarkMode, State, StopAfter, ExecutionMode, BenchmarkRepeatCount, + ValidationCounters, ResultAggMode)); if (!Runner) { ExitWithError("cannot create benchmark runner"); } - const auto Opcodes = getOpcodesOrDie(State); - std::vector Configurations; - - unsigned LoopRegister = - State.getExegesisTarget().getDefaultLoopCounterRegister( - State.getTargetMachine().getTargetTriple()); - - if (Opcodes.empty()) { - Configurations = ExitOnErr(readSnippets(State, SnippetsFile)); - for (const auto &Configuration : Configurations) { - if (ExecutionMode != BenchmarkRunner::ExecutionModeE::SubProcess && - (Configuration.Key.MemoryMappings.size() != 0 || - Configuration.Key.MemoryValues.size() != 0 || - Configuration.Key.SnippetAddress != 0)) - ExitWithError("Memory and snippet address annotations are only " - "supported in subprocess " - "execution mode"); + std::vector RunnableConfigs; + SmallVector Repetitions; + + // Write to standard output if file is not set. + if (BenchmarkFile.empty()) + BenchmarkFile = "-"; + + if (StartBefore == BenchmarkPhaseSelectorE::Measure) { + // Right now we only support resuming before the measurement phase. + auto ErrOrBuffer = MemoryBuffer::getFileOrSTDIN(InputFile, /*IsText=*/true); + if (!ErrOrBuffer) + report_fatal_error(errorCodeToError(ErrOrBuffer.getError())); + + std::vector Benchmarks = + ExitOnErr(Benchmark::readYamls(State, **ErrOrBuffer)); + deserializeRunnableConfigurations(Benchmarks, *Runner, RunnableConfigs, + Repetitions); + } else { + const auto Opcodes = getOpcodesOrDie(State); + std::vector Configurations; + + unsigned LoopRegister = + State.getExegesisTarget().getDefaultLoopCounterRegister( + State.getTargetMachine().getTargetTriple()); + + if (Opcodes.empty()) { + NamedRegionTimer T("prepare-snippet", "Prepare Code Snippet", + TimerGroupName, TimerGroupDescription, TimerIsEnabled); + Configurations = ExitOnErr(readSnippets(State, SnippetsFile)); + for (const auto &Configuration : Configurations) { + if (ExecutionMode != BenchmarkRunner::ExecutionModeE::SubProcess && + (Configuration.Key.MemoryMappings.size() != 0 || + Configuration.Key.MemoryValues.size() != 0 || + Configuration.Key.SnippetAddress != 0)) + ExitWithError("Memory and snippet address annotations are only " + "supported in subprocess " + "execution mode"); + } + LoopRegister = Configurations[0].Key.LoopRegister; } - LoopRegister = Configurations[0].Key.LoopRegister; - } - SmallVector, 2> Repetitors; - if (RepetitionMode != Benchmark::RepetitionModeE::AggregateMin) - Repetitors.emplace_back( - SnippetRepetitor::Create(RepetitionMode, State, LoopRegister)); - else { - for (Benchmark::RepetitionModeE RepMode : - {Benchmark::RepetitionModeE::Duplicate, - Benchmark::RepetitionModeE::Loop}) + SmallVector, 2> Repetitors; + if (RepetitionMode != Benchmark::RepetitionModeE::AggregateMin) Repetitors.emplace_back( - SnippetRepetitor::Create(RepMode, State, LoopRegister)); - } + SnippetRepetitor::Create(RepetitionMode, State, LoopRegister)); + else { + for (Benchmark::RepetitionModeE RepMode : + {Benchmark::RepetitionModeE::Duplicate, + Benchmark::RepetitionModeE::Loop}) + Repetitors.emplace_back( + SnippetRepetitor::Create(RepMode, State, LoopRegister)); + } - BitVector AllReservedRegs; - for (const std::unique_ptr &Repetitor : Repetitors) - AllReservedRegs |= Repetitor->getReservedRegs(); - - if (!Opcodes.empty()) { - for (const unsigned Opcode : Opcodes) { - // Ignore instructions without a sched class if - // -ignore-invalid-sched-class is passed. - if (IgnoreInvalidSchedClass && - State.getInstrInfo().get(Opcode).getSchedClass() == 0) { - errs() << State.getInstrInfo().getName(Opcode) - << ": ignoring instruction without sched class\n"; - continue; - } + BitVector AllReservedRegs; + for (const std::unique_ptr &Repetitor : Repetitors) + AllReservedRegs |= Repetitor->getReservedRegs(); + + if (!Opcodes.empty()) { + NamedRegionTimer T("prepare-snippet", "Prepare Code Snippet", + TimerGroupName, TimerGroupDescription, TimerIsEnabled); + for (const unsigned Opcode : Opcodes) { + // Ignore instructions without a sched class if + // -ignore-invalid-sched-class is passed. + if (IgnoreInvalidSchedClass && + State.getInstrInfo().get(Opcode).getSchedClass() == 0) { + errs() << State.getInstrInfo().getName(Opcode) + << ": ignoring instruction without sched class\n"; + continue; + } - auto ConfigsForInstr = generateSnippets(State, Opcode, AllReservedRegs); - if (!ConfigsForInstr) { - logAllUnhandledErrors( - ConfigsForInstr.takeError(), errs(), - Twine(State.getInstrInfo().getName(Opcode)).concat(": ")); - continue; + auto ConfigsForInstr = generateSnippets(State, Opcode, AllReservedRegs); + if (!ConfigsForInstr) { + logAllUnhandledErrors( + ConfigsForInstr.takeError(), errs(), + Twine(State.getInstrInfo().getName(Opcode)).concat(": ")); + continue; + } + std::move(ConfigsForInstr->begin(), ConfigsForInstr->end(), + std::back_inserter(Configurations)); } - std::move(ConfigsForInstr->begin(), ConfigsForInstr->end(), - std::back_inserter(Configurations)); } - } - if (MinInstructions == 0) { - ExitOnErr.setBanner("llvm-exegesis: "); - ExitWithError("--min-instructions must be greater than zero"); - } + if (MinInstructions == 0) { + ExitOnErr.setBanner("llvm-exegesis: "); + ExitWithError("--min-instructions must be greater than zero"); + } - // Write to standard output if file is not set. - if (BenchmarkFile.empty()) - BenchmarkFile = "-"; + collectRunnableConfigurations(Configurations, Repetitors, *Runner, + RunnableConfigs, Repetitions); + } - if (!Configurations.empty()) - runBenchmarkConfigurations(State, Configurations, Repetitors, *Runner); + if (!RunnableConfigs.empty()) + runBenchmarkConfigurations(State, RunnableConfigs, Repetitions, *Runner); pfm::pfmTerminate(); } @@ -585,7 +716,20 @@ void benchmarkMain() { // if OutputFilename is non-empty. template static void maybeRunAnalysis(const Analysis &Analyzer, const std::string &Name, - const std::string &OutputFilename) { + StringRef OutputFilename) { + Analysis::OutputFormat Format; + if (OutputFilename.consume_front("file=")) { + Format = Analysis::OF_Default; + } else if (OutputFilename.consume_front("yaml=")) { + Format = Analysis::OF_YAML; + } else if (OutputFilename.consume_front("json=")) { + Format = Analysis::OF_JSON; + } else if (!OutputFilename.empty()) { + errs() << "Unrecognized output file format and path '" + OutputFilename + << "'\n"; + return; + } + if (OutputFilename.empty()) return; if (OutputFilename != "-") { @@ -597,7 +741,7 @@ static void maybeRunAnalysis(const Analysis &Analyzer, const std::string &Name, sys::fs::FA_Read | sys::fs::FA_Write); if (ErrorCode) ExitOnFileError(OutputFilename, errorCodeToError(ErrorCode)); - if (auto Err = Analyzer.run(ClustersOS)) + if (auto Err = Analyzer.run(ClustersOS, Format)) ExitOnFileError(OutputFilename, std::move(Err)); }