From 3137a825e3b2c66650828d6132309a5b94d66f44 Mon Sep 17 00:00:00 2001 From: Claudiu Zissulescu Date: Fri, 10 Mar 2023 13:08:18 +0200 Subject: [PATCH 01/17] arcv: Add initial scheduling scheme. Signed-off-by: Claudiu Zissulescu --- gcc/config/riscv/arcv-rhx100.md | 103 ++++++++++++++++++++++++++++ gcc/config/riscv/riscv-cores.def | 1 + gcc/config/riscv/riscv-opts.h | 1 + gcc/config/riscv/riscv.cc | 113 +++++++++++++++++++++++++++++++ gcc/config/riscv/riscv.md | 4 +- 5 files changed, 221 insertions(+), 1 deletion(-) create mode 100644 gcc/config/riscv/arcv-rhx100.md diff --git a/gcc/config/riscv/arcv-rhx100.md b/gcc/config/riscv/arcv-rhx100.md new file mode 100644 index 000000000000..256871fc1656 --- /dev/null +++ b/gcc/config/riscv/arcv-rhx100.md @@ -0,0 +1,103 @@ +;; DFA scheduling description of the Synopsys RHX-100 cpu +;; for GNU C compiler +;; Copyright (C) 2023 Free Software Foundation, Inc. + +;; This file is part of GCC. + +;; GCC is free software; you can redistribute it and/or modify +;; it under the terms of the GNU General Public License as published by +;; the Free Software Foundation; either version 3, or (at your option) +;; any later version. + +;; GCC is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. + +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; . + +(define_automaton "arcv_rhx100") + +(define_cpu_unit "arcv_rhx100_ALU_A_fuse0_early" "arcv_rhx100") +(define_cpu_unit "arcv_rhx100_ALU_A_fuse1_early" "arcv_rhx100") +(define_cpu_unit "arcv_rhx100_ALU_B_fuse0_early" "arcv_rhx100") +(define_cpu_unit "arcv_rhx100_ALU_B_fuse1_early" "arcv_rhx100") +(define_cpu_unit "arcv_rhx100_MPY32" "arcv_rhx100") +(define_cpu_unit "arcv_rhx100_DIV" "arcv_rhx100") +(define_cpu_unit "arcv_rhx100_DMP_fuse0" "arcv_rhx100") +(define_cpu_unit "arcv_rhx100_DMP_fuse1" "arcv_rhx100") +(define_cpu_unit "arcv_rhx100_fdivsqrt" "arcv_rhx100") +(define_cpu_unit "arcv_rhx100_issueA_fuse0" "arcv_rhx100") +(define_cpu_unit "arcv_rhx100_issueA_fuse1" "arcv_rhx100") +(define_cpu_unit "arcv_rhx100_issueB_fuse0" "arcv_rhx100") +(define_cpu_unit "arcv_rhx100_issueB_fuse1" "arcv_rhx100") + +;; Instruction reservation for arithmetic instructions (pipe A, pipe B). +(define_insn_reservation "arcv_rhx100_alu_early_arith" 1 + (and (eq_attr "tune" "arcv_rhx100") + (eq_attr "type" "unknown,move,const,arith,shift,slt,multi,auipc,nop,logical,\ + bitmanip,min,max,minu,maxu,clz,ctz,atomic,\ + condmove,mvpair,zicond,cpop,clmul")) + "((arcv_rhx100_issueA_fuse0 + arcv_rhx100_ALU_A_fuse0_early) | (arcv_rhx100_issueA_fuse1 + arcv_rhx100_ALU_A_fuse1_early)) | ((arcv_rhx100_issueB_fuse0 + arcv_rhx100_ALU_B_fuse0_early) | (arcv_rhx100_issueB_fuse1 + arcv_rhx100_ALU_B_fuse1_early))") + +(define_insn_reservation "arcv_rhx100_jmp_insn" 1 + (and (eq_attr "tune" "arcv_rhx100") + (eq_attr "type" "branch,jump,call,jalr,ret,trap")) + "arcv_rhx100_issueA_fuse0 | arcv_rhx100_issueA_fuse1") + +(define_insn_reservation "arcv_rhx100_div_insn" 12 + (and (eq_attr "tune" "arcv_rhx100") + (eq_attr "type" "idiv")) + "arcv_rhx100_issueA_fuse0 + arcv_rhx100_DIV, nothing*11") + +(define_insn_reservation "arcv_rhx100_mpy32_insn" 4 + (and (eq_attr "tune" "arcv_rhx100") + (eq_attr "type" "imul")) + "arcv_rhx100_issueA_fuse0 + arcv_rhx100_MPY32, nothing*3") + +(define_insn_reservation "arcv_rhx100_load_insn" 3 + (and (eq_attr "tune" "arcv_rhx100") + (eq_attr "type" "load,fpload")) + "(arcv_rhx100_issueB_fuse0 + arcv_rhx100_DMP_fuse0) | (arcv_rhx100_issueB_fuse1 + arcv_rhx100_DMP_fuse1)") + +(define_insn_reservation "arcv_rhx100_store_insn" 1 + (and (eq_attr "tune" "arcv_rhx100") + (eq_attr "type" "store,fpstore")) + "(arcv_rhx100_issueB_fuse0 + arcv_rhx100_DMP_fuse0) | (arcv_rhx100_issueB_fuse1 + arcv_rhx100_DMP_fuse1)") + +;; (soft) floating points +(define_insn_reservation "arcv_rhx100_xfer" 3 + (and (eq_attr "tune" "arcv_rhx100") + (eq_attr "type" "mfc,mtc,fcvt,fcvt_i2f,fcvt_f2i,fmove,fcmp")) + "(arcv_rhx100_ALU_A_fuse0_early | arcv_rhx100_ALU_B_fuse0_early), nothing*2") + +(define_insn_reservation "arcv_rhx100_fmul" 5 + (and (eq_attr "tune" "arcv_rhx100") + (eq_attr "type" "fadd,fmul,fmadd")) + "(arcv_rhx100_ALU_A_fuse0_early | arcv_rhx100_ALU_B_fuse0_early)") + +(define_insn_reservation "arcv_rhx100_fdiv" 20 + (and (eq_attr "tune" "arcv_rhx100") + (eq_attr "type" "fdiv,fsqrt")) + "arcv_rhx100_fdivsqrt*20") + +;(final_presence_set "arcv_rhx100_issueA_fuse1" "arcv_rhx100_issueA_fuse0") +;(final_presence_set "arcv_rhx100_issueB_fuse1" "arcv_rhx100_issueB_fuse0") +;(final_presence_set "arcv_rhx100_ALU_A_fuse1_early" "arcv_rhx100_ALU_A_fuse0_early") +;(final_presence_set "arcv_rhx100_ALU_B_fuse1_early" "arcv_rhx100_ALU_B_fuse0_early") + +;; Bypasses +;(define_bypass 0 "arcv_rhx100_alu_early_arith" "arcv_rhx100_store_insn" "riscv_store_data_bypass_p") +(define_bypass 1 "arcv_rhx100_alu_early_arith" "arcv_rhx100_store_insn" "riscv_store_data_bypass_p") + +;(define_bypass 0 "arcv_rhx100_load_insn" "arcv_rhx100_store_insn" "riscv_store_data_bypass_p") +(define_bypass 1 "arcv_rhx100_load_insn" "arcv_rhx100_store_insn" "riscv_store_data_bypass_p") +(define_bypass 1 "arcv_rhx100_load_insn" "arcv_rhx100_alu_early_arith") +(define_bypass 1 "arcv_rhx100_load_insn" "arcv_rhx100_mpy*_insn") +(define_bypass 2 "arcv_rhx100_load_insn" "arcv_rhx100_load_insn") +(define_bypass 1 "arcv_rhx100_load_insn" "arcv_rhx100_div_insn") + +(define_bypass 3 "arcv_rhx100_mpy32_insn" "arcv_rhx100_mpy*_insn") +(define_bypass 3 "arcv_rhx100_mpy32_insn" "arcv_rhx100_div_insn") diff --git a/gcc/config/riscv/riscv-cores.def b/gcc/config/riscv/riscv-cores.def index d1708f3785b6..3b5da61d0bc9 100644 --- a/gcc/config/riscv/riscv-cores.def +++ b/gcc/config/riscv/riscv-cores.def @@ -51,6 +51,7 @@ RISCV_TUNE("xt-c920v2", generic, generic_ooo_tune_info) RISCV_TUNE("xiangshan-nanhu", xiangshan, xiangshan_nanhu_tune_info) RISCV_TUNE("xiangshan-kunminghu", xiangshan, generic_ooo_tune_info) RISCV_TUNE("arc-v-rmx-100-series", arcv_rmx100, arcv_rmx100_tune_info) +RISCV_TUNE("arc-v-rhx-100-series", arcv_rhx100, arcv_rhx100_tune_info) RISCV_TUNE("generic-ooo", generic_ooo, generic_ooo_tune_info) RISCV_TUNE("size", generic, optimize_size_tune_info) RISCV_TUNE("mips-p8700", mips_p8700, mips_p8700_tune_info) diff --git a/gcc/config/riscv/riscv-opts.h b/gcc/config/riscv/riscv-opts.h index 7be10413b4d9..632d426503be 100644 --- a/gcc/config/riscv/riscv-opts.h +++ b/gcc/config/riscv/riscv-opts.h @@ -62,6 +62,7 @@ enum riscv_microarchitecture_type { mips_p8700, tt_ascalon_d8, arcv_rmx100, + arcv_rhx100, }; extern enum riscv_microarchitecture_type riscv_microarchitecture; diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc index 8719c2942b55..bb8a9282f15d 100644 --- a/gcc/config/riscv/riscv.cc +++ b/gcc/config/riscv/riscv.cc @@ -291,6 +291,7 @@ enum riscv_fusion_pairs RISCV_FUSE_BFEXT = (1 << 11), RISCV_FUSE_EXPANDED_LD = (1 << 12), RISCV_FUSE_B_ALUI = (1 << 13), + RISCV_FUSE_ARCV = (1 << 14), }; /* Costs of various operations on the different architectures. */ @@ -709,6 +710,30 @@ static const struct riscv_tune_param arcv_rmx100_tune_info = { NULL, /* loop_align */ }; +/* Costs to use when optimizing for Synopsys RHX-100. */ +static const struct riscv_tune_param arcv_rhx100_tune_info = { + {COSTS_N_INSNS (4), COSTS_N_INSNS (5)}, /* fp_add */ + {COSTS_N_INSNS (4), COSTS_N_INSNS (5)}, /* fp_mul */ + {COSTS_N_INSNS (20), COSTS_N_INSNS (20)}, /* fp_div */ + {COSTS_N_INSNS (4), COSTS_N_INSNS (4)}, /* int_mul */ + {COSTS_N_INSNS (27), COSTS_N_INSNS (43)}, /* int_div */ + 4, /* issue_rate */ + 9, /* branch_cost */ + 2, /* memory_cost */ + 8, /* fmv_cost */ + false, /* slow_unaligned_access */ + false, /* vector_unaligned_access */ + false, /* use_divmod_expansion */ + false, /* overlap_op_by_pieces */ + true, /* use_zero_stride_load */ + false, /* speculative_sched_vsetvl */ + RISCV_FUSE_ARCV, /* fusible_ops */ + NULL, /* vector cost */ + NULL, /* function_align */ + NULL, /* jump_align */ + NULL, /* loop_align */ +}; + /* Costs to use when optimizing for size. */ static const struct riscv_tune_param optimize_size_tune_info = { {COSTS_N_INSNS (1), COSTS_N_INSNS (1)}, /* fp_add */ @@ -11009,6 +11034,91 @@ riscv_set_is_shNadduw (rtx set) && REG_P (SET_DEST (set))); } +/* Return TRUE if two addresses can be fused. */ + +static bool +arcv_fused_addr_p (rtx addr0, rtx addr1) +{ + rtx base0, base1, tmp; + HOST_WIDE_INT off0 = 0, off1 = 0; + + if (GET_CODE (addr0) == PLUS) + { + base0 = XEXP (addr0, 0); + tmp = XEXP (addr0, 1); + if (!CONST_INT_P (tmp)) + return false; + off0 = INTVAL (tmp); + } + else if (REG_P (addr0)) + base0 = addr0; + else + return false; + + if (GET_CODE (addr1) == PLUS) + { + base1 = XEXP (addr1, 0); + tmp = XEXP (addr1, 1); + if (!CONST_INT_P (tmp)) + return false; + off1 = INTVAL (tmp); + } + else if (REG_P (addr1)) + base1 = addr1; + else + return false; + + /* Check if we have the same base. */ + gcc_assert (REG_P (base0) && REG_P (base1)); + if (REGNO (base0) != REGNO (base1)) + return false; + + /* Offsets have to be aligned to word boundary and adjacent in memory, + but the memory operations can be narrower. */ + if ((off0 % UNITS_PER_WORD == 0) && (abs (off1 - off0) == UNITS_PER_WORD)) + return true; + + return false; +} + +/* Return true if PREV and CURR should be kept together during scheduling. */ + +static bool +arcv_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr) +{ + rtx prev_set = single_set (prev); + rtx curr_set = single_set (curr); + /* prev and curr are simple SET insns i.e. no flag setting or branching. */ + bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr); + + /* Don't handle anything with a jump. */ + if (!simple_sets_p) + return false; + + /* Fuse adjacent loads and stores. */ + if (get_attr_type (prev) == TYPE_LOAD + && get_attr_type (curr) == TYPE_LOAD) + { + rtx addr0 = XEXP (SET_SRC (prev_set), 0); + rtx addr1 = XEXP (SET_SRC (curr_set), 0); + + if (arcv_fused_addr_p (addr0, addr1)) + return true; + } + + if (get_attr_type (prev) == TYPE_STORE + && get_attr_type (curr) == TYPE_STORE) + { + rtx addr0 = XEXP (SET_DEST (prev_set), 0); + rtx addr1 = XEXP (SET_DEST (curr_set), 0); + + if (arcv_fused_addr_p (addr0, addr1)) + return true; + } + + return false; +} + /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR should be kept together during scheduling. */ @@ -11641,6 +11751,9 @@ riscv_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr) } } + if (riscv_fusion_enabled_p (RISCV_FUSE_ARCV)) + return arcv_macro_fusion_pair_p (prev, curr); + return false; } diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md index 823f8dda8a30..5779a862743d 100644 --- a/gcc/config/riscv/riscv.md +++ b/gcc/config/riscv/riscv.md @@ -672,7 +672,8 @@ ;; Microarchitectures we know how to tune for. ;; Keep this in sync with enum riscv_microarchitecture. (define_attr "tune" - "generic,sifive_7,sifive_p400,sifive_p600,xiangshan,generic_ooo,mips_p8700,tt_ascalon_d8,arcv_rmx100" + "generic,sifive_7,sifive_p400,sifive_p600,xiangshan,generic_ooo,mips_p8700,tt_ascalon_d8,arcv_rmx100,arcv_rhx100" + "generic,sifive_7,sifive_p400,sifive_p600,xiangshan,arcv_rhx100,generic_ooo" (const (symbol_ref "((enum attr_tune) riscv_microarchitecture)"))) ;; Describe a user's asm statement. @@ -4967,3 +4968,4 @@ (include "generic-ooo.md") (include "tt-ascalon-d8.md") (include "arcv-rmx100.md") +(include "arcv-rhx100.md") From 8984a0562829f8f835617f9911d542f63ce9e1d2 Mon Sep 17 00:00:00 2001 From: Artemiy Volkov Date: Wed, 11 Oct 2023 10:26:20 +0200 Subject: [PATCH 02/17] arcv: fuse load-immediate with store For the RMX-500 and RHX cores, the sequence "load-immediate + store" (that is used to store a constant value) can be executed in 1 cycle, provided the two instructions are kept next to one another. This patch handles this case in riscv_macro_fusion_pair_p(). Signed-off-by: Artemiy Volkov --- gcc/config/riscv/riscv.cc | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc index bb8a9282f15d..5a7d7e508199 100644 --- a/gcc/config/riscv/riscv.cc +++ b/gcc/config/riscv/riscv.cc @@ -11116,6 +11116,16 @@ arcv_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr) return true; } + /* Fuse load-immediate with a store of the destination register. */ + if (get_attr_type (prev) == TYPE_MOVE + && get_attr_move_type (prev) == MOVE_TYPE_CONST + && get_attr_type (curr) == TYPE_STORE + && ((REG_P (SET_SRC (curr_set)) + && SET_DEST (prev_set) == SET_SRC (curr_set)) + || (SUBREG_P (SET_SRC (curr_set)) + && SET_DEST (prev_set) == SUBREG_REG (SET_SRC (curr_set))))) + return true; + return false; } From 6e9d52cc50aa96c03d33c9f04cd086069471afcb Mon Sep 17 00:00:00 2001 From: Shahab Vahedi Date: Thu, 19 Oct 2023 13:28:17 +0200 Subject: [PATCH 03/17] arcv: Introduce riscv_is_micro_arch () ARC-V related optimisations must be guarded like: if (riscv_microarchitecture == ) { ... } Introduce an inline function that encapsulates this: static inline bool riscv_is_micro_arch () Use it to define __riscv_rhx whenever compiling for the RHX microarchitecture. Signed-off-by: Shahab Vahedi --- gcc/config/riscv/riscv-c.cc | 3 +++ gcc/config/riscv/riscv-protos.h | 3 +++ gcc/config/riscv/riscv.cc | 6 ++++++ 3 files changed, 12 insertions(+) diff --git a/gcc/config/riscv/riscv-c.cc b/gcc/config/riscv/riscv-c.cc index d497326e0611..52d240ceb89f 100644 --- a/gcc/config/riscv/riscv-c.cc +++ b/gcc/config/riscv/riscv-c.cc @@ -149,6 +149,9 @@ riscv_cpu_cpp_builtins (cpp_reader *pfile) builtin_define_with_int_value ("__riscv_th_v_intrinsic", riscv_ext_version_value (0, 11)); + if (riscv_is_micro_arch (arcv_rhx100)) + builtin_define ("__riscv_rhx"); + /* Define architecture extension test macros. */ builtin_define_with_int_value ("__riscv_arch_test", 1); diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h index 5881cb9529ce..77f447577cad 100644 --- a/gcc/config/riscv/riscv-protos.h +++ b/gcc/config/riscv/riscv-protos.h @@ -826,6 +826,9 @@ extern unsigned int th_int_get_mask (unsigned int); extern unsigned int th_int_get_save_adjustment (void); extern rtx th_int_adjust_cfi_prologue (unsigned int); extern const char *th_asm_output_opcode (FILE *asm_out_file, const char *p); + +extern bool riscv_is_micro_arch (enum riscv_microarchitecture_type); + #ifdef RTX_CODE extern const char* th_mempair_output_move (rtx[4], bool, machine_mode, RTX_CODE); diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc index 5a7d7e508199..49fe216c2e82 100644 --- a/gcc/config/riscv/riscv.cc +++ b/gcc/config/riscv/riscv.cc @@ -903,6 +903,12 @@ typedef enum typedef insn_code (*code_for_push_pop_t) (machine_mode); +bool +riscv_is_micro_arch (enum riscv_microarchitecture_type arch) +{ + return (riscv_microarchitecture == arch); +} + void riscv_frame_info::reset(void) { total_size = 0; From 1aa42c1f13ebb7dd8dac8fa61615dff5dc2af9fe Mon Sep 17 00:00:00 2001 From: Artemiy Volkov Date: Tue, 19 Mar 2024 19:27:03 -0700 Subject: [PATCH 04/17] arcv: fuse load-immediate with dependent branch With this commit, we allow a load-immediate to be macro-op fused with a successive conditional branch that is dependent on it, e.g.: li t0, #imm bge t1, t0, .label Additionally, add a new testcase to check that this fusion type is handled correctly. Signed-off-by: Artemiy Volkov --- gcc/config/riscv/riscv-protos.h | 1 + gcc/config/riscv/riscv.cc | 15 +++++++++++++-- .../gcc.target/riscv/arcv-fusion-limm-condbr.c | 12 ++++++++++++ 3 files changed, 26 insertions(+), 2 deletions(-) create mode 100644 gcc/testsuite/gcc.target/riscv/arcv-fusion-limm-condbr.c diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h index 77f447577cad..be047d7b7692 100644 --- a/gcc/config/riscv/riscv-protos.h +++ b/gcc/config/riscv/riscv-protos.h @@ -827,6 +827,7 @@ extern unsigned int th_int_get_save_adjustment (void); extern rtx th_int_adjust_cfi_prologue (unsigned int); extern const char *th_asm_output_opcode (FILE *asm_out_file, const char *p); +extern bool riscv_macro_fusion_p (); extern bool riscv_is_micro_arch (enum riscv_microarchitecture_type); #ifdef RTX_CODE diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc index 49fe216c2e82..f74668599a43 100644 --- a/gcc/config/riscv/riscv.cc +++ b/gcc/config/riscv/riscv.cc @@ -10951,7 +10951,7 @@ riscv_sched_reorder (FILE *, int, rtx_insn **ready, int *nreadyp, int) /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports instruction fusion of some sort. */ -static bool +bool riscv_macro_fusion_p (void) { return tune_param->fusible_ops != RISCV_FUSE_NOTHING; @@ -11097,7 +11097,18 @@ arcv_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr) /* prev and curr are simple SET insns i.e. no flag setting or branching. */ bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr); - /* Don't handle anything with a jump. */ + /* Fuse load-immediate with a dependent conditional branch. */ + if (get_attr_type (prev) == TYPE_MOVE + && get_attr_move_type (prev) == MOVE_TYPE_CONST + && any_condjump_p (curr)) + { + rtx comp = XEXP (SET_SRC (curr_set), 0); + + return (REG_P (XEXP (comp, 0)) && XEXP (comp, 0) == SET_DEST (prev_set)) + || (REG_P (XEXP (comp, 1)) && XEXP (comp, 1) == SET_DEST (prev_set)); + } + + /* Don't handle anything with a jump past this point. */ if (!simple_sets_p) return false; diff --git a/gcc/testsuite/gcc.target/riscv/arcv-fusion-limm-condbr.c b/gcc/testsuite/gcc.target/riscv/arcv-fusion-limm-condbr.c new file mode 100644 index 000000000000..cc2a56a2e086 --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/arcv-fusion-limm-condbr.c @@ -0,0 +1,12 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mtune=arc-v-rhx-100-series" } */ + +int +f (int x) +{ + begin: + if (x <= 3) + goto begin; +} + +/* { dg-final { scan-assembler "\\sli\\sa5,3\n\\sble\\sa0,a5,.L\[0-9\]+\n" } } */ From 0e5512b9504d5562d035509faa26e51f81e58f61 Mon Sep 17 00:00:00 2001 From: Artemiy Volkov Date: Tue, 4 Jun 2024 04:58:46 -0700 Subject: [PATCH 05/17] arcv: implement TARGET_SCHED_FUSION_PRIORITY To take better advantage of double load/store fusion, make use of the sched_fusion pass that assigns unique "fusion priorities" to load/store instructions and schedules operations on adjacent addresses together. This maximizes the probability that loads/stores are fused between each other instead of with other instructions. Signed-off-by: Artemiy Volkov --- gcc/config/riscv/riscv.cc | 75 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc index f74668599a43..3169b8b9fd87 100644 --- a/gcc/config/riscv/riscv.cc +++ b/gcc/config/riscv/riscv.cc @@ -11784,6 +11784,78 @@ riscv_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr) return false; } +/* If INSN is a load or store of address in the form of [base+offset], + extract the two parts and set to BASE and OFFSET. IS_LOAD is set + to TRUE if it's a load. Return TRUE if INSN is such an instruction, + otherwise return FALSE. */ + +static bool +fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset, bool *is_load) +{ + rtx x, dest, src; + + gcc_assert (INSN_P (insn)); + x = PATTERN (insn); + if (GET_CODE (x) != SET) + return false; + + src = SET_SRC (x); + dest = SET_DEST (x); + if (REG_P (src) && MEM_P (dest)) + { + *is_load = false; + extract_base_offset_in_addr (dest, base, offset); + } + else if (MEM_P (src) && REG_P (dest)) + { + *is_load = true; + extract_base_offset_in_addr (src, base, offset); + } + else + return false; + + return (*base != NULL_RTX && *offset != NULL_RTX); +} + +static void +riscv_sched_fusion_priority (rtx_insn *insn, int max_pri, int *fusion_pri, + int *pri) +{ + int tmp, off_val; + bool is_load; + rtx base, offset; + + gcc_assert (INSN_P (insn)); + + tmp = max_pri - 1; + if (!fusion_load_store (insn, &base, &offset, &is_load)) + { + *pri = tmp; + *fusion_pri = tmp; + return; + } + + tmp /= 2; + + /* INSN with smaller base register goes first. */ + tmp -= ((REGNO (base) & 0xff) << 20); + + /* INSN with smaller offset goes first. */ + off_val = (int)(INTVAL (offset)); + + /* Put loads/stores operating on adjacent words into the same + * scheduling group. */ + *fusion_pri = tmp - ((off_val / (UNITS_PER_WORD * 2)) << 1) + is_load; + + if (off_val >= 0) + tmp -= (off_val & 0xfffff); + else + tmp += ((- off_val) & 0xfffff); + + *pri = tmp; + return; +} + /* Adjust the cost/latency of instructions for scheduling. For now this is just used to change the latency of vector instructions according to their LMUL. We assume that an insn with LMUL == 8 requires @@ -16468,6 +16540,9 @@ riscv_prefetch_offset_address_p (rtx x, machine_mode mode) #undef TARGET_SCHED_INIT #define TARGET_SCHED_INIT riscv_sched_init +#undef TARGET_SCHED_FUSION_PRIORITY +#define TARGET_SCHED_FUSION_PRIORITY riscv_sched_fusion_priority + #undef TARGET_SCHED_VARIABLE_ISSUE #define TARGET_SCHED_VARIABLE_ISSUE riscv_sched_variable_issue From 310e813933f1c0dd3184c25e66d2c08ddd6ed6f1 Mon Sep 17 00:00:00 2001 From: Artemiy Volkov Date: Wed, 12 Jun 2024 02:06:43 -0700 Subject: [PATCH 06/17] arcv: fuse load/store + register post-{inc,dec}rement With this patch, arcv_macro_fusion_pair_p () recognizes instruction pairs like: LOAD rd1, [rs1,offset] add/sub rd2, rs1, rs2/imm (where all regs are distinct) and: STORE rs2, [rs1,offset] add/sub rd, rs1, rs2/imm as fused macro-op pairs. In the case of a load, rd1 being equal to rd2, rs1, or rs2 would lead to data hazards, hence this is disallowed; for stores, rs1 and rs2 of the two instructions must match. Signed-off-by: Artemiy Volkov --- gcc/config/riscv/riscv.cc | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc index 3169b8b9fd87..ea1d1d3f1348 100644 --- a/gcc/config/riscv/riscv.cc +++ b/gcc/config/riscv/riscv.cc @@ -11133,6 +11133,38 @@ arcv_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr) return true; } + /* Fuse load/store + register post-{inc,dec}rement: + * prev (ld) == (set (reg:X rd1) (mem:X (plus:X (reg:X rs1) (const_int)))) + * or + * prev (st) == (set (mem:X (plus:X (reg:X rs1) (const_int))) (reg:X rs2)) + * ... + */ + if ((GET_CODE (SET_SRC (curr_set)) == PLUS + || GET_CODE (SET_SRC (curr_set)) == MINUS) + && REG_P (XEXP (SET_SRC (curr_set), 0)) + && ((get_attr_type (prev) == TYPE_LOAD + && REG_P (XEXP (SET_SRC (prev_set), 0)) + && REGNO (XEXP (SET_SRC (prev_set), 0)) + == REGNO (XEXP (SET_SRC (curr_set), 0)) + && REGNO (XEXP (SET_SRC (prev_set), 0)) + != REGNO (SET_DEST (prev_set)) + && REGNO (SET_DEST (prev_set)) != REGNO (SET_DEST (curr_set)) + /* curr (op-imm) == (set (reg:X rd2) (plus/minus (reg:X rs1) (const_int))) */ + && (CONST_INT_P (XEXP (SET_SRC (curr_set), 1)) + /* or curr (op) == (set (reg:X rd2) (plus/minus (reg:X rs1) (reg:X rs2))) */ + || REGNO (SET_DEST (prev_set)) + != REGNO (XEXP (SET_SRC (curr_set), 1)))) + || (get_attr_type (prev) == TYPE_STORE + && REG_P (XEXP (SET_DEST (prev_set), 0)) + && REGNO (XEXP (SET_DEST (prev_set), 0)) + == REGNO (XEXP (SET_SRC (curr_set), 0)) + /* curr (op-imm) == (set (reg:X rd2) (plus/minus (reg:X rs1) (const_int))) */ + && (CONST_INT_P (XEXP (SET_SRC (curr_set), 1)) + /* or curr (op) == (set (reg:X rd2) (plus/minus (reg:X rs1) (reg:X rs2))) */ + || REGNO (XEXP (SET_DEST (prev_set), 0)) + == REGNO (XEXP (SET_SRC (curr_set), 1)))))) + return true; + /* Fuse load-immediate with a store of the destination register. */ if (get_attr_type (prev) == TYPE_MOVE && get_attr_move_type (prev) == MOVE_TYPE_CONST From 38016d4603147d33beb97a59ff2ce764c37eb83f Mon Sep 17 00:00:00 2001 From: Artemiy Volkov Date: Mon, 17 Jun 2024 01:23:28 -0700 Subject: [PATCH 07/17] arcv: fuse load/store with lui Fuse together instruction pairs such as: LOAD rd1, [rs1,offset] lui rd2, imm (where rd1 and rd2 are distinct) and: STORE rs2, [rs1,offset] lui rd, imm Signed-off-by: Artemiy Volkov --- gcc/config/riscv/riscv.cc | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc index ea1d1d3f1348..37690afd1b24 100644 --- a/gcc/config/riscv/riscv.cc +++ b/gcc/config/riscv/riscv.cc @@ -11165,6 +11165,22 @@ arcv_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr) == REGNO (XEXP (SET_SRC (curr_set), 1)))))) return true; + /* Fuse load/store with lui: + * prev (ld) == (set (reg:X rd1) (mem:X (plus:X (reg:X) (const_int)))) + * or + * prev (st) == (set (mem:X (plus:X (reg:X) (const_int))) (reg:X rD)) + * + * curr (lui) == (set (reg:X rd2) (const_int UPPER_IMM_20)) + */ + if (((get_attr_type (curr) == TYPE_MOVE + && GET_CODE (SET_SRC (curr_set)) == HIGH) + || (CONST_INT_P (SET_SRC (curr_set)) + && LUI_OPERAND (INTVAL (SET_SRC (curr_set))))) + && ((get_attr_type (prev) == TYPE_LOAD + && REGNO (SET_DEST (prev_set)) != REGNO (SET_DEST (curr_set))) + || get_attr_type (prev) == TYPE_STORE)) + return true; + /* Fuse load-immediate with a store of the destination register. */ if (get_attr_type (prev) == TYPE_MOVE && get_attr_move_type (prev) == MOVE_TYPE_CONST From 32300a0753c170f8ee36cd052fe4970eb3431599 Mon Sep 17 00:00:00 2001 From: Artemiy Volkov Date: Fri, 21 Jun 2024 08:19:46 -0700 Subject: [PATCH 08/17] arcv: create a 32-bit integer multiply-add instruction pattern The RHX core executes integer multiply-add sequences of the form: mul r1,r2,r3 add r1,r1,r4 in 1 cycle due to macro-op fusion. This patch adds a define_insn_and_split to recognize the above sequence and preserve it as a single insn up until the post-reload split pass. Since, due to a microarchitectural restriction, the output operand of both instructions must be the same register, the insn_and_split pattern has two alternatives corresponding to the following cases: (0) r1 is different from r4, in which case the insn can be split to the sequence above; (1) r1 and r4 are the same, in which case a temporary register has to be used and there is no fusion. Alternative (1) is discouraged so that reload maximizes the number of instances where MAC fusion can be applied. Since RHX is a rv32im core, the pattern requires that the target is 32-bit and supports multiplication. In addition, the {u,}maddhisi3 expand is implemented for RHX to convert the ( 16-bit x 16-bit + 32_bit ) WIDEN_MULT_PLUS_EXPR GIMPLE operator to the aforementioned madd_split instruction directly. Lastly, a very basic testcase is introduced to make sure that the new patterns are sufficient to produce MAC-fusion-aware code. No new dejagnu failures with RUNTESTFLAGS="CFLAGS_FOR_TARGET=-mtune=rhx dg.exp". Signed-off-by: Artemiy Volkov --- gcc/config/riscv/riscv.md | 57 ++++++++++++++++++- .../gcc.target/riscv/arcv-fusion-madd.c | 12 ++++ 2 files changed, 68 insertions(+), 1 deletion(-) create mode 100644 gcc/testsuite/gcc.target/riscv/arcv-fusion-madd.c diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md index 5779a862743d..02a8ff39a59f 100644 --- a/gcc/config/riscv/riscv.md +++ b/gcc/config/riscv/riscv.md @@ -4502,7 +4502,35 @@ (mult:SI (sign_extend:SI (match_operand:HI 1 "register_operand")) (sign_extend:SI (match_operand:HI 2 "register_operand"))) (match_operand:SI 3 "register_operand")))] - "TARGET_XTHEADMAC" + "TARGET_XTHEADMAC || (riscv_is_micro_arch (arcv_rhx100) + && !TARGET_64BIT && (TARGET_ZMMUL || TARGET_MUL))" + { + if (riscv_is_micro_arch (arcv_rhx100)) + { + rtx tmp0 = gen_reg_rtx (SImode), tmp1 = gen_reg_rtx (SImode); + emit_insn (gen_extendhisi2 (tmp0, operands[1])); + emit_insn (gen_extendhisi2 (tmp1, operands[2])); + emit_insn (gen_madd_split_fused (operands[0], tmp0, tmp1, operands[3])); + DONE; + } + } +) + +(define_expand "umaddhisi4" + [(set (match_operand:SI 0 "register_operand") + (plus:SI + (mult:SI (zero_extend:SI (match_operand:HI 1 "register_operand")) + (zero_extend:SI (match_operand:HI 2 "register_operand"))) + (match_operand:SI 3 "register_operand")))] + "riscv_is_micro_arch (arcv_rhx100) + && !TARGET_64BIT && (TARGET_ZMMUL || TARGET_MUL)" + { + rtx tmp0 = gen_reg_rtx (SImode), tmp1 = gen_reg_rtx (SImode); + emit_insn (gen_zero_extendhisi2 (tmp0, operands[1])); + emit_insn (gen_zero_extendhisi2 (tmp1, operands[2])); + emit_insn (gen_madd_split (operands[0], tmp0, tmp1, operands[3])); + DONE; + } ) (define_expand "msubhisi4" @@ -4514,6 +4542,33 @@ "TARGET_XTHEADMAC" ) +(define_insn_and_split "madd_split" + [(set (match_operand:SI 0 "register_operand" "=&r,r") + (plus:SI + (mult:SI (match_operand:SI 1 "register_operand" "r,r") + (match_operand:SI 2 "register_operand" "r,r")) + (match_operand:SI 3 "register_operand" "r,?0"))) + (clobber (match_scratch:SI 4 "=&r,&r"))] + "riscv_is_micro_arch (rhx) && !TARGET_64BIT && (TARGET_ZMMUL || TARGET_MUL)" + "#" + "&& reload_completed" + [(const_int 0)] + "{ + if (REGNO (operands[0]) == REGNO (operands[3])) + { + emit_insn (gen_mulsi3 (operands[4], operands[1], operands[2])); + emit_insn (gen_addsi3 (operands[0], operands[3], operands[4])); + } + else + { + emit_insn (gen_mulsi3 (operands[0], operands[1], operands[2])); + emit_insn (gen_addsi3 (operands[0], operands[0], operands[3])); + } + DONE; + }" + [(set_attr "type" "imul")] +) + ;; String compare with length insn. ;; Argument 0 is the target (result) ;; Argument 1 is the source1 diff --git a/gcc/testsuite/gcc.target/riscv/arcv-fusion-madd.c b/gcc/testsuite/gcc.target/riscv/arcv-fusion-madd.c new file mode 100644 index 000000000000..eb8665f576c4 --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/arcv-fusion-madd.c @@ -0,0 +1,12 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target rv32 } */ +/* { dg-skip-if "" { *-*-* } { "-g" "-flto" "-O0" } } */ +/* { dg-options "-mtune=arc-v-rhx-100-series -march=rv32im -mabi=ilp32" } */ + +int +f (int x, int y, int z, int v, int w) +{ + return x + y * z + v * w; +} + +/* { dg-final { scan-assembler {\smul\s([ast][0-9]+),a1,a2\n\sadd\s\1,\1,a0\n\smul\sa0,a3,a4\n\sadd\sa0,a0,\1\n} } } */ From 8bf79f3030ef8e6d22b3cb85eb3de913dd873383 Mon Sep 17 00:00:00 2001 From: Artemiy Volkov Date: Fri, 21 Jun 2024 07:27:33 -0700 Subject: [PATCH 09/17] arcv: fuse integer multiply-add instruction pairs To make sure that the multiply-add pairs (split post-reload from the madd_split instruction) are not broken up by the sched2 pass, designate them as fusable in arcv_macro_fusion_pair_p (). Signed-off-by: Artemiy Volkov --- gcc/config/riscv/arcv-rhx100.md | 5 +++++ gcc/config/riscv/riscv.cc | 7 +++++++ gcc/config/riscv/riscv.md | 25 ++++++++++--------------- 3 files changed, 22 insertions(+), 15 deletions(-) diff --git a/gcc/config/riscv/arcv-rhx100.md b/gcc/config/riscv/arcv-rhx100.md index 256871fc1656..08d468b1288c 100644 --- a/gcc/config/riscv/arcv-rhx100.md +++ b/gcc/config/riscv/arcv-rhx100.md @@ -42,6 +42,11 @@ condmove,mvpair,zicond,cpop,clmul")) "((arcv_rhx100_issueA_fuse0 + arcv_rhx100_ALU_A_fuse0_early) | (arcv_rhx100_issueA_fuse1 + arcv_rhx100_ALU_A_fuse1_early)) | ((arcv_rhx100_issueB_fuse0 + arcv_rhx100_ALU_B_fuse0_early) | (arcv_rhx100_issueB_fuse1 + arcv_rhx100_ALU_B_fuse1_early))") +(define_insn_reservation "arcv_rhx100_imul_fused" 4 + (and (eq_attr "tune" "arcv_rhx100") + (eq_attr "type" "imul_fused")) + "(arcv_rhx100_issueA_fuse0 + arcv_rhx100_issueA_fuse1 + arcv_rhx100_ALU_A_fuse0_early + arcv_rhx100_ALU_A_fuse1_early + arcv_rhx100_MPY32), nothing*3") + (define_insn_reservation "arcv_rhx100_jmp_insn" 1 (and (eq_attr "tune" "arcv_rhx100") (eq_attr "type" "branch,jump,call,jalr,ret,trap")) diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc index 37690afd1b24..bafa14cfd8ab 100644 --- a/gcc/config/riscv/riscv.cc +++ b/gcc/config/riscv/riscv.cc @@ -11191,6 +11191,13 @@ arcv_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr) && SET_DEST (prev_set) == SUBREG_REG (SET_SRC (curr_set))))) return true; + if (GET_CODE (SET_SRC (prev_set)) == MULT + && GET_CODE (SET_SRC (curr_set)) == PLUS + && REGNO (SET_DEST (prev_set)) == REGNO (SET_DEST (curr_set)) + && (REGNO (SET_DEST (prev_set)) == REGNO (XEXP (SET_SRC (curr_set), 0)) + || REGNO (SET_DEST (prev_set)) == REGNO (XEXP (SET_SRC (curr_set), 1)))) + return true; + return false; } diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md index 02a8ff39a59f..239e8d859811 100644 --- a/gcc/config/riscv/riscv.md +++ b/gcc/config/riscv/riscv.md @@ -518,7 +518,7 @@ vslideup,vslidedown,vislide1up,vislide1down,vfslide1up,vfslide1down, vgather,vcompress,vmov,vector,vandn,vbrev,vbrev8,vrev8,vclz,vctz,vcpop,vrol,vror,vwsll, vclmul,vclmulh,vghsh,vgmul,vaesef,vaesem,vaesdf,vaesdm,vaeskf1,vaeskf2,vaesz, - vsha2ms,vsha2ch,vsha2cl,vsm4k,vsm4r,vsm3me,vsm3c,vfncvtbf16,vfwcvtbf16,vfwmaccbf16, + vsha2ms,vsha2ch,vsha2cl,vsm4k,vsm4r,vsm3me,vsm3c,vfncvtbf16,vfwcvtbf16,vfwmaccbf16,imul_fused, sf_vc,sf_vc_se" (cond [(eq_attr "got" "load") (const_string "load") @@ -4528,7 +4528,7 @@ rtx tmp0 = gen_reg_rtx (SImode), tmp1 = gen_reg_rtx (SImode); emit_insn (gen_zero_extendhisi2 (tmp0, operands[1])); emit_insn (gen_zero_extendhisi2 (tmp1, operands[2])); - emit_insn (gen_madd_split (operands[0], tmp0, tmp1, operands[3])); + emit_insn (gen_madd_split_fused (operands[0], tmp0, tmp1, operands[3])); DONE; } ) @@ -4542,31 +4542,26 @@ "TARGET_XTHEADMAC" ) -(define_insn_and_split "madd_split" +(define_insn "madd_split_fused" [(set (match_operand:SI 0 "register_operand" "=&r,r") (plus:SI (mult:SI (match_operand:SI 1 "register_operand" "r,r") (match_operand:SI 2 "register_operand" "r,r")) (match_operand:SI 3 "register_operand" "r,?0"))) (clobber (match_scratch:SI 4 "=&r,&r"))] - "riscv_is_micro_arch (rhx) && !TARGET_64BIT && (TARGET_ZMMUL || TARGET_MUL)" - "#" - "&& reload_completed" - [(const_int 0)] - "{ + "riscv_is_micro_arch (arcv_rhx100) + && !TARGET_64BIT && (TARGET_ZMMUL || TARGET_MUL)" + { if (REGNO (operands[0]) == REGNO (operands[3])) { - emit_insn (gen_mulsi3 (operands[4], operands[1], operands[2])); - emit_insn (gen_addsi3 (operands[0], operands[3], operands[4])); + return "mul\t%4,%1,%2\n\tadd\t%4,%3,%4\n\tmv\t%0,%4"; } else { - emit_insn (gen_mulsi3 (operands[0], operands[1], operands[2])); - emit_insn (gen_addsi3 (operands[0], operands[0], operands[3])); + return "mul\t%0,%1,%2\n\tadd\t%0,%0,%3"; } - DONE; - }" - [(set_attr "type" "imul")] + } + [(set_attr "type" "imul_fused")] ) ;; String compare with length insn. From 5dc1c6eff4e233725743e6759530c8887d7d83c3 Mon Sep 17 00:00:00 2001 From: Artemiy Volkov Date: Sun, 7 Jul 2024 21:31:13 +0200 Subject: [PATCH 10/17] arcv: implement bit-extract fusion The bitfield zero_extract operation is normally expanded into an srai followed by an andi. (With the ZBS extension enabled, the special case of 1-bit zero-extract is implemented with the bexti insn.) However, since the RHX core can execute a shift-left and a shift-right of the same register in 1 cycle, we would prefer to emit those two instructions instead, and schedule them together so that macro fusion can take place. The required steps to achieve this are: (1) Create an insn_and_split that handles the zero_extract RTX; (2) Tell the combiner to use that split by lowering the cost of the zero_extract RTX when the target is the RHX core; (3) Designate the resulting slli + srli pair as fusable by the scheduler. Attached is a small testcase demonstrating the split, and that the bexti insn still takes priority over the shift pair. Signed-off-by: Artemiy Volkov --- gcc/config/riscv/arcv-rhx100.md | 5 +++++ gcc/config/riscv/riscv.cc | 10 +++++++++- gcc/config/riscv/riscv.md | 19 ++++++++++++++++++- .../gcc.target/riscv/arcv-fusion-xbfu.c | 14 ++++++++++++++ 4 files changed, 46 insertions(+), 2 deletions(-) create mode 100644 gcc/testsuite/gcc.target/riscv/arcv-fusion-xbfu.c diff --git a/gcc/config/riscv/arcv-rhx100.md b/gcc/config/riscv/arcv-rhx100.md index 08d468b1288c..398f13131606 100644 --- a/gcc/config/riscv/arcv-rhx100.md +++ b/gcc/config/riscv/arcv-rhx100.md @@ -47,6 +47,11 @@ (eq_attr "type" "imul_fused")) "(arcv_rhx100_issueA_fuse0 + arcv_rhx100_issueA_fuse1 + arcv_rhx100_ALU_A_fuse0_early + arcv_rhx100_ALU_A_fuse1_early + arcv_rhx100_MPY32), nothing*3") +(define_insn_reservation "arcv_rhx100_alu_fused" 1 + (and (eq_attr "tune" "arcv_rhx100") + (eq_attr "type" "alu_fused")) + "(arcv_rhx100_issueA_fuse0 + arcv_rhx100_issueA_fuse1 + arcv_rhx100_ALU_A_fuse0_early + arcv_rhx100_ALU_A_fuse1_early) | (arcv_rhx100_issueB_fuse0 + arcv_rhx100_issueB_fuse1 + arcv_rhx100_ALU_B_fuse0_early + arcv_rhx100_ALU_B_fuse1_early)") + (define_insn_reservation "arcv_rhx100_jmp_insn" 1 (and (eq_attr "tune" "arcv_rhx100") (eq_attr "type" "branch,jump,call,jalr,ret,trap")) diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc index bafa14cfd8ab..c1fd5414657f 100644 --- a/gcc/config/riscv/riscv.cc +++ b/gcc/config/riscv/riscv.cc @@ -4337,7 +4337,8 @@ riscv_rtx_costs (rtx x, machine_mode mode, int outer_code, int opno ATTRIBUTE_UN } gcc_fallthrough (); case SIGN_EXTRACT: - if (TARGET_XTHEADBB && outer_code == SET + if ((riscv_is_micro_arch (arcv_rhx100) || TARGET_XTHEADBB) + && outer_code == SET && CONST_INT_P (XEXP (x, 1)) && CONST_INT_P (XEXP (x, 2))) { @@ -11198,6 +11199,13 @@ arcv_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr) || REGNO (SET_DEST (prev_set)) == REGNO (XEXP (SET_SRC (curr_set), 1)))) return true; + /* Fuse logical shift left with logical shift right (bit-extract pattern). */ + if (GET_CODE (SET_SRC (prev_set)) == ASHIFT + && GET_CODE (SET_SRC (curr_set)) == LSHIFTRT + && REGNO (SET_DEST (prev_set)) == REGNO (SET_DEST (curr_set)) + && REGNO (SET_DEST (prev_set)) == REGNO (XEXP (SET_SRC (curr_set), 0))) + return true; + return false; } diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md index 239e8d859811..eece6f1c17e7 100644 --- a/gcc/config/riscv/riscv.md +++ b/gcc/config/riscv/riscv.md @@ -518,7 +518,7 @@ vslideup,vslidedown,vislide1up,vislide1down,vfslide1up,vfslide1down, vgather,vcompress,vmov,vector,vandn,vbrev,vbrev8,vrev8,vclz,vctz,vcpop,vrol,vror,vwsll, vclmul,vclmulh,vghsh,vgmul,vaesef,vaesem,vaesdf,vaesdm,vaeskf1,vaeskf2,vaesz, - vsha2ms,vsha2ch,vsha2cl,vsm4k,vsm4r,vsm3me,vsm3c,vfncvtbf16,vfwcvtbf16,vfwmaccbf16,imul_fused, + vsha2ms,vsha2ch,vsha2cl,vsm4k,vsm4r,vsm3me,vsm3c,vfncvtbf16,vfwcvtbf16,vfwmaccbf16,imul_fused,alu_fused, sf_vc,sf_vc_se" (cond [(eq_attr "got" "load") (const_string "load") @@ -4564,6 +4564,23 @@ [(set_attr "type" "imul_fused")] ) +(define_insn "*zero_extract_fused" + [(set (match_operand:SI 0 "register_operand" "=r") + (zero_extract:SI (match_operand:SI 1 "register_operand" "r") + (match_operand 2 "const_int_operand") + (match_operand 3 "const_int_operand")))] + "riscv_is_micro_arch (arcv_rhx100) && !TARGET_64BIT + && (INTVAL (operands[2]) > 1 || !TARGET_ZBS)" + { + int amount = INTVAL (operands[2]); + int end = INTVAL (operands[3]) + amount; + operands[2] = GEN_INT (BITS_PER_WORD - end); + operands[3] = GEN_INT (BITS_PER_WORD - amount); + return "slli\t%0,%1,%2\n\tsrli\t%0,%0,%3"; + } + [(set_attr "type" "alu_fused")] +) + ;; String compare with length insn. ;; Argument 0 is the target (result) ;; Argument 1 is the source1 diff --git a/gcc/testsuite/gcc.target/riscv/arcv-fusion-xbfu.c b/gcc/testsuite/gcc.target/riscv/arcv-fusion-xbfu.c new file mode 100644 index 000000000000..010038b52c96 --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/arcv-fusion-xbfu.c @@ -0,0 +1,14 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target rv32 } */ +/* { dg-skip-if "" { *-*-* } { "-g" "-flto" "-O0" "-Oz" "-Os" } } */ +/* { dg-options "-mtune=arc-v-rhx-100-series -march=rv32im_zbs -mabi=ilp32" } */ + +#define bit_extract(x,start,amt) (((x)>>(start)) & (~(0xffffffff << (amt)))) + +int +f (int x) +{ + return bit_extract(x,10,14) + bit_extract(x,1,1); +} + +/* { dg-final { scan-assembler {\sslli\s([ast][0-9]+),a0,8\n\ssrli\s([ast][0-9]+),\1,18\n\sbexti\sa0,a0,1\n\sadd\sa0,\2,a0\n} } } */ From 08b3411122215b1e732c66e83bcc99e1fb240b68 Mon Sep 17 00:00:00 2001 From: Artemiy Volkov Date: Tue, 9 Jul 2024 04:11:09 -0700 Subject: [PATCH 11/17] arcv: allow inverted instruction order for some fusion types Some fusion types (namely, LD/ST-OP/OPIMM and LD/ST-LUI) are available regardless of the order of instructions. To support this, extract the new arcv_memop_arith_pair_p () and arcv_memop_lui_pair_p () functions and call them twice. Signed-off-by: Artemiy Volkov --- gcc/config/riscv/riscv.cc | 141 ++++++++++++++++++++++++++------------ 1 file changed, 97 insertions(+), 44 deletions(-) diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc index c1fd5414657f..58fc18f46dee 100644 --- a/gcc/config/riscv/riscv.cc +++ b/gcc/config/riscv/riscv.cc @@ -11088,6 +11088,97 @@ arcv_fused_addr_p (rtx addr0, rtx addr1) return false; } +/* Return true if PREV and CURR constitute an ordered load/store + op/opimm + pair, for the purposes of ARCV-specific macro-op fusion. */ +static bool +arcv_memop_arith_pair_p (rtx_insn *prev, rtx_insn *curr) +{ + rtx prev_set = single_set (prev); + rtx curr_set = single_set (curr); + + gcc_assert (prev_set); + gcc_assert (curr_set); + + /* Fuse load/store + register post-{inc,dec}rement: + * prev (ld) == (set (reg:X rd1) (mem:X (plus:X (reg:X rs1) (const_int)))) + * or + * prev (st) == (set (mem:X (plus:X (reg:X rs1) (const_int))) (reg:X rs2)) + * ... + */ + if ((get_attr_type (curr) == TYPE_ARITH + || get_attr_type (curr) == TYPE_LOGICAL + || get_attr_type (curr) == TYPE_SHIFT + || get_attr_type (curr) == TYPE_SLT + || get_attr_type (curr) == TYPE_BITMANIP + || get_attr_type (curr) == TYPE_MIN + || get_attr_type (curr) == TYPE_MAX + || get_attr_type (curr) == TYPE_MINU + || get_attr_type (curr) == TYPE_MAXU + || get_attr_type (curr) == TYPE_CLZ + || get_attr_type (curr) == TYPE_CTZ) + && (CONST_INT_P (SET_SRC (curr_set)) + || REG_P (XEXP (SET_SRC (curr_set), 0))) + && ((get_attr_type (prev) == TYPE_LOAD + && REG_P (XEXP (SET_SRC (prev_set), 0)) + && REGNO (XEXP (SET_SRC (prev_set), 0)) + == REGNO (XEXP (SET_SRC (curr_set), 0)) + && REGNO (XEXP (SET_SRC (prev_set), 0)) + != REGNO (SET_DEST (prev_set)) + && REGNO (SET_DEST (prev_set)) != REGNO (SET_DEST (curr_set)) + && (/* (set (reg:X rd1) (not (reg:X rs1))) */ + GET_RTX_LENGTH (GET_CODE (SET_SRC (curr_set))) == 1 + /* (op-imm) == (set (reg:X rd2) (plus/minus (reg:X rs1) (const_int))) */ + || CONST_INT_P (XEXP (SET_SRC (curr_set), 1)) + /* (op) == (set (reg:X rd2) (plus/minus (reg:X rs1) (reg:X rs2))) */ + || REGNO (SET_DEST (prev_set)) + != REGNO (XEXP (SET_SRC (curr_set), 1)))) + || (get_attr_type (prev) == TYPE_STORE + && REG_P (XEXP (SET_DEST (prev_set), 0)) + && REGNO (XEXP (SET_DEST (prev_set), 0)) + == REGNO (XEXP (SET_SRC (curr_set), 0)) + && (/* (set (reg:X rd1) (not (reg:X rs1))) */ + GET_RTX_LENGTH (GET_CODE (SET_SRC (curr_set))) == 1 + /* (op-imm) == (set (reg:X rd2) (plus/minus (reg:X rs1) (const_int))) */ + || CONST_INT_P (XEXP (SET_SRC (curr_set), 1)) + /* (op) == (set (reg:X rd2) (plus/minus (reg:X rs1) (reg:X rs2))) */ + || REGNO (XEXP (SET_DEST (prev_set), 0)) + == REGNO (XEXP (SET_SRC (curr_set), 1)))))) + return true; + + return false; +} + +/* Return true if PREV and CURR constitute an ordered load/store + lui pair, for + the purposes of ARCV-specific macro-op fusion. */ +static bool +arcv_memop_lui_pair_p (rtx_insn *prev, rtx_insn *curr) +{ + rtx prev_set = single_set (prev); + rtx curr_set = single_set (curr); + + gcc_assert (prev_set); + gcc_assert (curr_set); + + /* Fuse load/store with lui: + * prev (ld) == (set (reg:X rd1) (mem:X (plus:X (reg:X) (const_int)))) + * or + * prev (st) == (set (mem:X (plus:X (reg:X) (const_int))) (reg:X rD)) + * + * curr (lui) == (set (reg:X rd2) (const_int UPPER_IMM_20)) + */ + if (REG_P (curr) + && ((get_attr_type (curr) == TYPE_MOVE + && GET_CODE (SET_SRC (curr_set)) == HIGH) + || (CONST_INT_P (SET_SRC (curr_set)) + && LUI_OPERAND (INTVAL (SET_SRC (curr_set))))) + && ((get_attr_type (prev) == TYPE_LOAD + && REGNO (SET_DEST (prev_set)) != REGNO (SET_DEST (curr_set))) + || get_attr_type (prev) == TYPE_STORE)) + return true; + + return false; +} + /* Return true if PREV and CURR should be kept together during scheduling. */ static bool @@ -11134,52 +11225,14 @@ arcv_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr) return true; } - /* Fuse load/store + register post-{inc,dec}rement: - * prev (ld) == (set (reg:X rd1) (mem:X (plus:X (reg:X rs1) (const_int)))) - * or - * prev (st) == (set (mem:X (plus:X (reg:X rs1) (const_int))) (reg:X rs2)) - * ... - */ - if ((GET_CODE (SET_SRC (curr_set)) == PLUS - || GET_CODE (SET_SRC (curr_set)) == MINUS) - && REG_P (XEXP (SET_SRC (curr_set), 0)) - && ((get_attr_type (prev) == TYPE_LOAD - && REG_P (XEXP (SET_SRC (prev_set), 0)) - && REGNO (XEXP (SET_SRC (prev_set), 0)) - == REGNO (XEXP (SET_SRC (curr_set), 0)) - && REGNO (XEXP (SET_SRC (prev_set), 0)) - != REGNO (SET_DEST (prev_set)) - && REGNO (SET_DEST (prev_set)) != REGNO (SET_DEST (curr_set)) - /* curr (op-imm) == (set (reg:X rd2) (plus/minus (reg:X rs1) (const_int))) */ - && (CONST_INT_P (XEXP (SET_SRC (curr_set), 1)) - /* or curr (op) == (set (reg:X rd2) (plus/minus (reg:X rs1) (reg:X rs2))) */ - || REGNO (SET_DEST (prev_set)) - != REGNO (XEXP (SET_SRC (curr_set), 1)))) - || (get_attr_type (prev) == TYPE_STORE - && REG_P (XEXP (SET_DEST (prev_set), 0)) - && REGNO (XEXP (SET_DEST (prev_set), 0)) - == REGNO (XEXP (SET_SRC (curr_set), 0)) - /* curr (op-imm) == (set (reg:X rd2) (plus/minus (reg:X rs1) (const_int))) */ - && (CONST_INT_P (XEXP (SET_SRC (curr_set), 1)) - /* or curr (op) == (set (reg:X rd2) (plus/minus (reg:X rs1) (reg:X rs2))) */ - || REGNO (XEXP (SET_DEST (prev_set), 0)) - == REGNO (XEXP (SET_SRC (curr_set), 1)))))) + /* Fuse a pre- or post-update memory operation. */ + if (arcv_memop_arith_pair_p (prev, curr) + || arcv_memop_arith_pair_p (curr, prev)) return true; - /* Fuse load/store with lui: - * prev (ld) == (set (reg:X rd1) (mem:X (plus:X (reg:X) (const_int)))) - * or - * prev (st) == (set (mem:X (plus:X (reg:X) (const_int))) (reg:X rD)) - * - * curr (lui) == (set (reg:X rd2) (const_int UPPER_IMM_20)) - */ - if (((get_attr_type (curr) == TYPE_MOVE - && GET_CODE (SET_SRC (curr_set)) == HIGH) - || (CONST_INT_P (SET_SRC (curr_set)) - && LUI_OPERAND (INTVAL (SET_SRC (curr_set))))) - && ((get_attr_type (prev) == TYPE_LOAD - && REGNO (SET_DEST (prev_set)) != REGNO (SET_DEST (curr_set))) - || get_attr_type (prev) == TYPE_STORE)) + /* Fuse a memory operation preceded or followed by a lui. */ + if (arcv_memop_lui_pair_p (prev, curr) + || arcv_memop_lui_pair_p (curr, prev)) return true; /* Fuse load-immediate with a store of the destination register. */ From 19f2bee498a20e6b13bacd533ea9eaab6b497acb Mon Sep 17 00:00:00 2001 From: Artemiy Volkov Date: Tue, 1 Oct 2024 01:02:17 -0700 Subject: [PATCH 12/17] arcv: add scheduling implementation for RHX-100 This commit implements the scheduling model for the RHX-100 core. Among notable things are: (1) The arcv_macro_fusion_pair_p () hook has been modified to not create SCHED_GROUP's larger than 2 instructions; also, it gives priority to double load/store fusion, suppressing the other types until sched2. (2) riscv_issue_rate () is set to 4 and the system is modeled as 4 separate pipelines, giving access to as many instructions in ready_list as possible. (3) The rhx.md description puts some initial constraints in place (e.g. memory ops can only go into pipe B), saving some work in the reordering hook. (4) The riscv_sched_variable_issue () and riscv_sched_reorder2 () hooks work together to make sure (in order of descending priority) that: (a) the critical path and the instruction priorities are respected; (b) both pipes are filled (taking advantage of parallel dispatch within the microarchitectural constraints); (c) there is as much fusion going on as possible (and the existing fusion pairs are not broken up). There is probably some room for improvement, and some tweaks will probably have to be made in response to HLA changes as the HW development process goes on. Signed-off-by: Artemiy Volkov --- gcc/config/riscv/riscv.cc | 283 ++++++++++++++++++++++++++++++++++---- 1 file changed, 259 insertions(+), 24 deletions(-) diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc index 58fc18f46dee..5ee94035ad4b 100644 --- a/gcc/config/riscv/riscv.cc +++ b/gcc/config/riscv/riscv.cc @@ -340,6 +340,12 @@ unsigned riscv_stack_boundary; /* Whether in riscv_output_mi_thunk. */ static bool riscv_in_thunk_func = false; +static int alu_pipe_scheduled_p; +static int pipeB_scheduled_p; + +static rtx_insn *last_scheduled_insn; +static short cached_can_issue_more; + /* If non-zero, this is an offset to be added to SP to redefine the CFA when restoring the FP register from the stack. Only valid when generating the epilogue. */ @@ -10847,6 +10853,21 @@ riscv_sched_init (FILE *, int, int) static int riscv_sched_variable_issue (FILE *, int, rtx_insn *insn, int more) { + /* Beginning of cycle - reset variables. */ + if (more == tune_param->issue_rate) + { + alu_pipe_scheduled_p = 0; + pipeB_scheduled_p = 0; + } + + if (alu_pipe_scheduled_p && pipeB_scheduled_p) + { + cached_can_issue_more = 0; + return 0; + } + + cached_can_issue_more = more; + if (DEBUG_INSN_P (insn)) return more; @@ -10892,6 +10913,28 @@ riscv_sched_variable_issue (FILE *, int, rtx_insn *insn, int more) } } + if (next_insn (insn) && INSN_P (next_insn (insn)) + && SCHED_GROUP_P (next_insn (insn))) + { + if (get_attr_type (insn) == TYPE_LOAD + || get_attr_type (insn) == TYPE_STORE + || get_attr_type (next_insn (insn)) == TYPE_LOAD + || get_attr_type (next_insn (insn)) == TYPE_STORE) + pipeB_scheduled_p = 1; + else + alu_pipe_scheduled_p = 1; + } + + if (get_attr_type (insn) == TYPE_ALU_FUSED + || get_attr_type (insn) == TYPE_IMUL_FUSED) + { + alu_pipe_scheduled_p = 1; + more -= 1; + } + + last_scheduled_insn = insn; + cached_can_issue_more = more - 1; + return more - 1; } @@ -11184,15 +11227,35 @@ arcv_memop_lui_pair_p (rtx_insn *prev, rtx_insn *curr) static bool arcv_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr) { + /* Never create sched groups with more than 2 members. */ + if (SCHED_GROUP_P (prev)) + return false; + rtx prev_set = single_set (prev); rtx curr_set = single_set (curr); - /* prev and curr are simple SET insns i.e. no flag setting or branching. */ - bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr); - /* Fuse load-immediate with a dependent conditional branch. */ - if (get_attr_type (prev) == TYPE_MOVE - && get_attr_move_type (prev) == MOVE_TYPE_CONST - && any_condjump_p (curr)) + /* Fuse multiply-add pair. */ + if (prev_set && curr_set && GET_CODE (SET_SRC (prev_set)) == MULT + && GET_CODE (SET_SRC (curr_set)) == PLUS + && (REG_P (XEXP (SET_SRC (curr_set), 0)) + && REGNO (SET_DEST (prev_set)) == + REGNO (XEXP (SET_SRC (curr_set), 0)) + || (REG_P (XEXP (SET_SRC (curr_set), 1)) + && REGNO (SET_DEST (prev_set)) == + REGNO (XEXP (SET_SRC (curr_set), 1))))) + return true; + + /* Fuse logical shift left with logical shift right (bit-extract pattern). */ + if (prev_set && curr_set && GET_CODE (SET_SRC (prev_set)) == ASHIFT + && GET_CODE (SET_SRC (curr_set)) == LSHIFTRT + && REGNO (SET_DEST (prev_set)) == REGNO (SET_DEST (curr_set)) + && REGNO (SET_DEST (prev_set)) == REGNO (XEXP (SET_SRC (curr_set), 0))) + return true; + + /* Fuse load-immediate with a dependent conditional branch. */ + if (get_attr_type (prev) == TYPE_MOVE + && get_attr_move_type (prev) == MOVE_TYPE_CONST + && any_condjump_p (curr)) { rtx comp = XEXP (SET_SRC (curr_set), 0); @@ -11200,6 +11263,13 @@ arcv_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr) || (REG_P (XEXP (comp, 1)) && XEXP (comp, 1) == SET_DEST (prev_set)); } + /* Do not fuse loads/stores before sched2. */ + if (!reload_completed || sched_fusion) + return false; + + /* prev and curr are simple SET insns i.e. no flag setting or branching. */ + bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr); + /* Don't handle anything with a jump past this point. */ if (!simple_sets_p) return false; @@ -11225,6 +11295,30 @@ arcv_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr) return true; } + /* Look ahead 1 insn to make sure double loads/stores are always + fused together, even in the presence of other opportunities. */ + if (next_insn (curr) && single_set (next_insn (curr)) + && get_attr_type (curr) == TYPE_LOAD + && get_attr_type (next_insn (curr)) == TYPE_LOAD) + { + rtx addr0 = XEXP (SET_SRC (curr_set), 0); + rtx addr1 = XEXP (SET_SRC (single_set (next_insn (curr))), 0); + + if (arcv_fused_addr_p (addr0, addr1)) + return false; + } + + if (next_insn (curr) && single_set (next_insn (curr)) + && get_attr_type (curr) == TYPE_STORE + && get_attr_type (next_insn (curr)) == TYPE_STORE) + { + rtx addr0 = XEXP (SET_DEST (curr_set), 0); + rtx addr1 = XEXP (SET_DEST (single_set (next_insn (curr))), 0); + + if (arcv_fused_addr_p (addr0, addr1)) + return false; + } + /* Fuse a pre- or post-update memory operation. */ if (arcv_memop_arith_pair_p (prev, curr) || arcv_memop_arith_pair_p (curr, prev)) @@ -11245,20 +11339,6 @@ arcv_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr) && SET_DEST (prev_set) == SUBREG_REG (SET_SRC (curr_set))))) return true; - if (GET_CODE (SET_SRC (prev_set)) == MULT - && GET_CODE (SET_SRC (curr_set)) == PLUS - && REGNO (SET_DEST (prev_set)) == REGNO (SET_DEST (curr_set)) - && (REGNO (SET_DEST (prev_set)) == REGNO (XEXP (SET_SRC (curr_set), 0)) - || REGNO (SET_DEST (prev_set)) == REGNO (XEXP (SET_SRC (curr_set), 1)))) - return true; - - /* Fuse logical shift left with logical shift right (bit-extract pattern). */ - if (GET_CODE (SET_SRC (prev_set)) == ASHIFT - && GET_CODE (SET_SRC (curr_set)) == LSHIFTRT - && REGNO (SET_DEST (prev_set)) == REGNO (SET_DEST (curr_set)) - && REGNO (SET_DEST (prev_set)) == REGNO (XEXP (SET_SRC (curr_set), 0))) - return true; - return false; } @@ -11980,17 +12060,21 @@ riscv_sched_fusion_priority (rtx_insn *insn, int max_pri, int *fusion_pri, we currently only perform the adjustment when -madjust-lmul-cost is given. */ static int -riscv_sched_adjust_cost (rtx_insn *, int, rtx_insn *insn, int cost, - unsigned int) +riscv_sched_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, + int cost, unsigned int) { + if (riscv_is_micro_arch (arcv_rhx100) && dep_type == REG_DEP_ANTI + && !SCHED_GROUP_P (insn)) + return cost + 1; + /* Only do adjustments for the generic out-of-order scheduling model. */ if (!TARGET_VECTOR || riscv_microarchitecture != generic_ooo) return cost; - if (recog_memoized (insn) < 0) + if (recog_memoized (dep_insn) < 0) return cost; - enum attr_type type = get_attr_type (insn); + enum attr_type type = get_attr_type (dep_insn); if (type == TYPE_VFREDO || type == TYPE_VFWREDO) { @@ -12041,6 +12125,7 @@ riscv_sched_adjust_cost (rtx_insn *, int, rtx_insn *insn, int cost, return new_cost; } + /* Implement TARGET_SCHED_CAN_SPECULATE_INSN hook. Return true if insn can can be scheduled for speculative execution. Reject vsetvl instructions to prevent the scheduler from hoisting them out of basic blocks without @@ -12062,6 +12147,149 @@ riscv_sched_can_speculate_insn (rtx_insn *insn) } } +static void +riscv_sched_init (FILE *file ATTRIBUTE_UNUSED, + int verbose ATTRIBUTE_UNUSED, + int max_ready ATTRIBUTE_UNUSED) +{ + last_scheduled_insn = 0; +} + +static int +riscv_sched_reorder2 (FILE *file ATTRIBUTE_UNUSED, + int verbose ATTRIBUTE_UNUSED, + rtx_insn **ready, + int *n_readyp, + int clock ATTRIBUTE_UNUSED) +{ + if (sched_fusion) + return cached_can_issue_more; + + if (!cached_can_issue_more) + return 0; + + /* Fuse double load/store instances missed by sched_fusion. */ + if (!pipeB_scheduled_p && last_scheduled_insn && ready && *n_readyp > 0 + && !SCHED_GROUP_P (last_scheduled_insn) + && (get_attr_type (last_scheduled_insn) == TYPE_LOAD + || get_attr_type (last_scheduled_insn) == TYPE_STORE)) + { + for (int i = 1; i <= *n_readyp; i++) + { + if (NONDEBUG_INSN_P (ready[*n_readyp - i]) + && !SCHED_GROUP_P (ready[*n_readyp - i]) + && (!next_insn (ready[*n_readyp - i]) + || !NONDEBUG_INSN_P (next_insn (ready[*n_readyp - i])) + || !SCHED_GROUP_P (next_insn (ready[*n_readyp - i]))) + && arcv_macro_fusion_pair_p (last_scheduled_insn, ready[*n_readyp - i])) + { + std::swap (ready[*n_readyp - 1], ready[*n_readyp - i]); + SCHED_GROUP_P (ready[*n_readyp - 1]) = 1; + pipeB_scheduled_p = 1; + return cached_can_issue_more; + } + } + pipeB_scheduled_p = 1; + } + + /* Try to fuse a non-memory last_scheduled_insn. */ + if ((!alu_pipe_scheduled_p || !pipeB_scheduled_p) + && last_scheduled_insn && ready && *n_readyp > 0 + && !SCHED_GROUP_P (last_scheduled_insn) + && (get_attr_type (last_scheduled_insn) != TYPE_LOAD + && get_attr_type (last_scheduled_insn) != TYPE_STORE)) + { + for (int i = 1; i <= *n_readyp; i++) + { + if (NONDEBUG_INSN_P (ready[*n_readyp - i]) + && !SCHED_GROUP_P (ready[*n_readyp - i]) + && (!next_insn (ready[*n_readyp - i]) + || !NONDEBUG_INSN_P (next_insn (ready[*n_readyp - i])) + || !SCHED_GROUP_P (next_insn (ready[*n_readyp - i]))) + && arcv_macro_fusion_pair_p (last_scheduled_insn, ready[*n_readyp - i])) + { + if (get_attr_type (ready[*n_readyp - i]) == TYPE_LOAD + || get_attr_type (ready[*n_readyp - i]) == TYPE_STORE) + if (pipeB_scheduled_p) + continue; + else + pipeB_scheduled_p = 1; + else if (!alu_pipe_scheduled_p) + alu_pipe_scheduled_p = 1; + else + pipeB_scheduled_p = 1; + + std::swap (ready[*n_readyp - 1], ready[*n_readyp - i]); + SCHED_GROUP_P (ready[*n_readyp - 1]) = 1; + return cached_can_issue_more; + } + } + alu_pipe_scheduled_p = 1; + } + + /* When pipe B is scheduled, we can have no more memops this cycle. */ + if (pipeB_scheduled_p && *n_readyp > 0 + && NONDEBUG_INSN_P (ready[*n_readyp - 1]) + && recog_memoized (ready[*n_readyp - 1]) >= 0 + && !SCHED_GROUP_P (ready[*n_readyp - 1]) + && (get_attr_type (ready[*n_readyp - 1]) == TYPE_LOAD + || get_attr_type (ready[*n_readyp - 1]) == TYPE_STORE)) + { + if (alu_pipe_scheduled_p) + return 0; + + for (int i = 2; i <= *n_readyp; i++) + { + if ((NONDEBUG_INSN_P (ready[*n_readyp - i]) + && recog_memoized (ready[*n_readyp - i]) >= 0 + && get_attr_type (ready[*n_readyp - i]) != TYPE_LOAD + && get_attr_type (ready[*n_readyp - i]) != TYPE_STORE + && !SCHED_GROUP_P (ready[*n_readyp - i]) + && ((!next_insn (ready[*n_readyp - i]) + || !NONDEBUG_INSN_P (next_insn (ready[*n_readyp - i])) + || !SCHED_GROUP_P (next_insn (ready[*n_readyp - i]))))) + || ((next_insn (ready[*n_readyp - i]) + && NONDEBUG_INSN_P (next_insn (ready[*n_readyp - i])) + && recog_memoized (next_insn (ready[*n_readyp - i])) >= 0 + && get_attr_type (next_insn (ready[*n_readyp - i])) != TYPE_LOAD + && get_attr_type (next_insn (ready[*n_readyp - i])) != TYPE_STORE))) + { + std::swap (ready[*n_readyp - 1], ready[*n_readyp - i]); + alu_pipe_scheduled_p = 1; + cached_can_issue_more = 1; + return 1; + } + } + return 0; + } + + /* If all else fails, schedule a single instruction. */ + if (ready && *n_readyp > 0 + && NONDEBUG_INSN_P (ready[*n_readyp - 1]) + && recog_memoized (ready[*n_readyp - 1]) >= 0 + && get_attr_type (ready[*n_readyp - 1]) != TYPE_LOAD + && get_attr_type (ready[*n_readyp - 1]) != TYPE_STORE) + { + if (!pipeB_scheduled_p + && (get_attr_type (ready[*n_readyp - 1]) == TYPE_LOAD + || get_attr_type (ready[*n_readyp - 1]) == TYPE_STORE)) + { + alu_pipe_scheduled_p = pipeB_scheduled_p = 1; + cached_can_issue_more = 1; + return 1; + } + else if (get_attr_type (ready[*n_readyp - 1]) != TYPE_LOAD + || get_attr_type (ready[*n_readyp - 1]) != TYPE_STORE) + { + alu_pipe_scheduled_p = pipeB_scheduled_p = 1; + cached_can_issue_more = 1; + return 1; + } + } + + return cached_can_issue_more; +} + /* Auxiliary function to emit RISC-V ELF attribute. */ static void riscv_emit_attribute () @@ -16668,9 +16896,16 @@ riscv_prefetch_offset_address_p (rtx x, machine_mode mode) #undef TARGET_SCHED_ADJUST_COST #define TARGET_SCHED_ADJUST_COST riscv_sched_adjust_cost + #undef TARGET_SCHED_CAN_SPECULATE_INSN #define TARGET_SCHED_CAN_SPECULATE_INSN riscv_sched_can_speculate_insn +#undef TARGET_SCHED_REORDER2 +#define TARGET_SCHED_REORDER2 riscv_sched_reorder2 + +#undef TARGET_SCHED_INIT +#define TARGET_SCHED_INIT riscv_sched_init + #undef TARGET_FUNCTION_OK_FOR_SIBCALL #define TARGET_FUNCTION_OK_FOR_SIBCALL riscv_function_ok_for_sibcall From 9e7d3cc3928c6565d1dd3a7e644ba6ad99957e94 Mon Sep 17 00:00:00 2001 From: Artemiy Volkov Date: Tue, 3 Dec 2024 00:49:30 -0800 Subject: [PATCH 13/17] arcv: adjust scheduling priority of memop pairs for RHX-100 This patch implements riscv_sched_adjust_priority () for the RHX-100 microarchitecture by slightly bumping the priority of load/store pairs. As a consequence of this change, it becomes easier for riscv_sched_reorder2 () to schedule instructions in the memory pipe. Signed-off-by: Artemiy Volkov --- gcc/config/riscv/riscv.cc | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc index 5ee94035ad4b..34c0a3d37fe4 100644 --- a/gcc/config/riscv/riscv.cc +++ b/gcc/config/riscv/riscv.cc @@ -12147,6 +12147,31 @@ riscv_sched_can_speculate_insn (rtx_insn *insn) } } +static int +riscv_sched_adjust_priority (rtx_insn *insn, int priority) +{ + if (!riscv_is_micro_arch (arcv_rhx100)) + return priority; + + if (DEBUG_INSN_P (insn) || GET_CODE (PATTERN (insn)) == USE + || GET_CODE (PATTERN (insn)) == CLOBBER) + return priority; + + /* Bump the priority of fused load-store pairs for easier + scheduling of the memory pipe. The specific increase + value is determined empirically. */ + if (next_insn (insn) && INSN_P (next_insn (insn)) + && SCHED_GROUP_P (next_insn (insn)) + && ((get_attr_type (insn) == TYPE_STORE + && get_attr_type (next_insn (insn)) == TYPE_STORE) + || (get_attr_type (insn) == TYPE_LOAD + && get_attr_type (next_insn (insn)) == TYPE_LOAD))) + return priority + 1; + + return priority; +} + + static void riscv_sched_init (FILE *file ATTRIBUTE_UNUSED, int verbose ATTRIBUTE_UNUSED, @@ -16896,10 +16921,12 @@ riscv_prefetch_offset_address_p (rtx x, machine_mode mode) #undef TARGET_SCHED_ADJUST_COST #define TARGET_SCHED_ADJUST_COST riscv_sched_adjust_cost - #undef TARGET_SCHED_CAN_SPECULATE_INSN #define TARGET_SCHED_CAN_SPECULATE_INSN riscv_sched_can_speculate_insn +#undef TARGET_SCHED_ADJUST_PRIORITY +#define TARGET_SCHED_ADJUST_PRIORITY riscv_sched_adjust_priority + #undef TARGET_SCHED_REORDER2 #define TARGET_SCHED_REORDER2 riscv_sched_reorder2 From 12c7874e556190a719c74dacc69ae92558d81794 Mon Sep 17 00:00:00 2001 From: Artemiy Volkov Date: Thu, 28 Nov 2024 02:08:08 -0800 Subject: [PATCH 14/17] arcv: fuse LH+LH and LB+LB instruction pairs In addition to the LW+LW and SW+SW pairs that are already being recognized as macro-op-fusable, add support for 8-bit and naturally aligned 16-bit loads operating on adjacent memory locations. To that end, introduce the new microarch-specific pair_fusion_mode_allowed_p () predicate, and call it from fusion_load_store () during sched_fusion, and from arcv_macro_fusion_pair_p () during regular scheduling passes. Signed-off-by: Artemiy Volkov --- gcc/config/riscv/riscv.cc | 115 ++++++++++++++++++++++++++------------ 1 file changed, 80 insertions(+), 35 deletions(-) diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc index 34c0a3d37fe4..60ac19ea6726 100644 --- a/gcc/config/riscv/riscv.cc +++ b/gcc/config/riscv/riscv.cc @@ -11084,37 +11084,73 @@ riscv_set_is_shNadduw (rtx set) && REG_P (SET_DEST (set))); } +/* Return TRUE if the target microarchitecture supports macro-op + fusion for two memory operations of mode MODE (the direction + of transfer is determined by the IS_LOAD parameter). */ + +static bool +pair_fusion_mode_allowed_p (machine_mode mode, bool is_load) +{ + if (!riscv_is_micro_arch (arcv_rhx100)) + return true; + + return ((is_load && (mode == SImode + || mode == HImode + || mode == QImode)) + || (!is_load && mode == SImode)); +} + /* Return TRUE if two addresses can be fused. */ static bool -arcv_fused_addr_p (rtx addr0, rtx addr1) +arcv_fused_addr_p (rtx addr0, rtx addr1, bool is_load) { rtx base0, base1, tmp; HOST_WIDE_INT off0 = 0, off1 = 0; - if (GET_CODE (addr0) == PLUS) + if (GET_CODE (addr0) == SIGN_EXTEND || GET_CODE (addr0) == ZERO_EXTEND) + addr0 = XEXP (addr0, 0); + + if (GET_CODE (addr1) == SIGN_EXTEND || GET_CODE (addr1) == ZERO_EXTEND) + addr1 = XEXP (addr1, 0); + + if (!MEM_P (addr0) || !MEM_P (addr1)) + return false; + + /* Require the accesses to have the same mode. */ + if (GET_MODE (addr0) != GET_MODE (addr1)) + return false; + + /* Check if the mode is allowed. */ + if (!pair_fusion_mode_allowed_p (GET_MODE (addr0), is_load)) + return false; + + rtx reg0 = XEXP (addr0, 0); + rtx reg1 = XEXP (addr1, 0); + + if (GET_CODE (reg0) == PLUS) { - base0 = XEXP (addr0, 0); - tmp = XEXP (addr0, 1); + base0 = XEXP (reg0, 0); + tmp = XEXP (reg0, 1); if (!CONST_INT_P (tmp)) return false; off0 = INTVAL (tmp); } - else if (REG_P (addr0)) - base0 = addr0; + else if (REG_P (reg0)) + base0 = reg0; else return false; - if (GET_CODE (addr1) == PLUS) + if (GET_CODE (reg1) == PLUS) { - base1 = XEXP (addr1, 0); - tmp = XEXP (addr1, 1); + base1 = XEXP (reg1, 0); + tmp = XEXP (reg1, 1); if (!CONST_INT_P (tmp)) return false; off1 = INTVAL (tmp); } - else if (REG_P (addr1)) - base1 = addr1; + else if (REG_P (reg1)) + base1 = reg1; else return false; @@ -11123,9 +11159,9 @@ arcv_fused_addr_p (rtx addr0, rtx addr1) if (REGNO (base0) != REGNO (base1)) return false; - /* Offsets have to be aligned to word boundary and adjacent in memory, - but the memory operations can be narrower. */ - if ((off0 % UNITS_PER_WORD == 0) && (abs (off1 - off0) == UNITS_PER_WORD)) + /* Fuse adjacent aligned addresses. */ + if ((off0 % GET_MODE_SIZE (GET_MODE (addr0)).to_constant () == 0) + && (abs (off1 - off0) == GET_MODE_SIZE (GET_MODE (addr0)).to_constant ())) return true; return false; @@ -11278,20 +11314,14 @@ arcv_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr) if (get_attr_type (prev) == TYPE_LOAD && get_attr_type (curr) == TYPE_LOAD) { - rtx addr0 = XEXP (SET_SRC (prev_set), 0); - rtx addr1 = XEXP (SET_SRC (curr_set), 0); - - if (arcv_fused_addr_p (addr0, addr1)) + if (arcv_fused_addr_p (SET_SRC (prev_set), SET_SRC (curr_set), true)) return true; } if (get_attr_type (prev) == TYPE_STORE && get_attr_type (curr) == TYPE_STORE) { - rtx addr0 = XEXP (SET_DEST (prev_set), 0); - rtx addr1 = XEXP (SET_DEST (curr_set), 0); - - if (arcv_fused_addr_p (addr0, addr1)) + if (arcv_fused_addr_p (SET_DEST (prev_set), SET_DEST (curr_set), false)) return true; } @@ -11301,10 +11331,9 @@ arcv_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr) && get_attr_type (curr) == TYPE_LOAD && get_attr_type (next_insn (curr)) == TYPE_LOAD) { - rtx addr0 = XEXP (SET_SRC (curr_set), 0); - rtx addr1 = XEXP (SET_SRC (single_set (next_insn (curr))), 0); - - if (arcv_fused_addr_p (addr0, addr1)) + if (arcv_fused_addr_p (SET_SRC (curr_set), + SET_SRC (single_set (next_insn (curr))), + true)) return false; } @@ -11312,10 +11341,9 @@ arcv_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr) && get_attr_type (curr) == TYPE_STORE && get_attr_type (next_insn (curr)) == TYPE_STORE) { - rtx addr0 = XEXP (SET_DEST (curr_set), 0); - rtx addr1 = XEXP (SET_DEST (single_set (next_insn (curr))), 0); - - if (arcv_fused_addr_p (addr0, addr1)) + if (arcv_fused_addr_p (SET_DEST (curr_set), + SET_DEST (single_set (next_insn (curr))), + false)) return false; } @@ -11986,7 +12014,8 @@ riscv_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr) otherwise return FALSE. */ static bool -fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset, bool *is_load) +fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset, machine_mode *mode, + bool *is_load) { rtx x, dest, src; @@ -11997,15 +12026,22 @@ fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset, bool *is_load) src = SET_SRC (x); dest = SET_DEST (x); + + if ((GET_CODE (src) == SIGN_EXTEND || GET_CODE (src) == ZERO_EXTEND) + && MEM_P (XEXP (src, 0))) + src = XEXP (src, 0); + if (REG_P (src) && MEM_P (dest)) { *is_load = false; - extract_base_offset_in_addr (dest, base, offset); + if (extract_base_offset_in_addr (dest, base, offset)) + *mode = GET_MODE (dest); } else if (MEM_P (src) && REG_P (dest)) { *is_load = true; - extract_base_offset_in_addr (src, base, offset); + if (extract_base_offset_in_addr (src, base, offset)) + *mode = GET_MODE (src); } else return false; @@ -12020,11 +12056,13 @@ riscv_sched_fusion_priority (rtx_insn *insn, int max_pri, int *fusion_pri, int tmp, off_val; bool is_load; rtx base, offset; + machine_mode mode = SImode; gcc_assert (INSN_P (insn)); tmp = max_pri - 1; - if (!fusion_load_store (insn, &base, &offset, &is_load)) + if (!fusion_load_store (insn, &base, &offset, &mode, &is_load) + || !pair_fusion_mode_allowed_p (mode, is_load)) { *pri = tmp; *fusion_pri = tmp; @@ -12033,6 +12071,11 @@ riscv_sched_fusion_priority (rtx_insn *insn, int max_pri, int *fusion_pri, tmp /= 2; + if (mode == HImode) + tmp /= 2; + else if (mode == QImode) + tmp /= 4; + /* INSN with smaller base register goes first. */ tmp -= ((REGNO (base) & 0xff) << 20); @@ -12041,7 +12084,9 @@ riscv_sched_fusion_priority (rtx_insn *insn, int max_pri, int *fusion_pri, /* Put loads/stores operating on adjacent words into the same * scheduling group. */ - *fusion_pri = tmp - ((off_val / (UNITS_PER_WORD * 2)) << 1) + is_load; + *fusion_pri = tmp + - ((off_val / (GET_MODE_SIZE (mode).to_constant () * 2)) << 1) + + is_load; if (off_val >= 0) tmp -= (off_val & 0xfffff); From a8477b0648d54a8e140ed77555f2afea38c8ff48 Mon Sep 17 00:00:00 2001 From: Artemiy Volkov Date: Tue, 6 May 2025 00:06:35 -0700 Subject: [PATCH 15/17] arcv: do not emit 64-bit MAC pairs for 32-bit data Currently on ARC-V, the maddhisi3 pattern always expands to the madd_split_fused instruction regardless of the target word size, which leads to the full-width mul and add instructions being emitted for 32-bit data even on riscv64: mul a6,a4,s6 add a6,a6,s7 sext.w s7,a6 To fix this, add another define_insn (madd_split_fused_extended) pattern wrapping the result of a MAC operation into a sign-extension from 32 to 64 bits, and use it in the (u)maddhisi3 expander in case of a 64-bit target. The assembly code after this change is more efficient, viz.: mulw a6,a4,s6 addw a6,a6,s7 Signed-off-by: Artemiy Volkov --- gcc/config/riscv/riscv.md | 55 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 53 insertions(+), 2 deletions(-) diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md index eece6f1c17e7..3987823be2d3 100644 --- a/gcc/config/riscv/riscv.md +++ b/gcc/config/riscv/riscv.md @@ -4510,7 +4510,21 @@ rtx tmp0 = gen_reg_rtx (SImode), tmp1 = gen_reg_rtx (SImode); emit_insn (gen_extendhisi2 (tmp0, operands[1])); emit_insn (gen_extendhisi2 (tmp1, operands[2])); - emit_insn (gen_madd_split_fused (operands[0], tmp0, tmp1, operands[3])); + + if (TARGET_64BIT) + { + rtx op0 = gen_reg_rtx (DImode); + emit_insn (gen_madd_split_fused_extended (op0, tmp0, tmp1, operands[3])); + op0 = gen_lowpart (SImode, op0); + SUBREG_PROMOTED_VAR_P (op0) = 1; + SUBREG_PROMOTED_SET (op0, SRP_SIGNED); + emit_move_insn (operands[0], op0); + } + else + { + emit_insn (gen_madd_split_fused (operands[0], tmp0, tmp1, operands[3])); + } + DONE; } } @@ -4528,7 +4542,21 @@ rtx tmp0 = gen_reg_rtx (SImode), tmp1 = gen_reg_rtx (SImode); emit_insn (gen_zero_extendhisi2 (tmp0, operands[1])); emit_insn (gen_zero_extendhisi2 (tmp1, operands[2])); - emit_insn (gen_madd_split_fused (operands[0], tmp0, tmp1, operands[3])); + + if (TARGET_64BIT) + { + rtx op0 = gen_reg_rtx (DImode); + emit_insn (gen_madd_split_fused_extended (op0, tmp0, tmp1, operands[3])); + op0 = gen_lowpart (SImode, op0); + SUBREG_PROMOTED_VAR_P (op0) = 1; + SUBREG_PROMOTED_SET (op0, SRP_SIGNED); + emit_move_insn (operands[0], op0); + } + else + { + emit_insn (gen_madd_split_fused (operands[0], tmp0, tmp1, operands[3])); + } + DONE; } ) @@ -4564,6 +4592,29 @@ [(set_attr "type" "imul_fused")] ) +(define_insn "madd_split_fused_extended" + [(set (match_operand:DI 0 "register_operand" "=&r,r") + (sign_extend:DI + (plus:SI + (mult:SI (match_operand:SI 1 "register_operand" "r,r") + (match_operand:SI 2 "register_operand" "r,r")) + (match_operand:SI 3 "register_operand" "r,?0")))) + (clobber (match_scratch:SI 4 "=&r,&r"))] + "arcv_micro_arch_supports_fusion_p () + && (TARGET_ZMMUL || TARGET_MUL)" + { + if (REGNO (operands[0]) == REGNO (operands[3])) + { + return "mulw\t%4,%1,%2\n\taddw\t%4,%3,%4\n\tmv\t%0,%4"; + } + else + { + return "mulw\t%0,%1,%2\n\taddw\t%0,%0,%3"; + } + } + [(set_attr "type" "imul_fused")] +) + (define_insn "*zero_extract_fused" [(set (match_operand:SI 0 "register_operand" "=r") (zero_extract:SI (match_operand:SI 1 "register_operand" "r") From fc92d40591315cf2fc50d9256e60fa918a2639d4 Mon Sep 17 00:00:00 2001 From: Michiel Derhaeg Date: Thu, 17 Jul 2025 05:45:37 -0700 Subject: [PATCH 16/17] arcv: Disable *3 when fusion is available This define_insn_and_split prevents *zero_extract_fused from being selected. Updated the test. It succeeded despite the fused case not being selected because the right instructions were produced still. Signed-off-by: Michiel Derhaeg --- gcc/config/riscv/iterators.md | 2 ++ gcc/config/riscv/riscv.md | 5 ++++- gcc/doc/riscv-mtune.texi | 2 ++ gcc/testsuite/gcc.target/riscv/arcv-fusion-xbfu.c | 4 ++-- 4 files changed, 10 insertions(+), 3 deletions(-) diff --git a/gcc/config/riscv/iterators.md b/gcc/config/riscv/iterators.md index 35de17f76cd9..df979031cd88 100644 --- a/gcc/config/riscv/iterators.md +++ b/gcc/config/riscv/iterators.md @@ -218,6 +218,8 @@ (zero_extract "srliw")]) (define_code_attr extract_shift [(sign_extract "ashiftrt") (zero_extract "lshiftrt")]) +(define_code_attr is_zero_extract [(sign_extract "false") + (zero_extract "true")]) ;; This code iterator allows the two right shift instructions to be ;; generated from the same template. diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md index 3987823be2d3..b60daf217d2c 100644 --- a/gcc/config/riscv/riscv.md +++ b/gcc/config/riscv/riscv.md @@ -3101,6 +3101,7 @@ ;; * Single-bit extraction (SFB) ;; * Extraction instruction th.ext(u) (XTheadBb) ;; * lshrsi3_extend_2 (see above) +;; * Zero extraction fusion (ARC-V) (define_insn_and_split "*3" [(set (match_operand:GPR 0 "register_operand" "=r") (any_extract:GPR @@ -3113,6 +3114,8 @@ && (INTVAL (operands[2]) == 1)) && !TARGET_XTHEADBB && !TARGET_XANDESPERF + && !(riscv_is_micro_arch (arcv_rhx100) + && ) && !(TARGET_64BIT && (INTVAL (operands[3]) > 0) && (INTVAL (operands[2]) + INTVAL (operands[3]) == 32))" @@ -4600,7 +4603,7 @@ (match_operand:SI 2 "register_operand" "r,r")) (match_operand:SI 3 "register_operand" "r,?0")))) (clobber (match_scratch:SI 4 "=&r,&r"))] - "arcv_micro_arch_supports_fusion_p () + "riscv_is_micro_arch (arcv_rhx100) && (TARGET_ZMMUL || TARGET_MUL)" { if (REGNO (operands[0]) == REGNO (operands[3])) diff --git a/gcc/doc/riscv-mtune.texi b/gcc/doc/riscv-mtune.texi index 63a01db67726..8ffb3db906fe 100644 --- a/gcc/doc/riscv-mtune.texi +++ b/gcc/doc/riscv-mtune.texi @@ -52,6 +52,8 @@ particular CPU name. Permissible values for this option are: @samp{arc-v-rmx-100-series}, +@samp{arc-v-rhx-100-series}, + @samp{generic-ooo}, @samp{size}, diff --git a/gcc/testsuite/gcc.target/riscv/arcv-fusion-xbfu.c b/gcc/testsuite/gcc.target/riscv/arcv-fusion-xbfu.c index 010038b52c96..7abf54ec1448 100644 --- a/gcc/testsuite/gcc.target/riscv/arcv-fusion-xbfu.c +++ b/gcc/testsuite/gcc.target/riscv/arcv-fusion-xbfu.c @@ -1,7 +1,7 @@ /* { dg-do compile } */ /* { dg-require-effective-target rv32 } */ /* { dg-skip-if "" { *-*-* } { "-g" "-flto" "-O0" "-Oz" "-Os" } } */ -/* { dg-options "-mtune=arc-v-rhx-100-series -march=rv32im_zbs -mabi=ilp32" } */ +/* { dg-options "-mtune=arc-v-rhx-100-series -march=rv32im_zbs -mabi=ilp32 -dp" } */ #define bit_extract(x,start,amt) (((x)>>(start)) & (~(0xffffffff << (amt)))) @@ -11,4 +11,4 @@ f (int x) return bit_extract(x,10,14) + bit_extract(x,1,1); } -/* { dg-final { scan-assembler {\sslli\s([ast][0-9]+),a0,8\n\ssrli\s([ast][0-9]+),\1,18\n\sbexti\sa0,a0,1\n\sadd\sa0,\2,a0\n} } } */ +/* { dg-final { scan-assembler {\sslli\s([ast][0-9]+),a0,8.*zero_extract_fused\n\ssrli\s([ast][0-9]+),\1,18\n\sbexti\sa0,a0,1.*\n\sadd\sa0,\2,a0.*\n} } } */ From 6762c997c4f4019471e76649d065fbf27199d316 Mon Sep 17 00:00:00 2001 From: Michiel Derhaeg Date: Wed, 15 Oct 2025 10:10:21 +0200 Subject: [PATCH 17/17] fixup! arcv: Add initial scheduling scheme. --- gcc/config/riscv/riscv.md | 1 - 1 file changed, 1 deletion(-) diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md index b60daf217d2c..67e8e76d725b 100644 --- a/gcc/config/riscv/riscv.md +++ b/gcc/config/riscv/riscv.md @@ -673,7 +673,6 @@ ;; Keep this in sync with enum riscv_microarchitecture. (define_attr "tune" "generic,sifive_7,sifive_p400,sifive_p600,xiangshan,generic_ooo,mips_p8700,tt_ascalon_d8,arcv_rmx100,arcv_rhx100" - "generic,sifive_7,sifive_p400,sifive_p600,xiangshan,arcv_rhx100,generic_ooo" (const (symbol_ref "((enum attr_tune) riscv_microarchitecture)"))) ;; Describe a user's asm statement.