Skip to content

Missed optimization: adjacent stores to memory not merged #147456

@es1024

Description

@es1024

Example (godbolt):

#include <cstdint>
#include <cstddef>

template<size_t N>
struct Test {
  uint16_t a;
  uint8_t b;
  uint8_t c;
  uint32_t d;

  uint64_t extra[N] = {};
};

template<size_t N>
Test<N> test(uint64_t x) {
  Test<N> t;
  t.a = static_cast<uint16_t>(x);
  t.b = static_cast<uint8_t>(x >> 16);
  t.c = static_cast<uint8_t>(x >> 24);
  t.d = static_cast<uint32_t>(x >> 32);
  return t;
}

template Test<1> test<1>(uint64_t);
template Test<2> test<2>(uint64_t);

For test<2> (24 byte case), assignment of a/b/c/d is done exactly as specified above (3 shifts, 4 truncates, 4 stores) when it could be reduced to just a single 64-bit store:

define weak_odr dso_local void @Test<2ul> test<2ul>(unsigned long)(ptr dead_on_unwind noalias writable sret(%struct.Test.0) align 8 %0, i64 noundef %1) local_unnamed_addr #0 comdat !dbg !138 {
    #dbg_value(i64 %1, !142, !DIExpression(), !144)
    #dbg_declare(ptr %0, !143, !DIExpression(), !145)
    #dbg_value(ptr %0, !146, !DIExpression(), !154)
  %3 = getelementptr inbounds nuw i8, ptr %0, i64 8, !dbg !156
  tail call void @llvm.memset.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) %3, i8 0, i64 16, i1 false), !dbg !156
  %4 = trunc i64 %1 to i16, !dbg !161
  store i16 %4, ptr %0, align 8, !dbg !162
  %5 = lshr i64 %1, 16, !dbg !167
  %6 = trunc i64 %5 to i8, !dbg !168
  %7 = getelementptr inbounds nuw i8, ptr %0, i64 2, !dbg !169
  store i8 %6, ptr %7, align 2, !dbg !170
  %8 = lshr i64 %1, 24, !dbg !172
  %9 = trunc i64 %8 to i8, !dbg !173
  %10 = getelementptr inbounds nuw i8, ptr %0, i64 3, !dbg !174
  store i8 %9, ptr %10, align 1, !dbg !175
  %11 = lshr i64 %1, 32, !dbg !177
  %12 = trunc nuw i64 %11 to i32, !dbg !178
  %13 = getelementptr inbounds nuw i8, ptr %0, i64 4, !dbg !179
  store i32 %12, ptr %13, align 4, !dbg !180
  ret void, !dbg !182
}

For test<1> (16 byte case), the assignments are combined as expected (but this case is a bit different since no store to memory is involved):

define weak_odr dso_local { i64, i64 } @Test<1ul> test<1ul>(unsigned long)(i64 noundef %0) local_unnamed_addr #0 comdat !dbg !130 {
    #dbg_value(i64 %0, !134, !DIExpression(), !136)
    #dbg_value(i64 0, !135, !DIExpression(DW_OP_LLVM_fragment, 64, 64), !136)
    #dbg_value(i64 %0, !135, !DIExpression(DW_OP_LLVM_fragment, 0, 16), !136)
    #dbg_value(i64 %0, !135, !DIExpression(DW_OP_constu, 16, DW_OP_shr, DW_OP_LLVM_convert, 64, DW_ATE_unsigned, DW_OP_LLVM_convert, 8, DW_ATE_unsigned, DW_OP_stack_value, DW_OP_LLVM_fragment, 16, 8), !136)
    #dbg_value(i64 %0, !135, !DIExpression(DW_OP_constu, 24, DW_OP_shr, DW_OP_LLVM_convert, 64, DW_ATE_unsigned, DW_OP_LLVM_convert, 8, DW_ATE_unsigned, DW_OP_stack_value, DW_OP_LLVM_fragment, 24, 8), !136)
    #dbg_value(i64 %0, !135, !DIExpression(DW_OP_constu, 32, DW_OP_shr, DW_OP_stack_value, DW_OP_LLVM_fragment, 32, 32), !136)
  %2 = insertvalue { i64, i64 } poison, i64 %0, 0, !dbg !137
  %3 = insertvalue { i64, i64 } %2, i64 0, 1, !dbg !137
  ret { i64, i64 } %3, !dbg !137
}

Metadata

Metadata

Assignees

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions