From d6271e58b04de9e6ec66e4118fab96e9ec59d52b Mon Sep 17 00:00:00 2001
From: Jameson Nash <vtjnash@gmail.com>
Date: Tue, 11 Mar 2025 13:50:34 +0000
Subject: [PATCH] fix alignment computation for nested objects

The alignment of a nested object (in C layouts) is not affected by the
alignment of its parent container when computing a field offset. This
can be strongly counter-intuitive (as it implies adding padding where it
does not seem to provide value), but is required to match the C standard.
It also permits users to write custom allocators which happen to provide
alignment in excess of that which codegen may assume is guaranteed, and
get the behavioral characteristics they intended to specify (without
resorting to computing explicit padding).

Addresses https://github.com/JuliaLang/julia/issues/57713#issuecomment-2714111074
---
 doc/src/manual/calling-c-and-fortran-code.md | 24 ++++++++++++-------
 src/cgutils.cpp                              | 13 ++++------
 src/datatype.c                               | 25 ++++++++++++++------
 test/core.jl                                 |  7 ++++++
 4 files changed, 45 insertions(+), 24 deletions(-)

diff --git a/doc/src/manual/calling-c-and-fortran-code.md b/doc/src/manual/calling-c-and-fortran-code.md
index d198c796a2e0b..aa317468b0f75 100644
--- a/doc/src/manual/calling-c-and-fortran-code.md
+++ b/doc/src/manual/calling-c-and-fortran-code.md
@@ -547,15 +547,14 @@ is not valid, since the type layout of `T` is not known statically.
 
 ### SIMD Values
 
-Note: This feature is currently implemented on 64-bit x86 and AArch64 platforms only.
-
 If a C/C++ routine has an argument or return value that is a native SIMD type, the corresponding
 Julia type is a homogeneous tuple of `VecElement` that naturally maps to the SIMD type. Specifically:
 
->   * The tuple must be the same size as the SIMD type. For example, a tuple representing an `__m128`
->     on x86 must have a size of 16 bytes.
->   * The element type of the tuple must be an instance of `VecElement{T}` where `T` is a primitive type that
->     is 1, 2, 4 or 8 bytes.
+>   * The tuple must be the same size and elements as the SIMD type. For example, a tuple
+>     representing an `__m128` on x86 must have a size of 16 bytes and Float32 elements.
+>   * The element type of the tuple must be an instance of `VecElement{T}` where `T` is a
+>     primitive type with a power-of-two number of bytes (e.g. 1, 2, 4, 8, 16, etc) such as
+>     Int8 or Float64.
 
 For instance, consider this C routine that uses AVX intrinsics:
 
@@ -628,6 +627,10 @@ For translating a C argument list to Julia:
 
       * `T`, where `T` is a concrete Julia type
       * argument value will be copied (passed by value)
+  * `vector T` (or `__attribute__ vector_size`, or a typedef such as `__m128`)
+
+      * `NTuple{N, VecElement{T}}`, where `T` is a primitive Julia type of the correct size
+        and N is the number of elements in the vector (equal to `vector_size / sizeof T`).
   * `void*`
 
       * depends on how this parameter is used, first translate this to the intended pointer type, then
@@ -674,13 +677,16 @@ For translating a C return type to Julia:
   * `T`, where `T` is one of the primitive types: `char`, `int`, `long`, `short`, `float`, `double`,
     `complex`, `enum` or any of their `typedef` equivalents
 
-      * `T`, where `T` is an equivalent Julia Bits Type (per the table above)
-      * if `T` is an `enum`, the argument type should be equivalent to `Cint` or `Cuint`
+      * same as C argument list
       * argument value will be copied (returned by-value)
   * `struct T` (including typedef to a struct)
 
-      * `T`, where `T` is a concrete Julia Type
+      * same as C argument list
       * argument value will be copied (returned by-value)
+
+  * `vector T`
+
+      * same as C argument list
   * `void*`
 
       * depends on how this parameter is used, first translate this to the intended pointer type, then
diff --git a/src/cgutils.cpp b/src/cgutils.cpp
index 8c40e12539598..fb4c9aa8cb415 100644
--- a/src/cgutils.cpp
+++ b/src/cgutils.cpp
@@ -3103,11 +3103,7 @@ static jl_cgval_t emit_getfield_knownidx(jl_codectx_t &ctx, const jl_cgval_t &st
     else if (strct.ispointer()) {
         auto tbaa = best_field_tbaa(ctx, strct, jt, idx, byte_offset);
         Value *staddr = data_pointer(ctx, strct);
-        Value *addr;
-        if (jl_is_vecelement_type((jl_value_t*)jt) || byte_offset == 0)
-            addr = staddr; // VecElement types are unwrapped in LLVM.
-        else
-            addr = emit_ptrgep(ctx, staddr, byte_offset);
+        Value *addr = (byte_offset == 0 ? staddr : emit_ptrgep(ctx, staddr, byte_offset));
         if (addr != staddr)
             setNameWithField(ctx.emission_context, addr, get_objname, jt, idx, Twine("_ptr"));
         if (jl_field_isptr(jt, idx)) {
@@ -3571,7 +3567,7 @@ static void union_alloca_type(jl_uniontype_t *ut,
             [&](unsigned idx, jl_datatype_t *jt) {
                 if (!jl_is_datatype_singleton(jt)) {
                     size_t nb1 = jl_datatype_size(jt);
-                    size_t align1 = jl_datatype_align(jt);
+                    size_t align1 = julia_alignment((jl_value_t*)jt);
                     if (nb1 > nbytes)
                         nbytes = nb1;
                     if (align1 > align)
@@ -4133,10 +4129,11 @@ static jl_cgval_t emit_new_struct(jl_codectx_t &ctx, jl_value_t *ty, size_t narg
 
             // choose whether we should perform the initialization with the struct as a IR value
             // or instead initialize the stack buffer with stores (the later is nearly always better)
+            // although we do the former if it is a vector or could be a vector element
             auto tracked = split_value_size(sty);
             assert(CountTrackedPointers(lt).count == tracked.second);
             bool init_as_value = false;
-            if (lt->isVectorTy() || jl_is_vecelement_type(ty)) { // maybe also check the size ?
+            if (lt->isVectorTy() || jl_special_vector_alignment(1, ty) != 0) {
                 init_as_value = true;
             }
 
@@ -4343,7 +4340,7 @@ static jl_cgval_t emit_new_struct(jl_codectx_t &ctx, jl_value_t *ty, size_t narg
                 if (strct) {
                     jl_aliasinfo_t ai = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_stack);
                     promotion_point = ai.decorateInst(ctx.builder.CreateMemSet(strct, ConstantInt::get(getInt8Ty(ctx.builder.getContext()), 0),
-                                                                jl_datatype_size(ty), MaybeAlign(jl_datatype_align(ty))));
+                                                                jl_datatype_size(ty), Align(julia_alignment(ty))));
                 }
                 ctx.builder.restoreIP(savedIP);
             }
diff --git a/src/datatype.c b/src/datatype.c
index 240b1ace2295f..f2f633adc3623 100644
--- a/src/datatype.c
+++ b/src/datatype.c
@@ -300,9 +300,10 @@ static jl_datatype_layout_t *jl_get_layout(uint32_t sz,
 }
 
 // Determine if homogeneous tuple with fields of type t will have
-// a special alignment beyond normal Julia rules.
+// a special alignment and vector-ABI beyond normal rules for aggregates.
 // Return special alignment if one exists, 0 if normal alignment rules hold.
 // A non-zero result *must* match the LLVM rules for a vector type <nfields x t>.
+// Matching the compiler's `__attribute__ vector_size` behavior.
 // For sake of Ahead-Of-Time (AOT) compilation, this routine has to work
 // without LLVM being available.
 unsigned jl_special_vector_alignment(size_t nfields, jl_value_t *t)
@@ -317,8 +318,12 @@ unsigned jl_special_vector_alignment(size_t nfields, jl_value_t *t)
         // motivating use case comes up for Julia, we reject pointers.
         return 0;
     size_t elsz = jl_datatype_size(ty);
-    if (elsz != 1 && elsz != 2 && elsz != 4 && elsz != 8)
-        // Only handle power-of-two-sized elements (for now)
+    if (next_power_of_two(elsz) != elsz)
+        // Only handle power-of-two-sized elements (for now), since other
+        // lengths may be packed into very complicated arrangements (llvm pads
+        // extra bits on most platforms when computing alignment but not when
+        // computing type size, but adds no extra bytes for each element, so
+        // their effect on offsets are never what you may naturally expect).
         return 0;
     size_t size = nfields * elsz;
     // Use natural alignment for this vector: this matches LLVM and clang.
@@ -723,9 +728,9 @@ void jl_compute_field_offsets(jl_datatype_t *st)
             }
             else {
                 fsz = sizeof(void*);
-                if (fsz > MAX_ALIGN)
-                    fsz = MAX_ALIGN;
                 al = fsz;
+                if (al > MAX_ALIGN)
+                    al = MAX_ALIGN;
                 desc[i].isptr = 1;
                 zeroinit = 1;
                 npointers++;
@@ -769,8 +774,6 @@ void jl_compute_field_offsets(jl_datatype_t *st)
             if (al > alignm)
                 alignm = al;
         }
-        if (alignm > MAX_ALIGN)
-            alignm = MAX_ALIGN; // We cannot guarantee alignments over 16 bytes because that's what our heap is aligned as
         if (LLT_ALIGN(sz, alignm) > sz) {
             haspadding = 1;
             sz = LLT_ALIGN(sz, alignm);
@@ -939,6 +942,14 @@ JL_DLLEXPORT jl_datatype_t *jl_new_primitivetype(jl_value_t *name, jl_module_t *
     uint32_t nbytes = (nbits + 7) / 8;
     uint32_t alignm = next_power_of_two(nbytes);
 # if defined(_CPU_X86_) && !defined(_OS_WINDOWS_)
+    // datalayout strings are often weird: on 64-bit they usually follow fairly simple rules,
+    // but on x86 32 bit platforms, sometimes 5 to 8 byte types are
+    // 32-bit aligned even though the MAX_ALIGN (for types 9+ bytes) is 16
+    // (except for f80 which is align 4 on Mingw, Linux, and BSDs--but align 16 on MSVC and Darwin)
+    // https://llvm.org/doxygen/ARMTargetMachine_8cpp.html#adb29b487708f0dc2a940345b68649270
+    // https://llvm.org/doxygen/AArch64TargetMachine_8cpp.html#a003a58caf135efbf7273c5ed84e700d7
+    // https://llvm.org/doxygen/X86TargetMachine_8cpp.html#aefdbcd6131ef195da070cef7fdaf0532
+    // 32-bit alignment is weird
     if (alignm == 8)
         alignm = 4;
 # endif
diff --git a/test/core.jl b/test/core.jl
index fd607cc86f1d5..b27a648f543d1 100644
--- a/test/core.jl
+++ b/test/core.jl
@@ -5773,6 +5773,13 @@ let ni128 = sizeof(FP128test) ÷ sizeof(Int),
     @test reinterpret(UInt128, arr[2].fp) == expected
 end
 
+# make sure VecElement Tuple has the C alignment and ABI for supported types
+primitive type Int24 24 end
+@test Base.datatype_alignment(NTuple{10,VecElement{Int16}}) == 32
+@test Base.datatype_alignment(NTuple{10,VecElement{Int24}}) == 4
+@test Base.datatype_alignment(NTuple{10,VecElement{Int64}}) == 128
+@test Base.datatype_alignment(NTuple{10,VecElement{Int128}}) == 256
+
 # issue #21516
 struct T21516
     x::Vector{Float64}