From a348cec8f5c79e89a542fcb60aa7bd6f4d8b54ca Mon Sep 17 00:00:00 2001
From: Yingbo Ma <mayingbo5@gmail.com>
Date: Thu, 18 Nov 2021 22:45:57 -0500
Subject: [PATCH 1/4] Failed attempts

---
 Project.toml       |   1 +
 src/ForwardDiff.jl |   1 +
 src/partials.jl    | 167 ++++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 158 insertions(+), 11 deletions(-)

diff --git a/Project.toml b/Project.toml
index 3318b46d..93b0d8a0 100644
--- a/Project.toml
+++ b/Project.toml
@@ -14,6 +14,7 @@ Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
+VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
 
 [compat]
 Calculus = "0.2, 0.3, 0.4, 0.5"
diff --git a/src/ForwardDiff.jl b/src/ForwardDiff.jl
index 93d3b246..6affc113 100644
--- a/src/ForwardDiff.jl
+++ b/src/ForwardDiff.jl
@@ -8,6 +8,7 @@ if VERSION >= v"1.6"
 end
 using Random
 using LinearAlgebra
+using VectorizationBase
 
 import Printf
 import NaNMath
diff --git a/src/partials.jl b/src/partials.jl
index fce67b0a..b60ffa68 100644
--- a/src/partials.jl
+++ b/src/partials.jl
@@ -197,30 +197,175 @@ end
     return tupexpr(i -> :(rand(V)), N)
 end
 
-@generated function scale_tuple(tup::NTuple{N}, x) where N
-    return tupexpr(i -> :(tup[$i] * x), N)
+const SIMDFloat = Union{Float64, Float32}
+const SIMDInt = Union{
+                       Int128, Int64, Int32, Int16, Int8,
+                       UInt128, UInt64, UInt32, UInt16, UInt8,
+                       Bool
+                     }
+const SIMDType = Union{SIMDFloat, SIMDInt}
+
+function julia_type_to_llvm_type(@nospecialize(T::DataType))
+    T === Float64 ? "double" :
+    T === Float32 ? "float"  :
+    T <: Union{Int128,UInt128} ? "i128" :
+    T <: Union{Int64,UInt64} ? "i64" :
+    T <: Union{Int32,UInt32} ? "i32" :
+    T <: Union{Int16,UInt16} ? "i16" :
+    T <: Union{Bool,Int8,UInt8} ? "i8" :
+    error("$T cannot be mapped to a LLVM type")
 end
 
-@generated function div_tuple_by_scalar(tup::NTuple{N}, x) where N
-    return tupexpr(i -> :(tup[$i] / x), N)
+function llvmir_scalar_to_vec(@nospecialize(T::DataType), n::Int, vname::String)
+    S = julia_type_to_llvm_type(T)
+    el = string("ele", vname)
+    """
+      %$el = insertelement <$n x $S> undef, $S %0, i32 0
+      %$vname = shufflevector <$n x $S> %$el, <$n x $S> undef, <$n x i32> zeroinitializer
+    """
 end
 
-@generated function add_tuples(a::NTuple{N}, b::NTuple{N})  where N
-    return tupexpr(i -> :(a[$i] + b[$i]), N)
+@generated function scale_tuple(tup::NTuple{N,T1}, x::S1) where {N,T1,S1}
+    (T1 <: SIMDType && S1 <: SIMDType) || return tupexpr(i -> :(tup[$i] * x), N)
+
+    T = promote_type(T1, S1)
+    S = julia_type_to_llvm_type(T)
+    VT = NTuple{N, VecElement{T}}
+    op = T <: SIMDFloat ? "fmul nsz contract" : "mul"
+    llvmir = """
+    %el = insertelement <$N x $S> undef, $S %1, i32 0
+    %vx = shufflevector <$N x $S> %el, <$N x $S> undef, <$N x i32> zeroinitializer
+    %res = $op <$N x $S> %0, %vx
+    ret <$N x $S> %res
+    """
+
+    quote
+        $(Expr(:meta, :inline))
+        t = Base.@ntuple $N i->$T(tup[i])
+        ret = Base.llvmcall($llvmir, $VT, Tuple{$VT, $T}, $VT(t), $T(x))
+        Base.@ntuple $N i->ret[i].value
+    end
+end
+
+@generated function div_tuple_by_scalar(tup::NTuple{N,T1}, x::S1) where {N,T1,S1}
+    (T1 <: SIMDType && S1 <: SIMDType) || return tupexpr(i -> :(tup[$i] / x), N)
+
+    T = typeof(one(T1) / one(S1))
+    S = julia_type_to_llvm_type(T)
+    VT = NTuple{N, VecElement{T}}
+    op = T <: SIMDFloat ? "fdiv nsz contract" : "div"
+    llvmir = """
+    %el = insertelement <$N x $S> undef, $S %1, i32 0
+    %vx = shufflevector <$N x $S> %el, <$N x $S> undef, <$N x i32> zeroinitializer
+    %res = $op <$N x $S> %0, %vx
+    ret <$N x $S> %res
+    """
+
+    quote
+        $(Expr(:meta, :inline))
+        t = Base.@ntuple $N i->$T(tup[i])
+        ret = Base.llvmcall($llvmir, $VT, Tuple{$VT, $T}, $VT(t), $T(x))
+        Base.@ntuple $N i->ret[i].value
+    end
+end
+
+@generated function add_tuples(a::NTuple{N,T1}, b::NTuple{N,S1}) where {N,T1,S1}
+    (T1 <: SIMDType && S1 <: SIMDType) || return tupexpr(i -> :(a[$i] + b[$i]), N)
+
+    T = promote_type(T1, S1)
+    S = julia_type_to_llvm_type(T)
+    VT = NTuple{N, VecElement{T}}
+    op = T <: SIMDFloat ? "fadd nsz contract" : "add"
+    llvmir = """
+    %res = $op <$N x $S> %0, %1
+    ret <$N x $S> %res
+    """
+
+    quote
+        $(Expr(:meta, :inline))
+        at = Base.@ntuple $N i->$T(a[i])
+        bt = Base.@ntuple $N i->$T(b[i])
+        ret = Base.llvmcall($llvmir, $VT, Tuple{$VT, $VT}, $VT(at), $VT(bt))
+        Base.@ntuple $N i->ret[i].value
+    end
 end
 
-@generated function sub_tuples(a::NTuple{N}, b::NTuple{N})  where N
-    return tupexpr(i -> :(a[$i] - b[$i]), N)
+@generated function sub_tuples(a::NTuple{N,T1}, b::NTuple{N,S1}) where {N,T1,S1}
+    (T1 <: SIMDType && S1 <: SIMDType) || return tupexpr(i -> :(a[$i] - b[$i]), N)
+
+    T = promote_type(T1, S1)
+    S = julia_type_to_llvm_type(T)
+    VT = NTuple{N, VecElement{T}}
+    op = T <: SIMDFloat ? "fsub nsz contract" : "sub"
+    llvmir = """
+    %res = $op <$N x $S> %0, %1
+    ret <$N x $S> %res
+    """
+
+    quote
+        $(Expr(:meta, :inline))
+        at = Base.@ntuple $N i->$T(a[i])
+        bt = Base.@ntuple $N i->$T(b[i])
+        ret = Base.llvmcall($llvmir, $VT, Tuple{$VT, $VT}, $VT(at), $VT(bt))
+        Base.@ntuple $N i->ret[i].value
+    end
 end
 
-@generated function minus_tuple(tup::NTuple{N}) where N
-    return tupexpr(i -> :(-tup[$i]), N)
+@generated function minus_tuple(tup::NTuple{N,T}) where {N,T}
+    T <: SIMDType || return tupexpr(i -> :(-tup[$i]), N)
+
+    S = julia_type_to_llvm_type(T)
+    VT = NTuple{N, VecElement{T}}
+    op = T <: SIMDFloat ? "fneg nsz contract" : "sub"
+    llvmir = """
+    %res = $op <$N x $S> %0
+    ret <$N x $S> %res
+    """
+
+    quote
+        $(Expr(:meta, :inline))
+        ret = Base.llvmcall($llvmir, $VT, Tuple{$VT}, $VT(tup))
+        Base.@ntuple $N i->ret[i].value
+    end
 end
 
-@generated function mul_tuples(a::NTuple{N}, b::NTuple{N}, afactor, bfactor) where N
+@generated function mul_tuples(a::NTuple{N,V1}, b::NTuple{N,V2}, afactor::S1, bfactor::S2) where {N,V1,V2,S1,S2}
     return tupexpr(i -> :((afactor * a[$i]) + (bfactor * b[$i])), N)
 end
 
+#=
+@inline function scale_tuple(tup::NTuple{N,T}, x) where {N,T<:SIMDType}
+    Tuple(Vec{N,T}(tup...) * x)
+end
+
+@inline function div_tuple_by_scalar(tup::NTuple{N,T}, x) where {N,T<:SIMDType}
+    Tuple(Vec{N,T}(tup...) / x)
+end
+
+@inline function add_tuples(a::NTuple{N,T}, b::NTuple{N,S}) where {N,T<:SIMDType,S<:SIMDType}
+    va = Vec{N,T}(a...)
+    vb = Vec{N,S}(b...)
+    return Tuple(va + vb)
+end
+
+@inline function sub_tuples(a::NTuple{N,T}, b::NTuple{N,S}) where {N,T<:SIMDType,S<:SIMDType}
+    va = Vec{N,T}(a...)
+    vb = Vec{N,S}(b...)
+    return Tuple(va - vb)
+end
+
+@inline function minus_tuple(a::NTuple{N,T}) where {N,T<:SIMDType}
+    va = Vec{N,T}(a...)
+    return Tuple(-va)
+end
+
+@inline function mul_tuples(a::NTuple{N,T}, b::NTuple{N,S}, afactor::SIMDType, bfactor::SIMDType) where {N,T<:SIMDType,S<:SIMDType}
+    va = Vec{N,T}(a...)
+    vb = Vec{N,S}(b...)
+    return Tuple(muladd(afactor, va, bfactor * vb))
+end
+=#
+
 ###################
 # Pretty Printing #
 ###################

From 0a5df539b65cabb7fd330995b32c7cb12970e20b Mon Sep 17 00:00:00 2001
From: Yingbo Ma <mayingbo5@gmail.com>
Date: Thu, 18 Nov 2021 23:30:03 -0500
Subject: [PATCH 2/4] Slightly better SIMD criterion

---
 Project.toml         |   1 -
 src/ForwardDiff.jl   |   1 -
 src/partials.jl      | 125 ++++++++++++++++++++-----------------------
 test/PartialsTest.jl |   7 ++-
 4 files changed, 63 insertions(+), 71 deletions(-)

diff --git a/Project.toml b/Project.toml
index 93b0d8a0..3318b46d 100644
--- a/Project.toml
+++ b/Project.toml
@@ -14,7 +14,6 @@ Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
-VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
 
 [compat]
 Calculus = "0.2, 0.3, 0.4, 0.5"
diff --git a/src/ForwardDiff.jl b/src/ForwardDiff.jl
index 6affc113..93d3b246 100644
--- a/src/ForwardDiff.jl
+++ b/src/ForwardDiff.jl
@@ -8,7 +8,6 @@ if VERSION >= v"1.6"
 end
 using Random
 using LinearAlgebra
-using VectorizationBase
 
 import Printf
 import NaNMath
diff --git a/src/partials.jl b/src/partials.jl
index b60ffa68..95d1dc4c 100644
--- a/src/partials.jl
+++ b/src/partials.jl
@@ -216,19 +216,11 @@ function julia_type_to_llvm_type(@nospecialize(T::DataType))
     error("$T cannot be mapped to a LLVM type")
 end
 
-function llvmir_scalar_to_vec(@nospecialize(T::DataType), n::Int, vname::String)
-    S = julia_type_to_llvm_type(T)
-    el = string("ele", vname)
-    """
-      %$el = insertelement <$n x $S> undef, $S %0, i32 0
-      %$vname = shufflevector <$n x $S> %$el, <$n x $S> undef, <$n x i32> zeroinitializer
-    """
-end
-
-@generated function scale_tuple(tup::NTuple{N,T1}, x::S1) where {N,T1,S1}
-    (T1 <: SIMDType && S1 <: SIMDType) || return tupexpr(i -> :(tup[$i] * x), N)
+@generated function scale_tuple(tup::NTuple{N,T}, x::S) where {N,T,S}
+    if !(T === S && S <: SIMDType)
+        return tupexpr(i -> :(tup[$i] * x), N)
+    end
 
-    T = promote_type(T1, S1)
     S = julia_type_to_llvm_type(T)
     VT = NTuple{N, VecElement{T}}
     op = T <: SIMDFloat ? "fmul nsz contract" : "mul"
@@ -241,16 +233,16 @@ end
 
     quote
         $(Expr(:meta, :inline))
-        t = Base.@ntuple $N i->$T(tup[i])
-        ret = Base.llvmcall($llvmir, $VT, Tuple{$VT, $T}, $VT(t), $T(x))
+        ret = Base.llvmcall($llvmir, $VT, Tuple{$VT, $T}, $VT(tup), x)
         Base.@ntuple $N i->ret[i].value
     end
 end
 
-@generated function div_tuple_by_scalar(tup::NTuple{N,T1}, x::S1) where {N,T1,S1}
-    (T1 <: SIMDType && S1 <: SIMDType) || return tupexpr(i -> :(tup[$i] / x), N)
+@generated function div_tuple_by_scalar(tup::NTuple{N,T}, x::S) where {N,T,S}
+    if !(T === S === typeof(one(T) / one(S)) && S <: SIMDType)
+        return tupexpr(i -> :(tup[$i] / x), N)
+    end
 
-    T = typeof(one(T1) / one(S1))
     S = julia_type_to_llvm_type(T)
     VT = NTuple{N, VecElement{T}}
     op = T <: SIMDFloat ? "fdiv nsz contract" : "div"
@@ -263,16 +255,16 @@ end
 
     quote
         $(Expr(:meta, :inline))
-        t = Base.@ntuple $N i->$T(tup[i])
-        ret = Base.llvmcall($llvmir, $VT, Tuple{$VT, $T}, $VT(t), $T(x))
+        ret = Base.llvmcall($llvmir, $VT, Tuple{$VT, $T}, $VT(tup), x)
         Base.@ntuple $N i->ret[i].value
     end
 end
 
-@generated function add_tuples(a::NTuple{N,T1}, b::NTuple{N,S1}) where {N,T1,S1}
-    (T1 <: SIMDType && S1 <: SIMDType) || return tupexpr(i -> :(a[$i] + b[$i]), N)
+@generated function add_tuples(a::NTuple{N,T}, b::NTuple{N,S}) where {N,T,S}
+    if !(T === S && S <: SIMDType)
+        return tupexpr(i -> :(a[$i] + b[$i]), N)
+    end
 
-    T = promote_type(T1, S1)
     S = julia_type_to_llvm_type(T)
     VT = NTuple{N, VecElement{T}}
     op = T <: SIMDFloat ? "fadd nsz contract" : "add"
@@ -283,17 +275,16 @@ end
 
     quote
         $(Expr(:meta, :inline))
-        at = Base.@ntuple $N i->$T(a[i])
-        bt = Base.@ntuple $N i->$T(b[i])
-        ret = Base.llvmcall($llvmir, $VT, Tuple{$VT, $VT}, $VT(at), $VT(bt))
+        ret = Base.llvmcall($llvmir, $VT, Tuple{$VT, $VT}, $VT(a), $VT(b))
         Base.@ntuple $N i->ret[i].value
     end
 end
 
-@generated function sub_tuples(a::NTuple{N,T1}, b::NTuple{N,S1}) where {N,T1,S1}
-    (T1 <: SIMDType && S1 <: SIMDType) || return tupexpr(i -> :(a[$i] - b[$i]), N)
+@generated function sub_tuples(a::NTuple{N,T}, b::NTuple{N,S}) where {N,T,S}
+    if !(T === S && S <: SIMDType)
+        return tupexpr(i -> :(a[$i] - b[$i]), N)
+    end
 
-    T = promote_type(T1, S1)
     S = julia_type_to_llvm_type(T)
     VT = NTuple{N, VecElement{T}}
     op = T <: SIMDFloat ? "fsub nsz contract" : "sub"
@@ -304,9 +295,7 @@ end
 
     quote
         $(Expr(:meta, :inline))
-        at = Base.@ntuple $N i->$T(a[i])
-        bt = Base.@ntuple $N i->$T(b[i])
-        ret = Base.llvmcall($llvmir, $VT, Tuple{$VT, $VT}, $VT(at), $VT(bt))
+        ret = Base.llvmcall($llvmir, $VT, Tuple{$VT, $VT}, $VT(a), $VT(b))
         Base.@ntuple $N i->ret[i].value
     end
 end
@@ -316,11 +305,17 @@ end
 
     S = julia_type_to_llvm_type(T)
     VT = NTuple{N, VecElement{T}}
-    op = T <: SIMDFloat ? "fneg nsz contract" : "sub"
-    llvmir = """
-    %res = $op <$N x $S> %0
-    ret <$N x $S> %res
-    """
+    if T <: SIMDFloat
+        llvmir = """
+        %res = fneg nsz contract <$N x $S> %0
+        ret <$N x $S> %res
+        """
+    else
+        llvmir = """
+        %res = sub <$N x $S> zeroinitializer, %0
+        ret <$N x $S> %res
+        """
+    end
 
     quote
         $(Expr(:meta, :inline))
@@ -330,41 +325,35 @@ end
 end
 
 @generated function mul_tuples(a::NTuple{N,V1}, b::NTuple{N,V2}, afactor::S1, bfactor::S2) where {N,V1,V2,S1,S2}
-    return tupexpr(i -> :((afactor * a[$i]) + (bfactor * b[$i])), N)
-end
-
-#=
-@inline function scale_tuple(tup::NTuple{N,T}, x) where {N,T<:SIMDType}
-    Tuple(Vec{N,T}(tup...) * x)
-end
-
-@inline function div_tuple_by_scalar(tup::NTuple{N,T}, x) where {N,T<:SIMDType}
-    Tuple(Vec{N,T}(tup...) / x)
-end
-
-@inline function add_tuples(a::NTuple{N,T}, b::NTuple{N,S}) where {N,T<:SIMDType,S<:SIMDType}
-    va = Vec{N,T}(a...)
-    vb = Vec{N,S}(b...)
-    return Tuple(va + vb)
-end
-
-@inline function sub_tuples(a::NTuple{N,T}, b::NTuple{N,S}) where {N,T<:SIMDType,S<:SIMDType}
-    va = Vec{N,T}(a...)
-    vb = Vec{N,S}(b...)
-    return Tuple(va - vb)
-end
+    if !(V1 === V2 === S1 === S2 && S2 <: SIMDFloat)
+        return tupexpr(i -> :((afactor * a[$i]) + (bfactor * b[$i])), N)
+    end
 
-@inline function minus_tuple(a::NTuple{N,T}) where {N,T<:SIMDType}
-    va = Vec{N,T}(a...)
-    return Tuple(-va)
-end
+    T = V1
+    S = julia_type_to_llvm_type(T)
+    fmuladd = "@llvm.fmuladd.v$(N)f$(sizeof(T)*8)"
 
-@inline function mul_tuples(a::NTuple{N,T}, b::NTuple{N,S}, afactor::SIMDType, bfactor::SIMDType) where {N,T<:SIMDType,S<:SIMDType}
-    va = Vec{N,T}(a...)
-    vb = Vec{N,S}(b...)
-    return Tuple(muladd(afactor, va, bfactor * vb))
+    VT = NTuple{N, VecElement{T}}
+    llvmir = """
+    declare <$N x $S> $fmuladd(<$N x $S>, <$N x $S>, <$N x $S>)
+
+    define <$N x $S> @entry(<$N x $S>, <$N x $S>, $S, $S) alwaysinline {
+    top:
+        %el1 = insertelement <$N x $S> undef, $S %2, i32 0
+        %afactor = shufflevector <$N x $S> %el1, <$N x $S> undef, <$N x i32> zeroinitializer
+        %el2 = insertelement <$N x $S> undef, $S %3, i32 0
+        %bfactor = shufflevector <$N x $S> %el2, <$N x $S> undef, <$N x i32> zeroinitializer
+        %tmp = fmul nsz contract <$N x $S> %1, %bfactor
+        %res = call nsz contract <$N x $S> $fmuladd(<$N x $S> %0, <$N x $S> %afactor, <$N x $S> %tmp)
+        ret <$N x $S> %res
+    }
+    """
+    quote
+        $(Expr(:meta, :inline))
+        ret = Base.llvmcall(($llvmir, "entry"), $VT, Tuple{$VT, $VT, $T, $T}, $VT(a), $VT(b), afactor, bfactor)
+        Base.@ntuple $N i->ret[i].value
+    end
 end
-=#
 
 ###################
 # Pretty Printing #
diff --git a/test/PartialsTest.jl b/test/PartialsTest.jl
index 39fb05d7..84320446 100644
--- a/test/PartialsTest.jl
+++ b/test/PartialsTest.jl
@@ -7,6 +7,10 @@ using ForwardDiff: Partials
 
 samerng() = MersenneTwister(1)
 
+approx_tuple(x, y) = all(zip(x, y)) do (a, b)
+    a ≈ b
+end
+
 for N in (0, 3), T in (Int, Float32, Float64)
     println("  ...testing Partials{$N,$T}")
 
@@ -114,7 +118,8 @@ for N in (0, 3), T in (Int, Float32, Float64)
 
     if N > 0
         @test ForwardDiff._div_partials(PARTIALS, PARTIALS2, X, Y) == ForwardDiff._mul_partials(PARTIALS, PARTIALS2, inv(Y), -X/(Y^2))
-        @test ForwardDiff._mul_partials(PARTIALS, PARTIALS2, X, Y).values == map((a, b) -> (X * a) + (Y * b), VALUES, VALUES2)
+        # FMA
+        @test approx_tuple(ForwardDiff._mul_partials(PARTIALS, PARTIALS2, X, Y).values, map((a, b) -> (X * a) + (Y * b), VALUES, VALUES2))
         @test ForwardDiff._mul_partials(ZERO_PARTIALS, PARTIALS, X, Y) == Y * PARTIALS
         @test ForwardDiff._mul_partials(PARTIALS, ZERO_PARTIALS, X, Y) == X * PARTIALS
 

From 0068d28a5c9928c33aa03b8570819b967dda2797 Mon Sep 17 00:00:00 2001
From: Yingbo Ma <mayingbo5@gmail.com>
Date: Fri, 19 Nov 2021 00:18:56 -0500
Subject: [PATCH 3/4] Julia version bound for `fneg`

---
 src/partials.jl | 46 ++++++++++++++++++++++++++--------------------
 1 file changed, 26 insertions(+), 20 deletions(-)

diff --git a/src/partials.jl b/src/partials.jl
index 95d1dc4c..7eaa7d91 100644
--- a/src/partials.jl
+++ b/src/partials.jl
@@ -300,27 +300,33 @@ end
     end
 end
 
-@generated function minus_tuple(tup::NTuple{N,T}) where {N,T}
-    T <: SIMDType || return tupexpr(i -> :(-tup[$i]), N)
-
-    S = julia_type_to_llvm_type(T)
-    VT = NTuple{N, VecElement{T}}
-    if T <: SIMDFloat
-        llvmir = """
-        %res = fneg nsz contract <$N x $S> %0
-        ret <$N x $S> %res
-        """
-    else
-        llvmir = """
-        %res = sub <$N x $S> zeroinitializer, %0
-        ret <$N x $S> %res
-        """
+if VERSION >= v"1.4" # fsub requires LLVM 8 (Julia 1.4)
+    @generated function minus_tuple(tup::NTuple{N,T}) where {N,T}
+        T <: SIMDType || return tupexpr(i -> :(-tup[$i]), N)
+
+        S = julia_type_to_llvm_type(T)
+        VT = NTuple{N, VecElement{T}}
+        if T <: SIMDFloat
+            llvmir = """
+            %res = fneg nsz contract <$N x $S> %0
+            ret <$N x $S> %res
+            """
+        else
+            llvmir = """
+            %res = sub <$N x $S> zeroinitializer, %0
+            ret <$N x $S> %res
+            """
+        end
+
+        quote
+            $(Expr(:meta, :inline))
+            ret = Base.llvmcall($llvmir, $VT, Tuple{$VT}, $VT(tup))
+            Base.@ntuple $N i->ret[i].value
+        end
     end
-
-    quote
-        $(Expr(:meta, :inline))
-        ret = Base.llvmcall($llvmir, $VT, Tuple{$VT}, $VT(tup))
-        Base.@ntuple $N i->ret[i].value
+else
+    @generated function minus_tuple(tup::NTuple{N,T}) where {N,T}
+        return tupexpr(i -> :(-tup[$i]), N)
     end
 end
 

From e50d653d1d800b634979da566f08f113166deb1b Mon Sep 17 00:00:00 2001
From: Yingbo Ma <mayingbo5@gmail.com>
Date: Fri, 19 Nov 2021 00:27:29 -0500
Subject: [PATCH 4/4] Julia 1.0 doesn't get nice things

---
 src/partials.jl | 59 +++++++++++++++++++++++--------------------------
 1 file changed, 28 insertions(+), 31 deletions(-)

diff --git a/src/partials.jl b/src/partials.jl
index 7eaa7d91..0325ea75 100644
--- a/src/partials.jl
+++ b/src/partials.jl
@@ -205,6 +205,9 @@ const SIMDInt = Union{
                      }
 const SIMDType = Union{SIMDFloat, SIMDInt}
 
+# This may not be a sharp bound, but at least people won't get worse result.
+const HAS_FLEXIABLE_VECTOR_LENGTH = VERSION >= v"1.6"
+
 function julia_type_to_llvm_type(@nospecialize(T::DataType))
     T === Float64 ? "double" :
     T === Float32 ? "float"  :
@@ -217,7 +220,7 @@ function julia_type_to_llvm_type(@nospecialize(T::DataType))
 end
 
 @generated function scale_tuple(tup::NTuple{N,T}, x::S) where {N,T,S}
-    if !(T === S && S <: SIMDType)
+    if !(HAS_FLEXIABLE_VECTOR_LENGTH && T === S && S <: SIMDType)
         return tupexpr(i -> :(tup[$i] * x), N)
     end
 
@@ -239,7 +242,7 @@ end
 end
 
 @generated function div_tuple_by_scalar(tup::NTuple{N,T}, x::S) where {N,T,S}
-    if !(T === S === typeof(one(T) / one(S)) && S <: SIMDType)
+    if !(HAS_FLEXIABLE_VECTOR_LENGTH && T === S === typeof(one(T) / one(S)) && S <: SIMDType)
         return tupexpr(i -> :(tup[$i] / x), N)
     end
 
@@ -261,7 +264,7 @@ end
 end
 
 @generated function add_tuples(a::NTuple{N,T}, b::NTuple{N,S}) where {N,T,S}
-    if !(T === S && S <: SIMDType)
+    if !(HAS_FLEXIABLE_VECTOR_LENGTH && T === S && S <: SIMDType)
         return tupexpr(i -> :(a[$i] + b[$i]), N)
     end
 
@@ -281,7 +284,7 @@ end
 end
 
 @generated function sub_tuples(a::NTuple{N,T}, b::NTuple{N,S}) where {N,T,S}
-    if !(T === S && S <: SIMDType)
+    if !(HAS_FLEXIABLE_VECTOR_LENGTH && T === S && S <: SIMDType)
         return tupexpr(i -> :(a[$i] - b[$i]), N)
     end
 
@@ -300,38 +303,32 @@ end
     end
 end
 
-if VERSION >= v"1.4" # fsub requires LLVM 8 (Julia 1.4)
-    @generated function minus_tuple(tup::NTuple{N,T}) where {N,T}
-        T <: SIMDType || return tupexpr(i -> :(-tup[$i]), N)
-
-        S = julia_type_to_llvm_type(T)
-        VT = NTuple{N, VecElement{T}}
-        if T <: SIMDFloat
-            llvmir = """
-            %res = fneg nsz contract <$N x $S> %0
-            ret <$N x $S> %res
-            """
-        else
-            llvmir = """
-            %res = sub <$N x $S> zeroinitializer, %0
-            ret <$N x $S> %res
-            """
-        end
-
-        quote
-            $(Expr(:meta, :inline))
-            ret = Base.llvmcall($llvmir, $VT, Tuple{$VT}, $VT(tup))
-            Base.@ntuple $N i->ret[i].value
-        end
+@generated function minus_tuple(tup::NTuple{N,T}) where {N,T}
+    (HAS_FLEXIABLE_VECTOR_LENGTH && T <: SIMDType) || return tupexpr(i -> :(-tup[$i]), N)
+
+    S = julia_type_to_llvm_type(T)
+    VT = NTuple{N, VecElement{T}}
+    if T <: SIMDFloat
+        llvmir = """
+        %res = fneg nsz contract <$N x $S> %0
+        ret <$N x $S> %res
+        """
+    else
+        llvmir = """
+        %res = sub <$N x $S> zeroinitializer, %0
+        ret <$N x $S> %res
+        """
     end
-else
-    @generated function minus_tuple(tup::NTuple{N,T}) where {N,T}
-        return tupexpr(i -> :(-tup[$i]), N)
+
+    quote
+        $(Expr(:meta, :inline))
+        ret = Base.llvmcall($llvmir, $VT, Tuple{$VT}, $VT(tup))
+        Base.@ntuple $N i->ret[i].value
     end
 end
 
 @generated function mul_tuples(a::NTuple{N,V1}, b::NTuple{N,V2}, afactor::S1, bfactor::S2) where {N,V1,V2,S1,S2}
-    if !(V1 === V2 === S1 === S2 && S2 <: SIMDFloat)
+    if !(HAS_FLEXIABLE_VECTOR_LENGTH && V1 === V2 === S1 === S2 && S2 <: SIMDFloat)
         return tupexpr(i -> :((afactor * a[$i]) + (bfactor * b[$i])), N)
     end