From e7b2a8abe0702c68761913c275c366f27a80fec7 Mon Sep 17 00:00:00 2001
From: Jishnu Bhattacharya <jishnub.github@gmail.com>
Date: Thu, 20 Feb 2025 22:20:52 +0530
Subject: [PATCH 1/6] Specialize Diagonal * Adjoint (#1207)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This reinstates slightly altered versions of the methods that were
removed in https://github.com/JuliaLang/julia/pull/52389. Sort of fixes
#1205, although this doesn't recover the full performance. However, this
version is more general, and works with the example presented in
https://github.com/JuliaLang/julia/pull/52389. There's still a
performance regression, but the full performance may only be obtained
for mutable matrices, and we may not assume mutability in general.

Performance:
v1.10:

```julia
julia> n = 100
100

julia> A = adjoint(sparse(Float64, I, n, n));

julia> B = Diagonal(ones(n));

julia> @btime $A * $B;
  837.119 ns (5 allocations: 2.59 KiB)
```

This PR
```julia
julia> @btime $A * $B;
  1.106 μs (15 allocations: 5.56 KiB)
```
We need double the allocations here compared to earlier, as we firstly
materialize `D' * A'`, and then we again copy the adjoint of this
result. I wonder if this may be reduced.

---------

Co-authored-by: Daniel Karrasch <daniel.karrasch@posteo.de>
---
 src/adjtrans.jl |  4 ++--
 src/diagonal.jl | 22 ++++++++++++++++++++++
 2 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/src/adjtrans.jl b/src/adjtrans.jl
index f7caa9bc..d81aa3ae 100644
--- a/src/adjtrans.jl
+++ b/src/adjtrans.jl
@@ -319,8 +319,8 @@ const AdjointAbsVec{T} = Adjoint{T,<:AbstractVector}
 const AdjointAbsMat{T} = Adjoint{T,<:AbstractMatrix}
 const TransposeAbsVec{T} = Transpose{T,<:AbstractVector}
 const TransposeAbsMat{T} = Transpose{T,<:AbstractMatrix}
-const AdjOrTransAbsVec{T} = AdjOrTrans{T,<:AbstractVector}
-const AdjOrTransAbsMat{T} = AdjOrTrans{T,<:AbstractMatrix}
+const AdjOrTransAbsVec{T,V<:AbstractVector} = AdjOrTrans{T,V}
+const AdjOrTransAbsMat{T,M<:AbstractMatrix} = AdjOrTrans{T,M}
 
 # for internal use below
 wrapperop(_) = identity
diff --git a/src/diagonal.jl b/src/diagonal.jl
index 9f8d54e5..9334f2c6 100644
--- a/src/diagonal.jl
+++ b/src/diagonal.jl
@@ -332,6 +332,28 @@ function (*)(D::Diagonal, V::AbstractVector)
     return D.diag .* V
 end
 
+function _diag_adj_mul(A::AdjOrTransAbsMat, D::Diagonal)
+    adj = wrapperop(A)
+    copy(adj(adj(D) * adj(A)))
+end
+function _diag_adj_mul(A::AdjOrTransAbsMat{<:Number, <:StridedMatrix}, D::Diagonal{<:Number})
+    @invoke *(A::AbstractMatrix, D::AbstractMatrix)
+end
+function _diag_adj_mul(D::Diagonal, A::AdjOrTransAbsMat)
+    adj = wrapperop(A)
+    copy(adj(adj(A) * adj(D)))
+end
+function _diag_adj_mul(D::Diagonal{<:Number}, A::AdjOrTransAbsMat{<:Number, <:StridedMatrix})
+    @invoke *(D::AbstractMatrix, A::AbstractMatrix)
+end
+
+function (*)(A::AdjOrTransAbsMat, D::Diagonal)
+    _diag_adj_mul(A, D)
+end
+function (*)(D::Diagonal, A::AdjOrTransAbsMat)
+    _diag_adj_mul(D, A)
+end
+
 function rmul!(A::AbstractMatrix, D::Diagonal)
     matmul_size_check(size(A), size(D))
     for I in CartesianIndices(A)

From 0745131bb035f018bdb0239a74175e8a8cd883dd Mon Sep 17 00:00:00 2001
From: Daniel Karrasch <daniel.karrasch@posteo.de>
Date: Wed, 5 Feb 2025 12:06:07 +0100
Subject: [PATCH 2/6] Use `BLAS.trsm!` instead of `LAPACK.trtrs!` in
 left-triangular solves (#1194)

Co-authored-by: Alexis Montoison <alexis.montoison@polymtl.ca>
---
 src/blas.jl        | 13 ++++++++++++-
 src/triangular.jl  |  8 +++++---
 test/testtriag.jl  |  2 ++
 test/triangular.jl |  7 ++++++-
 4 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/src/blas.jl b/src/blas.jl
index 3c156300..04e590c1 100644
--- a/src/blas.jl
+++ b/src/blas.jl
@@ -84,7 +84,8 @@ export
     trsm!,
     trsm
 
-using ..LinearAlgebra: libblastrampoline, BlasReal, BlasComplex, BlasFloat, BlasInt, DimensionMismatch, checksquare, chkstride1
+using ..LinearAlgebra: libblastrampoline, BlasReal, BlasComplex, BlasFloat, BlasInt,
+    DimensionMismatch, checksquare, chkstride1, SingularException
 
 include("lbt.jl")
 
@@ -1369,6 +1370,11 @@ for (fname, elty) in ((:dtrsv_,:Float64),
                 throw(DimensionMismatch(lazy"size of A is $n != length(x) = $(length(x))"))
             end
             chkstride1(A)
+            if diag == 'N'
+                for i in 1:n
+                    iszero(A[i,i]) && throw(SingularException(i))
+                end
+            end
             px, stx = vec_pointer_stride(x, ArgumentError("input vector with 0 stride is not allowed"))
             GC.@preserve x ccall((@blasfunc($fname), libblastrampoline), Cvoid,
                 (Ref{UInt8}, Ref{UInt8}, Ref{UInt8}, Ref{BlasInt},
@@ -2217,6 +2223,11 @@ for (mmname, smname, elty) in
             end
             chkstride1(A)
             chkstride1(B)
+            if diag == 'N'
+                for i in 1:k
+                    iszero(A[i,i]) && throw(SingularException(i))
+                end
+            end
             ccall((@blasfunc($smname), libblastrampoline), Cvoid,
                    (Ref{UInt8}, Ref{UInt8}, Ref{UInt8}, Ref{UInt8},
                     Ref{BlasInt}, Ref{BlasInt}, Ref{$elty}, Ptr{$elty},
diff --git a/src/triangular.jl b/src/triangular.jl
index 5c6b5188..3faa12da 100644
--- a/src/triangular.jl
+++ b/src/triangular.jl
@@ -1223,11 +1223,13 @@ function generic_mattrimul!(C::StridedMatrix{T}, uploc, isunitc, tfun::Function,
     end
 end
 # division
-function generic_trimatdiv!(C::StridedVecOrMat{T}, uploc, isunitc, tfun::Function, A::StridedMatrix{T}, B::AbstractVecOrMat{T}) where {T<:BlasFloat}
+generic_trimatdiv!(C::StridedVector{T}, uploc, isunitc, tfun::Function, A::StridedMatrix{T}, B::AbstractVector{T}) where {T<:BlasFloat} =
+    BLAS.trsv!(uploc, tfun === identity ? 'N' : tfun === transpose ? 'T' : 'C', isunitc, A, C === B ? C : copyto!(C, B))
+function generic_trimatdiv!(C::StridedMatrix{T}, uploc, isunitc, tfun::Function, A::StridedMatrix{T}, B::AbstractMatrix{T}) where {T<:BlasFloat}
     if stride(C,1) == stride(A,1) == 1
-        LAPACK.trtrs!(uploc, tfun === identity ? 'N' : tfun === transpose ? 'T' : 'C', isunitc, A, C === B ? C : copyto!(C, B))
+        BLAS.trsm!('L', uploc, tfun === identity ? 'N' : tfun === transpose ? 'T' : 'C', isunitc, one(T), A, C === B ? C : copyto!(C, B))
     else # incompatible with LAPACK
-        @invoke generic_trimatdiv!(C::AbstractVecOrMat, uploc, isunitc, tfun::Function, A::AbstractMatrix, B::AbstractVecOrMat)
+        @invoke generic_trimatdiv!(C::AbstractVecOrMat, uploc, isunitc, tfun::Function, A::AbstractMatrix, B::AbstractMatrix)
     end
 end
 function generic_mattridiv!(C::StridedMatrix{T}, uploc, isunitc, tfun::Function, A::AbstractMatrix{T}, B::StridedMatrix{T}) where {T<:BlasFloat}
diff --git a/test/testtriag.jl b/test/testtriag.jl
index 7542d843..6ed7a0d6 100644
--- a/test/testtriag.jl
+++ b/test/testtriag.jl
@@ -493,6 +493,8 @@ function test_triangular(elty1_types)
                 @test_throws DimensionMismatch transpose(Ann) \ bm
                 if t1 == UpperTriangular || t1 == LowerTriangular
                     @test_throws SingularException ldiv!(t1(zeros(elty1, n, n)), fill(eltyB(1), n))
+                    @test_throws SingularException ldiv!(t1(zeros(elty1, n, n)), fill(eltyB(1), n, 2))
+                    @test_throws SingularException rdiv!(fill(eltyB(1), n, n), t1(zeros(elty1, n, n)))
                 end
                 @test B / A1 ≈ B / M1
                 @test B / transpose(A1) ≈ B / transpose(M1)
diff --git a/test/triangular.jl b/test/triangular.jl
index a5fa45d0..8eeeb666 100644
--- a/test/triangular.jl
+++ b/test/triangular.jl
@@ -886,8 +886,13 @@ end
     end
 end
 
-@testset "(l/r)mul! and (l/r)div! for non-contiguous matrices" begin
+@testset "(l/r)mul! and (l/r)div! for non-contiguous arrays" begin
     U = UpperTriangular(reshape(collect(3:27.0),5,5))
+    b = float.(1:10)
+    b2 = copy(b); b2v = view(b2, 1:2:9); b2vc = copy(b2v)
+    @test lmul!(U, b2v) == lmul!(U, b2vc)
+    b2 = copy(b); b2v = view(b2, 1:2:9); b2vc = copy(b2v)
+    @test ldiv!(U, b2v) ≈ ldiv!(U, b2vc)
     B = float.(collect(reshape(1:100, 10,10)))
     B2 = copy(B); B2v = view(B2, 1:2:9, 1:5); B2vc = copy(B2v)
     @test lmul!(U, B2v) == lmul!(U, B2vc)

From 9b6fe735ca7f32124fa0d7bfad01d103e1ca0517 Mon Sep 17 00:00:00 2001
From: Keno Fischer <keno@juliacomputing.com>
Date: Mon, 10 Feb 2025 00:08:29 -0500
Subject: [PATCH 3/6] Explicitly declare type constructor imports (#1196)

These would otherwise give a warning after
https://github.com/JuliaLang/julia/pull/57311, but are a good change
even independently.

---------

Co-authored-by: Daniel Karrasch <daniel.karrasch@posteo.de>
(cherry picked from commit 2a1696a93683e947b46991ea454c4ffb1988e794)
---
 src/LinearAlgebra.jl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/LinearAlgebra.jl b/src/LinearAlgebra.jl
index fc1081e0..2c76dcdc 100644
--- a/src/LinearAlgebra.jl
+++ b/src/LinearAlgebra.jl
@@ -16,6 +16,7 @@ import Base: USE_BLAS64, abs, acos, acosh, acot, acoth, acsc, acsch, adjoint, as
     permutedims, permuterows!, power_by_squaring, promote_rule, real, sec, sech, setindex!,
     show, similar, sin, sincos, sinh, size, sqrt, strides, stride, tan, tanh, transpose, trunc,
     typed_hcat, vec, view, zero
+import Base: AbstractArray, AbstractMatrix, Array, Matrix
 using Base: IndexLinear, promote_eltype, promote_op, print_matrix,
     @propagate_inbounds, reduce, typed_hvcat, typed_vcat, require_one_based_indexing,
     splat, BitInteger

From 87542a7f88f7ef749d2e1d2460dc33da0d0a8f08 Mon Sep 17 00:00:00 2001
From: Jakob Nybo Nissen <jakobnybonissen@gmail.com>
Date: Fri, 14 Feb 2025 03:01:23 +0100
Subject: [PATCH 4/6] Add fast path in generic matmul (#1202)

This manually adds the critical optimisation investigated in
https://github.com/JuliaLang/julia/issues/56954. While we could rely on
LLVM to continue doing this optimisation, it's more robust to add it
manually.

(cherry picked from commit ed35a377cd3945c91654cabc0bac9cbed2d67a0e)
---
 src/matmul.jl  | 1 +
 test/matmul.jl | 1 +
 2 files changed, 2 insertions(+)

diff --git a/src/matmul.jl b/src/matmul.jl
index 76719c53..21c6e299 100644
--- a/src/matmul.jl
+++ b/src/matmul.jl
@@ -1021,6 +1021,7 @@ function _generic_matmatmul_nonadjtrans!(C, A, B, alpha, beta)
     @inbounds for n in axes(B, 2), k in axes(B, 1)
         # Balpha = B[k,n] * alpha, but we skip the multiplication in case isone(alpha)
         Balpha = @stable_muladdmul MulAddMul(alpha, false)(B[k,n])
+        !ismissing(Balpha) && iszero(Balpha) && continue
         @simd for m in axes(A, 1)
             C[m,n] = muladd(A[m,k], Balpha, C[m,n])
         end
diff --git a/test/matmul.jl b/test/matmul.jl
index 805edeac..afa730f1 100644
--- a/test/matmul.jl
+++ b/test/matmul.jl
@@ -767,6 +767,7 @@ import LinearAlgebra: Adjoint, Transpose
 (*)(x::RootInt, y::Integer) = x.i * y
 adjoint(x::RootInt) = x
 transpose(x::RootInt) = x
+Base.zero(::RootInt) = RootInt(0)
 
 @test Base.promote_op(*, RootInt, RootInt) === Int
 

From 37aa43ad700967a6bd8f9fc324705c28f8b695bc Mon Sep 17 00:00:00 2001
From: Jishnu Bhattacharya <jishnub.github@gmail.com>
Date: Mon, 17 Feb 2025 07:01:08 +0530
Subject: [PATCH 5/6] Restrict Diagonal sqrt branch to positive diag (#1203)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

As noted in https://github.com/JuliaLang/LinearAlgebra.jl/issues/1193,
the `sqrt(::Digonal{<:Real})` method requires the diagonal element to be
positive, so that `sqrt` is defined for the individual elements. We
therefore may restrict the diagonal branch in the dense `sqrt` method to
matrices with positive `diag` if the `eltype` is `Real`.

Fixes #1193

After this,
```julia
julia> A = diagm(0 => [1.0, -1.0])
2×2 Matrix{Float64}:
 1.0   0.0
 0.0  -1.0

julia> sqrt(A)
2×2 Matrix{ComplexF64}:
 1.0+0.0im  0.0+0.0im
 0.0+0.0im  0.0+1.0im
```

(cherry picked from commit 16d9d61d67419939f5528870eff4aacb3dd3afe5)
---
 src/dense.jl  | 3 ++-
 test/dense.jl | 6 ++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/dense.jl b/src/dense.jl
index ccf6bc80..234391e5 100644
--- a/src/dense.jl
+++ b/src/dense.jl
@@ -972,7 +972,8 @@ sqrt(::AbstractMatrix)
 function sqrt(A::AbstractMatrix{T}) where {T<:Union{Real,Complex}}
     if checksquare(A) == 0
         return copy(float(A))
-    elseif isdiag(A)
+    elseif isdiag(A) && (T <: Complex || all(x -> x ≥ zero(x), diagview(A)))
+        # Real Diagonal sqrt requires each diagonal element to be positive
         return applydiagonal(sqrt, A)
     elseif ishermitian(A)
         sqrtHermA = sqrt(Hermitian(A))
diff --git a/test/dense.jl b/test/dense.jl
index 1f953ea9..1d4b330c 100644
--- a/test/dense.jl
+++ b/test/dense.jl
@@ -984,6 +984,12 @@ end
 @testset "sqrt for diagonal" begin
     A = diagm(0 => [1, 2, 3])
     @test sqrt(A)^2 ≈ A
+
+    A = diagm(0 => [1.0, -1.0])
+    @test sqrt(A) == diagm(0 => [1.0, 1.0im])
+    @test sqrt(A)^2 ≈ A
+    B = im*A
+    @test sqrt(B)^2 ≈ B
 end
 
 @testset "issue #40141" begin

From 6b5bdf2cdb2cae8ce00cd9498c0155481a46be7e Mon Sep 17 00:00:00 2001
From: Jishnu Bhattacharya <jishnub.github@gmail.com>
Date: Thu, 20 Feb 2025 16:50:21 +0530
Subject: [PATCH 6/6] Indirection in matrix multiplication to avoid ambiguities
 (#1210)

(cherry picked from commit 5cf41c438b9014e2b9607b60a20140b4bb0a97da)
---
 src/diagonal.jl | 30 ++++++++++++++++--------------
 src/matmul.jl   |  4 +++-
 2 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/src/diagonal.jl b/src/diagonal.jl
index 9334f2c6..8ded64ad 100644
--- a/src/diagonal.jl
+++ b/src/diagonal.jl
@@ -693,22 +693,24 @@ end
 for Tri in (:UpperTriangular, :LowerTriangular)
     UTri = Symbol(:Unit, Tri)
     # 2 args
-    for (fun, f) in zip((:*, :rmul!, :rdiv!, :/), (:identity, :identity, :inv, :inv))
-        @eval $fun(A::$Tri, D::Diagonal) = $Tri($fun(A.data, D))
-        @eval $fun(A::$UTri, D::Diagonal) = $Tri(_setdiag!($fun(A.data, D), $f, D.diag))
+    for (fun, f) in zip((:mul, :rmul!, :rdiv!, :/), (:identity, :identity, :inv, :inv))
+        g = fun == :mul ? :* : fun
+        @eval $fun(A::$Tri, D::Diagonal) = $Tri($g(A.data, D))
+        @eval $fun(A::$UTri, D::Diagonal) = $Tri(_setdiag!($g(A.data, D), $f, D.diag))
     end
-    @eval *(A::$Tri{<:Any, <:StridedMaybeAdjOrTransMat}, D::Diagonal) =
-            @invoke *(A::AbstractMatrix, D::Diagonal)
-    @eval *(A::$UTri{<:Any, <:StridedMaybeAdjOrTransMat}, D::Diagonal) =
-            @invoke *(A::AbstractMatrix, D::Diagonal)
-    for (fun, f) in zip((:*, :lmul!, :ldiv!, :\), (:identity, :identity, :inv, :inv))
-        @eval $fun(D::Diagonal, A::$Tri) = $Tri($fun(D, A.data))
-        @eval $fun(D::Diagonal, A::$UTri) = $Tri(_setdiag!($fun(D, A.data), $f, D.diag))
+    @eval mul(A::$Tri{<:Any, <:StridedMaybeAdjOrTransMat}, D::Diagonal) =
+            @invoke mul(A::AbstractMatrix, D::Diagonal)
+    @eval mul(A::$UTri{<:Any, <:StridedMaybeAdjOrTransMat}, D::Diagonal) =
+            @invoke mul(A::AbstractMatrix, D::Diagonal)
+    for (fun, f) in zip((:mul, :lmul!, :ldiv!, :\), (:identity, :identity, :inv, :inv))
+        g = fun == :mul ? :* : fun
+        @eval $fun(D::Diagonal, A::$Tri) = $Tri($g(D, A.data))
+        @eval $fun(D::Diagonal, A::$UTri) = $Tri(_setdiag!($g(D, A.data), $f, D.diag))
     end
-    @eval *(D::Diagonal, A::$Tri{<:Any, <:StridedMaybeAdjOrTransMat}) =
-            @invoke *(D::Diagonal, A::AbstractMatrix)
-    @eval *(D::Diagonal, A::$UTri{<:Any, <:StridedMaybeAdjOrTransMat}) =
-            @invoke *(D::Diagonal, A::AbstractMatrix)
+    @eval mul(D::Diagonal, A::$Tri{<:Any, <:StridedMaybeAdjOrTransMat}) =
+            @invoke mul(D::Diagonal, A::AbstractMatrix)
+    @eval mul(D::Diagonal, A::$UTri{<:Any, <:StridedMaybeAdjOrTransMat}) =
+            @invoke mul(D::Diagonal, A::AbstractMatrix)
     # 3-arg ldiv!
     @eval ldiv!(C::$Tri, D::Diagonal, A::$Tri) = $Tri(ldiv!(C.data, D, A.data))
     @eval ldiv!(C::$Tri, D::Diagonal, A::$UTri) = $Tri(_setdiag!(ldiv!(C.data, D, A.data), inv, D.diag))
diff --git a/src/matmul.jl b/src/matmul.jl
index 21c6e299..84a83d3a 100644
--- a/src/matmul.jl
+++ b/src/matmul.jl
@@ -111,7 +111,9 @@ julia> [1 1; 0 1] * [1 0; 1 1]
  1  1
 ```
 """
-function (*)(A::AbstractMatrix, B::AbstractMatrix)
+(*)(A::AbstractMatrix, B::AbstractMatrix) = mul(A, B)
+# we add an extra level of indirection to avoid ambiguities in *
+function mul(A::AbstractMatrix, B::AbstractMatrix)
     TS = promote_op(matprod, eltype(A), eltype(B))
     mul!(matprod_dest(A, B, TS), A, B)
 end