diff --git a/Project.toml b/Project.toml
index 0e35b4c..918aadd 100644
--- a/Project.toml
+++ b/Project.toml
@@ -6,24 +6,25 @@ version = "1.3.3"
 [deps]
 ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b"
 ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
-DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
 DifferentiationInterface = "a0c0ee7d-e4b9-4e03-894e-1c5f64a51d63"
+DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
+FastClosures = "9aa1b823-49e4-5ca5-8b0f-3971ec8bab6a"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 PDMats = "90014a1f-27ba-587c-ab20-58faa44d9150"
 Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
 Requires = "ae029012-a4dd-5104-9daa-d747884805df"
 SciMLBase = "0bca4576-84f4-4d90-8ffe-ffa030f20462"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
+SparseConnectivityTracer = "9f842d2f-2579-4b1d-911e-f412cf18a3f5"
+SparseMatrixColorings = "0a514795-09f3-496d-8182-132a7b665d35"
 SymbolicAnalysis = "4297ee4d-0239-47d8-ba5d-195ecdf594fe"
 SymbolicIndexingInterface = "2efcf032-c050-4f8e-a9bb-153293bab1f5"
 Symbolics = "0c5d862f-8b57-4792-8d23-62f2024744c7"
-SparseConnectivityTracer = "9f842d2f-2579-4b1d-911e-f412cf18a3f5"
-SparseMatrixColorings = "0a514795-09f3-496d-8182-132a7b665d35"
 
 [weakdeps]
 Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
-ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 FiniteDiff = "6a86dc24-6348-571c-b903-95158fe2bd41"
+ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 ModelingToolkit = "961ee093-0014-501f-94e3-6117800e7a78"
 ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
diff --git a/ext/OptimizationEnzymeExt.jl b/ext/OptimizationEnzymeExt.jl
index 4439c8b..08242be 100644
--- a/ext/OptimizationEnzymeExt.jl
+++ b/ext/OptimizationEnzymeExt.jl
@@ -3,13 +3,13 @@ module OptimizationEnzymeExt
 import OptimizationBase, OptimizationBase.ArrayInterface
 import OptimizationBase.SciMLBase: OptimizationFunction
 import OptimizationBase.SciMLBase
-import OptimizationBase.LinearAlgebra: I
+import OptimizationBase.LinearAlgebra: I, dot
 import OptimizationBase.ADTypes: AutoEnzyme
 using Enzyme
 using Core: Vararg
 
-@inline function firstapply(f::F, θ, p, args...) where {F}
-    res = f(θ, p, args...)
+@inline function firstapply(f::F, θ, p) where {F}
+    res = f(θ, p)
     if isa(res, AbstractFloat)
         res
     else
@@ -17,104 +17,168 @@ using Core: Vararg
     end
 end
 
-function inner_grad(θ, bθ, f, p, args::Vararg{Any, N}) where {N}
+function inner_grad(θ, bθ, f, p)
     Enzyme.autodiff_deferred(Enzyme.Reverse,
         Const(firstapply),
         Active,
         Const(f),
         Enzyme.Duplicated(θ, bθ),
-        Const(p),
-        Const.(args)...),
+        Const(p)
+    )
     return nothing
 end
 
-function hv_f2_alloc(x, f, p, args...)
+function inner_grad_primal(θ, bθ, f, p)
+    Enzyme.autodiff_deferred(Enzyme.ReverseWithPrimal,
+        Const(firstapply),
+        Active,
+        Const(f),
+        Enzyme.Duplicated(θ, bθ),
+        Const(p)
+    )[2]
+end
+
+function hv_f2_alloc(x, f, p)
     dx = Enzyme.make_zero(x)
     Enzyme.autodiff_deferred(Enzyme.Reverse,
         firstapply,
         Active,
         f,
         Enzyme.Duplicated(x, dx),
-        Const(p),
-        Const.(args)...)
+        Const(p)
+    )
     return dx
 end
 
 function inner_cons(x, fcons::Function, p::Union{SciMLBase.NullParameters, Nothing},
-        num_cons::Int, i::Int, args::Vararg{Any, N}) where {N}
+        num_cons::Int, i::Int)
     res = zeros(eltype(x), num_cons)
-    fcons(res, x, p, args...)
+    fcons(res, x, p)
     return res[i]
 end
 
-function cons_f2(x, dx, fcons, p, num_cons, i, args::Vararg{Any, N}) where {N}
+function cons_f2(x, dx, fcons, p, num_cons, i)
     Enzyme.autodiff_deferred(Enzyme.Reverse, inner_cons, Active, Enzyme.Duplicated(x, dx),
-        Const(fcons), Const(p), Const(num_cons), Const(i), Const.(args)...)
+        Const(fcons), Const(p), Const(num_cons), Const(i))
     return nothing
 end
 
 function inner_cons_oop(
         x::Vector{T}, fcons::Function, p::Union{SciMLBase.NullParameters, Nothing},
-        i::Int, args::Vararg{Any, N}) where {T, N}
-    return fcons(x, p, args...)[i]
+        i::Int) where {T}
+    return fcons(x, p)[i]
 end
 
-function cons_f2_oop(x, dx, fcons, p, i, args::Vararg{Any, N}) where {N}
+function cons_f2_oop(x, dx, fcons, p, i)
     Enzyme.autodiff_deferred(
         Enzyme.Reverse, inner_cons_oop, Active, Enzyme.Duplicated(x, dx),
-        Const(fcons), Const(p), Const(i), Const.(args)...)
+        Const(fcons), Const(p), Const(i))
+    return nothing
+end
+
+function lagrangian(x, _f::Function, cons::Function, p, λ, σ = one(eltype(x)))::Float64
+    res = zeros(eltype(x), length(λ))
+    cons(res, x, p)
+    return σ * _f(x, p) + dot(λ, res)
+end
+
+function lag_grad(x, dx, lagrangian::Function, _f::Function, cons::Function, p, σ, λ)
+    Enzyme.autodiff_deferred(Enzyme.Reverse, lagrangian, Active, Enzyme.Duplicated(x, dx),
+        Const(_f), Const(cons), Const(p), Const(λ), Const(σ))
     return nothing
 end
 
 function OptimizationBase.instantiate_function(f::OptimizationFunction{true}, x,
-        adtype::AutoEnzyme, p,
-        num_cons = 0)
+        adtype::AutoEnzyme, p, num_cons = 0; fg = false, fgh = false)
     if f.grad === nothing
-        grad = let
-            function (res, θ, args...)
-                Enzyme.make_zero!(res)
-                Enzyme.autodiff(Enzyme.Reverse,
-                    Const(firstapply),
-                    Active,
-                    Const(f.f),
-                    Enzyme.Duplicated(θ, res),
-                    Const(p),
-                    Const.(args)...)
-            end
+        function grad(res, θ)
+            Enzyme.make_zero!(res)
+            Enzyme.autodiff(Enzyme.Reverse,
+                Const(firstapply),
+                Active,
+                Const(f.f),
+                Enzyme.Duplicated(θ, res),
+                Const(p)
+            )
+        end
+    else
+        grad = (G, θ) -> f.grad(G, θ, p)
+    end
+
+    if fg == true
+        function fg!(res, θ)
+            Enzyme.make_zero!(res)
+            y = Enzyme.autodiff(Enzyme.ReverseWithPrimal,
+                Const(firstapply),
+                Active,
+                Const(f.f),
+                Enzyme.Duplicated(θ, res),
+                Const(p)
+            )[2]
+            return y
         end
     else
-        grad = (G, θ, args...) -> f.grad(G, θ, p, args...)
+        fg! = nothing
     end
 
     if f.hess === nothing
-        function hess(res, θ, args...)
-            vdθ = Tuple((Array(r) for r in eachrow(I(length(θ)) * one(eltype(θ)))))
+        vdθ = Tuple((Array(r) for r in eachrow(I(length(x)) * one(eltype(x)))))
+        bθ = zeros(eltype(x), length(x))
+
+        if f.hess_prototype === nothing
+            vdbθ = Tuple(zeros(eltype(x), length(x)) for i in eachindex(x))
+        else
+            #useless right now, looks like there is no way to tell Enzyme the sparsity pattern?
+            vdbθ = Tuple((copy(r) for r in eachrow(f.hess_prototype)))
+        end
 
-            bθ = zeros(eltype(θ), length(θ))
-            vdbθ = Tuple(zeros(eltype(θ), length(θ)) for i in eachindex(θ))
+        function hess(res, θ)
+            Enzyme.make_zero!(bθ)
+            Enzyme.make_zero!.(vdbθ)
 
             Enzyme.autodiff(Enzyme.Forward,
                 inner_grad,
                 Enzyme.BatchDuplicated(θ, vdθ),
-                Enzyme.BatchDuplicated(bθ, vdbθ),
+                Enzyme.BatchDuplicatedNoNeed(bθ, vdbθ),
                 Const(f.f),
-                Const(p),
-                Const.(args)...)
+                Const(p)
+            )
 
             for i in eachindex(θ)
                 res[i, :] .= vdbθ[i]
             end
         end
     else
-        hess = (H, θ, args...) -> f.hess(H, θ, p, args...)
+        hess = (H, θ) -> f.hess(H, θ, p)
+    end
+
+    if fgh == true
+        function fgh!(G, H, θ)
+            vdθ = Tuple((Array(r) for r in eachrow(I(length(θ)) * one(eltype(θ)))))
+            vdbθ = Tuple(zeros(eltype(θ), length(θ)) for i in eachindex(θ))
+
+            Enzyme.autodiff(Enzyme.Forward,
+                inner_grad,
+                Enzyme.BatchDuplicated(θ, vdθ),
+                Enzyme.BatchDuplicatedNoNeed(G, vdbθ),
+                Const(f.f),
+                Const(p)
+            )
+
+            for i in eachindex(θ)
+                H[i, :] .= vdbθ[i]
+            end
+        end
+    else
+        fgh! = nothing
     end
 
     if f.hv === nothing
-        hv = function (H, θ, v, args...)
+        function hv(H, θ, v)
             H .= Enzyme.autodiff(
                 Enzyme.Forward, hv_f2_alloc, DuplicatedNoNeed, Duplicated(θ, v),
-                Const(_f), Const(f.f), Const(p),
-                Const.(args)...)[1]
+                Const(_f), Const(f.f), Const(p)
+            )[1]
         end
     else
         hv = f.hv
@@ -123,67 +187,162 @@ function OptimizationBase.instantiate_function(f::OptimizationFunction{true}, x,
     if f.cons === nothing
         cons = nothing
     else
-        cons = (res, θ, args...) -> f.cons(res, θ, p, args...)
+        cons = (res, θ) -> f.cons(res, θ, p)
     end
 
     if cons !== nothing && f.cons_j === nothing
-        seeds = Tuple((Array(r) for r in eachrow(I(length(x)) * one(eltype(x)))))
-        Jaccache = Tuple(zeros(eltype(x), num_cons) for i in 1:length(x))
+        if num_cons > length(x)
+            seeds = Enzyme.onehot(x)
+            Jaccache = Tuple(zeros(eltype(x), num_cons) for i in 1:length(x))
+        else
+            seeds = Enzyme.onehot(zeros(eltype(x), num_cons))
+            Jaccache = Tuple(zero(x) for i in 1:num_cons)
+        end
+
         y = zeros(eltype(x), num_cons)
-        cons_j = function (J, θ, args...)
-            for i in 1:num_cons
+
+        function cons_j(J, θ)
+            for i in eachindex(Jaccache)
                 Enzyme.make_zero!(Jaccache[i])
             end
             Enzyme.make_zero!(y)
-            Enzyme.autodiff(Enzyme.Forward, f.cons, BatchDuplicated(y, Jaccache),
-                BatchDuplicated(θ, seeds), Const(p), Const.(args)...)
-            for i in 1:length(θ)
-                if J isa Vector
-                    J[i] = Jaccache[i][1]
-                else
-                    copyto!(@view(J[:, i]), Jaccache[i])
+            if num_cons > length(θ)
+                Enzyme.autodiff(Enzyme.Forward, f.cons, BatchDuplicated(y, Jaccache),
+                    BatchDuplicated(θ, seeds), Const(p))
+                for i in eachindex(θ)
+                    if J isa Vector
+                        J[i] = Jaccache[i][1]
+                    else
+                        copyto!(@view(J[:, i]), Jaccache[i])
+                    end
+                end
+            else
+                Enzyme.autodiff(Enzyme.Reverse, f.cons, BatchDuplicated(y, seeds),
+                    BatchDuplicated(θ, Jaccache), Const(p))
+                for i in 1:num_cons
+                    if J isa Vector
+                        J .= Jaccache[1]
+                    else
+                        copyto!(@view(J[i, :]), Jaccache[i])
+                    end
                 end
             end
         end
     else
-        cons_j = (J, θ, args...) -> f.cons_j(J, θ, p, args...)
+        cons_j = (J, θ) -> f.cons_j(J, θ, p)
+    end
+
+    if cons !== nothing && f.cons_vjp == true
+        cons_res = zeros(eltype(x), num_cons)
+        function cons_vjp!(res, θ, v)
+            Enzyme.make_zero!(res)
+            Enzyme.make_zero!(cons_res)
+
+            Enzyme.autodiff(Enzyme.Reverse,
+                f.cons,
+                Const,
+                Duplicated(cons_res, v),
+                Duplicated(θ, res),
+                Const(p)
+            )
+        end
+    else
+        cons_vjp! = (θ, σ) -> f.cons_vjp(θ, σ, p)
+    end
+
+    if cons !== nothing && f.cons_jvp == true
+        cons_res = zeros(eltype(x), num_cons)
+
+        function cons_jvp!(res, θ, v)
+            Enzyme.make_zero!(res)
+            Enzyme.make_zero!(cons_res)
+
+            Enzyme.autodiff(Enzyme.Forward,
+                f.cons,
+                Duplicated(cons_res, res),
+                Duplicated(θ, v),
+                Const(p)
+            )
+        end
+    else
+        cons_vjp! = nothing
     end
 
     if cons !== nothing && f.cons_h === nothing
-        cons_h = function (res, θ, args...)
-            vdθ = Tuple((Array(r) for r in eachrow(I(length(θ)) * one(eltype(θ)))))
-            bθ = zeros(eltype(θ), length(θ))
-            vdbθ = Tuple(zeros(eltype(θ), length(θ)) for i in eachindex(θ))
+        cons_vdθ = Tuple((Array(r) for r in eachrow(I(length(x)) * one(eltype(x)))))
+        cons_bθ = zeros(eltype(x), length(x))
+        cons_vdbθ = Tuple(zeros(eltype(x), length(x)) for i in eachindex(x))
+
+        function cons_h(res, θ)
             for i in 1:num_cons
-                bθ .= zero(eltype(bθ))
-                for el in vdbθ
-                    Enzyme.make_zero!(el)
-                end
+                Enzyme.make_zero!(cons_bθ)
+                Enzyme.make_zero!.(cons_vdbθ)
                 Enzyme.autodiff(Enzyme.Forward,
                     cons_f2,
-                    Enzyme.BatchDuplicated(θ, vdθ),
-                    Enzyme.BatchDuplicated(bθ, vdbθ),
+                    Enzyme.BatchDuplicated(θ, cons_vdθ),
+                    Enzyme.BatchDuplicated(cons_bθ, cons_vdbθ),
                     Const(f.cons),
                     Const(p),
                     Const(num_cons),
-                    Const(i),
-                    Const.(args)...
-                )
+                    Const(i))
 
                 for j in eachindex(θ)
-                    res[i][j, :] .= vdbθ[j]
+                    res[i][j, :] .= cons_vdbθ[j]
                 end
             end
         end
     else
-        cons_h = (res, θ, args...) -> f.cons_h(res, θ, p, args...)
+        cons_h = (res, θ) -> f.cons_h(res, θ, p)
+    end
+
+    if f.lag_h === nothing
+        lag_vdθ = Tuple((Array(r) for r in eachrow(I(length(x)) * one(eltype(x)))))
+        lag_bθ = zeros(eltype(x), length(x))
+
+        if f.hess_prototype === nothing
+            lag_vdbθ = Tuple(zeros(eltype(x), length(x)) for i in eachindex(x))
+        else
+            #useless right now, looks like there is no way to tell Enzyme the sparsity pattern?
+            lag_vdbθ = Tuple((copy(r) for r in eachrow(f.hess_prototype)))
+        end
+
+        function lag_h(h, θ, σ, μ)
+            Enzyme.make_zero!(lag_bθ)
+            Enzyme.make_zero!.(lag_vdbθ)
+
+            Enzyme.autodiff(Enzyme.Forward,
+                lag_grad,
+                Enzyme.BatchDuplicated(θ, lag_vdθ),
+                Enzyme.BatchDuplicatedNoNeed(lag_bθ, lag_vdbθ),
+                Const(lagrangian),
+                Const(f.f),
+                Const(f.cons),
+                Const(p),
+                Const(σ),
+                Const(μ)
+            )
+            k = 0
+
+            for i in eachindex(θ)
+                vec_lagv = lag_vdbθ[i]
+                h[(k + 1):(k + i)] .= @view(vec_lagv[1:i])
+                k += i
+            end
+        end
+    else
+        lag_h = (θ, σ, μ) -> f.lag_h(θ, σ, μ, p)
     end
 
     return OptimizationFunction{true}(f.f, adtype; grad = grad, hess = hess, hv = hv,
         cons = cons, cons_j = cons_j, cons_h = cons_h,
         hess_prototype = f.hess_prototype,
         cons_jac_prototype = f.cons_jac_prototype,
-        cons_hess_prototype = f.cons_hess_prototype)
+        cons_hess_prototype = f.cons_hess_prototype,
+        lag_h = lag_h,
+        lag_hess_prototype = f.lag_hess_prototype,
+        sys = f.sys,
+        expr = f.expr,
+        cons_expr = f.cons_expr)
 end
 
 function OptimizationBase.instantiate_function(f::OptimizationFunction{true},
@@ -191,50 +350,105 @@ function OptimizationBase.instantiate_function(f::OptimizationFunction{true},
         adtype::AutoEnzyme,
         num_cons = 0)
     p = cache.p
+    x = cache.u0
+
+    return OptimizationBase.instantiate_function(f, x, adtype, p, num_cons)
+end
 
+function OptimizationBase.instantiate_function(f::OptimizationFunction{false}, x,
+        adtype::AutoEnzyme, p, num_cons = 0; fg = false, fgh = false)
     if f.grad === nothing
-        function grad(res, θ, args...)
+        res = zeros(eltype(x), size(x))
+        function grad(θ)
             Enzyme.make_zero!(res)
             Enzyme.autodiff(Enzyme.Reverse,
                 Const(firstapply),
                 Active,
                 Const(f.f),
                 Enzyme.Duplicated(θ, res),
-                Const(p),
-                Const.(args)...)
+                Const(p)
+            )
+            return res
+        end
+    else
+        grad = (θ) -> f.grad(θ, p)
+    end
+
+    if fg == true
+        res_fg = zeros(eltype(x), size(x))
+        function fg!(θ)
+            Enzyme.make_zero!(res_fg)
+            y = Enzyme.autodiff(Enzyme.ReverseWithPrimal,
+                Const(firstapply),
+                Active,
+                Const(f.f),
+                Enzyme.Duplicated(θ, res_fg),
+                Const(p)
+            )[2]
+            return y, res
         end
     else
-        grad = (G, θ, args...) -> f.grad(G, θ, p, args...)
+        fg! = nothing
     end
 
     if f.hess === nothing
-        function hess(res, θ, args...)
-            vdθ = Tuple((Array(r) for r in eachrow(I(length(θ)) * one(eltype(θ)))))
-            bθ = zeros(eltype(θ), length(θ))
-            vdbθ = Tuple(zeros(eltype(θ), length(θ)) for i in eachindex(θ))
+        vdθ = Tuple((Array(r) for r in eachrow(I(length(x)) * one(eltype(x)))))
+        bθ = zeros(eltype(x), length(x))
+        vdbθ = Tuple(zeros(eltype(x), length(x)) for i in eachindex(x))
+
+        function hess(θ)
+            Enzyme.make_zero!(bθ)
+            Enzyme.make_zero!.(vdbθ)
 
             Enzyme.autodiff(Enzyme.Forward,
                 inner_grad,
                 Enzyme.BatchDuplicated(θ, vdθ),
                 Enzyme.BatchDuplicated(bθ, vdbθ),
                 Const(f.f),
-                Const(p),
-                Const.(args)...)
+                Const(p)
+            )
+
+            return reduce(
+                vcat, [reshape(vdbθ[i], (1, length(vdbθ[i]))) for i in eachindex(θ)])
+        end
+    else
+        hess = (θ) -> f.hess(θ, p)
+    end
+
+    if fgh == true
+        vdθ_fgh = Tuple((Array(r) for r in eachrow(I(length(x)) * one(eltype(x)))))
+        vdbθ_fgh = Tuple(zeros(eltype(x), length(x)) for i in eachindex(x))
+        G_fgh = zeros(eltype(x), length(x))
+        H_fgh = zeros(eltype(x), length(x), length(x))
+
+        function fgh!(θ)
+            Enzyme.make_zero!(G_fgh)
+            Enzyme.make_zero!(H_fgh)
+            Enzyme.make_zero!.(vdbθ_fgh)
+
+            Enzyme.autodiff(Enzyme.Forward,
+                inner_grad,
+                Enzyme.BatchDuplicated(θ, vdθ_fgh),
+                Enzyme.BatchDuplicatedNoNeed(G_fgh, vdbθ_fgh),
+                Const(f.f),
+                Const(p)
+            )
 
             for i in eachindex(θ)
-                res[i, :] .= vdbθ[i]
+                H_fgh[i, :] .= vdbθ_fgh[i]
             end
+            return G_fgh, H_fgh
         end
     else
-        hess = (H, θ, args...) -> f.hess(H, θ, p, args...)
+        fgh! = nothing
     end
 
     if f.hv === nothing
-        hv = function (H, θ, v, args...)
-            H .= Enzyme.autodiff(
+        function hv(θ, v)
+            return Enzyme.autodiff(
                 Enzyme.Forward, hv_f2_alloc, DuplicatedNoNeed, Duplicated(θ, v),
-                Const(f.f), Const(p),
-                Const.(args)...)[1]
+                Const(_f), Const(f.f), Const(p)
+            )[1]
         end
     else
         hv = f.hv
@@ -243,296 +457,154 @@ function OptimizationBase.instantiate_function(f::OptimizationFunction{true},
     if f.cons === nothing
         cons = nothing
     else
-        cons = (res, θ, args...) -> f.cons(res, θ, p, args...)
+        function cons(θ)
+            return f.cons(θ, p)
+        end
     end
 
     if cons !== nothing && f.cons_j === nothing
-        seeds = Tuple((Array(r)
-        for r in eachrow(I(length(cache.u0)) * one(eltype(cache.u0)))))
-        Jaccache = Tuple(zeros(eltype(cache.u0), num_cons) for i in 1:length(cache.u0))
-        y = zeros(eltype(cache.u0), num_cons)
-        cons_j = function (J, θ, args...)
-            for i in 1:num_cons
+        seeds = Enzyme.onehot(x)
+        Jaccache = Tuple(zeros(eltype(x), num_cons) for i in 1:length(x))
+
+        function cons_j(θ)
+            for i in eachindex(Jaccache)
                 Enzyme.make_zero!(Jaccache[i])
             end
-            Enzyme.make_zero!(y)
-            Enzyme.autodiff(Enzyme.Forward, f.cons, BatchDuplicated(y, Jaccache),
-                BatchDuplicated(θ, seeds), Const(p), Const.(args)...)
-            for i in 1:length(θ)
-                if J isa Vector
-                    J[i] = Jaccache[i][1]
-                else
-                    copyto!(@view(J[:, i]), Jaccache[i])
-                end
+            y, Jaccache = Enzyme.autodiff(Enzyme.Forward, f.cons, Duplicated,
+                BatchDuplicated(θ, seeds), Const(p))
+            if size(y, 1) == 1
+                return reduce(vcat, Jaccache)
+            else
+                return reduce(hcat, Jaccache)
             end
         end
     else
-        cons_j = (J, θ, args...) -> f.cons_j(J, θ, p, args...)
+        cons_j = (θ) -> f.cons_j(θ, p)
     end
 
-    if cons !== nothing && f.cons_h === nothing
-        cons_h = function (res, θ, args...)
-            vdθ = Tuple((Array(r) for r in eachrow(I(length(θ)) * one(eltype(θ)))))
-            bθ = zeros(eltype(θ), length(θ))
-            vdbθ = Tuple(zeros(eltype(θ), length(θ)) for i in eachindex(θ))
-            for i in 1:num_cons
-                bθ .= zero(eltype(bθ))
-                for el in vdbθ
-                    Enzyme.make_zero!(el)
-                end
-                Enzyme.autodiff(Enzyme.Forward,
-                    cons_f2,
-                    Enzyme.BatchDuplicated(θ, vdθ),
-                    Enzyme.BatchDuplicated(bθ, vdbθ),
-                    Const(f.cons),
-                    Const(p),
-                    Const(num_cons),
-                    Const(i),
-                    Const.(args)...
-                )
-
-                for j in eachindex(θ)
-                    res[i][j, :] .= vdbθ[j]
-                end
-            end
-        end
-    else
-        cons_h = (res, θ, args...) -> f.cons_h(res, θ, p, args...)
-    end
+    if cons !== nothing && f.cons_vjp == true
+        res_vjp = zeros(eltype(x), size(x))
+        cons_vjp_res = zeros(eltype(x), num_cons)
 
-    return OptimizationFunction{true}(f.f, adtype; grad = grad, hess = hess, hv = hv,
-        cons = cons, cons_j = cons_j, cons_h = cons_h,
-        hess_prototype = f.hess_prototype,
-        cons_jac_prototype = f.cons_jac_prototype,
-        cons_hess_prototype = f.cons_hess_prototype)
-end
+        function cons_vjp(θ, v)
+            Enzyme.make_zero!(res_vjp)
+            Enzyme.make_zero!(cons_vjp_res)
 
-function OptimizationBase.instantiate_function(f::OptimizationFunction{false}, x,
-        adtype::AutoEnzyme, p,
-        num_cons = 0)
-    if f.grad === nothing
-        res = zeros(eltype(x), size(x))
-        grad = let res = res
-            function (θ, args...)
-                Enzyme.make_zero!(res)
-                Enzyme.autodiff(Enzyme.Reverse,
-                    Const(firstapply),
-                    Active,
-                    Const(f.f),
-                    Enzyme.Duplicated(θ, res),
-                    Const(p),
-                    Const.(args)...)
-                return res
-            end
+            Enzyme.autodiff(Enzyme.Reverse,
+                f.cons,
+                Const,
+                Duplicated(cons_vjp_res, v),
+                Duplicated(θ, res_vjp),
+                Const(p)
+            )
+            return res_vjp
         end
     else
-        grad = (θ, args...) -> f.grad(θ, p, args...)
+        cons_vjp = (θ, σ) -> f.cons_vjp(θ, σ, p)
     end
 
-    if f.hess === nothing
-        function hess(θ, args...)
-            vdθ = Tuple((Array(r) for r in eachrow(I(length(θ)) * one(eltype(θ)))))
+    if cons !== nothing && f.cons_jvp == true
+        res_jvp = zeros(eltype(x), size(x))
+        cons_jvp_res = zeros(eltype(x), num_cons)
 
-            bθ = zeros(eltype(θ), length(θ))
-            vdbθ = Tuple(zeros(eltype(θ), length(θ)) for i in eachindex(θ))
+        function cons_jvp(θ, v)
+            Enzyme.make_zero!(res_jvp)
+            Enzyme.make_zero!(cons_jvp_res)
 
             Enzyme.autodiff(Enzyme.Forward,
-                inner_grad,
-                Enzyme.BatchDuplicated(θ, vdθ),
-                Enzyme.BatchDuplicated(bθ, vdbθ),
-                Const(f.f),
-                Const(p),
-                Const.(args)...)
-
-            return reduce(
-                vcat, [reshape(vdbθ[i], (1, length(vdbθ[i]))) for i in eachindex(θ)])
+                f.cons,
+                Duplicated(cons_jvp_res, res_jvp),
+                Duplicated(θ, v),
+                Const(p)
+            )
+            return res_jvp
         end
     else
-        hess = (θ, args...) -> f.hess(θ, p, args...)
-    end
-
-    if f.hv === nothing
-        hv = function (θ, v, args...)
-            Enzyme.autodiff(
-                Enzyme.Forward, hv_f2_alloc, DuplicatedNoNeed, Duplicated(θ, v),
-                Const(_f), Const(f.f), Const(p),
-                Const.(args)...)[1]
-        end
-    else
-        hv = f.hv
+        cons_jvp = nothing
     end
 
-    if f.cons === nothing
-        cons = nothing
-    else
-        cons_oop = (θ, args...) -> f.cons(θ, p, args...)
-    end
-
-    if f.cons !== nothing && f.cons_j === nothing
-        seeds = Tuple((Array(r) for r in eachrow(I(length(x)) * one(eltype(x)))))
-        cons_j = function (θ, args...)
-            J = Enzyme.autodiff(
-                Enzyme.Forward, f.cons, BatchDuplicated(θ, seeds), Const(p), Const.(args)...)[1]
-            if num_cons == 1
-                return reduce(vcat, J)
-            else
-                return reduce(hcat, J)
-            end
-        end
-    else
-        cons_j = (θ) -> f.cons_j(θ, p)
-    end
-
-    if f.cons !== nothing && f.cons_h === nothing
-        cons_h = function (θ, args...)
-            vdθ = Tuple((Array(r) for r in eachrow(I(length(θ)) * one(eltype(θ)))))
-            bθ = zeros(eltype(θ), length(θ))
-            vdbθ = Tuple(zeros(eltype(θ), length(θ)) for i in eachindex(θ))
-            res = [zeros(eltype(x), length(θ), length(θ)) for i in 1:num_cons]
-            for i in 1:num_cons
-                Enzyme.make_zero!(bθ)
-                for el in vdbθ
-                    Enzyme.make_zero!(el)
-                end
+    if cons !== nothing && f.cons_h === nothing
+        cons_vdθ = Tuple((Array(r) for r in eachrow(I(length(x)) * one(eltype(x)))))
+        cons_bθ = zeros(eltype(x), length(x))
+        cons_vdbθ = Tuple(zeros(eltype(x), length(x)) for i in eachindex(x))
+
+        function cons_h(θ)
+            return map(1:num_cons) do i
+                Enzyme.make_zero!(cons_bθ)
+                Enzyme.make_zero!.(cons_vdbθ)
                 Enzyme.autodiff(Enzyme.Forward,
                     cons_f2_oop,
-                    Enzyme.BatchDuplicated(θ, vdθ),
-                    Enzyme.BatchDuplicated(bθ, vdbθ),
+                    Enzyme.BatchDuplicated(θ, cons_vdθ),
+                    Enzyme.BatchDuplicated(cons_bθ, cons_vdbθ),
                     Const(f.cons),
                     Const(p),
-                    Const(i),
-                    Const.(args)...
-                )
-                for j in eachindex(θ)
-                    res[i][j, :] = vdbθ[j]
-                end
-            end
-            return res
-        end
-    else
-        cons_h = (θ, args...) -> f.cons_h(θ, p, args...)
-    end
+                    Const(i))
 
-    return OptimizationFunction{false}(f.f, adtype; grad = grad, hess = hess, hv = hv,
-        cons = cons_oop, cons_j = cons_j, cons_h = cons_h,
-        hess_prototype = f.hess_prototype,
-        cons_jac_prototype = f.cons_jac_prototype,
-        cons_hess_prototype = f.cons_hess_prototype)
-end
-
-function OptimizationBase.instantiate_function(f::OptimizationFunction{false},
-        cache::OptimizationBase.ReInitCache,
-        adtype::AutoEnzyme,
-        num_cons = 0)
-    p = cache.p
-
-    if f.grad === nothing
-        res = zeros(eltype(x), size(x))
-        grad = let res = res
-            function (θ, args...)
-                Enzyme.make_zero!(res)
-                Enzyme.autodiff(Enzyme.Reverse,
-                    Const(firstapply),
-                    Active,
-                    Const(f.f),
-                    Enzyme.Duplicated(θ, res),
-                    Const(p),
-                    Const.(args)...)
-                return res
+                return reduce(hcat, cons_vdbθ)
             end
         end
     else
-        grad = (θ, args...) -> f.grad(θ, p, args...)
+        cons_h = (θ) -> f.cons_h(θ, p)
     end
 
-    if f.hess === nothing
-        function hess(θ, args...)
-            vdθ = Tuple((Array(r) for r in eachrow(I(length(θ)) * one(eltype(θ)))))
+    if f.lag_h === nothing
+        lag_vdθ = Tuple((Array(r) for r in eachrow(I(length(x)) * one(eltype(x)))))
+        lag_bθ = zeros(eltype(x), length(x))
+        if f.hess_prototype === nothing
+            lag_vdbθ = Tuple(zeros(eltype(x), length(x)) for i in eachindex(x))
+        else
+            lag_vdbθ = Tuple((copy(r) for r in eachrow(f.hess_prototype)))
+        end
 
-            bθ = zeros(eltype(θ), length(θ))
-            vdbθ = Tuple(zeros(eltype(θ), length(θ)) for i in eachindex(θ))
+        function lag_h(θ, σ, μ)
+            Enzyme.make_zero!(lag_bθ)
+            Enzyme.make_zero!.(lag_vdbθ)
 
             Enzyme.autodiff(Enzyme.Forward,
-                inner_grad,
-                Enzyme.BatchDuplicated(θ, vdθ),
-                Enzyme.BatchDuplicated(bθ, vdbθ),
+                lag_grad,
+                Enzyme.BatchDuplicated(θ, lag_vdθ),
+                Enzyme.BatchDuplicatedNoNeed(lag_bθ, lag_vdbθ),
+                Const(lagrangian),
                 Const(f.f),
+                Const(f.cons),
                 Const(p),
-                Const.(args)...)
-
-            return reduce(
-                vcat, [reshape(vdbθ[i], (1, length(vdbθ[i]))) for i in eachindex(θ)])
-        end
-    else
-        hess = (θ, args...) -> f.hess(θ, p, args...)
-    end
-
-    if f.hv === nothing
-        hv = function (θ, v, args...)
-            Enzyme.autodiff(
-                Enzyme.Forward, hv_f2_alloc, DuplicatedNoNeed, Duplicated(θ, v),
-                Const(_f), Const(f.f), Const(p),
-                Const.(args)...)[1]
-        end
-    else
-        hv = f.hv
-    end
+                Const(σ),
+                Const(μ)
+            )
 
-    if f.cons === nothing
-        cons = nothing
-    else
-        cons_oop = (θ, args...) -> f.cons(θ, p, args...)
-    end
+            k = 0
 
-    if f.cons !== nothing && f.cons_j === nothing
-        J = Tuple(zeros(eltype(cache.u0), length(cache.u0)) for i in 1:num_cons)
-        cons_j = function (θ, args...)
-            for i in 1:num_cons
-                Enzyme.make_zero!(J[i])
-            end
-            Enzyme.autodiff(
-                Enzyme.Forward, f.cons, BatchDuplicated(θ, J), Const(p), Const.(args)...)
-            return reduce(vcat, reshape.(J, Ref(1), Ref(length(θ))))
-        end
-    else
-        cons_j = (θ, args...) -> f.cons_j(θ, p, args...)
-    end
-
-    if f.cons !== nothing && f.cons_h === nothing
-        cons_h = function (θ, args...)
-            vdθ = Tuple((Array(r) for r in eachrow(I(length(θ)) * one(eltype(θ)))))
-            bθ = zeros(eltype(θ), length(θ))
-            vdbθ = Tuple(zeros(eltype(θ), length(θ)) for i in eachindex(θ))
-            res = [zeros(eltype(x), length(θ), length(θ)) for i in 1:num_cons]
-            for i in 1:num_cons
-                Enzyme.make_zero!(bθ)
-                for el in vdbθ
-                    Enzyme.make_zero!(el)
-                end
-                Enzyme.autodiff(Enzyme.Forward,
-                    cons_f2_oop,
-                    Enzyme.BatchDuplicated(θ, vdθ),
-                    Enzyme.BatchDuplicated(bθ, vdbθ),
-                    Const(f.cons),
-                    Const(p),
-                    Const(i),
-                    Const.(args)...
-                )
-                for j in eachindex(θ)
-                    res[i][j, :] = vdbθ[j]
-                end
+            for i in eachindex(θ)
+                vec_lagv = lag_vdbθ[i]
+                res[(k + 1):(k + i), :] .= @view(vec_lagv[1:i])
+                k += i
             end
             return res
         end
     else
-        cons_h = (θ) -> f.cons_h(θ, p)
+        lag_h = (θ, σ, μ) -> f.lag_h(θ, σ, μ, p)
     end
 
     return OptimizationFunction{false}(f.f, adtype; grad = grad, hess = hess, hv = hv,
-        cons = cons_oop, cons_j = cons_j, cons_h = cons_h,
+        cons = cons, cons_j = cons_j, cons_h = cons_h,
         hess_prototype = f.hess_prototype,
         cons_jac_prototype = f.cons_jac_prototype,
-        cons_hess_prototype = f.cons_hess_prototype)
+        cons_hess_prototype = f.cons_hess_prototype,
+        lag_h = lag_h,
+        lag_hess_prototype = f.lag_hess_prototype,
+        sys = f.sys,
+        expr = f.expr,
+        cons_expr = f.cons_expr)
+end
+
+function OptimizationBase.instantiate_function(f::OptimizationFunction{false},
+        cache::OptimizationBase.ReInitCache,
+        adtype::AutoEnzyme,
+        num_cons = 0)
+    p = cache.p
+    x = cache.u0
+
+    return OptimizationBase.instantiate_function(f, x, adtype, p, num_cons)
 end
 
 end
diff --git a/ext/OptimizationZygoteExt.jl b/ext/OptimizationZygoteExt.jl
index 211db2d..ab43ea5 100644
--- a/ext/OptimizationZygoteExt.jl
+++ b/ext/OptimizationZygoteExt.jl
@@ -1,43 +1,66 @@
 module OptimizationZygoteExt
 
-import OptimizationBase
+using OptimizationBase
+using OptimizationBase.FastClosures
+import OptimizationBase.ArrayInterface
 import OptimizationBase.SciMLBase: OptimizationFunction
-import OptimizationBase.ADTypes: AutoZygote
-using Zygote, Zygote.ForwardDiff
+import OptimizationBase.LinearAlgebra: I
+import DifferentiationInterface
+import DifferentiationInterface: prepare_gradient, prepare_hessian, prepare_hvp,
+                                 prepare_jacobian,
+                                 gradient!, hessian!, hvp!, jacobian!, gradient, hessian,
+                                 hvp, jacobian
+using ADTypes, SciMLBase
+import Zygote
+
+function OptimizationBase.instantiate_function(
+        f::OptimizationFunction{true}, x, adtype::ADTypes.AutoZygote,
+        p = SciMLBase.NullParameters(), num_cons = 0;
+        fg = false, fgh = false, cons_vjp = false, cons_jvp = false)
+    function _f(θ)
+        return f(θ, p)[1]
+    end
+
+    adtype, soadtype = OptimizationBase.generate_adtype(adtype)
 
-function OptimizationBase.instantiate_function(f::OptimizationFunction{true}, x,
-        adtype::AutoZygote, p,
-        num_cons = 0)
-    _f = (θ, args...) -> f(θ, p, args...)[1]
     if f.grad === nothing
-        grad = function (res, θ, args...)
-            val = Zygote.gradient(x -> _f(x, args...), θ)[1]
-            if val === nothing
-                res .= zero(eltype(θ))
-            else
-                res .= val
-            end
+        extras_grad = prepare_gradient(_f, adtype, x)
+        function grad(res, θ)
+            gradient!(_f, res, adtype, θ, extras_grad)
         end
     else
-        grad = (G, θ, args...) -> f.grad(G, θ, p, args...)
+        grad = (G, θ) -> f.grad(G, θ, p)
     end
 
+    if fg == true
+        function fg!(res, θ)
+            (y, _) = value_and_gradient!(_f, res, adtype, θ, extras_grad)
+            return y
+        end
+    end
+
+    hess_sparsity = f.hess_prototype
+    hess_colors = f.hess_colorvec
     if f.hess === nothing
-        hess = function (res, θ, args...)
-            res .= ForwardDiff.jacobian(θ) do θ
-                Zygote.gradient(x -> _f(x, args...), θ)[1]
-            end
+        extras_hess = prepare_hessian(_f, soadtype, x)
+        function hess(res, θ)
+            hessian!(_f, res, soadtype, θ, extras_hess)
         end
     else
-        hess = (H, θ, args...) -> f.hess(H, θ, p, args...)
+        hess = (H, θ) -> f.hess(H, θ, p)
+    end
+
+    if fgh == true
+        function fgh!(G, H, θ)
+            (y, _, _) = value_derivative_and_second_derivative!(_f, G, H, θ, extras_hess)
+            return y
+        end
     end
 
     if f.hv === nothing
-        hv = function (H, θ, v, args...)
-            _θ = ForwardDiff.Dual.(θ, v)
-            res = similar(_θ)
-            grad(res, _θ, args...)
-            H .= getindex.(ForwardDiff.partials.(res), 1)
+        extras_hvp = prepare_hvp(_f, soadtype, x, zeros(eltype(x), size(x)))
+        hv = function (H, θ, v)
+            hvp!(_f, H, soadtype, θ, v, extras_hvp)
         end
     else
         hv = f.hv
@@ -46,161 +69,175 @@ function OptimizationBase.instantiate_function(f::OptimizationFunction{true}, x,
     if f.cons === nothing
         cons = nothing
     else
-        cons = (res, θ) -> f.cons(res, θ, p)
-        cons_oop = (x) -> (_res = Zygote.Buffer(x, num_cons); cons(_res, x); copy(_res))
+        function cons(res, θ)
+            return f.cons(res, θ, p)
+        end
+
+        function cons_oop(x)
+            _res = Zygote.Buffer(x, num_cons)
+            cons(_res, x)
+            return copy(_res)
+        end
     end
 
+    cons_jac_prototype = f.cons_jac_prototype
+    cons_jac_colorvec = f.cons_jac_colorvec
     if cons !== nothing && f.cons_j === nothing
-        cons_j = function (J, θ)
-            J .= first(Zygote.jacobian(cons_oop, θ))
+        extras_jac = prepare_jacobian(cons_oop, adtype, x)
+        function cons_j(J, θ)
+            jacobian!(cons_oop, J, adtype, θ, extras_jac)
+            if size(J, 1) == 1
+                J = vec(J)
+            end
         end
     else
         cons_j = (J, θ) -> f.cons_j(J, θ, p)
     end
 
-    if cons !== nothing && f.cons_h === nothing
-        fncs = [(x) -> cons_oop(x)[i] for i in 1:num_cons]
-        cons_h = function (res, θ)
-            for i in 1:num_cons
-                res[i] .= Zygote.hessian(fncs[i], θ)
-            end
+    if f.cons_vjp === nothing && cons_vjp == true
+        extras_pullback = prepare_pullback(cons_oop, adtype, x)
+        function cons_vjp!(J, θ, v)
+            pullback!(cons_oop, J, adtype, θ, v, extras_pullback)
         end
     else
-        cons_h = (res, θ) -> f.cons_h(res, θ, p)
+        cons_vjp! = nothing
     end
 
-    if f.lag_h === nothing
-        lag_h = nothing # Consider implementing this
+    if f.cons_jvp === nothing && cons_jvp == true
+        extras_pushforward = prepare_pushforward(cons_oop, adtype, x)
+        function cons_jvp!(J, θ, v)
+            pushforward!(cons_oop, J, adtype, θ, v, extras_pushforward)
+        end
     else
-        lag_h = (res, θ, σ, μ) -> f.lag_h(res, θ, σ, μ, p)
+        cons_jvp! = nothing
     end
 
-    return OptimizationFunction{true}(f.f, adtype; grad = grad, hess = hess, hv = hv,
-        cons = cons, cons_j = cons_j, cons_h = cons_h,
-        hess_prototype = f.hess_prototype,
-        cons_jac_prototype = f.cons_jac_prototype,
-        cons_hess_prototype = f.cons_hess_prototype,
-        lag_h = lag_h,
-        lag_hess_prototype = f.lag_hess_prototype,
-        sys = f.sys,
-        expr = f.expr,
-        cons_expr = f.cons_expr)
-end
+    conshess_sparsity = f.cons_hess_prototype
+    conshess_colors = f.cons_hess_colorvec
+    if cons !== nothing && f.cons_h === nothing
+        fncs = [(x) -> cons_oop(x)[i] for i in 1:num_cons]
+        extras_cons_hess = prepare_hessian.(fncs, Ref(soadtype), Ref(x))
 
-function OptimizationBase.instantiate_function(f::OptimizationFunction{true},
-        cache::OptimizationBase.ReInitCache,
-        adtype::AutoZygote, num_cons = 0)
-    _f = (θ, args...) -> f(θ, cache.p, args...)[1]
-    if f.grad === nothing
-        grad = function (res, θ, args...)
-            val = Zygote.gradient(x -> _f(x, args...), θ)[1]
-            if val === nothing
-                res .= zero(eltype(θ))
-            else
-                res .= val
+        function cons_h(H, θ)
+            for i in 1:num_cons
+                hessian!(fncs[i], H[i], soadtype, θ, extras_cons_hess[i])
             end
         end
     else
-        grad = (G, θ, args...) -> f.grad(G, θ, cache.p, args...)
+        cons_h = (res, θ) -> f.cons_h(res, θ, p)
     end
 
-    if f.hess === nothing
-        hess = function (res, θ, args...)
-            res .= ForwardDiff.jacobian(θ) do θ
-                Zygote.gradient(x -> _f(x, args...), θ)[1]
-            end
-        end
-    else
-        hess = (H, θ, args...) -> f.hess(H, θ, cache.p, args...)
+    function lagrangian(x, σ = one(eltype(x)), λ = ones(eltype(x), num_cons))
+        return σ * _f(x) + dot(λ, cons_oop(x))
     end
 
-    if f.hv === nothing
-        hv = function (H, θ, v, args...)
-            _θ = ForwardDiff.Dual.(θ, v)
-            res = similar(_θ)
-            grad(res, _θ, args...)
-            H .= getindex.(ForwardDiff.partials.(res), 1)
-        end
-    else
-        hv = f.hv
-    end
+    lag_hess_prototype = f.lag_hess_prototype
 
-    if f.cons === nothing
-        cons = nothing
-    else
-        cons = (res, θ) -> f.cons(res, θ, cache.p)
-        cons_oop = (x) -> (_res = Zygote.Buffer(x, num_cons); cons(_res, x); copy(_res))
-    end
+    if f.lag_h === nothing
+        lag_extras = prepare_hessian(lagrangian, soadtype, x)
+        lag_hess_prototype = zeros(Bool, length(x), length(x))
 
-    if cons !== nothing && f.cons_j === nothing
-        cons_j = function (J, θ)
-            J .= first(Zygote.jacobian(cons_oop, θ))
+        function lag_h(H::AbstractMatrix, θ, σ, λ)
+            if σ == zero(eltype(θ))
+                cons_h(H, θ)
+                H *= λ
+            else
+                hessian!(x -> lagrangian(x, σ, λ), H, soadtype, θ, lag_extras)
+            end
         end
-    else
-        cons_j = (J, θ) -> f.cons_j(J, θ, cache.p)
-    end
 
-    if cons !== nothing && f.cons_h === nothing
-        fncs = [(x) -> cons_oop(x)[i] for i in 1:num_cons]
-        cons_h = function (res, θ)
-            for i in 1:num_cons
-                res[i] .= Zygote.hessian(fncs[i], θ)
+        function lag_h(h, θ, σ, λ)
+            H = eltype(θ).(lag_hess_prototype)
+            hessian!(x -> lagrangian(x, σ, λ), H, soadtype, θ, lag_extras)
+            k = 0
+            rows, cols, _ = findnz(H)
+            for (i, j) in zip(rows, cols)
+                if i <= j
+                    k += 1
+                    h[k] = H[i, j]
+                end
             end
         end
     else
-        cons_h = (res, θ) -> f.cons_h(res, θ, cache.p)
-    end
-
-    if f.lag_h === nothing
-        lag_h = nothing # Consider implementing this
-    else
-        lag_h = (res, θ, σ, μ) -> f.lag_h(res, θ, σ, μ, cache.p)
+        lag_h = (res, θ, σ, μ) -> f.lag_h(res, θ, σ, μ, p)
     end
 
     return OptimizationFunction{true}(f.f, adtype; grad = grad, hess = hess, hv = hv,
         cons = cons, cons_j = cons_j, cons_h = cons_h,
-        hess_prototype = f.hess_prototype,
-        cons_jac_prototype = f.cons_jac_prototype,
-        cons_hess_prototype = f.cons_hess_prototype,
-        lag_h = lag_h,
-        lag_hess_prototype = f.lag_hess_prototype,
+        cons_vjp = cons_vjp!, cons_jvp = cons_jvp!,
+        hess_prototype = hess_sparsity,
+        hess_colorvec = hess_colors,
+        cons_jac_prototype = cons_jac_prototype,
+        cons_jac_colorvec = cons_jac_colorvec,
+        cons_hess_prototype = conshess_sparsity,
+        cons_hess_colorvec = conshess_colors,
+        lag_h,
+        lag_hess_prototype = lag_hess_prototype,
         sys = f.sys,
         expr = f.expr,
         cons_expr = f.cons_expr)
 end
 
-function OptimizationBase.instantiate_function(f::OptimizationFunction{false}, x,
-        adtype::AutoZygote, p,
-        num_cons = 0)
-    _f = (θ, args...) -> f(θ, p, args...)[1]
+function OptimizationBase.instantiate_function(
+        f::OptimizationFunction{true}, cache::OptimizationBase.ReInitCache,
+        adtype::ADTypes.AutoZygote, num_cons = 0)
+    x = cache.u0
+    p = cache.p
+
+    return OptimizationBase.instantiate_function(f, x, adtype, p, num_cons)
+end
+
+function instantiate_function(
+        f::OptimizationFunction{true}, x, adtype::ADTypes.AutoSparse{<:AutoZygote},
+        p = SciMLBase.NullParameters(), num_cons = 0;
+        fg = false, fgh = false,
+        cons_vjp = false, cons_jvp = false)
+    function _f(θ)
+        return f(θ, p)[1]
+    end
+
+    adtype, soadtype = generate_sparse_adtype(adtype)
+
     if f.grad === nothing
-        grad = function (θ, args...)
-            val = Zygote.gradient(x -> _f(x, args...), θ)[1]
-            if val === nothing
-                return zero(eltype(θ))
-            else
-                return val
-            end
+        extras_grad = prepare_gradient(_f, adtype.dense_ad, x)
+        function grad(res, θ)
+            gradient!(_f, res, adtype.dense_ad, θ, extras_grad)
         end
     else
-        grad = (θ, args...) -> f.grad(θ, p, args...)
+        grad = (G, θ) -> f.grad(G, θ, p)
     end
 
+    if fg == true && f.fg !== nothing
+        function fg!(res, θ)
+            (y, _) = value_and_gradient!(_f, res, adtype.dense_ad, θ, extras_grad)
+            return y
+        end
+    end
+
+    hess_sparsity = f.hess_prototype
+    hess_colors = f.hess_colorvec
     if f.hess === nothing
-        hess = function (θ, args...)
-            return ForwardDiff.jacobian(θ) do θ
-                return Zygote.gradient(x -> _f(x, args...), θ)[1]
-            end
+        extras_hess = prepare_hessian(_f, soadtype, x) #placeholder logic, can be made much better
+        function hess(res, θ)
+            hessian!(_f, res, soadtype, θ, extras_hess)
         end
+        hess_sparsity = extras_hess.sparsity
+        hess_colors = extras_hess.colors
     else
-        hess = (θ, args...) -> f.hess(θ, p, args...)
+        hess = (H, θ) -> f.hess(H, θ, p)
+    end
+
+    if fgh == true && f.fgh !== nothing
+        function fgh!(G, H, θ)
+            (y, _, _) = value_derivative_and_second_derivative!(_f, G, H, θ, extras_hess)
+            return y
+        end
     end
 
     if f.hv === nothing
-        hv = function (H, θ, v, args...)
-            _θ = ForwardDiff.Dual.(θ, v)
-            res = grad(_θ, args...)
-            return getindex.(ForwardDiff.partials.(res), 1)
+        extras_hvp = prepare_hvp(_f, soadtype.dense_ad, x, zeros(eltype(x), size(x)))
+        hv = function (H, θ, v)
+            hvp!(_f, H, soadtype.dense_ad, θ, v, extras_hvp)
         end
     else
         hv = f.hv
@@ -209,136 +246,125 @@ function OptimizationBase.instantiate_function(f::OptimizationFunction{false}, x
     if f.cons === nothing
         cons = nothing
     else
-        cons = (θ) -> f.cons(θ, p)
-        cons_oop = cons
-    end
-
-    if cons !== nothing && f.cons_j === nothing
-        cons_j = function (θ)
-            if num_cons > 1
-                return first(Zygote.jacobian(cons_oop, θ))
-            else
-                return first(Zygote.jacobian(cons_oop, θ))[1, :]
-            end
+        function cons(res, θ)
+            f.cons(res, θ, p)
         end
-    else
-        cons_j = (θ) -> f.cons_j(θ, p)
-    end
 
-    if cons !== nothing && f.cons_h === nothing
-        fncs = [(x) -> cons_oop(x)[i] for i in 1:num_cons]
-        cons_h = function (θ)
-            return map(1:num_cons) do i
-                Zygote.hessian(fncs[i], θ)
-            end
+        function cons_oop(x)
+            _res = Zygote.Buffer(x, num_cons)
+            cons(_res, x)
+            return copy(_res)
         end
-    else
-        cons_h = (θ) -> f.cons_h(θ, p)
-    end
 
-    if f.lag_h === nothing
-        lag_h = nothing # Consider implementing this
-    else
-        lag_h = (θ, σ, μ) -> f.lag_h(θ, σ, μ, p)
+        function lagrangian(x, σ = one(eltype(x)), λ = ones(eltype(x), num_cons))
+            return σ * _f(x) + dot(λ, cons_oop(x))
+        end
     end
 
-    return OptimizationFunction{false}(f.f, adtype; grad = grad, hess = hess, hv = hv,
-        cons = cons, cons_j = cons_j, cons_h = cons_h,
-        hess_prototype = f.hess_prototype,
-        cons_jac_prototype = f.cons_jac_prototype,
-        cons_hess_prototype = f.cons_hess_prototype,
-        lag_h = lag_h,
-        lag_hess_prototype = f.lag_hess_prototype,
-        sys = f.sys,
-        expr = f.expr,
-        cons_expr = f.cons_expr)
-end
-
-function OptimizationBase.instantiate_function(f::OptimizationFunction{false},
-        cache::OptimizationBase.ReInitCache,
-        adtype::AutoZygote, num_cons = 0)
-    _f = (θ, args...) -> f(θ, cache.p, args...)[1]
-    p = cache.p
-
-    if f.grad === nothing
-        grad = function (θ, args...)
-            val = Zygote.gradient(x -> _f(x, args...), θ)[1]
-            if val === nothing
-                return zero(eltype(θ))
-            else
-                return val
+    cons_jac_prototype = f.cons_jac_prototype
+    cons_jac_colorvec = f.cons_jac_colorvec
+    if cons !== nothing && f.cons_j === nothing
+        extras_jac = prepare_jacobian(cons_oop, adtype, x)
+        function cons_j(J, θ)
+            jacobian!(cons_oop, J, adtype, θ, extras_jac)
+            if size(J, 1) == 1
+                J = vec(J)
             end
         end
+        cons_jac_prototype = extras_jac.sparsity
+        cons_jac_colorvec = extras_jac.colors
     else
-        grad = (θ, args...) -> f.grad(θ, p, args...)
+        cons_j = (J, θ) -> f.cons_j(J, θ, p)
     end
 
-    if f.hess === nothing
-        hess = function (θ, args...)
-            return ForwardDiff.jacobian(θ) do θ
-                Zygote.gradient(x -> _f(x, args...), θ)[1]
-            end
+    if f.cons_vjp === nothing && cons_vjp == true
+        extras_pullback = prepare_pullback(cons_oop, adtype, x)
+        function cons_vjp!(J, θ, v)
+            pullback!(cons_oop, J, adtype.dense_ad, θ, v, extras_pullback)
         end
     else
-        hess = (θ, args...) -> f.hess(θ, p, args...)
+        cons_vjp! = nothing
     end
 
-    if f.hv === nothing
-        hv = function (H, θ, v, args...)
-            _θ = ForwardDiff.Dual.(θ, v)
-            res = grad(_θ, args...)
-            return getindex.(ForwardDiff.partials.(res), 1)
+    if f.cons_jvp === nothing && cons_jvp == true
+        extras_pushforward = prepare_pushforward(cons_oop, adtype, x)
+        function cons_jvp!(J, θ, v)
+            pushforward!(cons_oop, J, adtype.dense_ad, θ, v, extras_pushforward)
         end
     else
-        hv = f.hv
+        cons_jvp! = nothing
     end
 
-    if f.cons === nothing
-        cons = nothing
+    conshess_sparsity = f.cons_hess_prototype
+    conshess_colors = f.cons_hess_colorvec
+    if cons !== nothing && f.cons_h === nothing
+        fncs = [@closure (x) -> cons_oop(x)[i] for i in 1:num_cons]
+        extras_cons_hess = Vector(undef, length(fncs))
+        for ind in 1:num_cons
+            extras_cons_hess[ind] = prepare_hessian(fncs[ind], soadtype, x)
+        end
+        conshess_sparsity = getfield.(extras_cons_hess, :sparsity)
+        conshess_colors = getfield.(extras_cons_hess, :colors)
+        function cons_h(H, θ)
+            for i in 1:num_cons
+                hessian!(fncs[i], H[i], soadtype, θ, extras_cons_hess[i])
+            end
+        end
     else
-        cons = (θ) -> f.cons(θ, p)
-        cons_oop = cons
+        cons_h = (res, θ) -> f.cons_h(res, θ, p)
     end
 
-    if cons !== nothing && f.cons_j === nothing
-        cons_j = function (θ)
-            if num_cons > 1
-                return first(Zygote.jacobian(cons_oop, θ))
+    lag_hess_prototype = f.lag_hess_prototype
+    if cons !== nothing && f.lag_h === nothing
+        lag_extras = prepare_hessian(lagrangian, soadtype, x)
+        lag_hess_prototype = lag_extras.sparsity
+
+        function lag_h(H::AbstractMatrix, θ, σ, λ)
+            if σ == zero(eltype(θ))
+                cons_h(H, θ)
+                H *= λ
             else
-                return first(Zygote.jacobian(cons_oop, θ))[1, :]
+                hessian!(x -> lagrangian(x, σ, λ), H, soadtype, θ, lag_extras)
             end
         end
-    else
-        cons_j = (θ) -> f.cons_j(θ, p)
-    end
 
-    if cons !== nothing && f.cons_h === nothing
-        fncs = [(x) -> cons_oop(x)[i] for i in 1:num_cons]
-        cons_h = function (θ)
-            return map(1:num_cons) do i
-                Zygote.hessian(fncs[i], θ)
+        function lag_h(h, θ, σ, λ)
+            H = eltype(θ).(lag_hess_prototype)
+            hessian!(x -> lagrangian(x, σ, λ), H, soadtype, θ, lag_extras)
+            k = 0
+            rows, cols, _ = findnz(H)
+            for (i, j) in zip(rows, cols)
+                if i <= j
+                    k += 1
+                    h[k] = H[i, j]
+                end
             end
         end
     else
-        cons_h = (θ) -> f.cons_h(θ, p)
-    end
-
-    if f.lag_h === nothing
-        lag_h = nothing # Consider implementing this
-    else
-        lag_h = (θ, σ, μ) -> f.lag_h(θ, σ, μ, p)
+        lag_h = (res, θ, σ, μ) -> f.lag_h(res, θ, σ, μ, p)
     end
-
-    return OptimizationFunction{false}(f.f, adtype; grad = grad, hess = hess, hv = hv,
+    return OptimizationFunction{true}(f.f, adtype; grad = grad, hess = hess, hv = hv,
         cons = cons, cons_j = cons_j, cons_h = cons_h,
-        hess_prototype = f.hess_prototype,
-        cons_jac_prototype = f.cons_jac_prototype,
-        cons_hess_prototype = f.cons_hess_prototype,
-        lag_h = lag_h,
-        lag_hess_prototype = f.lag_hess_prototype,
+        hess_prototype = hess_sparsity,
+        hess_colorvec = hess_colors,
+        cons_jac_prototype = cons_jac_prototype,
+        cons_jac_colorvec = cons_jac_colorvec,
+        cons_hess_prototype = conshess_sparsity,
+        cons_hess_colorvec = conshess_colors,
+        lag_h,
+        lag_hess_prototype = lag_hess_prototype,
         sys = f.sys,
         expr = f.expr,
         cons_expr = f.cons_expr)
 end
 
+function instantiate_function(
+        f::OptimizationFunction{true}, cache::OptimizationBase.ReInitCache,
+        adtype::ADTypes.AutoSparse{<:AutoZygote}, num_cons = 0)
+    x = cache.u0
+    p = cache.p
+
+    return instantiate_function(f, x, adtype, p, num_cons)
+end
+
 end
diff --git a/src/OptimizationBase.jl b/src/OptimizationBase.jl
index 52528bf..8a1c3e4 100644
--- a/src/OptimizationBase.jl
+++ b/src/OptimizationBase.jl
@@ -19,6 +19,8 @@ import SciMLBase: OptimizationProblem,
                   MaxSense, MinSense, OptimizationStats
 export ObjSense, MaxSense, MinSense
 
+using FastClosures
+
 struct NullCallback end
 (x::NullCallback)(args...) = false
 const DEFAULT_CALLBACK = NullCallback()
diff --git a/src/OptimizationDIExt.jl b/src/OptimizationDIExt.jl
index e185d75..0ab34a2 100644
--- a/src/OptimizationDIExt.jl
+++ b/src/OptimizationDIExt.jl
@@ -9,11 +9,7 @@ import DifferentiationInterface: prepare_gradient, prepare_hessian, prepare_hvp,
                                  hvp, jacobian
 using ADTypes, SciMLBase
 
-function instantiate_function(
-        f::OptimizationFunction{true}, x, adtype::ADTypes.AbstractADType,
-        p = SciMLBase.NullParameters(), num_cons = 0)
-    _f = (θ, args...) -> first(f.f(θ, p, args...))
-
+function generate_adtype(adtype)
     if !(adtype isa SciMLBase.NoAD) && ADTypes.mode(adtype) isa ADTypes.ForwardMode
         soadtype = DifferentiationInterface.SecondOrder(adtype, AutoReverseDiff()) #make zygote?
     elseif !(adtype isa SciMLBase.NoAD) && ADTypes.mode(adtype) isa ADTypes.ReverseMode
@@ -21,6 +17,18 @@ function instantiate_function(
     else
         soadtype = adtype
     end
+    return adtype, soadtype
+end
+
+function instantiate_function(
+        f::OptimizationFunction{true}, x, adtype::ADTypes.AbstractADType,
+        p = SciMLBase.NullParameters(), num_cons = 0;
+        fg = false, fgh = false, cons_vjp = false, cons_jvp = false)
+    function _f(θ)
+        return f(θ, p)[1]
+    end
+
+    adtype, soadtype = generate_adtype(adtype)
 
     if f.grad === nothing
         extras_grad = prepare_gradient(_f, adtype, x)
@@ -28,23 +36,37 @@ function instantiate_function(
             gradient!(_f, res, adtype, θ, extras_grad)
         end
     else
-        grad = (G, θ, args...) -> f.grad(G, θ, p, args...)
+        grad = (G, θ) -> f.grad(G, θ, p)
+    end
+
+    if fg == true && f.fg === nothing
+        function fg!(res, θ)
+            (y, _) = value_and_gradient!(_f, res, adtype, θ, extras_grad)
+            return y
+        end
     end
 
     hess_sparsity = f.hess_prototype
     hess_colors = f.hess_colorvec
     if f.hess === nothing
         extras_hess = prepare_hessian(_f, soadtype, x)
-        function hess(res, θ, args...)
+        function hess(res, θ)
             hessian!(_f, res, soadtype, θ, extras_hess)
         end
     else
-        hess = (H, θ, args...) -> f.hess(H, θ, p, args...)
+        hess = (H, θ) -> f.hess(H, θ, p)
+    end
+
+    if fgh == true && f.fgh !== nothing
+        function fgh!(G, H, θ)
+            (y, _, _) = value_derivative_and_second_derivative!(_f, G, H, θ, extras_hess)
+            return y
+        end
     end
 
     if f.hv === nothing
-        extras_hvp = prepare_hvp(_f, soadtype, x, rand(size(x)))
-        hv = function (H, θ, v, args...)
+        extras_hvp = prepare_hvp(_f, soadtype, x, zeros(eltype(x), size(x)))
+        hv = function (H, θ, v)
             hvp!(_f, H, soadtype, θ, v, extras_hvp)
         end
     else
@@ -54,15 +76,26 @@ function instantiate_function(
     if f.cons === nothing
         cons = nothing
     else
-        cons = (res, θ) -> f.cons(res, θ, p)
-        cons_oop = (x) -> (_res = zeros(eltype(x), num_cons); cons(_res, x); _res)
+        function cons(res, θ)
+            return f.cons(res, θ, p)
+        end
+
+        function cons_oop(x)
+            _res = zeros(eltype(x), num_cons)
+            cons(_res, x)
+            return _res
+        end
+
+        function lagrangian(x, σ = one(eltype(x)), λ = ones(eltype(x), num_cons))
+            return σ * _f(x) + dot(λ, cons_oop(x))
+        end
     end
 
     cons_jac_prototype = f.cons_jac_prototype
     cons_jac_colorvec = f.cons_jac_colorvec
     if cons !== nothing && f.cons_j === nothing
         extras_jac = prepare_jacobian(cons_oop, adtype, x)
-        cons_j = function (J, θ)
+        function cons_j(J, θ)
             jacobian!(cons_oop, J, adtype, θ, extras_jac)
             if size(J, 1) == 1
                 J = vec(J)
@@ -72,100 +105,22 @@ function instantiate_function(
         cons_j = (J, θ) -> f.cons_j(J, θ, p)
     end
 
-    conshess_sparsity = f.cons_hess_prototype
-    conshess_colors = f.cons_hess_colorvec
-    if cons !== nothing && f.cons_h === nothing
-        fncs = [(x) -> cons_oop(x)[i] for i in 1:num_cons]
-        extras_cons_hess = prepare_hessian.(fncs, Ref(soadtype), Ref(x))
-
-        function cons_h(H, θ)
-            for i in 1:num_cons
-                hessian!(fncs[i], H[i], soadtype, θ, extras_cons_hess[i])
-            end
-        end
-    else
-        cons_h = (res, θ) -> f.cons_h(res, θ, p)
-    end
-
-    if f.lag_h === nothing
-        lag_h = nothing # Consider implementing this
-    else
-        lag_h = (res, θ, σ, μ) -> f.lag_h(res, θ, σ, μ, p)
-    end
-    return OptimizationFunction{true}(f.f, adtype; grad = grad, hess = hess, hv = hv,
-        cons = cons, cons_j = cons_j, cons_h = cons_h,
-        hess_prototype = hess_sparsity,
-        hess_colorvec = hess_colors,
-        cons_jac_prototype = cons_jac_prototype,
-        cons_jac_colorvec = cons_jac_colorvec,
-        cons_hess_prototype = conshess_sparsity,
-        cons_hess_colorvec = conshess_colors,
-        lag_h, f.lag_hess_prototype)
-end
-
-function instantiate_function(
-        f::OptimizationFunction{true}, cache::OptimizationBase.ReInitCache,
-        adtype::ADTypes.AbstractADType, num_cons = 0)
-    x = cache.u0
-    p = cache.p
-    _f = (θ, args...) -> first(f.f(θ, p, args...))
-
-    if !(adtype isa SciMLBase.NoAD) && ADTypes.mode(adtype) isa ADTypes.ForwardMode
-        soadtype = DifferentiationInterface.SecondOrder(adtype, AutoReverseDiff()) #make zygote?
-    elseif !(adtype isa SciMLBase.NoAD) && ADTypes.mode(adtype) isa ADTypes.ReverseMode
-        soadtype = DifferentiationInterface.SecondOrder(AutoForwardDiff(), adtype)
-    else
-        soadtype = adtype
-    end
-
-    if f.grad === nothing
-        extras_grad = prepare_gradient(_f, adtype, x)
-        function grad(res, θ)
-            gradient!(_f, res, adtype, θ, extras_grad)
-        end
-    else
-        grad = (G, θ, args...) -> f.grad(G, θ, p, args...)
-    end
-
-    hess_sparsity = f.hess_prototype
-    hess_colors = f.hess_colorvec
-    if f.hess === nothing
-        extras_hess = prepare_hessian(_f, soadtype, x)
-        function hess(res, θ, args...)
-            hessian!(_f, res, soadtype, θ, extras_hess)
+    if f.cons_vjp === nothing && cons_vjp == true
+        extras_pullback = prepare_pullback(cons_oop, adtype, x)
+        function cons_vjp!(J, θ, v)
+            pullback!(cons_oop, J, adtype, θ, v, extras_pullback)
         end
     else
-        hess = (H, θ, args...) -> f.hess(H, θ, p, args...)
+        cons_vjp! = nothing
     end
 
-    if f.hv === nothing
-        extras_hvp = prepare_hvp(_f, soadtype, x, rand(size(x)))
-        hv = function (H, θ, v, args...)
-            hvp!(_f, H, soadtype, θ, v, extras_hvp)
+    if f.cons_jvp === nothing && cons_jvp == true
+        extras_pushforward = prepare_pushforward(cons_oop, adtype, x)
+        function cons_jvp!(J, θ, v)
+            pushforward!(cons_oop, J, adtype, θ, v, extras_pushforward)
         end
     else
-        hv = f.hv
-    end
-
-    if f.cons === nothing
-        cons = nothing
-    else
-        cons = (res, θ) -> f.cons(res, θ, p)
-        cons_oop = (x) -> (_res = zeros(eltype(x), num_cons); cons(_res, x); _res)
-    end
-
-    cons_jac_prototype = f.cons_jac_prototype
-    cons_jac_colorvec = f.cons_jac_colorvec
-    if cons !== nothing && f.cons_j === nothing
-        extras_jac = prepare_jacobian(cons_oop, adtype, x)
-        cons_j = function (J, θ)
-            jacobian!(cons_oop, J, adtype, θ, extras_jac)
-            if size(J, 1) == 1
-                J = vec(J)
-            end
-        end
-    else
-        cons_j = (J, θ) -> f.cons_j(J, θ, p)
+        cons_jvp! = nothing
     end
 
     conshess_sparsity = f.cons_hess_prototype
@@ -183,156 +138,112 @@ function instantiate_function(
         cons_h = (res, θ) -> f.cons_h(res, θ, p)
     end
 
-    if f.lag_h === nothing
-        lag_h = nothing # Consider implementing this
-    else
-        lag_h = (res, θ, σ, μ) -> f.lag_h(res, θ, σ, μ, p)
-    end
-    return OptimizationFunction{true}(f.f, adtype; grad = grad, hess = hess, hv = hv,
-        cons = cons, cons_j = cons_j, cons_h = cons_h,
-        hess_prototype = hess_sparsity,
-        hess_colorvec = hess_colors,
-        cons_jac_prototype = cons_jac_prototype,
-        cons_jac_colorvec = cons_jac_colorvec,
-        cons_hess_prototype = conshess_sparsity,
-        cons_hess_colorvec = conshess_colors,
-        lag_h, f.lag_hess_prototype)
-end
+    lag_hess_prototype = f.lag_hess_prototype
 
-function instantiate_function(
-        f::OptimizationFunction{false}, x, adtype::ADTypes.AbstractADType,
-        p = SciMLBase.NullParameters(), num_cons = 0)
-    _f = (θ, args...) -> first(f.f(θ, p, args...))
+    if cons !== nothing && f.lag_h === nothing
+        lag_extras = prepare_hessian(lagrangian, soadtype, x)
+        lag_hess_prototype = zeros(Bool, length(x), length(x))
 
-    if !(adtype isa SciMLBase.NoAD) && ADTypes.mode(adtype) isa ADTypes.ForwardMode
-        soadtype = DifferentiationInterface.SecondOrder(adtype, AutoReverseDiff()) #make zygote?
-    elseif !(adtype isa SciMLBase.NoAD) && ADTypes.mode(adtype) isa ADTypes.ReverseMode
-        soadtype = DifferentiationInterface.SecondOrder(AutoForwardDiff(), adtype)
-    else
-        soadtype = adtype
-    end
-
-    if f.grad === nothing
-        extras_grad = prepare_gradient(_f, adtype, x)
-        function grad(θ)
-            gradient(_f, adtype, θ, extras_grad)
-        end
-    else
-        grad = (θ, args...) -> f.grad(θ, p, args...)
-    end
-
-    hess_sparsity = f.hess_prototype
-    hess_colors = f.hess_colorvec
-    if f.hess === nothing
-        extras_hess = prepare_hessian(_f, soadtype, x) #placeholder logic, can be made much better
-        function hess(θ, args...)
-            hessian(_f, soadtype, θ, extras_hess)
-        end
-    else
-        hess = (θ, args...) -> f.hess(θ, p, args...)
-    end
-
-    if f.hv === nothing
-        extras_hvp = prepare_hvp(_f, soadtype, x, rand(size(x)))
-        hv = function (θ, v, args...)
-            hvp(_f, soadtype, θ, v, extras_hvp)
-        end
-    else
-        hv = f.hv
-    end
-
-    if f.cons === nothing
-        cons = nothing
-    else
-        cons = (θ) -> f.cons(θ, p)
-        cons_oop = cons
-    end
-
-    cons_jac_prototype = f.cons_jac_prototype
-    cons_jac_colorvec = f.cons_jac_colorvec
-    if cons !== nothing && f.cons_j === nothing
-        extras_jac = prepare_jacobian(cons_oop, adtype, x)
-        cons_j = function (θ)
-            J = jacobian(cons_oop, adtype, θ, extras_jac)
-            if size(J, 1) == 1
-                J = vec(J)
+        function lag_h(H::AbstractMatrix, θ, σ, λ)
+            if σ == zero(eltype(θ))
+                cons_h(H, θ)
+                H *= λ
+            else
+                hessian!(x -> lagrangian(x, σ, λ), H, soadtype, θ, lag_extras)
             end
-            return J
         end
-    else
-        cons_j = (θ) -> f.cons_j(θ, p)
-    end
 
-    conshess_sparsity = f.cons_hess_prototype
-    conshess_colors = f.cons_hess_colorvec
-    if cons !== nothing && f.cons_h === nothing
-        fncs = [(x) -> cons_oop(x)[i] for i in 1:num_cons]
-        extras_cons_hess = prepare_hessian.(fncs, Ref(soadtype), Ref(x))
-
-        function cons_h(θ)
-            H = map(1:num_cons) do i
-                hessian(fncs[i], soadtype, θ, extras_cons_hess[i])
+        function lag_h(h, θ, σ, λ)
+            H = eltype(θ).(lag_hess_prototype)
+            hessian!(x -> lagrangian(x, σ, λ), H, soadtype, θ, lag_extras)
+            k = 0
+            rows, cols, _ = findnz(H)
+            for (i, j) in zip(rows, cols)
+                if i <= j
+                    k += 1
+                    h[k] = H[i, j]
+                end
             end
-            return H
         end
     else
-        cons_h = (res, θ) -> f.cons_h(res, θ, p)
+        lag_h = (res, θ, σ, μ) -> f.lag_h(res, θ, σ, μ, p)
     end
 
-    if f.lag_h === nothing
-        lag_h = nothing # Consider implementing this
-    else
-        lag_h = (θ, σ, μ) -> f.lag_h(θ, σ, μ, p)
-    end
     return OptimizationFunction{true}(f.f, adtype; grad = grad, hess = hess, hv = hv,
         cons = cons, cons_j = cons_j, cons_h = cons_h,
+        cons_vjp = cons_vjp!, cons_jvp = cons_jvp!,
         hess_prototype = hess_sparsity,
         hess_colorvec = hess_colors,
         cons_jac_prototype = cons_jac_prototype,
         cons_jac_colorvec = cons_jac_colorvec,
         cons_hess_prototype = conshess_sparsity,
         cons_hess_colorvec = conshess_colors,
-        lag_h, f.lag_hess_prototype)
+        lag_h,
+        lag_hess_prototype = lag_hess_prototype,
+        sys = f.sys,
+        expr = f.expr,
+        cons_expr = f.cons_expr)
 end
 
 function instantiate_function(
-        f::OptimizationFunction{false}, cache::OptimizationBase.ReInitCache,
+        f::OptimizationFunction{true}, cache::OptimizationBase.ReInitCache,
         adtype::ADTypes.AbstractADType, num_cons = 0)
     x = cache.u0
     p = cache.p
-    _f = (θ, args...) -> first(f.f(θ, p, args...))
 
-    if !(adtype isa SciMLBase.NoAD) && ADTypes.mode(adtype) isa ADTypes.ForwardMode
-        soadtype = DifferentiationInterface.SecondOrder(adtype, AutoReverseDiff()) #make zygote?
-    elseif !(adtype isa SciMLBase.NoAD) && ADTypes.mode(adtype) isa ADTypes.ReverseMode
-        soadtype = DifferentiationInterface.SecondOrder(AutoForwardDiff(), adtype)
-    else
-        soadtype = adtype
+    return instantiate_function(f, x, adtype, p, num_cons)
+end
+
+function instantiate_function(
+        f::OptimizationFunction{false}, x, adtype::ADTypes.AbstractADType,
+        p = SciMLBase.NullParameters(), num_cons = 0;
+        fg = false, fgh = false, cons_vjp = false, cons_jvp = false)
+    function _f(θ)
+        return f(θ, p)[1]
     end
 
+    adtype, soadtype = generate_adtype(adtype)
+
     if f.grad === nothing
         extras_grad = prepare_gradient(_f, adtype, x)
         function grad(θ)
             gradient(_f, adtype, θ, extras_grad)
         end
     else
-        grad = (θ, args...) -> f.grad(θ, p, args...)
+        grad = (θ) -> f.grad(θ, p)
+    end
+
+    if fg == true && f.fg === nothing
+        function fg!(θ)
+            res = zeros(eltype(x), size(x))
+            (y, _) = value_and_gradient!(_f, res, adtype, θ, extras_grad)
+            return y, res
+        end
     end
 
     hess_sparsity = f.hess_prototype
     hess_colors = f.hess_colorvec
     if f.hess === nothing
-        extras_hess = prepare_hessian(_f, soadtype, x) #placeholder logic, can be made much better
-        function hess(θ, args...)
+        extras_hess = prepare_hessian(_f, soadtype, x)
+        function hess(θ)
             hessian(_f, soadtype, θ, extras_hess)
         end
     else
-        hess = (θ, args...) -> f.hess(θ, p, args...)
+        hess = (θ) -> f.hess(θ, p)
+    end
+
+    if fgh == true && f.fgh !== nothing
+        function fgh!(θ)
+            G = zeros(eltype(x), size(x))
+            H = zeros(eltype(x), size(x, 1), size(x, 1))
+            (y, _, _) = value_derivative_and_second_derivative!(_f, G, H, θ, extras_hess)
+            return y, G, H
+        end
     end
 
     if f.hv === nothing
-        extras_hvp = prepare_hvp(_f, soadtype, x, rand(size(x)))
-        hv = function (θ, v, args...)
+        extras_hvp = prepare_hvp(_f, soadtype, x, zeros(eltype(x), size(x)))
+        function hv(θ, v)
             hvp(_f, soadtype, θ, v, extras_hvp)
         end
     else
@@ -342,16 +253,21 @@ function instantiate_function(
     if f.cons === nothing
         cons = nothing
     else
-        cons = (θ) -> f.cons(θ, p)
-        cons_oop = cons
+        function cons(θ)
+            return f.cons(θ, p)
+        end
+
+        function lagrangian(x, σ = one(eltype(x)), λ = ones(eltype(x), num_cons))
+            return σ * _f(x) + dot(λ, cons(x))
+        end
     end
 
     cons_jac_prototype = f.cons_jac_prototype
     cons_jac_colorvec = f.cons_jac_colorvec
     if cons !== nothing && f.cons_j === nothing
-        extras_jac = prepare_jacobian(cons_oop, adtype, x)
+        extras_jac = prepare_jacobian(cons, adtype, x)
         cons_j = function (θ)
-            J = jacobian(cons_oop, adtype, θ, extras_jac)
+            J = jacobian(cons, adtype, θ, extras_jac)
             if size(J, 1) == 1
                 J = vec(J)
             end
@@ -361,10 +277,28 @@ function instantiate_function(
         cons_j = (θ) -> f.cons_j(θ, p)
     end
 
+    if f.cons_vjp === nothing && cons_vjp == true
+        extras_pullback = prepare_pullback(cons, adtype, x)
+        function cons_vjp!(θ, v)
+            return pullback(cons, adtype, θ, v, extras_pullback)
+        end
+    else
+        cons_vjp! = nothing
+    end
+
+    if f.cons_jvp === nothing && cons_jvp == true
+        extras_pushforward = prepare_pushforward(cons, adtype, x)
+        function cons_jvp!(θ, v)
+            return pushforward(cons, adtype, θ, v, extras_pushforward)
+        end
+    else
+        cons_jvp! = nothing
+    end
+
     conshess_sparsity = f.cons_hess_prototype
     conshess_colors = f.cons_hess_colorvec
     if cons !== nothing && f.cons_h === nothing
-        fncs = [(x) -> cons_oop(x)[i] for i in 1:num_cons]
+        fncs = [(x) -> cons(x)[i] for i in 1:num_cons]
         extras_cons_hess = prepare_hessian.(fncs, Ref(soadtype), Ref(x))
 
         function cons_h(θ)
@@ -377,18 +311,48 @@ function instantiate_function(
         cons_h = (θ) -> f.cons_h(θ, p)
     end
 
-    if f.lag_h === nothing
-        lag_h = nothing # Consider implementing this
+    lag_hess_prototype = f.lag_hess_prototype
+
+    if cons !== nothing && f.lag_h === nothing
+        lag_extras = prepare_hessian(lagrangian, soadtype, x)
+        lag_hess_prototype = zeros(Bool, length(x), length(x))
+
+        function lag_h(θ, σ, λ)
+            if σ == zero(eltype(θ))
+                H = cons_h(θ)
+                for i in 1:num_cons
+                    H[i] *= λ[i]
+                end
+                return H
+            else
+                return hessian(x -> lagrangian(x, σ, λ), soadtype, θ, lag_extras)
+            end
+        end
     else
         lag_h = (θ, σ, μ) -> f.lag_h(θ, σ, μ, p)
     end
-    return OptimizationFunction{true}(f.f, adtype; grad = grad, hess = hess, hv = hv,
+
+    return OptimizationFunction{false}(f.f, adtype; grad = grad, hess = hess, hv = hv,
         cons = cons, cons_j = cons_j, cons_h = cons_h,
+        cons_vjp = cons_vjp!, cons_jvp = cons_jvp!,
         hess_prototype = hess_sparsity,
         hess_colorvec = hess_colors,
         cons_jac_prototype = cons_jac_prototype,
         cons_jac_colorvec = cons_jac_colorvec,
         cons_hess_prototype = conshess_sparsity,
         cons_hess_colorvec = conshess_colors,
-        lag_h, f.lag_hess_prototype)
+        lag_h = lag_h,
+        lag_hess_prototype = lag_hess_prototype,
+        sys = f.sys,
+        expr = f.expr,
+        cons_expr = f.cons_expr)
+end
+
+function instantiate_function(
+        f::OptimizationFunction{false}, cache::OptimizationBase.ReInitCache,
+        adtype::ADTypes.AbstractADType, num_cons = 0)
+    x = cache.u0
+    p = cache.p
+
+    return instantiate_function(f, x, adtype, p, num_cons)
 end
diff --git a/src/OptimizationDISparseExt.jl b/src/OptimizationDISparseExt.jl
index 375a903..2599914 100644
--- a/src/OptimizationDISparseExt.jl
+++ b/src/OptimizationDISparseExt.jl
@@ -21,7 +21,7 @@ function generate_sparse_adtype(adtype)
                 sparsity_detector = TracerSparsityDetector(),
                 coloring_algorithm = GreedyColoringAlgorithm())
         elseif !(adtype.dense_ad isa SciMLBase.NoAD) &&
-           ADTypes.mode(adtype.dense_ad) isa ADTypes.ForwardMode
+               ADTypes.mode(adtype.dense_ad) isa ADTypes.ForwardMode
             soadtype = AutoSparse(
                 DifferentiationInterface.SecondOrder(adtype.dense_ad, AutoReverseDiff()),
                 sparsity_detector = TracerSparsityDetector(),
@@ -43,7 +43,7 @@ function generate_sparse_adtype(adtype)
                 sparsity_detector = TracerSparsityDetector(),
                 coloring_algorithm = adtype.coloring_algorithm)
         elseif !(adtype.dense_ad isa SciMLBase.NoAD) &&
-           ADTypes.mode(adtype.dense_ad) isa ADTypes.ForwardMode
+               ADTypes.mode(adtype.dense_ad) isa ADTypes.ForwardMode
             soadtype = AutoSparse(
                 DifferentiationInterface.SecondOrder(adtype.dense_ad, AutoReverseDiff()),
                 sparsity_detector = TracerSparsityDetector(),
@@ -65,7 +65,7 @@ function generate_sparse_adtype(adtype)
                 sparsity_detector = adtype.sparsity_detector,
                 coloring_algorithm = GreedyColoringAlgorithm())
         elseif !(adtype.dense_ad isa SciMLBase.NoAD) &&
-           ADTypes.mode(adtype.dense_ad) isa ADTypes.ForwardMode
+               ADTypes.mode(adtype.dense_ad) isa ADTypes.ForwardMode
             soadtype = AutoSparse(
                 DifferentiationInterface.SecondOrder(adtype.dense_ad, AutoReverseDiff()),
                 sparsity_detector = adtype.sparsity_detector,
@@ -84,7 +84,7 @@ function generate_sparse_adtype(adtype)
                 sparsity_detector = adtype.sparsity_detector,
                 coloring_algorithm = adtype.coloring_algorithm)
         elseif !(adtype.dense_ad isa SciMLBase.NoAD) &&
-           ADTypes.mode(adtype.dense_ad) isa ADTypes.ForwardMode
+               ADTypes.mode(adtype.dense_ad) isa ADTypes.ForwardMode
             soadtype = AutoSparse(
                 DifferentiationInterface.SecondOrder(adtype.dense_ad, AutoReverseDiff()),
                 sparsity_detector = adtype.sparsity_detector,
@@ -102,8 +102,12 @@ end
 
 function instantiate_function(
         f::OptimizationFunction{true}, x, adtype::ADTypes.AutoSparse{<:AbstractADType},
-        p = SciMLBase.NullParameters(), num_cons = 0)
-    _f = (θ, args...) -> first(f.f(θ, p, args...))
+        p = SciMLBase.NullParameters(), num_cons = 0;
+        fg = false, fgh = false,
+        cons_vjp = false, cons_jvp = false)
+    function _f(θ)
+        return f(θ, p)[1]
+    end
 
     adtype, soadtype = generate_sparse_adtype(adtype)
 
@@ -113,23 +117,39 @@ function instantiate_function(
             gradient!(_f, res, adtype.dense_ad, θ, extras_grad)
         end
     else
-        grad = (G, θ, args...) -> f.grad(G, θ, p, args...)
+        grad = (G, θ) -> f.grad(G, θ, p)
+    end
+
+    if fg == true && f.fg !== nothing
+        function fg!(res, θ)
+            (y, _) = value_and_gradient!(_f, res, adtype.dense_ad, θ, extras_grad)
+            return y
+        end
     end
 
     hess_sparsity = f.hess_prototype
     hess_colors = f.hess_colorvec
     if f.hess === nothing
         extras_hess = prepare_hessian(_f, soadtype, x) #placeholder logic, can be made much better
-        function hess(res, θ, args...)
+        function hess(res, θ)
             hessian!(_f, res, soadtype, θ, extras_hess)
         end
+        hess_sparsity = extras_hess.sparsity
+        hess_colors = extras_hess.colors
     else
-        hess = (H, θ, args...) -> f.hess(H, θ, p, args...)
+        hess = (H, θ) -> f.hess(H, θ, p)
+    end
+
+    if fgh == true && f.fgh !== nothing
+        function fgh!(G, H, θ)
+            (y, _, _) = value_derivative_and_second_derivative!(_f, G, H, θ, extras_hess)
+            return y
+        end
     end
 
     if f.hv === nothing
-        extras_hvp = prepare_hvp(_f, soadtype.dense_ad, x, rand(size(x)))
-        hv = function (H, θ, v, args...)
+        extras_hvp = prepare_hvp(_f, soadtype.dense_ad, x, zeros(eltype(x), size(x)))
+        hv = function (H, θ, v)
             hvp!(_f, H, soadtype.dense_ad, θ, v, extras_hvp)
         end
     else
@@ -139,120 +159,65 @@ function instantiate_function(
     if f.cons === nothing
         cons = nothing
     else
-        cons = (res, θ) -> f.cons(res, θ, p)
-        cons_oop = (x) -> (_res = zeros(eltype(x), num_cons); cons(_res, x); _res)
+        function cons(res, θ)
+            f.cons(res, θ, p)
+        end
+
+        function cons_oop(x)
+            _res = zeros(eltype(x), num_cons)
+            cons(_res, x)
+            return _res
+        end
+
+        function lagrangian(x, σ = one(eltype(x)), λ = ones(eltype(x), num_cons))
+            return σ * _f(x) + dot(λ, cons_oop(x))
+        end
     end
 
     cons_jac_prototype = f.cons_jac_prototype
     cons_jac_colorvec = f.cons_jac_colorvec
     if cons !== nothing && f.cons_j === nothing
         extras_jac = prepare_jacobian(cons_oop, adtype, x)
-        cons_j = function (J, θ)
+        function cons_j(J, θ)
             jacobian!(cons_oop, J, adtype, θ, extras_jac)
             if size(J, 1) == 1
                 J = vec(J)
             end
         end
+        cons_jac_prototype = extras_jac.sparsity
+        cons_jac_colorvec = extras_jac.colors
     else
         cons_j = (J, θ) -> f.cons_j(J, θ, p)
     end
 
-    conshess_sparsity = f.cons_hess_prototype
-    conshess_colors = f.cons_hess_colorvec
-    if cons !== nothing && f.cons_h === nothing
-        fncs = [(x) -> cons_oop(x)[i] for i in 1:num_cons]
-        extras_cons_hess = prepare_hessian.(fncs, Ref(soadtype), Ref(x))
-
-        function cons_h(H, θ)
-            for i in 1:num_cons
-                hessian!(fncs[i], H[i], soadtype, θ, extras_cons_hess[i])
-            end
-        end
-    else
-        cons_h = (res, θ) -> f.cons_h(res, θ, p)
-    end
-
-    if f.lag_h === nothing
-        lag_h = nothing # Consider implementing this
-    else
-        lag_h = (res, θ, σ, μ) -> f.lag_h(res, θ, σ, μ, p)
-    end
-    return OptimizationFunction{true}(f.f, adtype; grad = grad, hess = hess, hv = hv,
-        cons = cons, cons_j = cons_j, cons_h = cons_h,
-        hess_prototype = hess_sparsity,
-        hess_colorvec = hess_colors,
-        cons_jac_prototype = cons_jac_prototype,
-        cons_jac_colorvec = cons_jac_colorvec,
-        cons_hess_prototype = conshess_sparsity,
-        cons_hess_colorvec = conshess_colors,
-        lag_h, f.lag_hess_prototype)
-end
-
-function instantiate_function(
-        f::OptimizationFunction{true}, cache::OptimizationBase.ReInitCache,
-        adtype::ADTypes.AutoSparse{<:AbstractADType}, num_cons = 0)
-    x = cache.u0
-    p = cache.p
-    _f = (θ, args...) -> first(f.f(θ, p, args...))
-
-    adtype, soadtype = generate_sparse_adtype(adtype)
-
-    if f.grad === nothing
-        extras_grad = prepare_gradient(_f, adtype.dense_ad, x)
-        function grad(res, θ)
-            gradient!(_f, res, adtype.dense_ad, θ, extras_grad)
-        end
-    else
-        grad = (G, θ, args...) -> f.grad(G, θ, p, args...)
-    end
-
-    hess_sparsity = f.hess_prototype
-    hess_colors = f.hess_colorvec
-    if f.hess === nothing
-        extras_hess = prepare_hessian(_f, soadtype, x)
-        function hess(res, θ, args...)
-            hessian!(_f, res, soadtype, θ, extras_hess)
-        end
-    else
-        hess = (H, θ, args...) -> f.hess(H, θ, p, args...)
-    end
-
-    if f.hv === nothing
-        extras_hvp = prepare_hvp(_f, soadtype.dense_ad, x, rand(size(x)))
-        hv = function (H, θ, v, args...)
-            hvp!(_f, H, soadtype.dense_ad, θ, v, extras_hvp)
+    if f.cons_vjp === nothing && cons_vjp == true
+        extras_pullback = prepare_pullback(cons_oop, adtype, x)
+        function cons_vjp!(J, θ, v)
+            pullback!(cons_oop, J, adtype.dense_ad, θ, v, extras_pullback)
         end
     else
-        hv = f.hv
+        cons_vjp! = nothing
     end
 
-    if f.cons === nothing
-        cons = nothing
-    else
-        cons = (res, θ) -> f.cons(res, θ, p)
-        cons_oop = (x) -> (_res = zeros(eltype(x), num_cons); cons(_res, x); _res)
-    end
-
-    cons_jac_prototype = f.cons_jac_prototype
-    cons_jac_colorvec = f.cons_jac_colorvec
-    if cons !== nothing && f.cons_j === nothing
-        extras_jac = prepare_jacobian(cons_oop, adtype, x)
-        cons_j = function (J, θ)
-            jacobian!(cons_oop, J, adtype, θ, extras_jac)
-            if size(J, 1) == 1
-                J = vec(J)
-            end
+    if f.cons_jvp === nothing && cons_jvp == true
+        extras_pushforward = prepare_pushforward(cons_oop, adtype, x)
+        function cons_jvp!(J, θ, v)
+            pushforward!(cons_oop, J, adtype.dense_ad, θ, v, extras_pushforward)
         end
     else
-        cons_j = (J, θ) -> f.cons_j(J, θ, p)
+        cons_jvp! = nothing
     end
 
     conshess_sparsity = f.cons_hess_prototype
     conshess_colors = f.cons_hess_colorvec
     if cons !== nothing && f.cons_h === nothing
-        fncs = [(x) -> cons_oop(x)[i] for i in 1:num_cons]
-        extras_cons_hess = prepare_hessian.(fncs, Ref(soadtype), Ref(x))
-
+        fncs = [@closure (x) -> cons_oop(x)[i] for i in 1:num_cons]
+        extras_cons_hess = Vector(undef, length(fncs))
+        for ind in 1:num_cons
+            extras_cons_hess[ind] = prepare_hessian(fncs[ind], soadtype, x)
+        end
+        conshess_sparsity = getfield.(extras_cons_hess, :sparsity)
+        conshess_colors = getfield.(extras_cons_hess, :colors)
         function cons_h(H, θ)
             for i in 1:num_cons
                 hessian!(fncs[i], H[i], soadtype, θ, extras_cons_hess[i])
@@ -262,8 +227,32 @@ function instantiate_function(
         cons_h = (res, θ) -> f.cons_h(res, θ, p)
     end
 
-    if f.lag_h === nothing
-        lag_h = nothing # Consider implementing this
+    lag_hess_prototype = f.lag_hess_prototype
+    if cons !== nothing && f.lag_h === nothing
+        lag_extras = prepare_hessian(lagrangian, soadtype, x)
+        lag_hess_prototype = lag_extras.sparsity
+
+        function lag_h(H::AbstractMatrix, θ, σ, λ)
+            if σ == zero(eltype(θ))
+                cons_h(H, θ)
+                H *= λ
+            else
+                hessian!(x -> lagrangian(x, σ, λ), H, soadtype, θ, lag_extras)
+            end
+        end
+
+        function lag_h(h, θ, σ, λ)
+            H = eltype(θ).(lag_hess_prototype)
+            hessian!(x -> lagrangian(x, σ, λ), H, soadtype, θ, lag_extras)
+            k = 0
+            rows, cols, _ = findnz(H)
+            for (i, j) in zip(rows, cols)
+                if i <= j
+                    k += 1
+                    h[k] = H[i, j]
+                end
+            end
+        end
     else
         lag_h = (res, θ, σ, μ) -> f.lag_h(res, θ, σ, μ, p)
     end
@@ -275,13 +264,28 @@ function instantiate_function(
         cons_jac_colorvec = cons_jac_colorvec,
         cons_hess_prototype = conshess_sparsity,
         cons_hess_colorvec = conshess_colors,
-        lag_h, f.lag_hess_prototype)
+        lag_h,
+        lag_hess_prototype = lag_hess_prototype,
+        sys = f.sys,
+        expr = f.expr,
+        cons_expr = f.cons_expr)
+end
+
+function instantiate_function(
+        f::OptimizationFunction{true}, cache::OptimizationBase.ReInitCache,
+        adtype::ADTypes.AutoSparse{<:AbstractADType}, num_cons = 0)
+    x = cache.u0
+    p = cache.p
+
+    return instantiate_function(f, x, adtype, p, num_cons)
 end
 
 function instantiate_function(
         f::OptimizationFunction{false}, x, adtype::ADTypes.AutoSparse{<:AbstractADType},
         p = SciMLBase.NullParameters(), num_cons = 0)
-    _f = (θ, args...) -> first(f.f(θ, p, args...))
+    function _f(θ)
+        return f(θ, p)[1]
+    end
 
     adtype, soadtype = generate_sparse_adtype(adtype)
 
@@ -291,23 +295,23 @@ function instantiate_function(
             gradient(_f, adtype.dense_ad, θ, extras_grad)
         end
     else
-        grad = (θ, args...) -> f.grad(θ, p, args...)
+        grad = (θ) -> f.grad(θ, p)
     end
 
     hess_sparsity = f.hess_prototype
     hess_colors = f.hess_colorvec
     if f.hess === nothing
         extras_hess = prepare_hessian(_f, soadtype, x) #placeholder logic, can be made much better
-        function hess(θ, args...)
+        function hess(θ)
             hessian(_f, soadtype, θ, extras_hess)
         end
     else
-        hess = (θ, args...) -> f.hess(θ, p, args...)
+        hess = (θ) -> f.hess(θ, p)
     end
 
     if f.hv === nothing
-        extras_hvp = prepare_hvp(_f, soadtype.dense_ad, x, rand(size(x)))
-        hv = function (θ, v, args...)
+        extras_hvp = prepare_hvp(_f, soadtype.dense_ad, x, zeros(eltype(x), size(x)))
+        function hv(θ, v)
             hvp(_f, soadtype.dense_ad, θ, v, extras_hvp)
         end
     else
@@ -317,16 +321,17 @@ function instantiate_function(
     if f.cons === nothing
         cons = nothing
     else
-        cons = (θ) -> f.cons(θ, p)
-        cons_oop = cons
+        function cons(θ)
+            f.cons(θ, p)
+        end
     end
 
     cons_jac_prototype = f.cons_jac_prototype
     cons_jac_colorvec = f.cons_jac_colorvec
     if cons !== nothing && f.cons_j === nothing
-        extras_jac = prepare_jacobian(cons_oop, adtype, x)
-        cons_j = function (θ)
-            J = jacobian(cons_oop, adtype, θ, extras_jac)
+        extras_jac = prepare_jacobian(cons, adtype, x)
+        function cons_j(θ)
+            J = jacobian(cons, adtype, θ, extras_jac)
             if size(J, 1) == 1
                 J = vec(J)
             end
@@ -339,7 +344,7 @@ function instantiate_function(
     conshess_sparsity = f.cons_hess_prototype
     conshess_colors = f.cons_hess_colorvec
     if cons !== nothing && f.cons_h === nothing
-        fncs = [(x) -> cons_oop(x)[i] for i in 1:num_cons]
+        fncs = [(x) -> cons(x)[i] for i in 1:num_cons]
         extras_cons_hess = prepare_hessian.(fncs, Ref(soadtype), Ref(x))
 
         function cons_h(θ)
@@ -365,7 +370,11 @@ function instantiate_function(
         cons_jac_colorvec = cons_jac_colorvec,
         cons_hess_prototype = conshess_sparsity,
         cons_hess_colorvec = conshess_colors,
-        lag_h, f.lag_hess_prototype)
+        lag_h,
+        lag_hess_prototype = f.lag_hess_prototype,
+        sys = f.sys,
+        expr = f.expr,
+        cons_expr = f.cons_expr)
 end
 
 function instantiate_function(
@@ -373,89 +382,6 @@ function instantiate_function(
         adtype::ADTypes.AutoSparse{<:AbstractADType}, num_cons = 0)
     x = cache.u0
     p = cache.p
-    _f = (θ, args...) -> first(f.f(θ, p, args...))
-
-    adtype, soadtype = generate_sparse_adtype(adtype)
-
-    if f.grad === nothing
-        extras_grad = prepare_gradient(_f, adtype.dense_ad, x)
-        function grad(θ)
-            gradient(_f, adtype.dense_ad, θ, extras_grad)
-        end
-    else
-        grad = (θ, args...) -> f.grad(θ, p, args...)
-    end
-
-    hess_sparsity = f.hess_prototype
-    hess_colors = f.hess_colorvec
-    if f.hess === nothing
-        extras_hess = prepare_hessian(_f, soadtype, x) #placeholder logic, can be made much better
-        function hess(θ, args...)
-            hessian(_f, soadtype, θ, extras_hess)
-        end
-    else
-        hess = (θ, args...) -> f.hess(θ, p, args...)
-    end
 
-    if f.hv === nothing
-        extras_hvp = prepare_hvp(_f, soadtype.dense_ad, x, rand(size(x)))
-        hv = function (θ, v, args...)
-            hvp(_f, soadtype.dense_ad, θ, v, extras_hvp)
-        end
-    else
-        hv = f.hv
-    end
-
-    if f.cons === nothing
-        cons = nothing
-    else
-        cons = (θ) -> f.cons(θ, p)
-        cons_oop = cons
-    end
-
-    cons_jac_prototype = f.cons_jac_prototype
-    cons_jac_colorvec = f.cons_jac_colorvec
-    if cons !== nothing && f.cons_j === nothing
-        extras_jac = prepare_jacobian(cons_oop, adtype, x)
-        cons_j = function (θ)
-            J = jacobian(cons_oop, adtype, θ, extras_jac)
-            if size(J, 1) == 1
-                J = vec(J)
-            end
-            return J
-        end
-    else
-        cons_j = (θ) -> f.cons_j(θ, p)
-    end
-
-    conshess_sparsity = f.cons_hess_prototype
-    conshess_colors = f.cons_hess_colorvec
-    if cons !== nothing && f.cons_h === nothing
-        fncs = [(x) -> cons_oop(x)[i] for i in 1:num_cons]
-        extras_cons_hess = prepare_hessian.(fncs, Ref(soadtype), Ref(x))
-
-        function cons_h(θ)
-            H = map(1:num_cons) do i
-                hessian(fncs[i], soadtype, θ, extras_cons_hess[i])
-            end
-            return H
-        end
-    else
-        cons_h = (θ) -> f.cons_h(θ, p)
-    end
-
-    if f.lag_h === nothing
-        lag_h = nothing # Consider implementing this
-    else
-        lag_h = (θ, σ, μ) -> f.lag_h(θ, σ, μ, p)
-    end
-    return OptimizationFunction{true}(f.f, adtype; grad = grad, hess = hess, hv = hv,
-        cons = cons, cons_j = cons_j, cons_h = cons_h,
-        hess_prototype = hess_sparsity,
-        hess_colorvec = hess_colors,
-        cons_jac_prototype = cons_jac_prototype,
-        cons_jac_colorvec = cons_jac_colorvec,
-        cons_hess_prototype = conshess_sparsity,
-        cons_hess_colorvec = conshess_colors,
-        lag_h, f.lag_hess_prototype)
+    return instantiate_function(f, x, adtype, p, num_cons)
 end
diff --git a/src/augmented_lagrangian.jl b/src/augmented_lagrangian.jl
new file mode 100644
index 0000000..8790900
--- /dev/null
+++ b/src/augmented_lagrangian.jl
@@ -0,0 +1,13 @@
+function generate_auglag(θ)
+    x = cache.f(θ, cache.p)
+    cons_tmp .= zero(eltype(θ))
+    cache.f.cons(cons_tmp, θ)
+    cons_tmp[eq_inds] .= cons_tmp[eq_inds] - cache.lcons[eq_inds]
+    cons_tmp[ineq_inds] .= cons_tmp[ineq_inds] .- cache.ucons[ineq_inds]
+    opt_state = Optimization.OptimizationState(u = θ, objective = x[1])
+    if cache.callback(opt_state, x...)
+        error("Optimization halted by callback.")
+    end
+    return x[1] + sum(@. λ * cons_tmp[eq_inds] + ρ / 2 * (cons_tmp[eq_inds] .^ 2)) +
+           1 / (2 * ρ) * sum((max.(Ref(0.0), μ .+ (ρ .* cons_tmp[ineq_inds]))) .^ 2)
+end
diff --git a/src/function.jl b/src/function.jl
index 257d680..63e194f 100644
--- a/src/function.jl
+++ b/src/function.jl
@@ -72,7 +72,8 @@ function instantiate_function(f::OptimizationFunction{true}, x, ::SciMLBase.NoAD
         observed = f.observed)
 end
 
-function instantiate_function(f::OptimizationFunction{true}, cache::ReInitCache, ::SciMLBase.NoAD,
+function instantiate_function(
+        f::OptimizationFunction{true}, cache::ReInitCache, ::SciMLBase.NoAD,
         num_cons = 0)
     grad = f.grad === nothing ? nothing : (G, x, args...) -> f.grad(G, x, cache.p, args...)
     hess = f.hess === nothing ? nothing : (H, x, args...) -> f.hess(H, x, cache.p, args...)
diff --git a/test/adtests.jl b/test/adtests.jl
index d0fb1ec..f93814a 100644
--- a/test/adtests.jl
+++ b/test/adtests.jl
@@ -26,7 +26,7 @@ H2 = Array{Float64}(undef, 2, 2)
 g!(G1, x0)
 h!(H1, x0)
 
-cons = (res, x, p) -> (res .= [x[1]^2 + x[2]^2])
+cons = (res, x, p) -> (res .= [x[1]^2 + x[2]^2]; return nothing)
 optf = OptimizationFunction(rosenbrock, OptimizationBase.AutoModelingToolkit(), cons = cons)
 optprob = OptimizationBase.instantiate_function(optf, x0,
     OptimizationBase.AutoModelingToolkit(),
@@ -47,6 +47,7 @@ optprob.cons_h(H3, x0)
 
 function con2_c(res, x, p)
     res .= [x[1]^2 + x[2]^2, x[2] * sin(x[1]) - x[1]]
+    return nothing
 end
 optf = OptimizationFunction(rosenbrock,
     OptimizationBase.AutoModelingToolkit(),
@@ -71,46 +72,42 @@ optprob.cons_h(H3, x0)
 G2 = Array{Float64}(undef, 2)
 H2 = Array{Float64}(undef, 2, 2)
 
-if VERSION >= v"1.9"
-    optf = OptimizationFunction(rosenbrock, OptimizationBase.AutoEnzyme(), cons = cons)
-    optprob = OptimizationBase.instantiate_function(
-        optf, x0, OptimizationBase.AutoEnzyme(),
-        nothing, 1)
-    optprob.grad(G2, x0)
-    @test G1 == G2
-    optprob.hess(H2, x0)
-    @test H1 == H2
-    res = Array{Float64}(undef, 1)
-    optprob.cons(res, x0)
-    @test res == [0.0]
-    J = Array{Float64}(undef, 2)
-    optprob.cons_j(J, [5.0, 3.0])
-    @test J == [10.0, 6.0]
-    H3 = [Array{Float64}(undef, 2, 2)]
-    optprob.cons_h(H3, x0)
-    @test H3 == [[2.0 0.0; 0.0 2.0]]
-
-    G2 = Array{Float64}(undef, 2)
-    H2 = Array{Float64}(undef, 2, 2)
-
-    optf = OptimizationFunction(rosenbrock, OptimizationBase.AutoEnzyme(), cons = con2_c)
-    optprob = OptimizationBase.instantiate_function(
-        optf, x0, OptimizationBase.AutoEnzyme(),
-        nothing, 2)
-    optprob.grad(G2, x0)
-    @test G1 == G2
-    optprob.hess(H2, x0)
-    @test H1 == H2
-    res = Array{Float64}(undef, 2)
-    optprob.cons(res, x0)
-    @test res == [0.0, 0.0]
-    J = Array{Float64}(undef, 2, 2)
-    optprob.cons_j(J, [5.0, 3.0])
-    @test all(isapprox(J, [10.0 6.0; -0.149013 -0.958924]; rtol = 1e-3))
-    H3 = [Array{Float64}(undef, 2, 2), Array{Float64}(undef, 2, 2)]
-    optprob.cons_h(H3, x0)
-    @test H3 == [[2.0 0.0; 0.0 2.0], [-0.0 1.0; 1.0 0.0]]
-end
+optf = OptimizationFunction(rosenbrock, OptimizationBase.AutoEnzyme(), cons = cons)
+optprob = OptimizationBase.instantiate_function(
+    optf, x0, OptimizationBase.AutoEnzyme(),
+    nothing, 1)
+optprob.grad(G2, x0)
+@test G1 == G2
+optprob.hess(H2, x0)
+@test H1 == H2
+res = Array{Float64}(undef, 1)
+optprob.cons(res, x0)
+@test res == [0.0]
+J = Array{Float64}(undef, 2)
+optprob.cons_j(J, [5.0, 3.0])
+@test J == [10.0, 6.0]
+H3 = [Array{Float64}(undef, 2, 2)]
+optprob.cons_h(H3, x0)
+@test H3 == [[2.0 0.0; 0.0 2.0]]
+G2 = Array{Float64}(undef, 2)
+H2 = Array{Float64}(undef, 2, 2)
+optf = OptimizationFunction(rosenbrock, OptimizationBase.AutoEnzyme(), cons = con2_c)
+optprob = OptimizationBase.instantiate_function(
+    optf, x0, OptimizationBase.AutoEnzyme(),
+    nothing, 2)
+optprob.grad(G2, x0)
+@test G1 == G2
+optprob.hess(H2, x0)
+@test H1 == H2
+res = Array{Float64}(undef, 2)
+optprob.cons(res, x0)
+@test res == [0.0, 0.0]
+J = Array{Float64}(undef, 2, 2)
+optprob.cons_j(J, [5.0, 3.0])
+@test all(isapprox(J, [10.0 6.0; -0.149013 -0.958924]; rtol = 1e-3))
+H3 = [Array{Float64}(undef, 2, 2), Array{Float64}(undef, 2, 2)]
+optprob.cons_h(H3, x0)
+@test H3 == [[2.0 0.0; 0.0 2.0], [-0.0 1.0; 1.0 0.0]]
 
 G2 = Array{Float64}(undef, 2)
 H2 = Array{Float64}(undef, 2, 2)