Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions api/core/v1alpha1/model_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,11 @@ const (
// Once either of them qualified, we'll expose this as a field in Model.
ModelPreheatAnnoKey = "llmaz.io/model-preheat"

// ModelActivatorAnnoKey is used to indicate the model name activated by the activator.
ModelActivatorAnnoKey = "activator.llmaz.io/model-name"
// CachedModelActivatorAnnoKey is used to cache the activator state of the model.
CachedModelActivatorAnnoKey = "activator.llmaz.io/cached-state"

HUGGING_FACE = "Huggingface"
MODEL_SCOPE = "ModelScope"

Expand Down
4 changes: 4 additions & 0 deletions chart/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@ spec:
env:
- name: KUBERNETES_CLUSTER_DOMAIN
value: {{ quote .Values.kubernetesClusterDomain }}
- name: POD_IP
valueFrom:
fieldRef:
fieldPath: status.podIP
image: {{ .Values.controllerManager.manager.image.repository }}:{{ .Values.controllerManager.manager.image.tag
| default .Chart.AppVersion }}
livenessProbe:
Expand Down
2 changes: 2 additions & 0 deletions chart/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ controllerManager:
- --metrics-bind-address=:8443
- --leader-elect
- --namespace=llmaz-system
- --enable-service-activator
- --pod-ip=$(POD_IP)
containerSecurityContext:
allowPrivilegeEscalation: false
capabilities:
Expand Down
23 changes: 21 additions & 2 deletions cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (

"k8s.io/apimachinery/pkg/runtime"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/client-go/dynamic"
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/healthz"
Expand Down Expand Up @@ -63,10 +64,14 @@ func main() {
var enableLeaderElection bool
var probeAddr string
var namespace string
var enableServiceActivator bool
var podIP string

flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.")
flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
flag.StringVar(&namespace, "namespace", "llmaz-system", "The namespace of the llmaz to deploy")
flag.BoolVar(&enableServiceActivator, "enable-service-activator", false, "Enable the service activator feature. This is an experimental feature.")
flag.StringVar(&podIP, "pod-ip", "", "The pod IP of the llmaz controller manager. Only used when service activator is enabled.")
flag.BoolVar(&enableLeaderElection, "leader-elect", false,
"Enable leader election for controller manager. "+
"Enabling this will ensure there is only one active controller manager.")
Expand Down Expand Up @@ -120,7 +125,7 @@ func main() {
// Cert won't be ready until manager starts, so start a goroutine here which
// will block until the cert is ready before setting up the controllers.
// Controllers who register after manager starts will start directly.
go setupControllers(mgr, certsReady)
go setupControllers(mgr, certsReady, enableServiceActivator, podIP)

//+kubebuilder:scaffold:builder

Expand All @@ -140,7 +145,7 @@ func main() {
}
}

func setupControllers(mgr ctrl.Manager, certsReady chan struct{}) {
func setupControllers(mgr ctrl.Manager, certsReady chan struct{}, enableServiceActivator bool, podIP string) {
// The controllers won't work until the webhooks are operating,
// and the webhook won't work until the certs are all in places.
setupLog.Info("waiting for the cert generation to complete")
Expand Down Expand Up @@ -176,6 +181,20 @@ func setupControllers(mgr ctrl.Manager, certsReady chan struct{}) {
os.Exit(1)
}

if enableServiceActivator {
dynamicClient, err := dynamic.NewForConfig(mgr.GetConfig())
if err != nil {
setupLog.Error(err, "unable to create dynamic client")
os.Exit(1)
}

activatorReconciler := inferencecontroller.NewActivatorReconciler(mgr, dynamicClient, podIP)
if err := activatorReconciler.SetupWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "Activator")
os.Exit(1)
}
}

if os.Getenv("ENABLE_WEBHOOKS") != "false" {
if err := webhook.SetupOpenModelWebhook(mgr); err != nil {
setupLog.Error(err, "unable to create webhook", "webhook", "Model")
Expand Down
50 changes: 48 additions & 2 deletions config/crd/bases/inference.llmaz.io_backendruntimes.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -388,6 +388,12 @@ spec:
- port
type: object
type: object
stopSignal:
description: |-
StopSignal defines which signal will be sent to a container when it is being stopped.
If not specified, the default is defined by the container runtime in use.
StopSignal can only be set for Pods with a non-empty .spec.os.name
type: string
type: object
livenessProbe:
description: |-
Expand Down Expand Up @@ -770,7 +776,9 @@ spec:
policies:
description: |-
policies is a list of potential scaling polices which can be used during scaling.
At least one policy must be specified, otherwise the HPAScalingRules will be discarded as invalid
If not set, use the default values:
- For scale up: allow doubling the number of pods, or an absolute change of 4 pods in a 15s window.
- For scale down: allow all pods to be removed in a 15s window.
items:
description: HPAScalingPolicy is a single
policy which must hold true for a specified
Expand Down Expand Up @@ -814,6 +822,24 @@ spec:
- For scale down: 300 (i.e. the stabilization window is 300 seconds long).
format: int32
type: integer
tolerance:
anyOf:
- type: integer
- type: string
description: |-
tolerance is the tolerance on the ratio between the current and desired
metric value under which no updates are made to the desired number of
replicas (e.g. 0.01 for 1%). Must be greater than or equal to zero. If not
set, the default cluster-wide tolerance is applied (by default 10%).

For example, if autoscaling is configured with a memory consumption target of 100Mi,
and scale-down and scale-up tolerances of 5% and 1% respectively, scaling will be
triggered when the actual consumption falls below 95Mi or exceeds 101Mi.

This is an alpha field and requires enabling the HPAConfigurableTolerance
feature gate.
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
x-kubernetes-int-or-string: true
type: object
scaleUp:
description: |-
Expand All @@ -826,7 +852,9 @@ spec:
policies:
description: |-
policies is a list of potential scaling polices which can be used during scaling.
At least one policy must be specified, otherwise the HPAScalingRules will be discarded as invalid
If not set, use the default values:
- For scale up: allow doubling the number of pods, or an absolute change of 4 pods in a 15s window.
- For scale down: allow all pods to be removed in a 15s window.
items:
description: HPAScalingPolicy is a single
policy which must hold true for a specified
Expand Down Expand Up @@ -870,6 +898,24 @@ spec:
- For scale down: 300 (i.e. the stabilization window is 300 seconds long).
format: int32
type: integer
tolerance:
anyOf:
- type: integer
- type: string
description: |-
tolerance is the tolerance on the ratio between the current and desired
metric value under which no updates are made to the desired number of
replicas (e.g. 0.01 for 1%). Must be greater than or equal to zero. If not
set, the default cluster-wide tolerance is applied (by default 10%).

For example, if autoscaling is configured with a memory consumption target of 100Mi,
and scale-down and scale-up tolerances of 5% and 1% respectively, scaling will be
triggered when the actual consumption falls below 95Mi or exceeds 101Mi.

This is an alpha field and requires enabling the HPAConfigurableTolerance
feature gate.
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
x-kubernetes-int-or-string: true
type: object
type: object
metrics:
Expand Down
44 changes: 42 additions & 2 deletions config/crd/bases/inference.llmaz.io_playgrounds.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -295,7 +295,9 @@ spec:
policies:
description: |-
policies is a list of potential scaling polices which can be used during scaling.
At least one policy must be specified, otherwise the HPAScalingRules will be discarded as invalid
If not set, use the default values:
- For scale up: allow doubling the number of pods, or an absolute change of 4 pods in a 15s window.
- For scale down: allow all pods to be removed in a 15s window.
items:
description: HPAScalingPolicy is a single policy
which must hold true for a specified past
Expand Down Expand Up @@ -339,6 +341,24 @@ spec:
- For scale down: 300 (i.e. the stabilization window is 300 seconds long).
format: int32
type: integer
tolerance:
anyOf:
- type: integer
- type: string
description: |-
tolerance is the tolerance on the ratio between the current and desired
metric value under which no updates are made to the desired number of
replicas (e.g. 0.01 for 1%). Must be greater than or equal to zero. If not
set, the default cluster-wide tolerance is applied (by default 10%).

For example, if autoscaling is configured with a memory consumption target of 100Mi,
and scale-down and scale-up tolerances of 5% and 1% respectively, scaling will be
triggered when the actual consumption falls below 95Mi or exceeds 101Mi.

This is an alpha field and requires enabling the HPAConfigurableTolerance
feature gate.
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
x-kubernetes-int-or-string: true
type: object
scaleUp:
description: |-
Expand All @@ -351,7 +371,9 @@ spec:
policies:
description: |-
policies is a list of potential scaling polices which can be used during scaling.
At least one policy must be specified, otherwise the HPAScalingRules will be discarded as invalid
If not set, use the default values:
- For scale up: allow doubling the number of pods, or an absolute change of 4 pods in a 15s window.
- For scale down: allow all pods to be removed in a 15s window.
items:
description: HPAScalingPolicy is a single policy
which must hold true for a specified past
Expand Down Expand Up @@ -395,6 +417,24 @@ spec:
- For scale down: 300 (i.e. the stabilization window is 300 seconds long).
format: int32
type: integer
tolerance:
anyOf:
- type: integer
- type: string
description: |-
tolerance is the tolerance on the ratio between the current and desired
metric value under which no updates are made to the desired number of
replicas (e.g. 0.01 for 1%). Must be greater than or equal to zero. If not
set, the default cluster-wide tolerance is applied (by default 10%).

For example, if autoscaling is configured with a memory consumption target of 100Mi,
and scale-down and scale-up tolerances of 5% and 1% respectively, scaling will be
triggered when the actual consumption falls below 95Mi or exceeds 101Mi.

This is an alpha field and requires enabling the HPAConfigurableTolerance
feature gate.
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
x-kubernetes-int-or-string: true
type: object
type: object
metrics:
Expand Down
Loading
Loading