InftyAI · kerthcet · Oct 30, 2025 · Jul 16, 2025 · Jul 21, 2025 · Jul 30, 2025
diff --git a/api/core/v1alpha1/model_types.go b/api/core/v1alpha1/model_types.go
@@ -35,6 +35,11 @@ const (
 	// Once either of them qualified, we'll expose this as a field in Model.
 	ModelPreheatAnnoKey = "llmaz.io/model-preheat"
 
+	// ModelActivatorAnnoKey is used to indicate the model name activated by the activator.
+	ModelActivatorAnnoKey = "activator.llmaz.io/model-name"
+	// CachedModelActivatorAnnoKey is used to cache the activator state of the model.
+	CachedModelActivatorAnnoKey = "activator.llmaz.io/cached-state"
+
 	HUGGING_FACE = "Huggingface"
 	MODEL_SCOPE  = "ModelScope"
 

diff --git a/chart/templates/deployment.yaml b/chart/templates/deployment.yaml
@@ -29,6 +29,10 @@ spec:
         env:
         - name: KUBERNETES_CLUSTER_DOMAIN
           value: {{ quote .Values.kubernetesClusterDomain }}
+        - name: POD_IP
+          valueFrom:
+            fieldRef:
+              fieldPath: status.podIP
         image: {{ .Values.controllerManager.manager.image.repository }}:{{ .Values.controllerManager.manager.image.tag
           | default .Chart.AppVersion }}
         livenessProbe:

diff --git a/chart/values.yaml b/chart/values.yaml
@@ -5,6 +5,8 @@ controllerManager:
     - --metrics-bind-address=:8443
     - --leader-elect
     - --namespace=llmaz-system
+    - --enable-service-activator
+    - --pod-ip=$(POD_IP)
     containerSecurityContext:
       allowPrivilegeEscalation: false
       capabilities:

diff --git a/cmd/main.go b/cmd/main.go
@@ -26,6 +26,7 @@ import (
 
 	"k8s.io/apimachinery/pkg/runtime"
 	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
+	"k8s.io/client-go/dynamic"
 	clientgoscheme "k8s.io/client-go/kubernetes/scheme"
 	ctrl "sigs.k8s.io/controller-runtime"
 	"sigs.k8s.io/controller-runtime/pkg/healthz"
@@ -63,10 +64,14 @@ func main() {
 	var enableLeaderElection bool
 	var probeAddr string
 	var namespace string
+	var enableServiceActivator bool
+	var podIP string
 
 	flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.")
 	flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
 	flag.StringVar(&namespace, "namespace", "llmaz-system", "The namespace of the llmaz to deploy")
+	flag.BoolVar(&enableServiceActivator, "enable-service-activator", false, "Enable the service activator feature. This is an experimental feature.")
+	flag.StringVar(&podIP, "pod-ip", "", "The pod IP of the llmaz controller manager. Only used when service activator is enabled.")
 	flag.BoolVar(&enableLeaderElection, "leader-elect", false,
 		"Enable leader election for controller manager. "+
 			"Enabling this will ensure there is only one active controller manager.")
@@ -120,7 +125,7 @@ func main() {
 	// Cert won't be ready until manager starts, so start a goroutine here which
 	// will block until the cert is ready before setting up the controllers.
 	// Controllers who register after manager starts will start directly.
-	go setupControllers(mgr, certsReady)
+	go setupControllers(mgr, certsReady, enableServiceActivator, podIP)
 
 	//+kubebuilder:scaffold:builder
 
@@ -140,7 +145,7 @@ func main() {
 	}
 }
 
-func setupControllers(mgr ctrl.Manager, certsReady chan struct{}) {
+func setupControllers(mgr ctrl.Manager, certsReady chan struct{}, enableServiceActivator bool, podIP string) {
 	// The controllers won't work until the webhooks are operating,
 	// and the webhook won't work until the certs are all in places.
 	setupLog.Info("waiting for the cert generation to complete")
@@ -176,6 +181,20 @@ func setupControllers(mgr ctrl.Manager, certsReady chan struct{}) {
 		os.Exit(1)
 	}
 
+	if enableServiceActivator {
+		dynamicClient, err := dynamic.NewForConfig(mgr.GetConfig())
+		if err != nil {
+			setupLog.Error(err, "unable to create dynamic client")
+			os.Exit(1)
+		}
+
+		activatorReconciler := inferencecontroller.NewActivatorReconciler(mgr, dynamicClient, podIP)
+		if err := activatorReconciler.SetupWithManager(mgr); err != nil {
+			setupLog.Error(err, "unable to create controller", "controller", "Activator")
+			os.Exit(1)
+		}
+	}
+
 	if os.Getenv("ENABLE_WEBHOOKS") != "false" {
 		if err := webhook.SetupOpenModelWebhook(mgr); err != nil {
 			setupLog.Error(err, "unable to create webhook", "webhook", "Model")

diff --git a/config/crd/bases/inference.llmaz.io_backendruntimes.yaml b/config/crd/bases/inference.llmaz.io_backendruntimes.yaml
@@ -388,6 +388,12 @@ spec:
                         - port
                         type: object
                     type: object
+                  stopSignal:
+                    description: |-
+                      StopSignal defines which signal will be sent to a container when it is being stopped.
+                      If not specified, the default is defined by the container runtime in use.
+                      StopSignal can only be set for Pods with a non-empty .spec.os.name
+                    type: string
                 type: object
               livenessProbe:
                 description: |-
@@ -770,7 +776,9 @@ spec:
                                     policies:
                                       description: |-
                                         policies is a list of potential scaling polices which can be used during scaling.
-                                        At least one policy must be specified, otherwise the HPAScalingRules will be discarded as invalid
+                                        If not set, use the default values:
+                                        - For scale up: allow doubling the number of pods, or an absolute change of 4 pods in a 15s window.
+                                        - For scale down: allow all pods to be removed in a 15s window.
                                       items:
                                         description: HPAScalingPolicy is a single
                                           policy which must hold true for a specified
@@ -814,6 +822,24 @@ spec:
                                         - For scale down: 300 (i.e. the stabilization window is 300 seconds long).
                                       format: int32
                                       type: integer
+                                    tolerance:
+                                      anyOf:
+                                      - type: integer
+                                      - type: string
+                                      description: |-
+                                        tolerance is the tolerance on the ratio between the current and desired
+                                        metric value under which no updates are made to the desired number of
+                                        replicas (e.g. 0.01 for 1%). Must be greater than or equal to zero. If not
+                                        set, the default cluster-wide tolerance is applied (by default 10%).
+
+                                        For example, if autoscaling is configured with a memory consumption target of 100Mi,
+                                        and scale-down and scale-up tolerances of 5% and 1% respectively, scaling will be
+                                        triggered when the actual consumption falls below 95Mi or exceeds 101Mi.
+
+                                        This is an alpha field and requires enabling the HPAConfigurableTolerance
+                                        feature gate.
+                                      pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                      x-kubernetes-int-or-string: true
                                   type: object
                                 scaleUp:
                                   description: |-
@@ -826,7 +852,9 @@ spec:
                                     policies:
                                       description: |-
                                         policies is a list of potential scaling polices which can be used during scaling.
-                                        At least one policy must be specified, otherwise the HPAScalingRules will be discarded as invalid
+                                        If not set, use the default values:
+                                        - For scale up: allow doubling the number of pods, or an absolute change of 4 pods in a 15s window.
+                                        - For scale down: allow all pods to be removed in a 15s window.
                                       items:
                                         description: HPAScalingPolicy is a single
                                           policy which must hold true for a specified
@@ -870,6 +898,24 @@ spec:
                                         - For scale down: 300 (i.e. the stabilization window is 300 seconds long).
                                       format: int32
                                       type: integer
+                                    tolerance:
+                                      anyOf:
+                                      - type: integer
+                                      - type: string
+                                      description: |-
+                                        tolerance is the tolerance on the ratio between the current and desired
+                                        metric value under which no updates are made to the desired number of
+                                        replicas (e.g. 0.01 for 1%). Must be greater than or equal to zero. If not
+                                        set, the default cluster-wide tolerance is applied (by default 10%).
+
+                                        For example, if autoscaling is configured with a memory consumption target of 100Mi,
+                                        and scale-down and scale-up tolerances of 5% and 1% respectively, scaling will be
+                                        triggered when the actual consumption falls below 95Mi or exceeds 101Mi.
+
+                                        This is an alpha field and requires enabling the HPAConfigurableTolerance
+                                        feature gate.
+                                      pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                      x-kubernetes-int-or-string: true
                                   type: object
                               type: object
                             metrics:

diff --git a/config/crd/bases/inference.llmaz.io_playgrounds.yaml b/config/crd/bases/inference.llmaz.io_playgrounds.yaml
@@ -295,7 +295,9 @@ spec:
                                   policies:
                                     description: |-
                                       policies is a list of potential scaling polices which can be used during scaling.
-                                      At least one policy must be specified, otherwise the HPAScalingRules will be discarded as invalid
+                                      If not set, use the default values:
+                                      - For scale up: allow doubling the number of pods, or an absolute change of 4 pods in a 15s window.
+                                      - For scale down: allow all pods to be removed in a 15s window.
                                     items:
                                       description: HPAScalingPolicy is a single policy
                                         which must hold true for a specified past
@@ -339,6 +341,24 @@ spec:
                                       - For scale down: 300 (i.e. the stabilization window is 300 seconds long).
                                     format: int32
                                     type: integer
+                                  tolerance:
+                                    anyOf:
+                                    - type: integer
+                                    - type: string
+                                    description: |-
+                                      tolerance is the tolerance on the ratio between the current and desired
+                                      metric value under which no updates are made to the desired number of
+                                      replicas (e.g. 0.01 for 1%). Must be greater than or equal to zero. If not
+                                      set, the default cluster-wide tolerance is applied (by default 10%).
+
+                                      For example, if autoscaling is configured with a memory consumption target of 100Mi,
+                                      and scale-down and scale-up tolerances of 5% and 1% respectively, scaling will be
+                                      triggered when the actual consumption falls below 95Mi or exceeds 101Mi.
+
+                                      This is an alpha field and requires enabling the HPAConfigurableTolerance
+                                      feature gate.
+                                    pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                    x-kubernetes-int-or-string: true
                                 type: object
                               scaleUp:
                                 description: |-
@@ -351,7 +371,9 @@ spec:
                                   policies:
                                     description: |-
                                       policies is a list of potential scaling polices which can be used during scaling.
-                                      At least one policy must be specified, otherwise the HPAScalingRules will be discarded as invalid
+                                      If not set, use the default values:
+                                      - For scale up: allow doubling the number of pods, or an absolute change of 4 pods in a 15s window.
+                                      - For scale down: allow all pods to be removed in a 15s window.
                                     items:
                                       description: HPAScalingPolicy is a single policy
                                         which must hold true for a specified past
@@ -395,6 +417,24 @@ spec:
                                       - For scale down: 300 (i.e. the stabilization window is 300 seconds long).
                                     format: int32
                                     type: integer
+                                  tolerance:
+                                    anyOf:
+                                    - type: integer
+                                    - type: string
+                                    description: |-
+                                      tolerance is the tolerance on the ratio between the current and desired
+                                      metric value under which no updates are made to the desired number of
+                                      replicas (e.g. 0.01 for 1%). Must be greater than or equal to zero. If not
+                                      set, the default cluster-wide tolerance is applied (by default 10%).
+
+                                      For example, if autoscaling is configured with a memory consumption target of 100Mi,
+                                      and scale-down and scale-up tolerances of 5% and 1% respectively, scaling will be
+                                      triggered when the actual consumption falls below 95Mi or exceeds 101Mi.
+
+                                      This is an alpha field and requires enabling the HPAConfigurableTolerance
+                                      feature gate.
+                                    pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                    x-kubernetes-int-or-string: true
                                 type: object
                             type: object
                           metrics: