diff --git a/api/core/v1alpha1/model_types.go b/api/core/v1alpha1/model_types.go index 7f4a997f..982959da 100644 --- a/api/core/v1alpha1/model_types.go +++ b/api/core/v1alpha1/model_types.go @@ -35,6 +35,11 @@ const ( // Once either of them qualified, we'll expose this as a field in Model. ModelPreheatAnnoKey = "llmaz.io/model-preheat" + // ModelActivatorAnnoKey is used to indicate the model name activated by the activator. + ModelActivatorAnnoKey = "activator.llmaz.io/model-name" + // CachedModelActivatorAnnoKey is used to cache the activator state of the model. + CachedModelActivatorAnnoKey = "activator.llmaz.io/cached-state" + HUGGING_FACE = "Huggingface" MODEL_SCOPE = "ModelScope" diff --git a/chart/templates/deployment.yaml b/chart/templates/deployment.yaml index 0be4fea7..aa58f15b 100644 --- a/chart/templates/deployment.yaml +++ b/chart/templates/deployment.yaml @@ -29,6 +29,10 @@ spec: env: - name: KUBERNETES_CLUSTER_DOMAIN value: {{ quote .Values.kubernetesClusterDomain }} + - name: POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP image: {{ .Values.controllerManager.manager.image.repository }}:{{ .Values.controllerManager.manager.image.tag | default .Chart.AppVersion }} livenessProbe: diff --git a/chart/values.yaml b/chart/values.yaml index ddf55284..60f76554 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -5,6 +5,8 @@ controllerManager: - --metrics-bind-address=:8443 - --leader-elect - --namespace=llmaz-system + - --enable-service-activator + - --pod-ip=$(POD_IP) containerSecurityContext: allowPrivilegeEscalation: false capabilities: diff --git a/cmd/main.go b/cmd/main.go index 5372335f..17c86008 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -26,6 +26,7 @@ import ( "k8s.io/apimachinery/pkg/runtime" utilruntime "k8s.io/apimachinery/pkg/util/runtime" + "k8s.io/client-go/dynamic" clientgoscheme "k8s.io/client-go/kubernetes/scheme" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/healthz" @@ -63,10 +64,14 @@ func main() { var enableLeaderElection bool var probeAddr string var namespace string + var enableServiceActivator bool + var podIP string flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.") flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.") flag.StringVar(&namespace, "namespace", "llmaz-system", "The namespace of the llmaz to deploy") + flag.BoolVar(&enableServiceActivator, "enable-service-activator", false, "Enable the service activator feature. This is an experimental feature.") + flag.StringVar(&podIP, "pod-ip", "", "The pod IP of the llmaz controller manager. Only used when service activator is enabled.") flag.BoolVar(&enableLeaderElection, "leader-elect", false, "Enable leader election for controller manager. "+ "Enabling this will ensure there is only one active controller manager.") @@ -120,7 +125,7 @@ func main() { // Cert won't be ready until manager starts, so start a goroutine here which // will block until the cert is ready before setting up the controllers. // Controllers who register after manager starts will start directly. - go setupControllers(mgr, certsReady) + go setupControllers(mgr, certsReady, enableServiceActivator, podIP) //+kubebuilder:scaffold:builder @@ -140,7 +145,7 @@ func main() { } } -func setupControllers(mgr ctrl.Manager, certsReady chan struct{}) { +func setupControllers(mgr ctrl.Manager, certsReady chan struct{}, enableServiceActivator bool, podIP string) { // The controllers won't work until the webhooks are operating, // and the webhook won't work until the certs are all in places. setupLog.Info("waiting for the cert generation to complete") @@ -176,6 +181,20 @@ func setupControllers(mgr ctrl.Manager, certsReady chan struct{}) { os.Exit(1) } + if enableServiceActivator { + dynamicClient, err := dynamic.NewForConfig(mgr.GetConfig()) + if err != nil { + setupLog.Error(err, "unable to create dynamic client") + os.Exit(1) + } + + activatorReconciler := inferencecontroller.NewActivatorReconciler(mgr, dynamicClient, podIP) + if err := activatorReconciler.SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "Activator") + os.Exit(1) + } + } + if os.Getenv("ENABLE_WEBHOOKS") != "false" { if err := webhook.SetupOpenModelWebhook(mgr); err != nil { setupLog.Error(err, "unable to create webhook", "webhook", "Model") diff --git a/config/crd/bases/inference.llmaz.io_backendruntimes.yaml b/config/crd/bases/inference.llmaz.io_backendruntimes.yaml index a626c0af..ea03fd13 100644 --- a/config/crd/bases/inference.llmaz.io_backendruntimes.yaml +++ b/config/crd/bases/inference.llmaz.io_backendruntimes.yaml @@ -388,6 +388,12 @@ spec: - port type: object type: object + stopSignal: + description: |- + StopSignal defines which signal will be sent to a container when it is being stopped. + If not specified, the default is defined by the container runtime in use. + StopSignal can only be set for Pods with a non-empty .spec.os.name + type: string type: object livenessProbe: description: |- @@ -770,7 +776,9 @@ spec: policies: description: |- policies is a list of potential scaling polices which can be used during scaling. - At least one policy must be specified, otherwise the HPAScalingRules will be discarded as invalid + If not set, use the default values: + - For scale up: allow doubling the number of pods, or an absolute change of 4 pods in a 15s window. + - For scale down: allow all pods to be removed in a 15s window. items: description: HPAScalingPolicy is a single policy which must hold true for a specified @@ -814,6 +822,24 @@ spec: - For scale down: 300 (i.e. the stabilization window is 300 seconds long). format: int32 type: integer + tolerance: + anyOf: + - type: integer + - type: string + description: |- + tolerance is the tolerance on the ratio between the current and desired + metric value under which no updates are made to the desired number of + replicas (e.g. 0.01 for 1%). Must be greater than or equal to zero. If not + set, the default cluster-wide tolerance is applied (by default 10%). + + For example, if autoscaling is configured with a memory consumption target of 100Mi, + and scale-down and scale-up tolerances of 5% and 1% respectively, scaling will be + triggered when the actual consumption falls below 95Mi or exceeds 101Mi. + + This is an alpha field and requires enabling the HPAConfigurableTolerance + feature gate. + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true type: object scaleUp: description: |- @@ -826,7 +852,9 @@ spec: policies: description: |- policies is a list of potential scaling polices which can be used during scaling. - At least one policy must be specified, otherwise the HPAScalingRules will be discarded as invalid + If not set, use the default values: + - For scale up: allow doubling the number of pods, or an absolute change of 4 pods in a 15s window. + - For scale down: allow all pods to be removed in a 15s window. items: description: HPAScalingPolicy is a single policy which must hold true for a specified @@ -870,6 +898,24 @@ spec: - For scale down: 300 (i.e. the stabilization window is 300 seconds long). format: int32 type: integer + tolerance: + anyOf: + - type: integer + - type: string + description: |- + tolerance is the tolerance on the ratio between the current and desired + metric value under which no updates are made to the desired number of + replicas (e.g. 0.01 for 1%). Must be greater than or equal to zero. If not + set, the default cluster-wide tolerance is applied (by default 10%). + + For example, if autoscaling is configured with a memory consumption target of 100Mi, + and scale-down and scale-up tolerances of 5% and 1% respectively, scaling will be + triggered when the actual consumption falls below 95Mi or exceeds 101Mi. + + This is an alpha field and requires enabling the HPAConfigurableTolerance + feature gate. + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true type: object type: object metrics: diff --git a/config/crd/bases/inference.llmaz.io_playgrounds.yaml b/config/crd/bases/inference.llmaz.io_playgrounds.yaml index d64ae9d9..44b38958 100644 --- a/config/crd/bases/inference.llmaz.io_playgrounds.yaml +++ b/config/crd/bases/inference.llmaz.io_playgrounds.yaml @@ -295,7 +295,9 @@ spec: policies: description: |- policies is a list of potential scaling polices which can be used during scaling. - At least one policy must be specified, otherwise the HPAScalingRules will be discarded as invalid + If not set, use the default values: + - For scale up: allow doubling the number of pods, or an absolute change of 4 pods in a 15s window. + - For scale down: allow all pods to be removed in a 15s window. items: description: HPAScalingPolicy is a single policy which must hold true for a specified past @@ -339,6 +341,24 @@ spec: - For scale down: 300 (i.e. the stabilization window is 300 seconds long). format: int32 type: integer + tolerance: + anyOf: + - type: integer + - type: string + description: |- + tolerance is the tolerance on the ratio between the current and desired + metric value under which no updates are made to the desired number of + replicas (e.g. 0.01 for 1%). Must be greater than or equal to zero. If not + set, the default cluster-wide tolerance is applied (by default 10%). + + For example, if autoscaling is configured with a memory consumption target of 100Mi, + and scale-down and scale-up tolerances of 5% and 1% respectively, scaling will be + triggered when the actual consumption falls below 95Mi or exceeds 101Mi. + + This is an alpha field and requires enabling the HPAConfigurableTolerance + feature gate. + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true type: object scaleUp: description: |- @@ -351,7 +371,9 @@ spec: policies: description: |- policies is a list of potential scaling polices which can be used during scaling. - At least one policy must be specified, otherwise the HPAScalingRules will be discarded as invalid + If not set, use the default values: + - For scale up: allow doubling the number of pods, or an absolute change of 4 pods in a 15s window. + - For scale down: allow all pods to be removed in a 15s window. items: description: HPAScalingPolicy is a single policy which must hold true for a specified past @@ -395,6 +417,24 @@ spec: - For scale down: 300 (i.e. the stabilization window is 300 seconds long). format: int32 type: integer + tolerance: + anyOf: + - type: integer + - type: string + description: |- + tolerance is the tolerance on the ratio between the current and desired + metric value under which no updates are made to the desired number of + replicas (e.g. 0.01 for 1%). Must be greater than or equal to zero. If not + set, the default cluster-wide tolerance is applied (by default 10%). + + For example, if autoscaling is configured with a memory consumption target of 100Mi, + and scale-down and scale-up tolerances of 5% and 1% respectively, scaling will be + triggered when the actual consumption falls below 95Mi or exceeds 101Mi. + + This is an alpha field and requires enabling the HPAConfigurableTolerance + feature gate. + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true type: object type: object metrics: diff --git a/config/crd/bases/inference.llmaz.io_services.yaml b/config/crd/bases/inference.llmaz.io_services.yaml index 61fb763a..13926b0b 100644 --- a/config/crd/bases/inference.llmaz.io_services.yaml +++ b/config/crd/bases/inference.llmaz.io_services.yaml @@ -498,7 +498,6 @@ spec: pod labels will be ignored. The default value is empty. The same key is forbidden to exist in both matchLabelKeys and labelSelector. Also, matchLabelKeys cannot be set when labelSelector isn't set. - This is a beta field and requires enabling MatchLabelKeysInPodAffinity feature gate (enabled by default). items: type: string type: array @@ -513,7 +512,6 @@ spec: pod labels will be ignored. The default value is empty. The same key is forbidden to exist in both mismatchLabelKeys and labelSelector. Also, mismatchLabelKeys cannot be set when labelSelector isn't set. - This is a beta field and requires enabling MatchLabelKeysInPodAffinity feature gate (enabled by default). items: type: string type: array @@ -682,7 +680,6 @@ spec: pod labels will be ignored. The default value is empty. The same key is forbidden to exist in both matchLabelKeys and labelSelector. Also, matchLabelKeys cannot be set when labelSelector isn't set. - This is a beta field and requires enabling MatchLabelKeysInPodAffinity feature gate (enabled by default). items: type: string type: array @@ -697,7 +694,6 @@ spec: pod labels will be ignored. The default value is empty. The same key is forbidden to exist in both mismatchLabelKeys and labelSelector. Also, mismatchLabelKeys cannot be set when labelSelector isn't set. - This is a beta field and requires enabling MatchLabelKeysInPodAffinity feature gate (enabled by default). items: type: string type: array @@ -864,7 +860,6 @@ spec: pod labels will be ignored. The default value is empty. The same key is forbidden to exist in both matchLabelKeys and labelSelector. Also, matchLabelKeys cannot be set when labelSelector isn't set. - This is a beta field and requires enabling MatchLabelKeysInPodAffinity feature gate (enabled by default). items: type: string type: array @@ -879,7 +874,6 @@ spec: pod labels will be ignored. The default value is empty. The same key is forbidden to exist in both mismatchLabelKeys and labelSelector. Also, mismatchLabelKeys cannot be set when labelSelector isn't set. - This is a beta field and requires enabling MatchLabelKeysInPodAffinity feature gate (enabled by default). items: type: string type: array @@ -1048,7 +1042,6 @@ spec: pod labels will be ignored. The default value is empty. The same key is forbidden to exist in both matchLabelKeys and labelSelector. Also, matchLabelKeys cannot be set when labelSelector isn't set. - This is a beta field and requires enabling MatchLabelKeysInPodAffinity feature gate (enabled by default). items: type: string type: array @@ -1063,7 +1056,6 @@ spec: pod labels will be ignored. The default value is empty. The same key is forbidden to exist in both mismatchLabelKeys and labelSelector. Also, mismatchLabelKeys cannot be set when labelSelector isn't set. - This is a beta field and requires enabling MatchLabelKeysInPodAffinity feature gate (enabled by default). items: type: string type: array @@ -1327,7 +1319,7 @@ spec: Cannot be updated. items: description: EnvFromSource represents the source - of a set of ConfigMaps + of a set of ConfigMaps or Secrets properties: configMapRef: description: The ConfigMap to select from @@ -1348,9 +1340,9 @@ spec: type: object x-kubernetes-map-type: atomic prefix: - description: An optional identifier to prepend - to each key in the ConfigMap. Must be a - C_IDENTIFIER. + description: Optional text to prepend to the + name of each environment variable. Must + be a C_IDENTIFIER. type: string secretRef: description: The Secret to select from @@ -1622,6 +1614,12 @@ spec: - port type: object type: object + stopSignal: + description: |- + StopSignal defines which signal will be sent to a container when it is being stopped. + If not specified, the default is defined by the container runtime in use. + StopSignal can only be set for Pods with a non-empty .spec.os.name + type: string type: object livenessProbe: description: |- @@ -2853,7 +2851,7 @@ spec: Cannot be updated. items: description: EnvFromSource represents the source - of a set of ConfigMaps + of a set of ConfigMaps or Secrets properties: configMapRef: description: The ConfigMap to select from @@ -2874,9 +2872,9 @@ spec: type: object x-kubernetes-map-type: atomic prefix: - description: An optional identifier to prepend - to each key in the ConfigMap. Must be a - C_IDENTIFIER. + description: Optional text to prepend to the + name of each environment variable. Must + be a C_IDENTIFIER. type: string secretRef: description: The Secret to select from @@ -3145,6 +3143,12 @@ spec: - port type: object type: object + stopSignal: + description: |- + StopSignal defines which signal will be sent to a container when it is being stopped. + If not specified, the default is defined by the container runtime in use. + StopSignal can only be set for Pods with a non-empty .spec.os.name + type: string type: object livenessProbe: description: Probes are not allowed for ephemeral @@ -4200,7 +4204,7 @@ spec: Init containers may not have Lifecycle actions, Readiness probes, Liveness probes, or Startup probes. The resourceRequirements of an init container are taken into account during scheduling by finding the highest request/limit for each resource type, and then using the max of - of that value or the sum of the normal containers. Limits are applied to init containers + that value or the sum of the normal containers. Limits are applied to init containers in a similar fashion. Init containers cannot currently be added or removed. Cannot be updated. @@ -4376,7 +4380,7 @@ spec: Cannot be updated. items: description: EnvFromSource represents the source - of a set of ConfigMaps + of a set of ConfigMaps or Secrets properties: configMapRef: description: The ConfigMap to select from @@ -4397,9 +4401,9 @@ spec: type: object x-kubernetes-map-type: atomic prefix: - description: An optional identifier to prepend - to each key in the ConfigMap. Must be a - C_IDENTIFIER. + description: Optional text to prepend to the + name of each environment variable. Must + be a C_IDENTIFIER. type: string secretRef: description: The Secret to select from @@ -4671,6 +4675,12 @@ spec: - port type: object type: object + stopSignal: + description: |- + StopSignal defines which signal will be sent to a container when it is being stopped. + If not specified, the default is defined by the container runtime in use. + StopSignal can only be set for Pods with a non-empty .spec.os.name + type: string type: object livenessProbe: description: |- @@ -6391,7 +6401,6 @@ spec: - Ignore: nodeAffinity/nodeSelector are ignored. All nodes are included in the calculations. If this value is nil, the behavior is equivalent to the Honor policy. - This is a beta-level feature default enabled by the NodeInclusionPolicyInPodTopologySpread feature flag. type: string nodeTaintsPolicy: description: |- @@ -6402,7 +6411,6 @@ spec: - Ignore: node taints are ignored. All nodes are included. If this value is nil, the behavior is equivalent to the Ignore policy. - This is a beta-level feature default enabled by the NodeInclusionPolicyInPodTopologySpread feature flag. type: string topologyKey: description: |- @@ -7404,7 +7412,7 @@ spec: The types of objects that may be mounted by this volume are defined by the container runtime implementation on a host machine and at minimum must include all valid types supported by the container image field. The OCI object gets mounted in a single directory (spec.containers[*].volumeMounts.mountPath) by merging the manifest layers in the same way as for container images. The volume will be mounted read-only (ro) and non-executable files (noexec). - Sub path mounts for containers are not supported (spec.containers[*].volumeMounts.subpath). + Sub path mounts for containers are not supported (spec.containers[*].volumeMounts.subpath) before 1.33. The field spec.securityContext.fsGroupChangePolicy has no effect on this volume type. properties: pullPolicy: @@ -8664,7 +8672,6 @@ spec: pod labels will be ignored. The default value is empty. The same key is forbidden to exist in both matchLabelKeys and labelSelector. Also, matchLabelKeys cannot be set when labelSelector isn't set. - This is a beta field and requires enabling MatchLabelKeysInPodAffinity feature gate (enabled by default). items: type: string type: array @@ -8679,7 +8686,6 @@ spec: pod labels will be ignored. The default value is empty. The same key is forbidden to exist in both mismatchLabelKeys and labelSelector. Also, mismatchLabelKeys cannot be set when labelSelector isn't set. - This is a beta field and requires enabling MatchLabelKeysInPodAffinity feature gate (enabled by default). items: type: string type: array @@ -8848,7 +8854,6 @@ spec: pod labels will be ignored. The default value is empty. The same key is forbidden to exist in both matchLabelKeys and labelSelector. Also, matchLabelKeys cannot be set when labelSelector isn't set. - This is a beta field and requires enabling MatchLabelKeysInPodAffinity feature gate (enabled by default). items: type: string type: array @@ -8863,7 +8868,6 @@ spec: pod labels will be ignored. The default value is empty. The same key is forbidden to exist in both mismatchLabelKeys and labelSelector. Also, mismatchLabelKeys cannot be set when labelSelector isn't set. - This is a beta field and requires enabling MatchLabelKeysInPodAffinity feature gate (enabled by default). items: type: string type: array @@ -9030,7 +9034,6 @@ spec: pod labels will be ignored. The default value is empty. The same key is forbidden to exist in both matchLabelKeys and labelSelector. Also, matchLabelKeys cannot be set when labelSelector isn't set. - This is a beta field and requires enabling MatchLabelKeysInPodAffinity feature gate (enabled by default). items: type: string type: array @@ -9045,7 +9048,6 @@ spec: pod labels will be ignored. The default value is empty. The same key is forbidden to exist in both mismatchLabelKeys and labelSelector. Also, mismatchLabelKeys cannot be set when labelSelector isn't set. - This is a beta field and requires enabling MatchLabelKeysInPodAffinity feature gate (enabled by default). items: type: string type: array @@ -9214,7 +9216,6 @@ spec: pod labels will be ignored. The default value is empty. The same key is forbidden to exist in both matchLabelKeys and labelSelector. Also, matchLabelKeys cannot be set when labelSelector isn't set. - This is a beta field and requires enabling MatchLabelKeysInPodAffinity feature gate (enabled by default). items: type: string type: array @@ -9229,7 +9230,6 @@ spec: pod labels will be ignored. The default value is empty. The same key is forbidden to exist in both mismatchLabelKeys and labelSelector. Also, mismatchLabelKeys cannot be set when labelSelector isn't set. - This is a beta field and requires enabling MatchLabelKeysInPodAffinity feature gate (enabled by default). items: type: string type: array @@ -9493,7 +9493,7 @@ spec: Cannot be updated. items: description: EnvFromSource represents the source - of a set of ConfigMaps + of a set of ConfigMaps or Secrets properties: configMapRef: description: The ConfigMap to select from @@ -9514,9 +9514,9 @@ spec: type: object x-kubernetes-map-type: atomic prefix: - description: An optional identifier to prepend - to each key in the ConfigMap. Must be a - C_IDENTIFIER. + description: Optional text to prepend to the + name of each environment variable. Must + be a C_IDENTIFIER. type: string secretRef: description: The Secret to select from @@ -9788,6 +9788,12 @@ spec: - port type: object type: object + stopSignal: + description: |- + StopSignal defines which signal will be sent to a container when it is being stopped. + If not specified, the default is defined by the container runtime in use. + StopSignal can only be set for Pods with a non-empty .spec.os.name + type: string type: object livenessProbe: description: |- @@ -11019,7 +11025,7 @@ spec: Cannot be updated. items: description: EnvFromSource represents the source - of a set of ConfigMaps + of a set of ConfigMaps or Secrets properties: configMapRef: description: The ConfigMap to select from @@ -11040,9 +11046,9 @@ spec: type: object x-kubernetes-map-type: atomic prefix: - description: An optional identifier to prepend - to each key in the ConfigMap. Must be a - C_IDENTIFIER. + description: Optional text to prepend to the + name of each environment variable. Must + be a C_IDENTIFIER. type: string secretRef: description: The Secret to select from @@ -11311,6 +11317,12 @@ spec: - port type: object type: object + stopSignal: + description: |- + StopSignal defines which signal will be sent to a container when it is being stopped. + If not specified, the default is defined by the container runtime in use. + StopSignal can only be set for Pods with a non-empty .spec.os.name + type: string type: object livenessProbe: description: Probes are not allowed for ephemeral @@ -12366,7 +12378,7 @@ spec: Init containers may not have Lifecycle actions, Readiness probes, Liveness probes, or Startup probes. The resourceRequirements of an init container are taken into account during scheduling by finding the highest request/limit for each resource type, and then using the max of - of that value or the sum of the normal containers. Limits are applied to init containers + that value or the sum of the normal containers. Limits are applied to init containers in a similar fashion. Init containers cannot currently be added or removed. Cannot be updated. @@ -12542,7 +12554,7 @@ spec: Cannot be updated. items: description: EnvFromSource represents the source - of a set of ConfigMaps + of a set of ConfigMaps or Secrets properties: configMapRef: description: The ConfigMap to select from @@ -12563,9 +12575,9 @@ spec: type: object x-kubernetes-map-type: atomic prefix: - description: An optional identifier to prepend - to each key in the ConfigMap. Must be a - C_IDENTIFIER. + description: Optional text to prepend to the + name of each environment variable. Must + be a C_IDENTIFIER. type: string secretRef: description: The Secret to select from @@ -12837,6 +12849,12 @@ spec: - port type: object type: object + stopSignal: + description: |- + StopSignal defines which signal will be sent to a container when it is being stopped. + If not specified, the default is defined by the container runtime in use. + StopSignal can only be set for Pods with a non-empty .spec.os.name + type: string type: object livenessProbe: description: |- @@ -14557,7 +14575,6 @@ spec: - Ignore: nodeAffinity/nodeSelector are ignored. All nodes are included in the calculations. If this value is nil, the behavior is equivalent to the Honor policy. - This is a beta-level feature default enabled by the NodeInclusionPolicyInPodTopologySpread feature flag. type: string nodeTaintsPolicy: description: |- @@ -14568,7 +14585,6 @@ spec: - Ignore: node taints are ignored. All nodes are included. If this value is nil, the behavior is equivalent to the Ignore policy. - This is a beta-level feature default enabled by the NodeInclusionPolicyInPodTopologySpread feature flag. type: string topologyKey: description: |- @@ -15570,7 +15586,7 @@ spec: The types of objects that may be mounted by this volume are defined by the container runtime implementation on a host machine and at minimum must include all valid types supported by the container image field. The OCI object gets mounted in a single directory (spec.containers[*].volumeMounts.mountPath) by merging the manifest layers in the same way as for container images. The volume will be mounted read-only (ro) and non-executable files (noexec). - Sub path mounts for containers are not supported (spec.containers[*].volumeMounts.subpath). + Sub path mounts for containers are not supported (spec.containers[*].volumeMounts.subpath) before 1.33. The field spec.securityContext.fsGroupChangePolicy has no effect on this volume type. properties: pullPolicy: diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index 61c42763..ea9b95cc 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -11,6 +11,16 @@ rules: verbs: - get - list +- apiGroups: + - "" + resources: + - endpoints + verbs: + - get + - list + - patch + - update + - watch - apiGroups: - "" resources: diff --git a/pkg/controller/inference/activator_controller.go b/pkg/controller/inference/activator_controller.go new file mode 100644 index 00000000..78bcc212 --- /dev/null +++ b/pkg/controller/inference/activator_controller.go @@ -0,0 +1,564 @@ +/* +Copyright 2025 The InftyAI Team. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package inference + +import ( + "context" + "encoding/json" + "fmt" + "io" + "net" + "sync" + "time" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/intstr" + "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/client-go/dynamic" + "k8s.io/client-go/util/retry" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/builder" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/event" + "sigs.k8s.io/controller-runtime/pkg/handler" + "sigs.k8s.io/controller-runtime/pkg/predicate" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + + llmazcorev1alpha1 "github.com/inftyai/llmaz/api/core/v1alpha1" +) + +var ( + activatorControllerLog = ctrl.Log.WithName("activator-controller") +) + +const ( + playgroundsResource = "playgrounds" + activatorControllerName = "activator-controller" +) + +type ActivatorReconciler struct { + client.Client + dynamicClient dynamic.Interface + portManager *PortManager + ip string +} + +func NewActivatorReconciler(mgr ctrl.Manager, dynamicClient dynamic.Interface, ip string) *ActivatorReconciler { + reconciler := &ActivatorReconciler{ + Client: mgr.GetClient(), + dynamicClient: dynamicClient, + ip: ip, + } + reconciler.portManager = NewPortManager(reconciler.scaleUp) + return reconciler +} + +// +kubebuilder:rbac:groups="",resources=services,verbs=get;list;watch;update;patch;delete +// +kubebuilder:rbac:groups="",resources=endpoints,verbs=get;list;watch;update;patch + +// Reconcile is part of the main kubernetes reconciliation loop which aims to +// move the current state of the cluster closer to the desired state. +// For more details, check Reconcile and its Result here: +// - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.16.3/pkg/reconcile +func (r *ActivatorReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + svc := &corev1.Service{} + if err := r.Get(ctx, req.NamespacedName, svc); err != nil { + if errors.IsNotFound(err) { + r.handleServiceDeletion(req.Namespace, req.Name) + return ctrl.Result{}, nil + } + return ctrl.Result{}, err + } + + if err := r.restoreSelectorIfNeeded(ctx, svc); err != nil { + return ctrl.Result{}, err + } + + // nolint:staticcheck + ep := &corev1.Endpoints{} + if err := r.Get(ctx, req.NamespacedName, ep); err != nil { + if errors.IsNotFound(err) { + activatorControllerLog.Info("Endpoints not found, waiting for creation", "service", svc.Name) + return ctrl.Result{}, nil + } + activatorControllerLog.Error(err, "Failed to get endpoints", "service", svc.Name) + return ctrl.Result{}, err + } + + // Check if the service has the activator annotation + ports, ok := r.needInject(svc) + if !ok { + activatorControllerLog.Info("Activator annotation not found, skipping", "service", svc.Name) + return ctrl.Result{}, nil + } + + if len(ep.Subsets) == 0 { + // If the endpoints are empty, inject the activator IP + return ctrl.Result{}, r.injectEndpoint(ctx, ep, svc, ports) + } else if len(ep.Subsets[0].Addresses) > 0 && + ep.Subsets[0].Addresses[0].IP != r.ip { + // If the endpoints are not empty and not the activator IP, forward the traffic + return ctrl.Result{}, r.forwardEndpoint(ctx, ep, ports) + } + + return ctrl.Result{}, nil +} + +func (r *ActivatorReconciler) needInject(svc *corev1.Service) ([]corev1.ServicePort, bool) { + if svc == nil || svc.Annotations == nil { + return nil, false + } + if _, ok := svc.Annotations[llmazcorev1alpha1.ModelActivatorAnnoKey]; !ok { + return nil, false + } + if len(svc.Spec.Ports) == 0 || svc.Spec.Type != corev1.ServiceTypeClusterIP { + return nil, false + } + + validPorts := make([]corev1.ServicePort, 0, len(svc.Spec.Ports)) + for _, port := range svc.Spec.Ports { + if port.Port == 0 || port.Protocol != corev1.ProtocolTCP { + continue + } + validPorts = append(validPorts, port) + } + if len(validPorts) == 0 { + return nil, false + } + return validPorts, true +} + +func (r *ActivatorReconciler) restoreSelectorIfNeeded(ctx context.Context, svc *corev1.Service) error { + selectorStr := svc.Annotations[llmazcorev1alpha1.CachedModelActivatorAnnoKey] + if selectorStr == "" { + return nil + } + + sel := map[string]string{} + if err := json.Unmarshal([]byte(selectorStr), &sel); err != nil { + activatorControllerLog.Error(err, "Failed to unmarshal selector") + return err + } + + updatedSvc := svc.DeepCopy() + delete(updatedSvc.Annotations, llmazcorev1alpha1.CachedModelActivatorAnnoKey) + updatedSvc.Spec.Selector = sel + + if err := r.Update(ctx, updatedSvc); err != nil { + activatorControllerLog.Error(err, "Failed to restore service selector") + return err + } + + activatorControllerLog.Info("Restored service selector", "selector", sel) + return nil +} + +// nolint:staticcheck +func (r *ActivatorReconciler) injectEndpoint(ctx context.Context, ep *corev1.Endpoints, svc *corev1.Service, ports []corev1.ServicePort) error { + // nolint:staticcheck + subsets := make([]corev1.EndpointSubset, 0, len(ports)) + for _, port := range ports { + ds, err := r.portManager.AddTarget(ep.Name, ep.Namespace, int(port.Port)) + if err != nil { + return err + } + + activatorControllerLog.Info("Injecting endpoint", + "port", port.Port, + "listenerPort", ds.Listener.Port(), + ) + + // nolint:staticcheck + subsets = append(subsets, corev1.EndpointSubset{ + Addresses: []corev1.EndpointAddress{{IP: r.ip}}, + Ports: []corev1.EndpointPort{{ + Name: port.Name, + Port: int32(ds.Listener.Port()), + }}, + }) + } + + updatedEp := ep.DeepCopy() + updatedEp.Subsets = subsets + if err := r.Update(ctx, updatedEp); err != nil { + activatorControllerLog.Error(err, "Failed to update endpoints") + return err + } + + // Save the original selector to annotation and clear the selector + selectorBytes, _ := json.Marshal(svc.Spec.Selector) + updatedSvc := svc.DeepCopy() + if updatedSvc.Annotations == nil { + updatedSvc.Annotations = make(map[string]string) + } + updatedSvc.Annotations[llmazcorev1alpha1.CachedModelActivatorAnnoKey] = string(selectorBytes) + updatedSvc.Spec.Selector = nil + return r.Update(ctx, updatedSvc) +} + +func (r *ActivatorReconciler) handleServiceDeletion(namespace, name string) { + pis := r.portManager.RemoveTargetForAllPorts(name, namespace) + for _, pi := range pis { + activatorControllerLog.Info("Cleaning up endpoints after service deletion", + "port", pi.Target.Port, + "listenerPort", pi.Listener.Port(), + ) + _ = pi.Listener.Close() + for _, conn := range pi.Connections { + _ = conn.Close() + } + } +} + +// nolint:staticcheck +func (r *ActivatorReconciler) forwardEndpoint(_ context.Context, ep *corev1.Endpoints, ports []corev1.ServicePort) error { + for _, port := range ports { + ds := r.portManager.RemoveTarget(ep.Name, ep.Namespace, int(port.Port)) + if ds == nil { + continue + } + + address, err := r.getEndpointAddress(ep, ports, &ds.Target) + if err != nil { + activatorControllerLog.Error(err, "Failed to get endpoint address") + continue + } + + activatorControllerLog.Info("Forwarding traffic to real endpoint", + "port", port.Port, + "address", address, + "connections", len(ds.Connections), + ) + + for _, conn := range ds.Connections { + targetConn, err := net.Dial("tcp", address) + if err != nil { + activatorControllerLog.Error(err, "Failed to dial target") + continue + } + tunnel(conn, targetConn) + } + err = ds.Listener.Close() + if err != nil { + activatorControllerLog.Error(err, "Failed to close listener") + return err + } + } + return nil +} + +// nolint:staticcheck +func (r *ActivatorReconciler) getEndpointAddress(ep *corev1.Endpoints, ports []corev1.ServicePort, target *Target) (string, error) { + for _, port := range ports { + if int(port.Port) != target.Port { + continue + } + + for _, subset := range ep.Subsets { + if len(subset.Addresses) == 0 { + continue + } + for _, p := range subset.Ports { + if port.TargetPort.Type == intstr.Int && int(p.Port) == int(port.TargetPort.IntVal) { + return fmt.Sprintf("%s:%d", subset.Addresses[0].IP, p.Port), nil + } + } + } + } + return "", fmt.Errorf("address not found for port %d", target.Port) +} + +func (r *ActivatorReconciler) scaleUp(pi *PortInformation) { + ctx := context.Background() + activatorControllerLog.Info("Scaling up target Playground", "service", pi.Target.Name) + + svc := &corev1.Service{} + key := types.NamespacedName{Namespace: pi.Target.Namespace, Name: pi.Target.Name} + if err := r.Get(ctx, key, svc); err != nil { + activatorControllerLog.Error(err, "Failed to get service") + return + } + + name := svc.Annotations[llmazcorev1alpha1.ModelActivatorAnnoKey] + if name == "" { + activatorControllerLog.Error(nil, "Scale annotation not found") + return + } + + gvr := llmazcorev1alpha1.GroupVersion.WithResource(playgroundsResource) + + activatorControllerLog.Info("Scaling up Playground", "playground", name) + retryErr := retry.RetryOnConflict(retry.DefaultRetry, func() error { + playground, err := r.dynamicClient.Resource(gvr).Namespace(pi.Target.Namespace).Get(ctx, name, metav1.GetOptions{}) + if err != nil { + return err + } + if err := unstructured.SetNestedField(playground.Object, int64(1), "spec", "replicas"); err != nil { + return err + } + _, err = r.dynamicClient.Resource(gvr).Namespace(pi.Target.Namespace).Update(ctx, playground, metav1.UpdateOptions{}) + return err + }) + + if retryErr != nil { + activatorControllerLog.Error(retryErr, "Failed to scale Playground") + return + } + + if err := r.waitUntilPlaygroundPodIsReady(ctx, name, pi.Target.Namespace); err != nil { + activatorControllerLog.Error(err, "Failed waiting for Playground pod") + return + } + + // Restore the service selector + restoreSelectorIfNeededErr := r.restoreSelectorIfNeeded(ctx, svc) + if restoreSelectorIfNeededErr != nil { + activatorControllerLog.Error(restoreSelectorIfNeededErr, "Failed to restore service selector") + return + } +} + +func (r *ActivatorReconciler) waitUntilPlaygroundPodIsReady(ctx context.Context, name, namespace string) error { + // The pod name is always playground name + "-0" + podName := name + "-0" + return wait.PollUntilContextTimeout(ctx, time.Second, 5*time.Minute, true, func(ctx context.Context) (bool, error) { + pod := &corev1.Pod{} + if err := r.Get(ctx, types.NamespacedName{Namespace: namespace, Name: podName}, pod); err != nil { + if errors.IsNotFound(err) { + return false, nil + } + return false, err + } + for _, cond := range pod.Status.Conditions { + if cond.Type == corev1.PodReady && cond.Status == corev1.ConditionTrue { + return true, nil + } + } + return false, nil + }) +} + +func (r *ActivatorReconciler) SetupWithManager(mgr ctrl.Manager) error { + hasActivatorAnnotation := func(obj client.Object) bool { + // Make sure the object has the activator annotation + annotations := obj.GetAnnotations() + _, ok := annotations[llmazcorev1alpha1.ModelActivatorAnnoKey] + if ok { + activatorControllerLog.V(4).Info("Object has activator annotation", "object", obj.GetName()) + } + + return ok + } + + return ctrl.NewControllerManagedBy(mgr). + Named(activatorControllerName). + For(&corev1.Service{}, builder.WithPredicates(predicate.Funcs{ + CreateFunc: func(e event.CreateEvent) bool { + return hasActivatorAnnotation(e.Object) + }, + UpdateFunc: func(e event.UpdateEvent) bool { + return hasActivatorAnnotation(e.ObjectNew) || hasActivatorAnnotation(e.ObjectOld) + }, + DeleteFunc: func(e event.DeleteEvent) bool { + return hasActivatorAnnotation(e.Object) + }, + })). + Watches( + // nolint:staticcheck + &corev1.Endpoints{}, + handler.EnqueueRequestsFromMapFunc(func(ctx context.Context, obj client.Object) []reconcile.Request { + return []reconcile.Request{ + {NamespacedName: types.NamespacedName{ + Namespace: obj.GetNamespace(), + Name: obj.GetName(), + }}, + } + }), + builder.WithPredicates(predicate.Funcs{ + CreateFunc: func(e event.CreateEvent) bool { + return hasActivatorAnnotation(e.Object) + }, + UpdateFunc: func(e event.UpdateEvent) bool { + return hasActivatorAnnotation(e.ObjectNew) + }, + DeleteFunc: func(e event.DeleteEvent) bool { + return hasActivatorAnnotation(e.Object) + }, + }), + ). + Complete(r) +} + +func tunnel(a, b net.Conn) { + go func() { + if _, err := io.Copy(a, b); err != nil { + activatorControllerLog.Error(err, "Failed to copy") + } + }() + + go func() { + if _, err := io.Copy(b, a); err != nil { + activatorControllerLog.Error(err, "Failed to copy") + } + }() +} + +type Listener interface { + net.Listener + Port() int +} + +type listener struct { + net.Listener + port int +} + +func NewListener() (Listener, error) { + l, err := net.Listen("tcp", ":0") + if err != nil { + return nil, err + } + return &listener{ + Listener: l, + port: l.Addr().(*net.TCPAddr).Port, + }, nil +} + +func (l *listener) Accept() (net.Conn, error) { + c, err := l.Listener.Accept() + if err != nil { + return nil, err + } + return c, nil +} + +func (l *listener) Port() int { + return l.port +} + +type Target struct { + Name string + Namespace string + Port int +} + +type PortInformation struct { + Target Target + Listener Listener + Connections []net.Conn +} + +type PortManager struct { + portMap map[int]*PortInformation + reversePortMap map[Target]int + mut sync.Mutex + + cb func(*PortInformation) +} + +func NewPortManager(cb func(*PortInformation)) *PortManager { + return &PortManager{ + portMap: map[int]*PortInformation{}, + reversePortMap: map[Target]int{}, + cb: cb, + } +} + +func (pm *PortManager) AddTarget(name string, namespace string, port int) (*PortInformation, error) { + pm.mut.Lock() + defer pm.mut.Unlock() + + target := Target{ + Name: name, + Namespace: namespace, + Port: port, + } + + port, ok := pm.reversePortMap[target] + if ok { + return pm.portMap[port], nil + } + + listener, err := NewListener() + if err != nil { + return nil, err + } + port = listener.Port() + downstream := &PortInformation{ + Target: target, + Listener: listener, + } + pm.portMap[port] = downstream + pm.reversePortMap[target] = port + + go pm.startListener(downstream) + return downstream, nil +} + +func (pm *PortManager) RemoveTarget(name string, namespace string, port int) *PortInformation { + pm.mut.Lock() + defer pm.mut.Unlock() + + target := Target{ + Name: name, + Namespace: namespace, + Port: port, + } + + port, ok := pm.reversePortMap[target] + if !ok { + return nil + } + downstream := pm.portMap[port] + delete(pm.portMap, port) + delete(pm.reversePortMap, target) + return downstream +} + +func (pm *PortManager) RemoveTargetForAllPorts(name string, namespace string) []*PortInformation { + pm.mut.Lock() + defer pm.mut.Unlock() + + var downstreams []*PortInformation + for port, downstream := range pm.portMap { + if downstream.Target.Name == name && downstream.Target.Namespace == namespace { + delete(pm.portMap, port) + delete(pm.reversePortMap, downstream.Target) + downstreams = append(downstreams, downstream) + } + } + return downstreams +} + +func (pm *PortManager) startListener(downstream *PortInformation) { + start := false + for { + conn, err := downstream.Listener.Accept() + if err != nil { + return + } + downstream.Connections = append(downstream.Connections, conn) + if !start { + go pm.cb(downstream) + start = true + } + } +} diff --git a/pkg/controller/inference/service_controller.go b/pkg/controller/inference/service_controller.go index 1f62cabd..87126770 100644 --- a/pkg/controller/inference/service_controller.go +++ b/pkg/controller/inference/service_controller.go @@ -131,7 +131,7 @@ func (r *ServiceReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct } // Create a service for the leader pods of the lws for loadbalancing. - if err := CreateServiceIfNotExists(ctx, r.Client, r.Scheme, service); err != nil { + if err := CreateServiceIfNotExists(ctx, r.Client, r.Scheme, service, models); err != nil { return ctrl.Result{}, err } @@ -357,6 +357,12 @@ func modelAnnotations(service *inferenceapi.Service) map[string]string { return nil } +func activatorAnnotations(model *coreapi.OpenModel) map[string]string { + return map[string]string{ + coreapi.ModelActivatorAnnoKey: model.Name, + } +} + func setServiceCondition(service *inferenceapi.Service, workload *lws.LeaderWorkerSet) { defer func() { if service.Status.Selector != workload.Status.HPAPodSelector { @@ -419,7 +425,7 @@ func setControllerReferenceForWorkload(owner metav1.Object, lws *applyconfigurat return nil } -func CreateServiceIfNotExists(ctx context.Context, k8sClient client.Client, Scheme *runtime.Scheme, service *inferenceapi.Service) error { +func CreateServiceIfNotExists(ctx context.Context, k8sClient client.Client, Scheme *runtime.Scheme, service *inferenceapi.Service, model []*coreapi.OpenModel) error { log := ctrl.LoggerFrom(ctx) // The load balancing service name. svcName := service.Name + "-lb" @@ -433,6 +439,8 @@ func CreateServiceIfNotExists(ctx context.Context, k8sClient client.Client, Sche ObjectMeta: metav1.ObjectMeta{ Name: svcName, Namespace: service.Namespace, + // For activator service, we can ignore it if serverless config is not enabled. + Annotations: activatorAnnotations(model[0]), }, Spec: corev1.ServiceSpec{ Ports: []corev1.ServicePort{