From c060c143f183578497dae5855ea0bf4a9aa62efd Mon Sep 17 00:00:00 2001 From: MichaelClifford Date: Wed, 26 Oct 2022 10:03:38 -0400 Subject: [PATCH] draft appwrapper templating --- src/codeflare_sdk/cluster/cluster.py | 23 ++- src/codeflare_sdk/cluster/config.py | 11 +- .../templates/base-template.yaml | 156 ++++++++++++++++++ 3 files changed, 183 insertions(+), 7 deletions(-) create mode 100644 src/codeflare_sdk/templates/base-template.yaml diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py index e13a6bf7..dab7a62c 100644 --- a/src/codeflare_sdk/cluster/cluster.py +++ b/src/codeflare_sdk/cluster/cluster.py @@ -1,6 +1,7 @@ from .config import ClusterConfiguration from .model import RayCluster, AppWrapper from ..utils import pretty_print +from ..utils.generate_yaml import generate_appwrapper import openshift as oc from typing import List, Optional @@ -8,16 +9,26 @@ class Cluster: def __init__(self, config: ClusterConfiguration): self.config = config - - # creates a new cluser with the provided or default spec + self.app_wrapper_yaml = self.create_app_wrapper() + + def create_app_wrapper(self): + cpu=self.config.max_cpus + memory=self.config.memory + gpu=self.config.gpu + workers=self.config.max_worker + template=self.config.template + return generate_appwrapper(cpu=cpu, memory=memory, + gpu=gpu,workers=workers, + template=template) + + # creates a new cluster with the provided or default spec def up(self, namespace='default'): with oc.project(namespace): - oc.invoke("apply", ["-f", - "https://raw.githubusercontent.com/IBM/multi-cluster-app-dispatcher/quota-management/doc/usage/examples/kuberay/config/aw-raycluster.yaml"]) + oc.invoke("apply", ["-f", self.app_wrapper_yaml ]) - def down(self, name, namespace='default'): + def down(self, namespace='default'): with oc.project(namespace): - oc.invoke("delete",["AppWrapper", self.config.name]) + oc.invoke("delete",["AppWrapper", self.app_wrapper_yaml]) def status(self, print_to_console=True): cluster = _ray_cluster_status(self.config.name) diff --git a/src/codeflare_sdk/cluster/config.py b/src/codeflare_sdk/cluster/config.py index 657c5ec8..4861e6ed 100644 --- a/src/codeflare_sdk/cluster/config.py +++ b/src/codeflare_sdk/cluster/config.py @@ -3,7 +3,16 @@ @dataclass class ClusterConfiguration: name: str + head_info: list = [] + machine_types: list = [] min_cpus: int = 1 max_cpus: int = 1 - min_worker: int = 0 + min_worker: int = 1 max_worker: int = 1 + min_memory: int = 2 + max_memory: int = 2 + gpu: int = 0 + template: str = "src/codeflare_sdk/templates/base-template.yaml" + instascale: bool = False + envs: dict = {} + image: str = "rayproject/ray:latest" diff --git a/src/codeflare_sdk/templates/base-template.yaml b/src/codeflare_sdk/templates/base-template.yaml new file mode 100644 index 00000000..bbef20d0 --- /dev/null +++ b/src/codeflare_sdk/templates/base-template.yaml @@ -0,0 +1,156 @@ +apiVersion: mcad.ibm.com/v1beta1 +kind: AppWrapper +metadata: + name: raycluster-autoscaler + namespace: default +spec: + resources: + Items: [] + GenericItems: + - replicas: 1 + custompodresources: + - replicas: 2 + requests: + cpu: 10 + memory: 512Mi + limits: + cpu: 10 + memory: 1G + generictemplate: + # This config demonstrates KubeRay's Ray autoscaler integration. + # The resource requests and limits in this config are too small for production! + # For an example with more realistic resource configuration, see + # ray-cluster.autoscaler.large.yaml. + apiVersion: ray.io/v1alpha1 + kind: RayCluster + metadata: + labels: + controller-tools.k8s.io: "1.0" + # A unique identifier for the head node and workers of this cluster. + name: raycluster-autoscaler + spec: + # The version of Ray you are using. Make sure all Ray containers are running this version of Ray. + rayVersion: '2.0.0' + # If enableInTreeAutoscaling is true, the autoscaler sidecar will be added to the Ray head pod. + # Ray autoscaler integration is supported only for Ray versions >= 1.11.0 + # Ray autoscaler integration is Beta with KubeRay >= 0.3.0 and Ray >= 2.0.0. + enableInTreeAutoscaling: true + # autoscalerOptions is an OPTIONAL field specifying configuration overrides for the Ray autoscaler. + # The example configuration shown below below represents the DEFAULT values. + # (You may delete autoscalerOptions if the defaults are suitable.) + autoscalerOptions: + # upscalingMode is "Default" or "Aggressive." + # Conservative: Upscaling is rate-limited; the number of pending worker pods is at most the size of the Ray cluster. + # Default: Upscaling is not rate-limited. + # Aggressive: An alias for Default; upscaling is not rate-limited. + upscalingMode: Default + # idleTimeoutSeconds is the number of seconds to wait before scaling down a worker pod which is not using Ray resources. + idleTimeoutSeconds: 60 + # image optionally overrides the autoscaler's container image. + # If instance.spec.rayVersion is at least "2.0.0", the autoscaler will default to the same image as + # the ray container. For older Ray versions, the autoscaler will default to using the Ray 2.0.0 image. + ## image: "my-repo/my-custom-autoscaler-image:tag" + # imagePullPolicy optionally overrides the autoscaler container's image pull policy. + imagePullPolicy: Always + # resources specifies optional resource request and limit overrides for the autoscaler container. + # For large Ray clusters, we recommend monitoring container resource usage to determine if overriding the defaults is required. + resources: + limits: + cpu: "500m" + memory: "512Mi" + requests: + cpu: "500m" + memory: "512Mi" + ######################headGroupSpec################################# + # head group template and specs, (perhaps 'group' is not needed in the name) + headGroupSpec: + # Kubernetes Service Type, valid values are 'ClusterIP', 'NodePort' and 'LoadBalancer' + serviceType: ClusterIP + # logical group name, for this called head-group, also can be functional + # pod type head or worker + # rayNodeType: head # Not needed since it is under the headgroup + # the following params are used to complete the ray start: ray start --head --block ... + rayStartParams: + # Flag "no-monitor" will be automatically set when autoscaling is enabled. + dashboard-host: '0.0.0.0' + block: 'true' + # num-cpus: '1' # can be auto-completed from the limits + # Use `resources` to optionally specify custom resource annotations for the Ray node. + # The value of `resources` is a string-integer mapping. + # Currently, `resources` must be provided in the specific format demonstrated below: + # resources: '"{\"Custom1\": 1, \"Custom2\": 5}"' + #pod template + template: + spec: + containers: + # The Ray head pod + - name: ray-head + image: rayproject/ray:2.0.0 + imagePullPolicy: Always + ports: + - containerPort: 6379 + name: gcs + - containerPort: 8265 + name: dashboard + - containerPort: 10001 + name: client + lifecycle: + preStop: + exec: + command: ["/bin/sh","-c","ray stop"] + resources: + limits: + cpu: "1" + memory: "1G" + requests: + cpu: "500m" + memory: "512Mi" + workerGroupSpecs: + # the pod replicas in this group typed worker + - replicas: 1 + minReplicas: 1 + maxReplicas: 300 + # logical group name, for this called small-group, also can be functional + groupName: small-group + # if worker pods need to be added, we can simply increment the replicas + # if worker pods need to be removed, we decrement the replicas, and populate the podsToDelete list + # the operator will remove pods from the list until the number of replicas is satisfied + # when a pod is confirmed to be deleted, its name will be removed from the list below + #scaleStrategy: + # workersToDelete: + # - raycluster-complete-worker-small-group-bdtwh + # - raycluster-complete-worker-small-group-hv457 + # - raycluster-complete-worker-small-group-k8tj7 + # the following params are used to complete the ray start: ray start --block ... + rayStartParams: + block: 'true' + #pod template + template: + metadata: + labels: + key: value + # annotations for pod + annotations: + key: value + spec: + initContainers: + # the env var $RAY_IP is set by the operator if missing, with the value of the head service name + - name: init-myservice + image: busybox:1.28 + command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"] + containers: + - name: machine-learning # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc' + image: rayproject/ray:2.0.0 + # environment variables to set in the container.Optional. + # Refer to https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/ + lifecycle: + preStop: + exec: + command: ["/bin/sh","-c","ray stop"] + resources: + limits: + cpu: "1" + memory: "512Mi" + requests: + cpu: "500m" + memory: "256Mi" \ No newline at end of file