Update mcad kuberay example

tedhtchang · tedhtchang · commit 6b295a422d70 · 2023-08-29T01:02:39.000-07:00
diff --git a/doc/usage/examples/kuberay/config/aw-raycluster.yaml b/doc/usage/examples/kuberay/config/aw-raycluster.yaml
@@ -1,92 +1,45 @@
 apiVersion: mcad.ibm.com/v1beta1
 kind: AppWrapper
 metadata:
-  name: raycluster-autoscaler
+  name: raycluster-complete
   namespace: default
 spec:
   resources:
-    Items: []
     GenericItems:
-    - replicas: 1
-      custompodresources:
-      - replicas: 2
-        requests:
-          cpu: 10
-          memory: 512Mi
-        limits:
-          cpu: 10
-          memory: 1G
-      generictemplate:
-        # This config demonstrates KubeRay's Ray autoscaler integration.
+    - generictemplate:
         # The resource requests and limits in this config are too small for production!
-        # For an example with more realistic resource configuration, see
+        # For examples with more realistic resource configuration, see
+        # ray-cluster.complete.large.yaml and
         # ray-cluster.autoscaler.large.yaml.
         apiVersion: ray.io/v1alpha1
         kind: RayCluster
         metadata:
           labels:
             controller-tools.k8s.io: "1.0"
             # A unique identifier for the head node and workers of this cluster.
-          name: raycluster-autoscaler
+          name: raycluster-complete
         spec:
-          # The version of Ray you are using. Make sure all Ray containers are running this version of Ray.
-          rayVersion: '2.0.0'
-          # If enableInTreeAutoscaling is true, the autoscaler sidecar will be added to the Ray head pod.
-          # Ray autoscaler integration is supported only for Ray versions >= 1.11.0
-          # Ray autoscaler integration is Beta with KubeRay >= 0.3.0 and Ray >= 2.0.0.
-          enableInTreeAutoscaling: true
-          # autoscalerOptions is an OPTIONAL field specifying configuration overrides for the Ray autoscaler.
-          # The example configuration shown below below represents the DEFAULT values.
-          # (You may delete autoscalerOptions if the defaults are suitable.)
-          autoscalerOptions:
-            # upscalingMode is "Default" or "Aggressive."
-            # Conservative: Upscaling is rate-limited; the number of pending worker pods is at most the size of the Ray cluster.
-            # Default: Upscaling is not rate-limited.
-            # Aggressive: An alias for Default; upscaling is not rate-limited.
-            upscalingMode: Default
-            # idleTimeoutSeconds is the number of seconds to wait before scaling down a worker pod which is not using Ray resources.
-            idleTimeoutSeconds: 60
-            # image optionally overrides the autoscaler's container image.
-            # If instance.spec.rayVersion is at least "2.0.0", the autoscaler will default to the same image as
-            # the ray container. For older Ray versions, the autoscaler will default to using the Ray 2.0.0 image.
-            ## image: "my-repo/my-custom-autoscaler-image:tag"
-            # imagePullPolicy optionally overrides the autoscaler container's image pull policy.
-            imagePullPolicy: Always
-            # resources specifies optional resource request and limit overrides for the autoscaler container.
-            # For large Ray clusters, we recommend monitoring container resource usage to determine if overriding the defaults is required.
-            resources:
-              limits:
-                cpu: "500m"
-                memory: "512Mi"
-              requests:
-                cpu: "500m"
-                memory: "512Mi"
-          ######################headGroupSpec#################################
-          # head group template and specs, (perhaps 'group' is not needed in the name)
+          rayVersion: '2.5.0'
+          # Ray head pod configuration
           headGroupSpec:
-            # Kubernetes Service Type, valid values are 'ClusterIP', 'NodePort' and 'LoadBalancer'
+            # Kubernetes Service Type. This is an optional field, and the default value is ClusterIP.
+            # Refer to https://kubernetes.io/docs/concepts/services-networking/service/#publishing-services-service-types.
             serviceType: ClusterIP
-            # logical group name, for this called head-group, also can be functional
-            # pod type head or worker
-            # rayNodeType: head # Not needed since it is under the headgroup
-            # the following params are used to complete the ray start: ray start --head --block ...
+            # The `rayStartParams` are used to configure the `ray start` command.
+            # See https://github.com/ray-project/kuberay/blob/master/docs/guidance/rayStartParams.md for the default settings of `rayStartParams` in KubeRay.
+            # See https://docs.ray.io/en/latest/cluster/cli.html#ray-start for all available options in `rayStartParams`.
             rayStartParams:
-              # Flag "no-monitor" will be automatically set when autoscaling is enabled.
               dashboard-host: '0.0.0.0'
-              block: 'true'
-              # num-cpus: '1' # can be auto-completed from the limits
-              # Use `resources` to optionally specify custom resource annotations for the Ray node.
-              # The value of `resources` is a string-integer mapping.
-              # Currently, `resources` must be provided in the specific format demonstrated below:
-              # resources: '"{\"Custom1\": 1, \"Custom2\": 5}"'
-            #pod template
+            # pod template
             template:
+              metadata:
+                # Custom labels. NOTE: To avoid conflicts with KubeRay operator, do not define custom labels start with `raycluster`.
+                # Refer to https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/
+                labels: {}
               spec:
                 containers:
-                # The Ray head pod
                 - name: ray-head
-                  image: rayproject/ray:2.0.0
-                  imagePullPolicy: Always
+                  image: rayproject/ray:2.5.0
                   ports:
                   - containerPort: 6379
                     name: gcs
@@ -98,59 +51,90 @@ spec:
                     preStop:
                       exec:
                         command: ["/bin/sh","-c","ray stop"]
+                  volumeMounts:
+                    - mountPath: /tmp/ray
+                      name: ray-logs
+                  # The resource requests and limits in this config are too small for production!
+                  # For an example with more realistic resource configuration, see
+                  # ray-cluster.autoscaler.large.yaml.
+                  # It is better to use a few large Ray pod than many small ones.
+                  # For production, it is ideal to size each Ray pod to take up the
+                  # entire Kubernetes node on which it is scheduled.
                   resources:
                     limits:
                       cpu: "1"
-                      memory: "1G"
+                      memory: "2G"
                     requests:
+                      # For production use-cases, we recommend specifying integer CPU reqests and limits.
+                      # We also recommend setting requests equal to limits for both CPU and memory.
+                      # For this example, we use a 500m CPU request to accomodate resource-constrained local
+                      # Kubernetes testing environments such as KinD and minikube.
                       cpu: "500m"
-                      memory: "512Mi"
+                      memory: "2G"
+                volumes:
+                  - name: ray-logs
+                    emptyDir: {}
           workerGroupSpecs:
           # the pod replicas in this group typed worker
           - replicas: 1
             minReplicas: 1
-            maxReplicas: 300
+            maxReplicas: 10
             # logical group name, for this called small-group, also can be functional
             groupName: small-group
-            # if worker pods need to be added, we can simply increment the replicas
-            # if worker pods need to be removed, we decrement the replicas, and populate the podsToDelete list
-            # the operator will remove pods from the list until the number of replicas is satisfied
-            # when a pod is confirmed to be deleted, its name will be removed from the list below
+            # If worker pods need to be added, we can increment the replicas.
+            # If worker pods need to be removed, we decrement the replicas, and populate the workersToDelete list.
+            # The operator will remove pods from the list until the desired number of replicas is satisfied.
+            # If the difference between the current replica count and the desired replicas is greater than the
+            # number of entries in workersToDelete, random worker pods will be deleted.
             #scaleStrategy:
             #  workersToDelete:
             #  - raycluster-complete-worker-small-group-bdtwh
             #  - raycluster-complete-worker-small-group-hv457
             #  - raycluster-complete-worker-small-group-k8tj7
-            # the following params are used to complete the ray start: ray start --block ...
-            rayStartParams:
-              block: 'true'
+            # The `rayStartParams` are used to configure the `ray start` command.
+            # See https://github.com/ray-project/kuberay/blob/master/docs/guidance/rayStartParams.md for the default settings of `rayStartParams` in KubeRay.
+            # See https://docs.ray.io/en/latest/cluster/cli.html#ray-start for all available options in `rayStartParams`.
+            rayStartParams: {}
             #pod template
             template:
-              metadata:
-                labels:
-                  key: value
-                # annotations for pod
-                annotations:
-                  key: value
               spec:
-                initContainers:
-                # the env var $RAY_IP is set by the operator if missing, with the value of the head service name
-                - name: init-myservice
-                  image: busybox:1.28
-                  command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"]
                 containers:
-                - name: machine-learning # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name',  or '123-abc'
-                  image: rayproject/ray:2.0.0
-                  # environment variables to set in the container.Optional.
-                  # Refer to https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/
+                - name: ray-worker
+                  image: rayproject/ray:2.5.0
                   lifecycle:
                     preStop:
                       exec:
                         command: ["/bin/sh","-c","ray stop"]
+                  # use volumeMounts.Optional.
+                  # Refer to https://kubernetes.io/docs/concepts/storage/volumes/
+                  volumeMounts:
+                    - mountPath: /tmp/ray
+                      name: ray-logs
+                  # The resource requests and limits in this config are too small for production!
+                  # For an example with more realistic resource configuration, see
+                  # ray-cluster.autoscaler.large.yaml.
+                  # It is better to use a few large Ray pod than many small ones.
+                  # For production, it is ideal to size each Ray pod to take up the
+                  # entire Kubernetes node on which it is scheduled.
                   resources:
                     limits:
                       cpu: "1"
-                      memory: "512Mi"
+                      memory: "1G"
+                    # For production use-cases, we recommend specifying integer CPU reqests and limits.
+                    # We also recommend setting requests equal to limits for both CPU and memory.
+                    # For this example, we use a 500m CPU request to accomodate resource-constrained local
+                    # Kubernetes testing environments such as KinD and minikube.
                     requests:
+                      # For production use-cases, we recommend specifying integer CPU reqests and limits.
+                      # We also recommend setting requests equal to limits for both CPU and memory.
+                      # For this example, we use a 500m CPU request to accomodate resource-constrained local
+                      # Kubernetes testing environments such as KinD and minikube.
                       cpu: "500m"
-                      memory: "256Mi"
+                      # For production use-cases, we recommend allocating at least 8Gb memory for each Ray container.
+                      memory: "1G"
+                # use volumes
+                # Refer to https://kubernetes.io/docs/concepts/storage/volumes/
+                volumes:
+                  - name: ray-logs
+                    emptyDir: {}
+
diff --git a/doc/usage/examples/kuberay/kuberay-mcad.md b/doc/usage/examples/kuberay/kuberay-mcad.md
@@ -4,13 +4,55 @@ This integration will help in queuing on [kuberay](https://github.com/ray-projec
 
 #### Prerequisites
 
-- kubernetes or Openshift cluster 
-- Install MCAD using instructions present under `deployment` directory
-- Make sure MCAD has clusterrole to create ray resources, please patch using configuration file present in `config` directory with name `xqueuejob-controller.yaml`
+- Kubernetes(see [KinD](https://helm.sh/docs/intro/install/)) or Openshift cluster(see [OpenShift Local](https://developers.redhat.com/products/openshift-local/overview))
+- Kubernetes client tools such as [kubectl](https://kubernetes.io/docs/tasks/tools/) or [OpenShift CLI](https://docs.openshift.com/container-platform/4.13/cli_reference/openshift_cli/getting-started-cli.html)
+- [Helm](https://helm.sh/docs/intro/install/)
+- Install MCAD and KubeRay operators:
+    - KinD cluster:
+
+        Install the stable release of MCAD opeartor from local charts
+        ```bash
+        git clone https://github.com/project-codeflare/multi-cluster-app-dispatcher
+        cd multi-cluster-app-dispatcher
+        helm install mcad --set image.repository=quay.io/project-codeflare/mcad-controller --set image.tag=stable deployment/mcad-controller
+        ```
+
+        Make sure MCAD has clusterrole to create ray resources, please patch using [xqueuejob-controller.yaml](doc/usage/examples/kuberay/config/xqueuejob-controller.yaml). For example:
+        ```
+        kubectl apply -f doc/usage/examples/kuberay/config/xqueuejob-controller.yaml
+        ```
+
+        See [deployment.md](../../../../doc/deploy/deployment.md) for more options.
+
+        Install kuberay operator using the [instructions](https://github.com/ray-project/kuberay#quick-start). For example, install kuberay v0.6.0 from remote helm repo:
+        ```
+        helm repo add kuberay https://ray-project.github.io/kuberay-helm/
+        helm install kuberay-operator kuberay/kuberay-operator --version 0.6.0
+        ```
+
+    - OpenShift cluster:
+
+        MCAD and KubeRay Operators are part of the CodeFlare stack which provides a simple, user-friendly abstraction for scaling,
+queuing and resource management of distributed AI/ML and Python workloads. Please follow the `Distributed Workloads` [Quick-Start](https://github.com/opendatahub-io/distributed-workloads/blob/main/Quick-Start.md) for installation.
+
 
 #### Steps
 
-- Install kuberay operator from [link](https://docs.ray.io/en/latest/cluster/kubernetes/getting-started.html#deploying-the-kuberay-operator)
-- Submit ray cluster to MCAD as appwrapper using the config file `aw-raycluster.yaml` present in the `config` directory using command `kubectl create -f aw-raycluster.yaml`
-- Check the status of the appwrapper using command `kubectl describe appwrapper <your-appwrapper-name>`
-- Check running pods using command `kubectl get pods -n <your-name-space>`
+
+- Submit the RayCluster custom resource to MCAD as AppWrapper using the [aw-raycluster.yaml](doc/usage/examples/kuberay/config/aw-raycluster.yaml) exmaple:
+    ```bash
+    kubectl create -f doc/usage/examples/kuberay/config/aw-raycluster.yaml
+    ```
+- Check the status of the AppWrapper custom resource using command
+    ```bash
+    kubectl describe appwrapper raycluster-complete -n default
+    ```
+- Check the raycluster status is ready using command
+    ```bash
+    kubectl get raycluster -n default
+    ```
+    Expect:
+    ``````
+    NAME                  DESIRED WORKERS   AVAILABLE WORKERS   STATUS   AGE
+    raycluster-complete   1                 1                   ready    6m45s
+    ```