You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
# head group template and specs, (perhaps 'group' is not needed in the name)
22
+
rayVersion: '2.5.0'
23
+
# Ray head pod configuration
66
24
headGroupSpec:
67
-
# Kubernetes Service Type, valid values are 'ClusterIP', 'NodePort' and 'LoadBalancer'
25
+
# Kubernetes Service Type. This is an optional field, and the default value is ClusterIP.
26
+
# Refer to https://kubernetes.io/docs/concepts/services-networking/service/#publishing-services-service-types.
68
27
serviceType: ClusterIP
69
-
# logical group name, for this called head-group, also can be functional
70
-
# pod type head or worker
71
-
# rayNodeType: head # Not needed since it is under the headgroup
72
-
# the following params are used to complete the ray start: ray start --head --block ...
28
+
# The `rayStartParams` are used to configure the `ray start` command.
29
+
# See https://github.com/ray-project/kuberay/blob/master/docs/guidance/rayStartParams.md for the default settings of `rayStartParams` in KubeRay.
30
+
# See https://docs.ray.io/en/latest/cluster/cli.html#ray-start for all available options in `rayStartParams`.
73
31
rayStartParams:
74
-
# Flag "no-monitor" will be automatically set when autoscaling is enabled.
75
32
dashboard-host: '0.0.0.0'
76
-
block: 'true'
77
-
# num-cpus: '1' # can be auto-completed from the limits
78
-
# Use `resources` to optionally specify custom resource annotations for the Ray node.
79
-
# The value of `resources` is a string-integer mapping.
80
-
# Currently, `resources` must be provided in the specific format demonstrated below:
81
-
# resources: '"{\"Custom1\": 1, \"Custom2\": 5}"'
82
-
#pod template
33
+
# pod template
83
34
template:
35
+
metadata:
36
+
# Custom labels. NOTE: To avoid conflicts with KubeRay operator, do not define custom labels start with `raycluster`.
37
+
# Refer to https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/
38
+
labels: {}
84
39
spec:
85
40
containers:
86
-
# The Ray head pod
87
41
- name: ray-head
88
-
image: rayproject/ray:2.0.0
89
-
imagePullPolicy: Always
42
+
image: rayproject/ray:2.5.0
90
43
ports:
91
44
- containerPort: 6379
92
45
name: gcs
@@ -98,59 +51,90 @@ spec:
98
51
preStop:
99
52
exec:
100
53
command: ["/bin/sh","-c","ray stop"]
54
+
volumeMounts:
55
+
- mountPath: /tmp/ray
56
+
name: ray-logs
57
+
# The resource requests and limits in this config are too small for production!
58
+
# For an example with more realistic resource configuration, see
59
+
# ray-cluster.autoscaler.large.yaml.
60
+
# It is better to use a few large Ray pod than many small ones.
61
+
# For production, it is ideal to size each Ray pod to take up the
62
+
# entire Kubernetes node on which it is scheduled.
101
63
resources:
102
64
limits:
103
65
cpu: "1"
104
-
memory: "1G"
66
+
memory: "2G"
105
67
requests:
68
+
# For production use-cases, we recommend specifying integer CPU reqests and limits.
69
+
# We also recommend setting requests equal to limits for both CPU and memory.
70
+
# For this example, we use a 500m CPU request to accomodate resource-constrained local
71
+
# Kubernetes testing environments such as KinD and minikube.
106
72
cpu: "500m"
107
-
memory: "512Mi"
73
+
memory: "2G"
74
+
volumes:
75
+
- name: ray-logs
76
+
emptyDir: {}
108
77
workerGroupSpecs:
109
78
# the pod replicas in this group typed worker
110
79
- replicas: 1
111
80
minReplicas: 1
112
-
maxReplicas: 300
81
+
maxReplicas: 10
113
82
# logical group name, for this called small-group, also can be functional
114
83
groupName: small-group
115
-
# if worker pods need to be added, we can simply increment the replicas
116
-
# if worker pods need to be removed, we decrement the replicas, and populate the podsToDelete list
117
-
# the operator will remove pods from the list until the number of replicas is satisfied
118
-
# when a pod is confirmed to be deleted, its name will be removed from the list below
84
+
# If worker pods need to be added, we can increment the replicas.
85
+
# If worker pods need to be removed, we decrement the replicas, and populate the workersToDelete list.
86
+
# The operator will remove pods from the list until the desired number of replicas is satisfied.
87
+
# If the difference between the current replica count and the desired replicas is greater than the
88
+
# number of entries in workersToDelete, random worker pods will be deleted.
119
89
#scaleStrategy:
120
90
# workersToDelete:
121
91
# - raycluster-complete-worker-small-group-bdtwh
122
92
# - raycluster-complete-worker-small-group-hv457
123
93
# - raycluster-complete-worker-small-group-k8tj7
124
-
# the following params are used to complete the ray start: ray start --block ...
125
-
rayStartParams:
126
-
block: 'true'
94
+
# The `rayStartParams` are used to configure the `ray start` command.
95
+
# See https://github.com/ray-project/kuberay/blob/master/docs/guidance/rayStartParams.md for the default settings of `rayStartParams` in KubeRay.
96
+
# See https://docs.ray.io/en/latest/cluster/cli.html#ray-start for all available options in `rayStartParams`.
97
+
rayStartParams: {}
127
98
#pod template
128
99
template:
129
-
metadata:
130
-
labels:
131
-
key: value
132
-
# annotations for pod
133
-
annotations:
134
-
key: value
135
100
spec:
136
-
initContainers:
137
-
# the env var $RAY_IP is set by the operator if missing, with the value of the head service name
138
-
- name: init-myservice
139
-
image: busybox:1.28
140
-
command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"]
141
101
containers:
142
-
- name: machine-learning # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc'
143
-
image: rayproject/ray:2.0.0
144
-
# environment variables to set in the container.Optional.
145
-
# Refer to https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/
102
+
- name: ray-worker
103
+
image: rayproject/ray:2.5.0
146
104
lifecycle:
147
105
preStop:
148
106
exec:
149
107
command: ["/bin/sh","-c","ray stop"]
108
+
# use volumeMounts.Optional.
109
+
# Refer to https://kubernetes.io/docs/concepts/storage/volumes/
110
+
volumeMounts:
111
+
- mountPath: /tmp/ray
112
+
name: ray-logs
113
+
# The resource requests and limits in this config are too small for production!
114
+
# For an example with more realistic resource configuration, see
115
+
# ray-cluster.autoscaler.large.yaml.
116
+
# It is better to use a few large Ray pod than many small ones.
117
+
# For production, it is ideal to size each Ray pod to take up the
118
+
# entire Kubernetes node on which it is scheduled.
150
119
resources:
151
120
limits:
152
121
cpu: "1"
153
-
memory: "512Mi"
122
+
memory: "1G"
123
+
# For production use-cases, we recommend specifying integer CPU reqests and limits.
124
+
# We also recommend setting requests equal to limits for both CPU and memory.
125
+
# For this example, we use a 500m CPU request to accomodate resource-constrained local
126
+
# Kubernetes testing environments such as KinD and minikube.
154
127
requests:
128
+
# For production use-cases, we recommend specifying integer CPU reqests and limits.
129
+
# We also recommend setting requests equal to limits for both CPU and memory.
130
+
# For this example, we use a 500m CPU request to accomodate resource-constrained local
131
+
# Kubernetes testing environments such as KinD and minikube.
155
132
cpu: "500m"
156
-
memory: "256Mi"
133
+
# For production use-cases, we recommend allocating at least 8Gb memory for each Ray container.
134
+
memory: "1G"
135
+
# use volumes
136
+
# Refer to https://kubernetes.io/docs/concepts/storage/volumes/
Copy file name to clipboardExpand all lines: doc/usage/examples/kuberay/kuberay-mcad.md
+49-7Lines changed: 49 additions & 7 deletions
Display the source diff
Display the rich diff
Original file line number
Diff line number
Diff line change
@@ -4,13 +4,55 @@ This integration will help in queuing on [kuberay](https://github.com/ray-projec
4
4
5
5
#### Prerequisites
6
6
7
-
- kubernetes or Openshift cluster
8
-
- Install MCAD using instructions present under `deployment` directory
9
-
- Make sure MCAD has clusterrole to create ray resources, please patch using configuration file present in `config` directory with name `xqueuejob-controller.yaml`
7
+
- Kubernetes(see [KinD](https://helm.sh/docs/intro/install/)) or Openshift cluster(see [OpenShift Local](https://developers.redhat.com/products/openshift-local/overview))
8
+
- Kubernetes client tools such as [kubectl](https://kubernetes.io/docs/tasks/tools/) or [OpenShift CLI](https://docs.openshift.com/container-platform/4.13/cli_reference/openshift_cli/getting-started-cli.html)
9
+
-[Helm](https://helm.sh/docs/intro/install/)
10
+
- Install MCAD and KubeRay operators:
11
+
- KinD cluster:
12
+
13
+
Install the stable release of MCAD opeartor from local charts
Make sure MCAD has clusterrole to create ray resources, please patch using [xqueuejob-controller.yaml](doc/usage/examples/kuberay/config/xqueuejob-controller.yaml). For example:
See [deployment.md](../../../../doc/deploy/deployment.md) for more options.
26
+
27
+
Install kuberay operator using the [instructions](https://github.com/ray-project/kuberay#quick-start). For example, install kuberay v0.6.0 from remote helm repo:
MCAD and KubeRay Operators are part of the CodeFlare stack which provides a simple, user-friendly abstraction for scaling,
36
+
queuing and resource management of distributed AI/ML and Python workloads. Please follow the `Distributed Workloads`[Quick-Start](https://github.com/opendatahub-io/distributed-workloads/blob/main/Quick-Start.md) for installation.
37
+
10
38
11
39
#### Steps
12
40
13
-
- Install kuberay operator from [link](https://docs.ray.io/en/latest/cluster/kubernetes/getting-started.html#deploying-the-kuberay-operator)
14
-
- Submit ray cluster to MCAD as appwrapper using the config file `aw-raycluster.yaml` present in the `config` directory using command `kubectl create -f aw-raycluster.yaml`
15
-
- Check the status of the appwrapper using command `kubectl describe appwrapper <your-appwrapper-name>`
16
-
- Check running pods using command `kubectl get pods -n <your-name-space>`
41
+
42
+
- Submit the RayCluster custom resource to MCAD as AppWrapper using the [aw-raycluster.yaml](doc/usage/examples/kuberay/config/aw-raycluster.yaml) exmaple:
0 commit comments