You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
# head group template and specs, (perhaps 'group' is not needed in the name)
23
+
rayVersion: '2.5.0'
24
+
# Ray head pod configuration
66
25
headGroupSpec:
67
-
# Kubernetes Service Type, valid values are 'ClusterIP', 'NodePort' and 'LoadBalancer'
26
+
# Kubernetes Service Type. This is an optional field, and the default value is ClusterIP.
27
+
# Refer to https://kubernetes.io/docs/concepts/services-networking/service/#publishing-services-service-types.
68
28
serviceType: ClusterIP
69
-
# logical group name, for this called head-group, also can be functional
70
-
# pod type head or worker
71
-
# rayNodeType: head # Not needed since it is under the headgroup
72
-
# the following params are used to complete the ray start: ray start --head --block ...
29
+
# The `rayStartParams` are used to configure the `ray start` command.
30
+
# See https://github.com/ray-project/kuberay/blob/master/docs/guidance/rayStartParams.md for the default settings of `rayStartParams` in KubeRay.
31
+
# See https://docs.ray.io/en/latest/cluster/cli.html#ray-start for all available options in `rayStartParams`.
73
32
rayStartParams:
74
-
# Flag "no-monitor" will be automatically set when autoscaling is enabled.
75
33
dashboard-host: '0.0.0.0'
76
-
block: 'true'
77
-
# num-cpus: '1' # can be auto-completed from the limits
78
-
# Use `resources` to optionally specify custom resource annotations for the Ray node.
79
-
# The value of `resources` is a string-integer mapping.
80
-
# Currently, `resources` must be provided in the specific format demonstrated below:
81
-
# resources: '"{\"Custom1\": 1, \"Custom2\": 5}"'
82
-
#pod template
34
+
# pod template
83
35
template:
36
+
metadata:
37
+
# Custom labels. NOTE: To avoid conflicts with KubeRay operator, do not define custom labels start with `raycluster`.
38
+
# Refer to https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/
39
+
labels: {}
84
40
spec:
85
41
containers:
86
-
# The Ray head pod
87
42
- name: ray-head
88
-
image: rayproject/ray:2.0.0
89
-
imagePullPolicy: Always
43
+
image: rayproject/ray:2.5.0
90
44
ports:
91
45
- containerPort: 6379
92
46
name: gcs
@@ -98,59 +52,90 @@ spec:
98
52
preStop:
99
53
exec:
100
54
command: ["/bin/sh","-c","ray stop"]
55
+
volumeMounts:
56
+
- mountPath: /tmp/ray
57
+
name: ray-logs
58
+
# The resource requests and limits in this config are too small for production!
59
+
# For an example with more realistic resource configuration, see
60
+
# ray-cluster.autoscaler.large.yaml.
61
+
# It is better to use a few large Ray pod than many small ones.
62
+
# For production, it is ideal to size each Ray pod to take up the
63
+
# entire Kubernetes node on which it is scheduled.
101
64
resources:
102
65
limits:
103
66
cpu: "1"
104
-
memory: "1G"
67
+
memory: "2G"
105
68
requests:
69
+
# For production use-cases, we recommend specifying integer CPU reqests and limits.
70
+
# We also recommend setting requests equal to limits for both CPU and memory.
71
+
# For this example, we use a 500m CPU request to accomodate resource-constrained local
72
+
# Kubernetes testing environments such as KinD and minikube.
106
73
cpu: "500m"
107
-
memory: "512Mi"
74
+
memory: "2G"
75
+
volumes:
76
+
- name: ray-logs
77
+
emptyDir: {}
108
78
workerGroupSpecs:
109
79
# the pod replicas in this group typed worker
110
80
- replicas: 1
111
81
minReplicas: 1
112
-
maxReplicas: 300
82
+
maxReplicas: 10
113
83
# logical group name, for this called small-group, also can be functional
114
84
groupName: small-group
115
-
# if worker pods need to be added, we can simply increment the replicas
116
-
# if worker pods need to be removed, we decrement the replicas, and populate the podsToDelete list
117
-
# the operator will remove pods from the list until the number of replicas is satisfied
118
-
# when a pod is confirmed to be deleted, its name will be removed from the list below
85
+
# If worker pods need to be added, we can increment the replicas.
86
+
# If worker pods need to be removed, we decrement the replicas, and populate the workersToDelete list.
87
+
# The operator will remove pods from the list until the desired number of replicas is satisfied.
88
+
# If the difference between the current replica count and the desired replicas is greater than the
89
+
# number of entries in workersToDelete, random worker pods will be deleted.
119
90
#scaleStrategy:
120
91
# workersToDelete:
121
92
# - raycluster-complete-worker-small-group-bdtwh
122
93
# - raycluster-complete-worker-small-group-hv457
123
94
# - raycluster-complete-worker-small-group-k8tj7
124
-
# the following params are used to complete the ray start: ray start --block ...
125
-
rayStartParams:
126
-
block: 'true'
95
+
# The `rayStartParams` are used to configure the `ray start` command.
96
+
# See https://github.com/ray-project/kuberay/blob/master/docs/guidance/rayStartParams.md for the default settings of `rayStartParams` in KubeRay.
97
+
# See https://docs.ray.io/en/latest/cluster/cli.html#ray-start for all available options in `rayStartParams`.
98
+
rayStartParams: {}
127
99
#pod template
128
100
template:
129
-
metadata:
130
-
labels:
131
-
key: value
132
-
# annotations for pod
133
-
annotations:
134
-
key: value
135
101
spec:
136
-
initContainers:
137
-
# the env var $RAY_IP is set by the operator if missing, with the value of the head service name
138
-
- name: init-myservice
139
-
image: busybox:1.28
140
-
command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"]
141
102
containers:
142
-
- name: machine-learning # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc'
143
-
image: rayproject/ray:2.0.0
144
-
# environment variables to set in the container.Optional.
145
-
# Refer to https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/
103
+
- name: ray-worker
104
+
image: rayproject/ray:2.5.0
146
105
lifecycle:
147
106
preStop:
148
107
exec:
149
108
command: ["/bin/sh","-c","ray stop"]
109
+
# use volumeMounts.Optional.
110
+
# Refer to https://kubernetes.io/docs/concepts/storage/volumes/
111
+
volumeMounts:
112
+
- mountPath: /tmp/ray
113
+
name: ray-logs
114
+
# The resource requests and limits in this config are too small for production!
115
+
# For an example with more realistic resource configuration, see
116
+
# ray-cluster.autoscaler.large.yaml.
117
+
# It is better to use a few large Ray pod than many small ones.
118
+
# For production, it is ideal to size each Ray pod to take up the
119
+
# entire Kubernetes node on which it is scheduled.
150
120
resources:
151
121
limits:
152
122
cpu: "1"
153
-
memory: "512Mi"
123
+
memory: "1G"
124
+
# For production use-cases, we recommend specifying integer CPU reqests and limits.
125
+
# We also recommend setting requests equal to limits for both CPU and memory.
126
+
# For this example, we use a 500m CPU request to accomodate resource-constrained local
127
+
# Kubernetes testing environments such as KinD and minikube.
154
128
requests:
129
+
# For production use-cases, we recommend specifying integer CPU reqests and limits.
130
+
# We also recommend setting requests equal to limits for both CPU and memory.
131
+
# For this example, we use a 500m CPU request to accomodate resource-constrained local
132
+
# Kubernetes testing environments such as KinD and minikube.
155
133
cpu: "500m"
156
-
memory: "256Mi"
134
+
# For production use-cases, we recommend allocating at least 8Gb memory for each Ray container.
135
+
memory: "1G"
136
+
# use volumes
137
+
# Refer to https://kubernetes.io/docs/concepts/storage/volumes/
Copy file name to clipboardExpand all lines: doc/usage/examples/kuberay/kuberay-mcad.md
+37-7Lines changed: 37 additions & 7 deletions
Display the source diff
Display the rich diff
Original file line number
Diff line number
Diff line change
@@ -4,13 +4,43 @@ This integration will help in queuing on [kuberay](https://github.com/ray-projec
4
4
5
5
#### Prerequisites
6
6
7
-
- kubernetes or Openshift cluster
8
-
- Install MCAD using instructions present under `deployment` directory
9
-
- Make sure MCAD has clusterrole to create ray resources, please patch using configuration file present in `config` directory with name `xqueuejob-controller.yaml`
7
+
- Kubernetes(see [KinD](https://helm.sh/docs/intro/install/)) or Openshift cluster(see [OpenShift Local](https://developers.redhat.com/products/openshift-local/overview))
8
+
- Kubernetes client tools such as [kubectl](https://kubernetes.io/docs/tasks/tools/) or [OpenShift CLI](https://docs.openshift.com/container-platform/4.13/cli_reference/openshift_cli/getting-started-cli.html)
9
+
-[Helm](https://helm.sh/docs/intro/install/)
10
+
- Install a stable MCAD release on your Kubernetes cluster using helm
See [deployment.md](doc/deploy/deployment.md) for more options.
18
+
- Make sure MCAD has clusterrole to create ray resources, please patch using [xqueuejob-controller.yaml](doc/usage/examples/kuberay/config/xqueuejob-controller.yaml). For example:
- Install kuberay operator from [link](https://docs.ray.io/en/latest/cluster/kubernetes/getting-started.html#deploying-the-kuberay-operator)
14
-
- Submit ray cluster to MCAD as appwrapper using the config file `aw-raycluster.yaml` present in the `config` directory using command `kubectl create -f aw-raycluster.yaml`
15
-
- Check the status of the appwrapper using command `kubectl describe appwrapper <your-appwrapper-name>`
16
-
- Check running pods using command `kubectl get pods -n <your-name-space>`
25
+
- Install kuberay operator using [instructions](https://github.com/ray-project/kuberay#quick-start). For example, install kuberay v0.6.0 from remote helm repo:
- Submit the RayCluster custom resource to MCAD as AppWrapper using the [aw-raycluster.yaml](doc/usage/examples/kuberay/config/aw-raycluster.yaml) exmaple:
0 commit comments