Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 11 additions & 5 deletions src/codeflare_sdk/cluster/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,20 @@ def __init__(self, config: ClusterConfiguration):
self.app_wrapper_yaml = self.create_app_wrapper()

def create_app_wrapper(self):
cpu=self.config.max_cpus
memory=self.config.memory
min_cpu=self.config.min_cpus
max_cpu=self.config.max_cpus
min_memory=self.config.min_memory
max_memory=self.config,max_memory
gpu=self.config.gpu
workers=self.config.max_worker
template=self.config.template
return generate_appwrapper(cpu=cpu, memory=memory,
gpu=gpu,workers=workers,
template=template)
image=self.config.image
instascale=self.config.instascale
instance_types=self.config.machine_types
env=self.config.envs
return generate_appwrapper(min_cpu=min_cpu, max_cpu=max_cpu, min_memory=min_memory,
max_memory=max_memory, gpu=gpu, workers=workers, template=template,
image=image, instascale=instascale, instance_types=instance_types, env=env)

# creates a new cluster with the provided or default spec
def up(self, namespace='default'):
Expand Down
4 changes: 2 additions & 2 deletions src/codeflare_sdk/cluster/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,15 @@
class ClusterConfiguration:
name: str
head_info: list = []
machine_types: list = []
machine_types: list = [] #["m4.xlarge", "g4dn.xlarge"]
min_cpus: int = 1
max_cpus: int = 1
min_worker: int = 1
max_worker: int = 1
min_memory: int = 2
max_memory: int = 2
gpu: int = 0
template: str = "src/codeflare_sdk/templates/base-template.yaml"
template: str = "src/codeflare_sdk/templates/new-template.yaml"
instascale: bool = False
envs: dict = {}
image: str = "rayproject/ray:latest"
234 changes: 234 additions & 0 deletions src/codeflare_sdk/templates/new-template.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,234 @@
apiVersion: mcad.ibm.com/v1beta1
kind: AppWrapper
metadata:
name: aw-kuberay
namespace: default
#new addition
labels:
orderedinstance: "m4.xlarge_g4dn.xlarge"
spec:
priority: 9
resources:
Items: []
GenericItems:
- replicas: 1
#new addition
custompodresources:
- replicas: 1
requests:
cpu: 2
memory: 12G
nvidia.com/gpu: 0
limits:
cpu: 2
memory: 12G
nvidia.com/gpu: 0
- replicas: 3
requests:
cpu: 2
memory: 12G
nvidia.com/gpu: 1
limits:
cpu: 2
memory: 12G
nvidia.com/gpu: 1
generictemplate:
# This config demonstrates KubeRay's Ray autoscaler integration.
# The resource requests and limits in this config are too small for production!
# For an example with more realistic resource configuration, see
# ray-cluster.autoscaler.large.yaml.
apiVersion: ray.io/v1alpha1
kind: RayCluster
metadata:
labels:
appwrapper.mcad.ibm.com: "aw-kuberay"
controller-tools.k8s.io: "1.0"
# A unique identifier for the head node and workers of this cluster.
name: kuberay-cluster
# finalizers:
# - kubernetes
spec:
# The version of Ray you are using. Make sure all Ray containers are running this version of Ray.
rayVersion: '1.12.0'
# If enableInTreeAutoscaling is true, the autoscaler sidecar will be added to the Ray head pod.
# Ray autoscaler integration is supported only for Ray versions >= 1.11.0
# Ray autoscaler integration is Beta with KubeRay >= 0.3.0 and Ray >= 2.0.0.
enableInTreeAutoscaling: false
# autoscalerOptions is an OPTIONAL field specifying configuration overrides for the Ray autoscaler.
# The example configuration shown below below represents the DEFAULT values.
# (You may delete autoscalerOptions if the defaults are suitable.)
autoscalerOptions:
# upscalingMode is "Default" or "Aggressive."
# Conservative: Upscaling is rate-limited; the number of pending worker pods is at most the size of the Ray cluster.
# Default: Upscaling is not rate-limited.
# Aggressive: An alias for Default; upscaling is not rate-limited.
upscalingMode: Default
# idleTimeoutSeconds is the number of seconds to wait before scaling down a worker pod which is not using Ray resources.
idleTimeoutSeconds: 60
# image optionally overrides the autoscaler's container image.
# If instance.spec.rayVersion is at least "2.0.0", the autoscaler will default to the same image as
# the ray container. For older Ray versions, the autoscaler will default to using the Ray 2.0.0 image.
## image: "my-repo/my-custom-autoscaler-image:tag"
# imagePullPolicy optionally overrides the autoscaler container's image pull policy.
imagePullPolicy: Always
# resources specifies optional resource request and limit overrides for the autoscaler container.
# For large Ray clusters, we recommend monitoring container resource usage to determine if overriding the defaults is required.
resources:
limits:
cpu: "500m"
memory: "512Mi"
requests:
cpu: "500m"
memory: "512Mi"
######################headGroupSpec#################################
# head group template and specs, (perhaps 'group' is not needed in the name)
headGroupSpec:
# Kubernetes Service Type, valid values are 'ClusterIP', 'NodePort' and 'LoadBalancer'
serviceType: ClusterIP
# logical group name, for this called head-group, also can be functional
# pod type head or worker
# rayNodeType: head # Not needed since it is under the headgroup
# the following params are used to complete the ray start: ray start --head --block ...
rayStartParams:
# Flag "no-monitor" will be automatically set when autoscaling is enabled.
dashboard-host: '0.0.0.0'
block: 'true'
# num-cpus: '1' # can be auto-completed from the limits
# Use `resources` to optionally specify custom resource annotations for the Ray node.
# The value of `resources` is a string-integer mapping.
# Currently, `resources` must be provided in the specific format demonstrated below:
# resources: '"{\"Custom1\": 1, \"Custom2\": 5}"'
num-gpus: 0
#pod template
template:
spec:
#new addition
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: aw-kuberay
operator: In
values:
- "aw-kuberay"
containers:
# The Ray head pod
- name: ray-head
image: rayproject/ray:latest
env:
- name: AWS_ACCESS_KEY_ID
valueFrom:
secretKeyRef:
name: s3-creds
key: AWS_ACCESS_KEY_ID
- name: AWS_SECRET_ACCESS_KEY
valueFrom:
secretKeyRef:
name: s3-creds
key: AWS_SECRET_ACCESS_KEY
- name: ENDPOINT_URL
valueFrom:
secretKeyRef:
name: s3-creds
key: ENDPOINT_URL
imagePullPolicy: Always
ports:
- containerPort: 6379
name: gcs
- containerPort: 8265
name: dashboard
- containerPort: 10001
name: client
lifecycle:
preStop:
exec:
command: ["/bin/sh","-c","ray stop"]
resources:
limits:
cpu: "2"
memory: "12G"
nvidia.com/gpu: "0"
requests:
cpu: "2"
memory: "12G"
nvidia.com/gpu: "0"
workerGroupSpecs:
# the pod replicas in this group typed worker
- replicas: 3
minReplicas: 3
maxReplicas: 3
# logical group name, for this called small-group, also can be functional
groupName: small-group
# if worker pods need to be added, we can simply increment the replicas
# if worker pods need to be removed, we decrement the replicas, and populate the podsToDelete list
# the operator will remove pods from the list until the number of replicas is satisfied
# when a pod is confirmed to be deleted, its name will be removed from the list below
#scaleStrategy:
# workersToDelete:
# - raycluster-complete-worker-small-group-bdtwh
# - raycluster-complete-worker-small-group-hv457
# - raycluster-complete-worker-small-group-k8tj7
# the following params are used to complete the ray start: ray start --block ...
rayStartParams:
block: 'true'
num-gpus: 1
#pod template
template:
metadata:
labels:
key: value
# annotations for pod
annotations:
key: value
# finalizers:
# - kubernetes
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: aw-kuberay
operator: In
values:
- "aw-kuberay"
initContainers:
# the env var $RAY_IP is set by the operator if missing, with the value of the head service name
- name: init-myservice
image: busybox:1.28
command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"]
containers:
- name: machine-learning # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc'
image: rayproject/ray:latest
env:
- name: AWS_ACCESS_KEY_ID
valueFrom:
secretKeyRef:
name: s3-creds
key: AWS_ACCESS_KEY_ID
- name: AWS_SECRET_ACCESS_KEY
valueFrom:
secretKeyRef:
name: s3-creds
key: AWS_SECRET_ACCESS_KEY
- name: ENDPOINT_URL
valueFrom:
secretKeyRef:
name: s3-creds
key: ENDPOINT_URL
# environment variables to set in the container.Optional.
# Refer to https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/
lifecycle:
preStop:
exec:
command: ["/bin/sh","-c","ray stop"]
resources:
limits:
cpu: "2"
memory: "12G"
nvidia.com/gpu: "1"
requests:
cpu: "2"
memory: "12G"
nvidia.com/gpu: "1"
Loading