Skip to content

Commit c7cf12a

Browse files
RHOAIENG-33283: Error handling, remove obsolete volumes, housekeeping
1 parent 57992ee commit c7cf12a

File tree

7 files changed

+57
-171
lines changed

7 files changed

+57
-171
lines changed

src/codeflare_sdk/common/widgets/test_widgets.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ def test_view_clusters(mocker, capsys):
106106
# Prepare to run view_clusters when notebook environment is detected
107107
mocker.patch("codeflare_sdk.common.widgets.widgets.is_notebook", return_value=True)
108108
mock_get_current_namespace = mocker.patch(
109-
"codeflare_sdk.common.utils.get_current_namespace",
109+
"codeflare_sdk.common.widgets.widgets.get_current_namespace",
110110
return_value="default",
111111
)
112112
namespace = mock_get_current_namespace.return_value

src/codeflare_sdk/common/widgets/widgets.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -353,8 +353,6 @@ def view_clusters(namespace: str = None):
353353
)
354354
return # Exit function if not in Jupyter Notebook
355355

356-
from ...common.utils import get_current_namespace
357-
358356
if not namespace:
359357
namespace = get_current_namespace()
360358

src/codeflare_sdk/ray/cluster/cluster.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -520,7 +520,7 @@ def cluster_dashboard_uri(self) -> str:
520520
protocol = "https" if route["spec"].get("tls") else "http"
521521
return f"{protocol}://{route['spec']['host']}"
522522
# No route found for this cluster
523-
return "Dashboard not available yet, have you run cluster.up()?"
523+
return "Dashboard not available yet, have you run cluster.apply()?"
524524
else:
525525
try:
526526
api_instance = client.NetworkingV1Api(get_api_client())

src/codeflare_sdk/ray/rayjobs/config.py

Lines changed: 2 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,6 @@
2222
from dataclasses import dataclass, field, fields
2323
from typing import Dict, List, Optional, Union, get_args, get_origin, Any, Tuple
2424
from kubernetes.client import (
25-
V1ConfigMapVolumeSource,
26-
V1KeyToPath,
2725
V1LocalObjectReference,
2826
V1SecretVolumeSource,
2927
V1Toleration,
@@ -62,50 +60,6 @@
6260
"huawei.com/Ascend310": "NPU",
6361
}
6462

65-
# Default volume mounts for CA certificates
66-
DEFAULT_VOLUME_MOUNTS = [
67-
V1VolumeMount(
68-
mount_path="/etc/pki/tls/certs/odh-trusted-ca-bundle.crt",
69-
name="odh-trusted-ca-cert",
70-
sub_path="odh-trusted-ca-bundle.crt",
71-
),
72-
V1VolumeMount(
73-
mount_path="/etc/ssl/certs/odh-trusted-ca-bundle.crt",
74-
name="odh-trusted-ca-cert",
75-
sub_path="odh-trusted-ca-bundle.crt",
76-
),
77-
V1VolumeMount(
78-
mount_path="/etc/pki/tls/certs/odh-ca-bundle.crt",
79-
name="odh-ca-cert",
80-
sub_path="odh-ca-bundle.crt",
81-
),
82-
V1VolumeMount(
83-
mount_path="/etc/ssl/certs/odh-ca-bundle.crt",
84-
name="odh-ca-cert",
85-
sub_path="odh-ca-bundle.crt",
86-
),
87-
]
88-
89-
# Default volumes for CA certificates
90-
DEFAULT_VOLUMES = [
91-
V1Volume(
92-
name="odh-trusted-ca-cert",
93-
config_map=V1ConfigMapVolumeSource(
94-
name="odh-trusted-ca-bundle",
95-
items=[V1KeyToPath(key="ca-bundle.crt", path="odh-trusted-ca-bundle.crt")],
96-
optional=True,
97-
),
98-
),
99-
V1Volume(
100-
name="odh-ca-cert",
101-
config_map=V1ConfigMapVolumeSource(
102-
name="odh-trusted-ca-bundle",
103-
items=[V1KeyToPath(key="odh-ca-bundle.crt", path="odh-ca-bundle.crt")],
104-
optional=True,
105-
),
106-
),
107-
]
108-
10963

11064
@dataclass
11165
class ManagedClusterConfig:
@@ -426,7 +380,7 @@ def _build_pod_spec(self, container: V1Container, is_head: bool) -> V1PodSpec:
426380

427381
def _generate_volume_mounts(self) -> list:
428382
"""Generate volume mounts for the container."""
429-
volume_mounts = DEFAULT_VOLUME_MOUNTS.copy()
383+
volume_mounts = []
430384

431385
# Add custom volume mounts if specified
432386
if hasattr(self, "volume_mounts") and self.volume_mounts:
@@ -436,7 +390,7 @@ def _generate_volume_mounts(self) -> list:
436390

437391
def _generate_volumes(self) -> list:
438392
"""Generate volumes for the pod."""
439-
volumes = DEFAULT_VOLUMES.copy()
393+
volumes = []
440394

441395
# Add custom volumes if specified
442396
if hasattr(self, "volumes") and self.volumes:

src/codeflare_sdk/ray/rayjobs/rayjob.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -169,11 +169,6 @@ def submit(self) -> str:
169169
# Extract files from entrypoint and runtime_env working_dir
170170
files = extract_all_local_files(self)
171171

172-
# Create Secret for files (will be mounted to submitter pod)
173-
secret_name = None
174-
if files:
175-
secret_name = f"{self.name}-files"
176-
177172
rayjob_cr = self._build_rayjob_cr()
178173

179174
logger.info(f"Submitting RayJob {self.name} to Kuberay operator")

src/codeflare_sdk/ray/rayjobs/runtime_env.py

Lines changed: 16 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ def extract_all_local_files(job: RayJob) -> Optional[Dict[str, str]]:
113113

114114
def _zip_directory(directory_path: str) -> Optional[bytes]:
115115
"""
116-
Zip entire directory preserving structure, excluding Jupyter notebook files.
116+
Zip entire directory preserving structure, excluding Jupyter notebook and markdown files.
117117
118118
Args:
119119
directory_path: Path to directory to zip
@@ -338,29 +338,22 @@ def create_secret_from_spec(
338338

339339
metadata = client.V1ObjectMeta(**secret_spec["metadata"])
340340

341-
# Add owner reference if we have the RayJob result
342-
if (
343-
rayjob_result
344-
and isinstance(rayjob_result, dict)
345-
and rayjob_result.get("metadata", {}).get("uid")
346-
):
347-
logger.info(
348-
f"Adding owner reference to Secret '{secret_name}' with RayJob UID: {rayjob_result['metadata']['uid']}"
349-
)
350-
metadata.owner_references = [
351-
client.V1OwnerReference(
352-
api_version="ray.io/v1",
353-
kind="RayJob",
354-
name=job.name,
355-
uid=rayjob_result["metadata"]["uid"],
356-
controller=True,
357-
block_owner_deletion=True,
358-
)
359-
]
360-
else:
361-
logger.warning(
362-
f"No valid RayJob result with UID found, Secret '{secret_name}' will not have owner reference. Result: {rayjob_result}"
341+
# Add owner reference to ensure proper cleanup
342+
# We can trust that rayjob_result contains UID since submit_job() only returns
343+
# complete K8s resources or None, and we already validated result exists
344+
logger.info(
345+
f"Adding owner reference to Secret '{secret_name}' with RayJob UID: {rayjob_result['metadata']['uid']}"
346+
)
347+
metadata.owner_references = [
348+
client.V1OwnerReference(
349+
api_version="ray.io/v1",
350+
kind="RayJob",
351+
name=job.name,
352+
uid=rayjob_result["metadata"]["uid"],
353+
controller=True,
354+
block_owner_deletion=True,
363355
)
356+
]
364357

365358
# Convert dict spec to V1Secret
366359
# Use stringData instead of data to avoid double base64 encoding
@@ -409,5 +402,4 @@ def create_file_secret(
409402
)
410403

411404
# Create Secret with owner reference
412-
# TODO Error handling
413405
create_secret_from_spec(job, secret_spec, rayjob_result)

src/codeflare_sdk/ray/rayjobs/test/test_runtime_env.py

Lines changed: 37 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,16 @@ def test_create_secret_from_spec(auto_mock_setup):
176176
"data": {"test.py": "print('test')"},
177177
}
178178

179-
result = create_secret_from_spec(rayjob, secret_spec)
179+
# Provide valid RayJob result with UID as KubeRay client would
180+
rayjob_result = {
181+
"metadata": {
182+
"name": "test-job",
183+
"namespace": "test-namespace",
184+
"uid": "test-uid-12345",
185+
}
186+
}
187+
188+
result = create_secret_from_spec(rayjob, secret_spec, rayjob_result)
180189

181190
assert result == "test-files"
182191
mock_api_instance.create_namespaced_secret.assert_called_once()
@@ -205,7 +214,16 @@ def test_create_secret_already_exists(auto_mock_setup):
205214
"data": {"test.py": "print('test')"},
206215
}
207216

208-
result = create_secret_from_spec(rayjob, secret_spec)
217+
# Provide valid RayJob result with UID as KubeRay client would
218+
rayjob_result = {
219+
"metadata": {
220+
"name": "test-job",
221+
"namespace": "test-namespace",
222+
"uid": "test-uid-67890",
223+
}
224+
}
225+
226+
result = create_secret_from_spec(rayjob, secret_spec, rayjob_result)
209227

210228
assert result == "test-files"
211229
mock_api_instance.create_namespaced_secret.assert_called_once()
@@ -271,92 +289,6 @@ def test_create_secret_with_owner_reference_basic(mocker, auto_mock_setup, caplo
271289
mock_api_instance.create_namespaced_secret.assert_called_once()
272290

273291

274-
def test_create_secret_without_owner_reference_no_uid(mocker, auto_mock_setup, caplog):
275-
"""
276-
Test creating Secret without owner reference when RayJob has no UID.
277-
"""
278-
mock_api_instance = auto_mock_setup["k8s_api"]
279-
280-
mock_v1_metadata = mocker.patch("kubernetes.client.V1ObjectMeta")
281-
mock_metadata_instance = MagicMock()
282-
mock_v1_metadata.return_value = mock_metadata_instance
283-
284-
rayjob = RayJob(
285-
job_name="test-job",
286-
cluster_name="existing-cluster",
287-
entrypoint="python test.py",
288-
namespace="test-namespace",
289-
)
290-
291-
secret_spec = {
292-
"apiVersion": "v1",
293-
"kind": "Secret",
294-
"type": "Opaque",
295-
"metadata": {"name": "test-files", "namespace": "test-namespace"},
296-
"data": {"test.py": "print('test')"},
297-
}
298-
299-
# RayJob result without UID
300-
rayjob_result = {
301-
"metadata": {
302-
"name": "test-job",
303-
"namespace": "test-namespace",
304-
# No UID field
305-
}
306-
}
307-
308-
with caplog.at_level("WARNING"):
309-
result = create_secret_from_spec(rayjob, secret_spec, rayjob_result)
310-
311-
assert result == "test-files"
312-
313-
# Verify warning was logged and no owner reference was set
314-
assert (
315-
"No valid RayJob result with UID found, Secret 'test-files' will not have owner reference"
316-
in caplog.text
317-
)
318-
319-
# The important part is that the warning was logged, indicating no owner reference was set
320-
mock_api_instance.create_namespaced_secret.assert_called_once()
321-
322-
323-
def test_create_secret_with_invalid_rayjob_result(auto_mock_setup, caplog):
324-
"""
325-
Test creating Secret with None or invalid rayjob_result.
326-
"""
327-
mock_api_instance = auto_mock_setup["k8s_api"]
328-
329-
rayjob = RayJob(
330-
job_name="test-job",
331-
cluster_name="existing-cluster",
332-
entrypoint="python test.py",
333-
namespace="test-namespace",
334-
)
335-
336-
secret_spec = {
337-
"apiVersion": "v1",
338-
"kind": "Secret",
339-
"type": "Opaque",
340-
"metadata": {"name": "test-files", "namespace": "test-namespace"},
341-
"data": {"test.py": "print('test')"},
342-
}
343-
344-
# Test with None
345-
with caplog.at_level("WARNING"):
346-
result = create_secret_from_spec(rayjob, secret_spec, None)
347-
348-
assert result == "test-files"
349-
assert "No valid RayJob result with UID found" in caplog.text
350-
351-
# Test with string instead of dict
352-
caplog.clear()
353-
with caplog.at_level("WARNING"):
354-
result = create_secret_from_spec(rayjob, secret_spec, "not-a-dict")
355-
356-
assert result == "test-files"
357-
assert "No valid RayJob result with UID found" in caplog.text
358-
359-
360292
def test_file_handling_kubernetes_best_practice_flow(mocker, tmp_path):
361293
"""
362294
Test the Kubernetes best practice flow: pre-declare volume, submit, create Secret.
@@ -446,7 +378,13 @@ def test_rayjob_submit_with_files_new_cluster(auto_mock_setup, tmp_path):
446378
Test RayJob submission with file detection for new cluster.
447379
"""
448380
mock_api_instance = auto_mock_setup["rayjob_api"]
449-
mock_api_instance.submit_job.return_value = True
381+
mock_api_instance.submit_job.return_value = {
382+
"metadata": {
383+
"name": "test-job",
384+
"namespace": "test-namespace",
385+
"uid": "test-uid-files-12345",
386+
}
387+
}
450388

451389
mock_k8s_instance = auto_mock_setup["k8s_api"]
452390

@@ -507,8 +445,17 @@ def test_create_secret_api_error_non_409(auto_mock_setup):
507445
"data": {"test.py": "print('test')"},
508446
}
509447

448+
# Provide valid RayJob result with UID as KubeRay client would
449+
rayjob_result = {
450+
"metadata": {
451+
"name": "test-job",
452+
"namespace": "test-namespace",
453+
"uid": "test-uid-api-error",
454+
}
455+
}
456+
510457
with pytest.raises(RuntimeError, match="Failed to create Secret"):
511-
create_secret_from_spec(rayjob, secret_spec)
458+
create_secret_from_spec(rayjob, secret_spec, rayjob_result)
512459

513460

514461
def test_add_file_volumes_existing_volume_skip():

0 commit comments

Comments
 (0)