From c4905a479957d7c4f06ccf16d86c1640866fce15 Mon Sep 17 00:00:00 2001 From: Grigory Pomadchin Date: Fri, 7 Jan 2022 18:51:45 -0500 Subject: [PATCH 1/3] Add PySpark Cleanup Spark cpu pools Expose SparkUI in the spawned notebooks, add more details into the PySpark profile description Add the Spark env init script Add Spark executors template to support nodes tolerations Allocate NoteBooks per namespace Add hub and proxy ClusterRoles Make AKS RBAC enabled Rename ensure_service_account function Change the spark executor image Expose the Spark UI in the pyspark profile case Cleanup hub and autohttps ClusterRoles --- .gitignore | 1 + README.md | 2 +- helm/chart/templates/autohttps-rbac.yaml | 33 ++++++ helm/chart/templates/hub-rbac.yaml | 39 +++++++ helm/kbatch-proxy-values.yaml | 11 +- helm/profiles.yaml | 89 ++++++++++++++++ helm/values.yaml | 124 ++++++++++++++++++++++- terraform/prod/main.tf | 1 + terraform/resources/aks.tf | 44 ++++++++ terraform/resources/hub.tf | 2 +- terraform/resources/providers.tf | 2 + terraform/resources/variables.tf | 11 ++ terraform/staging/main.tf | 1 + 13 files changed, 355 insertions(+), 5 deletions(-) create mode 100644 helm/chart/templates/autohttps-rbac.yaml create mode 100644 helm/chart/templates/hub-rbac.yaml diff --git a/.gitignore b/.gitignore index 89ef6b1..461da1c 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,4 @@ package.json jupyterhub_cookie_secret jupyterhub-proxy.pid .kubeconfig +.history diff --git a/README.md b/README.md index 572bdda..aaca418 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ The bulk of the configuration is done in `values.yaml`. See the inline comments `jupyterhub_opencensus_monitor.yaml` sets `daskhub.jupyterhub.hub.extraFiles.jupyterhub_open_census_monitor.stringData` to be the `jupyterhub_opencensus_monitor.py` script (see below). We couldn't figure out out to get the helm-release provider working with with kubectl's `set-file` so we needed to inline the script. There's probably a better way to do this. -Finally, the custom UI elements used by the Hub process and additional notebook server configuration are included under `helm/chart/files` and `helm/cart/templates`. These are mounted into the pods. See [custom UI](#custom-ui) for more. +Finally, the custom UI elements used by the Hub process and additional notebook server configuration are included under `helm/chart/files` and `helm/chart/templates`. These are mounted into the pods. See [custom UI](#custom-ui) for more. ## Terraform diff --git a/helm/chart/templates/autohttps-rbac.yaml b/helm/chart/templates/autohttps-rbac.yaml new file mode 100644 index 0000000..a343db5 --- /dev/null +++ b/helm/chart/templates/autohttps-rbac.yaml @@ -0,0 +1,33 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: autohttps + namespace: {{ .Release.Namespace }} + labels: + chart: {{ .Chart.Name }}-{{ .Chart.Version }} + component: autohttps + heritage: {{ .Release.Service }} + release: {{ .Release.Name }} +rules: +- apiGroups: [""] + resources: ["secrets"] + verbs: ["get", "patch", "list", "create"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: autohttps + namespace: {{ .Release.Namespace }} + labels: + chart: {{ .Chart.Name }}-{{ .Chart.Version }} + component: autohttps + heritage: {{ .Release.Service }} + release: {{ .Release.Name }} +subjects: + - kind: ServiceAccount + name: autohttps + namespace: {{ .Release.Namespace }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: autohttps diff --git a/helm/chart/templates/hub-rbac.yaml b/helm/chart/templates/hub-rbac.yaml new file mode 100644 index 0000000..9b0dc12 --- /dev/null +++ b/helm/chart/templates/hub-rbac.yaml @@ -0,0 +1,39 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: hub + namespace: {{ .Release.Namespace }} + labels: + chart: {{ .Chart.Name }}-{{ .Chart.Version }} + component: hub + heritage: {{ .Release.Service }} + release: {{ .Release.Name }} +rules: + - apiGroups: [""] + resources: ["pods", "persistentvolumeclaims", "secrets", "configmaps", "services", "namespaces", "serviceaccounts"] + verbs: ["get", "watch", "list", "create", "delete", "update"] + - apiGroups: ["rbac.authorization.k8s.io"] + resources: ["roles", "rolebindings"] + verbs: ["get", "watch", "list", "create", "delete", "update"] + - apiGroups: [""] + resources: ["events"] + verbs: ["get", "watch", "list"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: hub + namespace: {{ .Release.Namespace }} + labels: + chart: {{ .Chart.Name }}-{{ .Chart.Version }} + component: hub + heritage: {{ .Release.Service }} + release: {{ .Release.Name }} +subjects: + - kind: ServiceAccount + name: hub + namespace: {{ .Release.Namespace }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: hub diff --git a/helm/kbatch-proxy-values.yaml b/helm/kbatch-proxy-values.yaml index 6d0957e..15026ee 100644 --- a/helm/kbatch-proxy-values.yaml +++ b/helm/kbatch-proxy-values.yaml @@ -47,7 +47,16 @@ kbatch-proxy: mountPath: /profile-template.yaml data: python: - image: mcr.microsoft.com/planetary-computer/python:2021.11.30.0 + image: cr.microsoft.com/planetary-computer/python:2021.11.30.0 + resources: + requests: + cpu: "3.6" + memory: "27G" + limits: + cpu: "4" + memory: "32G" + pyspark: + image: daunnc/planetary-computer-python-jdk:2021.11.30.0 resources: requests: cpu: "3.6" diff --git a/helm/profiles.yaml b/helm/profiles.yaml index 750ea69..4697e2b 100644 --- a/helm/profiles.yaml +++ b/helm/profiles.yaml @@ -20,6 +20,24 @@ daskhub: values: - gpu + # Spark ----------------------------------------------------- + - display_name: "PySpark" + default: "True" + description: '4 cores, 32 GiB of memory. Pangeo Notebook environment powered by Raster Frames, GeoTrellis and Apache Spark.' + kubespawner_override: + image: "${pyspark_image}" + cpu_guarantee: 3 + cpu_limit: 4 + mem_guarantee: "25G" + mem_limit: "32G" + default_url: "/lab/tree/PlanetaryComputerExamples/README.md" + node_affinity_required: + - matchExpressions: + - key: pc.microsoft.com/userkind + operator: NotIn + values: + - gpu + # R -------------------------------------------------------------------- - display_name: "R" description: '8 cores, 64 GiB of memory. R geospatial environment.' @@ -108,3 +126,74 @@ daskhub: operator: NotIn values: - gpu + + extraFiles: + spark_default_configuration: + # TODO(https://github.com/hashicorp/terraform-provider-helm/issues/628): use set-file + stringData: | + """ + Default Spark configuration init for the Jypyter instance. + """ + import socket + import os + notebook_ip = socket.gethostbyname(socket.gethostname()) + namespace_user = os.environ.get('NAMESPACE_USER', '') + spark_config = { + 'spark.master': 'k8s://https://kubernetes.default.svc.cluster.local', + 'spark.app.name': 'STAC API with RF in K8S', + 'spark.ui.port': '4040', + 'spark.driver.blockManager.port': '7777', + 'spark.driver.port': '2222', + 'spark.driver.host': notebook_ip, + 'spark.driver.bindAddress': '0.0.0.0', + 'spark.executor.instances': '2', + 'spark.executor.memory': '4g', + 'spark.driver.memory': '1g', + 'spark.executor.cores': '3', + 'spark.kubernetes.namespace': namespace_user, + 'spark.kubernetes.container.image': 'quay.io/daunnc/spark-k8s-py-3.8.8-gdal32-msftpc:3.1.2', + 'spark.kubernetes.executor.deleteOnTermination': 'true', + 'spark.kubernetes.authenticate.driver.serviceAccountName': 'default', + 'spark.kubernetes.authenticate.caCertFile': '/var/run/secrets/kubernetes.io/serviceaccount/ca.crt', + 'spark.kubernetes.authenticate.oauthTokenFile': '/var/run/secrets/kubernetes.io/serviceaccount/token', + 'spark.kubernetes.executor.podTemplateFile': '/home/jovyan/executor-template.yml', + 'spark.kubernetes.node.selector.k8s.spark.org/dedicated': 'worker', + 'spark.kubernetes.node.selector.pc.microsoft.com/workerkind': 'spark-cpu', + 'spark.kubernetes.node.selector.kubernetes.azure.com/scalesetpriority': 'spot' + } + + # Spark supports pool with no taints and can select nodes via selector only by default + # This template allows Spark executors to make use of Azure spots + spark_executor_template: + stringData: | + # + # Licensed to the Apache Software Foundation (ASF) under one or more + # contributor license agreements. See the NOTICE file distributed with + # this work for additional information regarding copyright ownership. + # The ASF licenses this file to You under the Apache License, Version 2.0 + # (the "License"); you may not use this file except in compliance with + # the License. You may obtain a copy of the License at + # + # http://www.apache.org/licenses/LICENSE-2.0 + # + # Unless required by applicable law or agreed to in writing, software + # distributed under the License is distributed on an "AS IS" BASIS, + # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + # See the License for the specific language governing permissions and + # limitations under the License. + # + apiVersion: v1 + Kind: Pod + metadata: + labels: + template-label-key: executor-template-label-value + spec: + containers: + - name: test-executor-container + image: will-be-overwritten + # extra toleration to support Spot instances + tolerations: + - key: "kubernetes.azure.com/scalesetpriority" + operator: "Equal" + value: "spot" + effect: "NoSchedule" diff --git a/helm/values.yaml b/helm/values.yaml index fbc7e12..e18f06e 100644 --- a/helm/values.yaml +++ b/helm/values.yaml @@ -85,10 +85,17 @@ daskhub: c.KubeSpawner.extra_labels = {} kubespawner: | c.KubeSpawner.start_timeout = 15 * 60 # 15 minutes + # pass the parent namespace through, needed for pre_spawn_hook to copy resources + c.KubeSpawner.environment['NAMESPACE_PARENT'] = c.KubeSpawner.namespace + # hub allocates notebook in user namespaces + c.KubeSpawner.enable_user_namespaces = True + # the hub url should be accessible across namespaces + c.KubeSpawner.hub_connect_url = "http://hub.${namespace}.svc.cluster.local:8081" + 01-add-dask-gateway-values: | # The daskhub helm chart doesn't correctly handle hub.baseUrl. # DASK_GATEWAY__PUBLIC_ADDRESS set via terraform - c.KubeSpawner.environment["DASK_GATEWAY__ADDRESS"] = "http://proxy-http:8000/compute/services/dask-gateway/" + c.KubeSpawner.environment["DASK_GATEWAY__ADDRESS"] = "http://proxy-http.${namespace}.svc.cluster.local:8000/compute/services/dask-gateway/" c.KubeSpawner.environment["DASK_GATEWAY__PUBLIC_ADDRESS"] = "https://${jupyterhub_host}/compute/services/dask-gateway/" templates: | c.JupyterHub.template_paths.insert(0, "/etc/jupyterhub/templates") @@ -97,8 +104,97 @@ daskhub: # Sets the following # 1. environment variable PC_SDK_SUBSCRIPTION_KEY # --------------------------------------------------- + from kubernetes.client import RbacAuthorizationV1Api + from kubernetes.client.rest import ApiException + from kubernetes.client.models import V1Role, V1PolicyRule, V1ObjectMeta, V1Subject, V1RoleRef, V1RoleBinding, V1ServiceAccount + + async def ensure_service_account_role(spawner, name, namespace, role_name): + api = spawner.api + try: + api.create_namespaced_service_account(namespace, V1ServiceAccount(metadata=V1ObjectMeta(name=name))) + except ApiException as e: + if e.status != 409: + # It's fine if it already exists + spawner.log.exception(f'Failed to create service account {name} in the {namespace} namespace') + raise + try: + rules = [ + V1PolicyRule( + [''], + resources=['pods', 'services', 'configmaps'], + verbs=['get', 'watch', 'list', 'create', 'delete', 'update'] + ) + ] + role = V1Role(rules=rules) + role.metadata = V1ObjectMeta(namespace=namespace, name=role_name) + + rbac = RbacAuthorizationV1Api() + rbac.create_namespaced_role(namespace, role) + except ApiException as e: + if e.status != 409: + # It's fine if it already exists + spawner.log.exception(f'Failed to create role {role} for service account {name} in the {namespace} namespace') + raise + try: + subject = V1Subject(kind='ServiceAccount', name=name, namespace=namespace) + role_ref = V1RoleRef(api_group='rbac.authorization.k8s.io', kind='Role', name=role_name) + metadata = V1ObjectMeta(name=f'{role_name}-binding') + role_binding = V1RoleBinding(metadata=metadata, role_ref=role_ref, subjects=[subject]) + rbac = RbacAuthorizationV1Api() + rbac.create_namespaced_role_binding(namespace=namespace, body=role_binding) + except ApiException as e: + if e.status != 409: + # It's fine if it already exists + spawner.log.exception(f'Failed to create role binding for {role} and service account {name} in the {namespace} namespace') + raise async def pre_spawn_hook(spawner): + spawner.environment['NAMESPACE_USER'] = spawner.namespace + namespace_parent = spawner.environment['NAMESPACE_PARENT'] + + # create user namespace before running the spawner + if spawner.enable_user_namespaces: + await spawner._ensure_namespace() + await ensure_service_account_role(spawner, 'default', spawner.namespace, 'default-role') + + # copy secrets and configmaps into the new namespace + api = spawner.api + for s in api.list_namespaced_secret(namespace_parent).items: + s.metadata.namespace = spawner.namespace + s.metadata.resource_version = None + try: + api.create_namespaced_secret(spawner.namespace, s) + except ApiException as e: + if e.status != 409: + # It's fine if it already exists + spawner.log.exception(f'Failed to create namespace {spawner.namespace.namespace}, trying to patch...') + api.patch_namespaced_secret(spawner.namespace, s) + raise + + for m in api.list_namespaced_config_map(namespace_parent).items: + m.metadata.namespace = spawner.namespace + m.metadata.resource_version = None + try: + api.create_namespaced_config_map(spawner.namespace, m) + except ApiException as e: + if e.status != 409: + # It's fine if it already exists + spawner.log.exception(f'Failed to create namespace {spawner.namespace.namespace}, trying to patch...') + api.patch_namespaced_config_map(spawner.namespace, m) + raise + + # unmount spark default configuration with py env preload if not needed, for more details see + # https://github.com/jupyterhub/kubespawner/issues/501 + # https://discourse.jupyter.org/t/tailoring-spawn-options-and-server-configuration-to-certain-users/8449 + if spawner.user_options.get('profile', '') != 'pyspark': + spawner.volume_mounts = list(filter(lambda e: 'spark' not in e.get('subPath', ''), spawner.volume_mounts)) + # expose the Spark UI (needed only in the pyspark profile case) + else: + spawner.extra_container_config = {'ports': [ + {'containerPort': 8888, 'name': 'notebook-port', 'protocol': 'TCP'}, + {'containerPort': 4040, 'name': 'spark-ui', 'protocol': 'TCP'} + ]} + username = spawner.user.name # `username` is an email address. We use that email address to look up the # user in the Django App @@ -147,6 +243,21 @@ daskhub: c.KubeSpawner.pre_spawn_hook = pre_spawn_hook + # it is the spawner post stop hook, not related to the notebook lifecycle + # we don't need it + post_stop_hook: | + from kubernetes.client.rest import ApiException + async def post_stop_hook(spawner): + try: + spawner.api.delete_namespace(spawner.namespace) + except ApiException as e: + if e.status != 409: + # It's fine if it is already removed + spawner.log.exception(f'Failed to delete namespace {spawner.namespace.namespace}') + raise + + # c.KubeSpawner.post_stop_hook = post_stop_hook + proxy: https: enabled: true @@ -154,6 +265,9 @@ daskhub: contactEmail: "taugspurger@microsoft.com" singleuser: + # if not set, it also backs to default but with no ServiceAccount secrets mounted + serviceAccountName: default + # These limits match the "large" profiles, so that a user requesting large will be successfully scheduled. # The user scheduler doesn't evict multiple placeholders. memory: @@ -198,6 +312,12 @@ daskhub: - name: driven-data mountPath: /driven-data/ + extraFiles: + spark_default_configuration: + mountPath: /home/jovyan/.ipython/profile_default/startup/00-spark-conf.py + spark_executor_template: + mountPath: /home/jovyan/executor-template.yml + extraEnv: DASK_GATEWAY__CLUSTER__OPTIONS__IMAGE: '{JUPYTER_IMAGE_SPEC}' DASK_DISTRIBUTED__DASHBOARD__LINK: '/user/{JUPYTERHUB_USER}/proxy/{port}/status' @@ -222,7 +342,7 @@ daskhub: auth: jupyterhub: apiToken: "{{ tf.jupyterhub_dask_gateway_token }}" - apiUrl: http://proxy-http:8000/compute/hub/api + apiUrl: http://proxy-http.${namespace}.svc.cluster.local:8000/compute/hub/api affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: diff --git a/terraform/prod/main.tf b/terraform/prod/main.tf index 320756d..bce5e34 100644 --- a/terraform/prod/main.tf +++ b/terraform/prod/main.tf @@ -23,6 +23,7 @@ module "resources" { jupyterhub_singleuser_image_name = "pcccr.azurecr.io/public/planetary-computer/python" jupyterhub_singleuser_image_tag = "2022.01.17.0" python_image = "pcccr.azurecr.io/public/planetary-computer/python:2022.01.17.0" + pyspark_image = "daunnc/planetary-computer-pyspark:2021.11.29.0-gdal3.4-3.1-rf" r_image = "pcccr.azurecr.io/public/planetary-computer/r:2022.01.17.0" gpu_pytorch_image = "pcccr.azurecr.io/public/planetary-computer/gpu-pytorch:2022.01.17.0" gpu_tensorflow_image = "pcccr.azurecr.io/public/planetary-computer/gpu-tensorflow:2022.01.17.0" diff --git a/terraform/resources/aks.tf b/terraform/resources/aks.tf index cd81b17..bbfcfe9 100644 --- a/terraform/resources/aks.tf +++ b/terraform/resources/aks.tf @@ -5,6 +5,10 @@ resource "azurerm_kubernetes_cluster" "pc_compute" { dns_prefix = "${local.maybe_staging_prefix}-cluster" kubernetes_version = var.kubernetes_version sku_tier = "Paid" + + role_based_access_control { + enabled = var.enable_role_based_access_control + } addon_profile { kube_dashboard { @@ -126,3 +130,43 @@ resource "azurerm_kubernetes_cluster_node_pool" "cpu_worker_pool" { } } + +# Spark supports pool with no taints and can select nodes via selector only by default +# https://spark.apache.org/docs/latest/running-on-kubernetes.html#how-it-works +# We use a non default spark-executors template to address this issue +resource "azurerm_kubernetes_cluster_node_pool" "spark_cpu_worker_pool" { + name = "spcpuworker" + kubernetes_cluster_id = azurerm_kubernetes_cluster.pc_compute.id + vm_size = var.cpu_worker_vm_size + enable_auto_scaling = true + os_disk_size_gb = 128 + orchestrator_version = var.kubernetes_version + priority = "Spot" # Regular when not set + eviction_policy = "Delete" + spot_max_price = -1 + vnet_subnet_id = azurerm_subnet.node_subnet.id + + node_labels = { + "k8s.spark.org/dedicated" = "worker", + "pc.microsoft.com/workerkind" = "spark-cpu", + "kubernetes.azure.com/scalesetpriority" = "spot" + } + + node_taints = [ + "kubernetes.azure.com/scalesetpriority=spot:NoSchedule", + ] + + min_count = var.cpu_worker_pool_min_count + max_count = var.cpu_worker_max_count + tags = { + Environment = "Production" + ManagedBy = "AI4E" + } + + lifecycle { + ignore_changes = [ + node_count, + ] + } + +} diff --git a/terraform/resources/hub.tf b/terraform/resources/hub.tf index fa948c3..6c1e7d1 100644 --- a/terraform/resources/hub.tf +++ b/terraform/resources/hub.tf @@ -20,7 +20,7 @@ resource "helm_release" "dhub" { values = [ "${templatefile("../../helm/values.yaml", { jupyterhub_host = var.jupyterhub_host, namespace = var.environment })}", "${file("../../helm/jupyterhub_opencensus_monitor.yaml")}", - "${templatefile("../../helm/profiles.yaml", { python_image = var.python_image, r_image = var.r_image, gpu_pytorch_image = var.gpu_pytorch_image, gpu_tensorflow_image = var.gpu_tensorflow_image, qgis_image = var.qgis_image })}", + "${templatefile("../../helm/profiles.yaml", { python_image = var.python_image, pyspark_image = var.pyspark_image, r_image = var.r_image, gpu_pytorch_image = var.gpu_pytorch_image, gpu_tensorflow_image = var.gpu_tensorflow_image, qgis_image = var.qgis_image })}", # workaround https://github.com/hashicorp/terraform-provider-helm/issues/669 "${templatefile("../../helm/kbatch-proxy-values.yaml", { jupyterhub_host = var.jupyterhub_host, dns_label = var.dns_label })}", ] diff --git a/terraform/resources/providers.tf b/terraform/resources/providers.tf index bc1335a..add57ed 100644 --- a/terraform/resources/providers.tf +++ b/terraform/resources/providers.tf @@ -9,6 +9,7 @@ provider "helm" { client_key = base64decode(azurerm_kubernetes_cluster.pc_compute.kube_config[0].client_key) client_certificate = base64decode(azurerm_kubernetes_cluster.pc_compute.kube_config[0].client_certificate) cluster_ca_certificate = base64decode(azurerm_kubernetes_cluster.pc_compute.kube_config[0].cluster_ca_certificate) + # config_path = "~/.kube/config" } } @@ -17,6 +18,7 @@ provider "kubernetes" { client_key = base64decode(azurerm_kubernetes_cluster.pc_compute.kube_config[0].client_key) client_certificate = base64decode(azurerm_kubernetes_cluster.pc_compute.kube_config[0].client_certificate) cluster_ca_certificate = base64decode(azurerm_kubernetes_cluster.pc_compute.kube_config[0].cluster_ca_certificate) + # config_path = "~/.kube/config" } diff --git a/terraform/resources/variables.tf b/terraform/resources/variables.tf index fa1a441..1ecec7c 100644 --- a/terraform/resources/variables.tf +++ b/terraform/resources/variables.tf @@ -46,6 +46,11 @@ variable "python_image" { description = "The tag for the python environment image." } +variable "pyspark_image" { + type = string + description = "The tag for the PySpark environment image." +} + variable "r_image" { type = string description = "The tag for the R environment image." @@ -110,6 +115,12 @@ variable "workspace_id" { description = "A random (unique) string to use for the Log Analystics workspace." } +variable "enable_role_based_access_control" { + type = bool + default = true + description = "Enable Role Based Access Control." +} + # ---------------------------------------------------------------------------- # Deploy values diff --git a/terraform/staging/main.tf b/terraform/staging/main.tf index 69c3bd0..c80e989 100644 --- a/terraform/staging/main.tf +++ b/terraform/staging/main.tf @@ -23,6 +23,7 @@ module "resources" { jupyterhub_singleuser_image_name = "pcccr.azurecr.io/public/planetary-computer/python" jupyterhub_singleuser_image_tag = "2022.01.17.0" python_image = "pcccr.azurecr.io/public/planetary-computer/python:2022.01.17.0" + pyspark_image = "daunnc/planetary-computer-pyspark:2021.11.29.0-gdal3.4-3.1-rf" r_image = "pcccr.azurecr.io/public/planetary-computer/r:2022.01.17.0" gpu_pytorch_image = "pcccr.azurecr.io/public/planetary-computer/gpu-pytorch:2022.01.17.0" gpu_tensorflow_image = "pcccr.azurecr.io/public/planetary-computer/gpu-tensorflow:2022.01.17.0" From e88379e9c565aee70e94bc0ac1497e4c0d1449ed Mon Sep 17 00:00:00 2001 From: Grigory Pomadchin Date: Thu, 27 Jan 2022 13:33:48 -0500 Subject: [PATCH 2/3] Fix the pvc conflict --- .../files/etc/singleuser/k8s-lifecycle-hook-post-start.sh | 5 +++++ helm/profiles.yaml | 2 +- helm/values.yaml | 6 +++--- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/helm/chart/files/etc/singleuser/k8s-lifecycle-hook-post-start.sh b/helm/chart/files/etc/singleuser/k8s-lifecycle-hook-post-start.sh index 868775a..1d653a5 100755 --- a/helm/chart/files/etc/singleuser/k8s-lifecycle-hook-post-start.sh +++ b/helm/chart/files/etc/singleuser/k8s-lifecycle-hook-post-start.sh @@ -48,6 +48,11 @@ Terminal=false Hidden=false EOF +# Add Spark default configuration if mounted +if [ -d "/etc/spark-ipython/profile_default/startup" ]; then + mkdir -p /home/jovyan/.ipython/profile_default/startup/ && \ + mv /etc/spark-ipython/profile_default/startup/* /home/jovyan/.ipython/profile_default/startup/ +fi echo "Removing lost+found" # Remove empty lost+found directories diff --git a/helm/profiles.yaml b/helm/profiles.yaml index 4697e2b..dd6673a 100644 --- a/helm/profiles.yaml +++ b/helm/profiles.yaml @@ -156,7 +156,7 @@ daskhub: 'spark.kubernetes.authenticate.driver.serviceAccountName': 'default', 'spark.kubernetes.authenticate.caCertFile': '/var/run/secrets/kubernetes.io/serviceaccount/ca.crt', 'spark.kubernetes.authenticate.oauthTokenFile': '/var/run/secrets/kubernetes.io/serviceaccount/token', - 'spark.kubernetes.executor.podTemplateFile': '/home/jovyan/executor-template.yml', + 'spark.kubernetes.executor.podTemplateFile': '/etc/spark/executor-template.yml', 'spark.kubernetes.node.selector.k8s.spark.org/dedicated': 'worker', 'spark.kubernetes.node.selector.pc.microsoft.com/workerkind': 'spark-cpu', 'spark.kubernetes.node.selector.kubernetes.azure.com/scalesetpriority': 'spot' diff --git a/helm/values.yaml b/helm/values.yaml index e18f06e..48de752 100644 --- a/helm/values.yaml +++ b/helm/values.yaml @@ -313,10 +313,10 @@ daskhub: mountPath: /driven-data/ extraFiles: - spark_default_configuration: - mountPath: /home/jovyan/.ipython/profile_default/startup/00-spark-conf.py spark_executor_template: - mountPath: /home/jovyan/executor-template.yml + mountPath: /etc/spark/executor-template.yml + spark_default_configuration: + mountPath: /etc/spark-ipython/profile_default/startup/00-spark-conf.py extraEnv: DASK_GATEWAY__CLUSTER__OPTIONS__IMAGE: '{JUPYTER_IMAGE_SPEC}' From 1fbb48ffd092d56a2cdbc864bfd12c3b07c39a9a Mon Sep 17 00:00:00 2001 From: Grigory Pomadchin Date: Thu, 27 Jan 2022 13:54:36 -0500 Subject: [PATCH 3/3] Make changes HOME location agnostic --- .../files/etc/singleuser/k8s-lifecycle-hook-post-start.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/helm/chart/files/etc/singleuser/k8s-lifecycle-hook-post-start.sh b/helm/chart/files/etc/singleuser/k8s-lifecycle-hook-post-start.sh index 1d653a5..af9368d 100755 --- a/helm/chart/files/etc/singleuser/k8s-lifecycle-hook-post-start.sh +++ b/helm/chart/files/etc/singleuser/k8s-lifecycle-hook-post-start.sh @@ -50,8 +50,8 @@ EOF # Add Spark default configuration if mounted if [ -d "/etc/spark-ipython/profile_default/startup" ]; then - mkdir -p /home/jovyan/.ipython/profile_default/startup/ && \ - mv /etc/spark-ipython/profile_default/startup/* /home/jovyan/.ipython/profile_default/startup/ + mkdir -p ~/.ipython/profile_default/startup/ && \ + mv /etc/spark-ipython/profile_default/startup/* ~/.ipython/profile_default/startup/ fi echo "Removing lost+found"