microsoft · pomadchin · Jan 7, 2022 · Jan 27, 2022 · Jan 27, 2022 · Jan 31, 2022
diff --git a/.gitignore b/.gitignore
@@ -14,3 +14,4 @@ package.json
 jupyterhub_cookie_secret
 jupyterhub-proxy.pid
 .kubeconfig
+.history
diff --git a/README.md b/README.md
@@ -28,7 +28,7 @@ The bulk of the configuration is done in `values.yaml`. See the inline comments
 
 `jupyterhub_opencensus_monitor.yaml` sets `daskhub.jupyterhub.hub.extraFiles.jupyterhub_open_census_monitor.stringData` to be the `jupyterhub_opencensus_monitor.py` script (see below). We couldn't figure out out to get the helm-release provider working with with kubectl's `set-file` so we needed to inline the script. There's probably a better way to do this.
 
-Finally, the custom UI elements used by the Hub process and additional notebook server configuration are included under `helm/chart/files` and `helm/cart/templates`. These are mounted into the pods. See [custom UI](#custom-ui) for more.
+Finally, the custom UI elements used by the Hub process and additional notebook server configuration are included under `helm/chart/files` and `helm/chart/templates`. These are mounted into the pods. See [custom UI](#custom-ui) for more.
 
 ## Terraform
 

diff --git a/helm/chart/files/etc/singleuser/k8s-lifecycle-hook-post-start.sh b/helm/chart/files/etc/singleuser/k8s-lifecycle-hook-post-start.sh
@@ -48,6 +48,11 @@ Terminal=false
 Hidden=false
 EOF
 
+# Add Spark default configuration if mounted
+if [ -d "/etc/spark-ipython/profile_default/startup" ]; then
+    mkdir -p ~/.ipython/profile_default/startup/ && \
+    mv /etc/spark-ipython/profile_default/startup/* ~/.ipython/profile_default/startup/
+fi
 
 echo "Removing lost+found"
 # Remove empty lost+found directories

diff --git a/helm/chart/templates/autohttps-rbac.yaml b/helm/chart/templates/autohttps-rbac.yaml
@@ -0,0 +1,33 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: autohttps
+  namespace: {{ .Release.Namespace }}
+  labels:
+    chart: {{ .Chart.Name }}-{{ .Chart.Version }}
+    component: autohttps
+    heritage: {{ .Release.Service }}
+    release: {{ .Release.Name }}
+rules:
+- apiGroups: [""]
+  resources: ["secrets"]
+  verbs: ["get", "patch", "list", "create"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: autohttps
+  namespace: {{ .Release.Namespace }}
+  labels:
+    chart: {{ .Chart.Name }}-{{ .Chart.Version }}
+    component: autohttps
+    heritage: {{ .Release.Service }}
+    release: {{ .Release.Name }}
+subjects:
+  - kind: ServiceAccount
+    name: autohttps
+    namespace: {{ .Release.Namespace }}
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: autohttps
diff --git a/helm/chart/templates/hub-rbac.yaml b/helm/chart/templates/hub-rbac.yaml
@@ -0,0 +1,39 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: hub
+  namespace: {{ .Release.Namespace }}
+  labels:
+    chart: {{ .Chart.Name }}-{{ .Chart.Version }}
+    component: hub
+    heritage: {{ .Release.Service }}
+    release: {{ .Release.Name }}
+rules:
+  - apiGroups: [""]
+    resources: ["pods", "persistentvolumeclaims", "secrets", "configmaps", "services", "namespaces", "serviceaccounts"]
+    verbs: ["get", "watch", "list", "create", "delete", "update"]
+  - apiGroups: ["rbac.authorization.k8s.io"]
+    resources: ["roles", "rolebindings"]
+    verbs: ["get", "watch", "list", "create", "delete", "update"]
+  - apiGroups: [""]
+    resources: ["events"]
+    verbs: ["get", "watch", "list"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: hub
+  namespace: {{ .Release.Namespace }}
+  labels:
+    chart: {{ .Chart.Name }}-{{ .Chart.Version }}
+    component: hub
+    heritage: {{ .Release.Service }}
+    release: {{ .Release.Name }}
+subjects:
+  - kind: ServiceAccount
+    name: hub
+    namespace: {{ .Release.Namespace }}
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: hub
diff --git a/helm/kbatch-proxy-values.yaml b/helm/kbatch-proxy-values.yaml
@@ -47,7 +47,16 @@ kbatch-proxy:
         mountPath: /profile-template.yaml
         data:
           python:
-            image: mcr.microsoft.com/planetary-computer/python:2021.11.30.0
+            image: cr.microsoft.com/planetary-computer/python:2021.11.30.0
+            resources:
+              requests:
+                cpu: "3.6"
+                memory: "27G"
+              limits:
+                cpu: "4"
+                memory: "32G"
+          pyspark:
+            image: daunnc/planetary-computer-python-jdk:2021.11.30.0
             resources:
               requests:
                 cpu: "3.6"

diff --git a/helm/profiles.yaml b/helm/profiles.yaml
@@ -20,6 +20,24 @@ daskhub:
                   values:
                   - gpu
 
+        # Spark -----------------------------------------------------
+        - display_name: "PySpark"
+          default: "True"
+          description: '4 cores, 32 GiB of memory. <a href="https://github.com/pangeo-data/pangeo-docker-images" target="_blank">Pangeo Notebook</a> environment powered by <a href="https://rasterframes.io/">Raster Frames</a>, <a href="http://geotrellis.io/">GeoTrellis</a> and <a href="https://spark.apache.org/">Apache Spark</a>.'
+          kubespawner_override:
+            image: "${pyspark_image}"
+            cpu_guarantee: 3
+            cpu_limit: 4
+            mem_guarantee: "25G"
+            mem_limit: "32G"
+            default_url: "/lab/tree/PlanetaryComputerExamples/README.md"
+            node_affinity_required:
+              - matchExpressions:
+                - key: pc.microsoft.com/userkind
+                  operator: NotIn
+                  values:
+                  - gpu
+
         # R --------------------------------------------------------------------
         - display_name: "R"
           description: '8 cores, 64 GiB of memory. R geospatial environment.'
@@ -108,3 +126,74 @@ daskhub:
                   operator: NotIn
                   values:
                   - gpu
+
+      extraFiles:
+        spark_default_configuration:
+          # TODO(https://github.com/hashicorp/terraform-provider-helm/issues/628): use set-file
+          stringData: |
+            """
+            Default Spark configuration init for the Jypyter instance.
+            """
+            import socket
+            import os
+            notebook_ip = socket.gethostbyname(socket.gethostname())
+            namespace_user = os.environ.get('NAMESPACE_USER', '')
+            spark_config = {
+                'spark.master': 'k8s://https://kubernetes.default.svc.cluster.local',
+                'spark.app.name': 'STAC API with RF in K8S',
+                'spark.ui.port': '4040',
+                'spark.driver.blockManager.port': '7777',
+                'spark.driver.port': '2222',
+                'spark.driver.host': notebook_ip,
+                'spark.driver.bindAddress': '0.0.0.0',
+                'spark.executor.instances': '2',
+                'spark.executor.memory': '4g',
+                'spark.driver.memory': '1g',
+                'spark.executor.cores': '3',
+                'spark.kubernetes.namespace': namespace_user,
+                'spark.kubernetes.container.image': 'quay.io/daunnc/spark-k8s-py-3.8.8-gdal32-msftpc:3.1.2',
+                'spark.kubernetes.executor.deleteOnTermination': 'true',
+                'spark.kubernetes.authenticate.driver.serviceAccountName': 'default',
+                'spark.kubernetes.authenticate.caCertFile': '/var/run/secrets/kubernetes.io/serviceaccount/ca.crt',
+                'spark.kubernetes.authenticate.oauthTokenFile': '/var/run/secrets/kubernetes.io/serviceaccount/token',
+                'spark.kubernetes.executor.podTemplateFile': '/etc/spark/executor-template.yml',
+                'spark.kubernetes.node.selector.k8s.spark.org/dedicated': 'worker',
+                'spark.kubernetes.node.selector.pc.microsoft.com/workerkind': 'spark-cpu',
+                'spark.kubernetes.node.selector.kubernetes.azure.com/scalesetpriority': 'spot'
+            }
+
+        # Spark supports pool with no taints and can select nodes via selector only by default
+        # This template allows Spark executors to make use of Azure spots
+        spark_executor_template:
+          stringData: |
+            #
+            # Licensed to the Apache Software Foundation (ASF) under one or more
+            # contributor license agreements.  See the NOTICE file distributed with
+            # this work for additional information regarding copyright ownership.
+            # The ASF licenses this file to You under the Apache License, Version 2.0
+            # (the "License"); you may not use this file except in compliance with
+            # the License.  You may obtain a copy of the License at
+            #
+            #    http://www.apache.org/licenses/LICENSE-2.0
+            #
+            # Unless required by applicable law or agreed to in writing, software
+            # distributed under the License is distributed on an "AS IS" BASIS,
+            # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+            # See the License for the specific language governing permissions and
+            # limitations under the License.
+            #
+            apiVersion: v1
+            Kind: Pod
+            metadata:
+              labels:
+                template-label-key: executor-template-label-value
+            spec:
+              containers:
+              - name: test-executor-container
+                image: will-be-overwritten
+              # extra toleration to support Spot instances
+              tolerations:
+              - key: "kubernetes.azure.com/scalesetpriority"
+                operator: "Equal"
+                value: "spot"
+                effect: "NoSchedule"
diff --git a/helm/values.yaml b/helm/values.yaml
@@ -85,10 +85,17 @@ daskhub:
           c.KubeSpawner.extra_labels = {}
         kubespawner: |
           c.KubeSpawner.start_timeout = 15 * 60  # 15 minutes
+          # pass the parent namespace through, needed for pre_spawn_hook to copy resources
+          c.KubeSpawner.environment['NAMESPACE_PARENT'] = c.KubeSpawner.namespace
+          # hub allocates notebook in user namespaces
+          c.KubeSpawner.enable_user_namespaces = True
+          # the hub url should be accessible across namespaces
+          c.KubeSpawner.hub_connect_url = "http://hub.${namespace}.svc.cluster.local:8081"
+
         01-add-dask-gateway-values: |
           # The daskhub helm chart doesn't correctly handle hub.baseUrl.
           # DASK_GATEWAY__PUBLIC_ADDRESS set via terraform
-          c.KubeSpawner.environment["DASK_GATEWAY__ADDRESS"] = "http://proxy-http:8000/compute/services/dask-gateway/"
+          c.KubeSpawner.environment["DASK_GATEWAY__ADDRESS"] = "http://proxy-http.${namespace}.svc.cluster.local:8000/compute/services/dask-gateway/"
           c.KubeSpawner.environment["DASK_GATEWAY__PUBLIC_ADDRESS"] = "https://${jupyterhub_host}/compute/services/dask-gateway/"
         templates: |
           c.JupyterHub.template_paths.insert(0, "/etc/jupyterhub/templates")
@@ -97,8 +104,97 @@ daskhub:
           # Sets the following
           # 1. environment variable PC_SDK_SUBSCRIPTION_KEY
           # ---------------------------------------------------
+          from kubernetes.client import RbacAuthorizationV1Api
+          from kubernetes.client.rest import ApiException
+          from kubernetes.client.models import V1Role, V1PolicyRule, V1ObjectMeta, V1Subject, V1RoleRef, V1RoleBinding, V1ServiceAccount
+
+          async def ensure_service_account_role(spawner, name, namespace, role_name): 
+              api = spawner.api
+              try:
+                  api.create_namespaced_service_account(namespace, V1ServiceAccount(metadata=V1ObjectMeta(name=name)))
+              except ApiException as e:
+                  if e.status != 409:
+                      # It's fine if it already exists
+                      spawner.log.exception(f'Failed to create service account {name} in the {namespace} namespace')
+                      raise
+              try:
+                  rules = [
+                    V1PolicyRule(
+                        [''], 
+                        resources=['pods', 'services', 'configmaps'], 
+                        verbs=['get', 'watch', 'list', 'create', 'delete', 'update']
+                    )
+                  ]
+                  role = V1Role(rules=rules)
+                  role.metadata = V1ObjectMeta(namespace=namespace, name=role_name)
+
+                  rbac = RbacAuthorizationV1Api()
+                  rbac.create_namespaced_role(namespace, role)
+              except ApiException as e:
+                  if e.status != 409:
+                      # It's fine if it already exists
+                      spawner.log.exception(f'Failed to create role {role} for service account {name} in the {namespace} namespace')
+                      raise
+              try:
+                  subject = V1Subject(kind='ServiceAccount', name=name, namespace=namespace)
+                  role_ref = V1RoleRef(api_group='rbac.authorization.k8s.io', kind='Role', name=role_name)
+                  metadata = V1ObjectMeta(name=f'{role_name}-binding')
+                  role_binding = V1RoleBinding(metadata=metadata, role_ref=role_ref, subjects=[subject])
+                  rbac = RbacAuthorizationV1Api()
+                  rbac.create_namespaced_role_binding(namespace=namespace, body=role_binding)
+              except ApiException as e:
+                  if e.status != 409:
+                      # It's fine if it already exists
+                      spawner.log.exception(f'Failed to create role binding for {role} and service account {name} in the {namespace} namespace')
+                      raise
 
           async def pre_spawn_hook(spawner):
+              spawner.environment['NAMESPACE_USER'] = spawner.namespace
+              namespace_parent = spawner.environment['NAMESPACE_PARENT']
+
+              # create user namespace before running the spawner
+              if spawner.enable_user_namespaces:
+                  await spawner._ensure_namespace()
+                  await ensure_service_account_role(spawner, 'default', spawner.namespace, 'default-role')
+
+              # copy secrets and configmaps into the new namespace
+              api = spawner.api
+              for s in api.list_namespaced_secret(namespace_parent).items:
+                  s.metadata.namespace = spawner.namespace
+                  s.metadata.resource_version = None
+                  try:
+                      api.create_namespaced_secret(spawner.namespace, s)
+                  except ApiException as e:
+                      if e.status != 409:
+                          # It's fine if it already exists
+                          spawner.log.exception(f'Failed to create namespace {spawner.namespace.namespace}, trying to patch...')
+                          api.patch_namespaced_secret(spawner.namespace, s)
+                          raise
+
+              for m in api.list_namespaced_config_map(namespace_parent).items:
+                  m.metadata.namespace = spawner.namespace
+                  m.metadata.resource_version = None
+                  try:
+                      api.create_namespaced_config_map(spawner.namespace, m)
+                  except ApiException as e:
+                      if e.status != 409:
+                          # It's fine if it already exists
+                          spawner.log.exception(f'Failed to create namespace {spawner.namespace.namespace}, trying to patch...')
+                          api.patch_namespaced_config_map(spawner.namespace, m)
+                          raise
+
+              # unmount spark default configuration with py env preload if not needed, for more details see
+              # https://github.com/jupyterhub/kubespawner/issues/501
+              # https://discourse.jupyter.org/t/tailoring-spawn-options-and-server-configuration-to-certain-users/8449
+              if spawner.user_options.get('profile', '') != 'pyspark':
+                  spawner.volume_mounts = list(filter(lambda e: 'spark' not in e.get('subPath', ''), spawner.volume_mounts))
+              # expose the Spark UI (needed only in the pyspark profile case)
+              else:
+                  spawner.extra_container_config = {'ports': [
+                      {'containerPort': 8888, 'name': 'notebook-port', 'protocol': 'TCP'},
+                      {'containerPort': 4040, 'name': 'spark-ui', 'protocol': 'TCP'}
+                  ]}
+
               username = spawner.user.name
               # `username` is an email address. We use that email address to look up the
               # user in the Django App
@@ -147,13 +243,31 @@ daskhub:
 
           c.KubeSpawner.pre_spawn_hook = pre_spawn_hook
 
+        # it is the spawner post stop hook, not related to the notebook lifecycle
+        # we don't need it
+        post_stop_hook: |
+          from kubernetes.client.rest import ApiException
+          async def post_stop_hook(spawner):
+              try:
+                  spawner.api.delete_namespace(spawner.namespace)
+              except ApiException as e:
+                  if e.status != 409:
+                      # It's fine if it is already removed
+                      spawner.log.exception(f'Failed to delete namespace {spawner.namespace.namespace}')
+                      raise
+
+          # c.KubeSpawner.post_stop_hook = post_stop_hook
+
     proxy:
       https:
         enabled: true
         letsencrypt:
           contactEmail: "[email protected]"
 
     singleuser:
+      # if not set, it also backs to default but with no ServiceAccount secrets mounted
+      serviceAccountName: default
+
       # These limits match the "large" profiles, so that a user requesting large will be successfully scheduled.
       # The user scheduler doesn't evict multiple placeholders.
       memory:
@@ -198,6 +312,12 @@ daskhub:
           - name: driven-data
             mountPath: /driven-data/
 
+      extraFiles:
+        spark_executor_template:
+          mountPath: /etc/spark/executor-template.yml
+        spark_default_configuration:
+          mountPath: /etc/spark-ipython/profile_default/startup/00-spark-conf.py
+
       extraEnv:
         DASK_GATEWAY__CLUSTER__OPTIONS__IMAGE: '{JUPYTER_IMAGE_SPEC}'
         DASK_DISTRIBUTED__DASHBOARD__LINK: '/user/{JUPYTERHUB_USER}/proxy/{port}/status'
@@ -222,7 +342,7 @@ daskhub:
       auth:
         jupyterhub:
           apiToken: "{{ tf.jupyterhub_dask_gateway_token }}"
-          apiUrl: http://proxy-http:8000/compute/hub/api
+          apiUrl: http://proxy-http.${namespace}.svc.cluster.local:8000/compute/hub/api
       affinity:
         nodeAffinity:
           requiredDuringSchedulingIgnoredDuringExecution:

diff --git a/terraform/prod/main.tf b/terraform/prod/main.tf
@@ -23,6 +23,7 @@ module "resources" {
   jupyterhub_singleuser_image_name = "pcccr.azurecr.io/public/planetary-computer/python"
   jupyterhub_singleuser_image_tag  = "2022.01.17.0"
   python_image                     = "pcccr.azurecr.io/public/planetary-computer/python:2022.01.17.0"
+  pyspark_image                    = "daunnc/planetary-computer-pyspark:2021.11.29.0-gdal3.4-3.1-rf"
   r_image                          = "pcccr.azurecr.io/public/planetary-computer/r:2022.01.17.0"
   gpu_pytorch_image                = "pcccr.azurecr.io/public/planetary-computer/gpu-pytorch:2022.01.17.0"
   gpu_tensorflow_image             = "pcccr.azurecr.io/public/planetary-computer/gpu-tensorflow:2022.01.17.0"