fix: jupyterhub-pyspark-hdfs-anomaly-detection-taxi-data dev install (#257)

razvan · web-flow · commit d2103c6b94e4 · 2025-07-17T09:04:27.000+02:00
* fix: jupyterhub-pyspark-hdfs-anomaly-detection-taxi-data dev install

* docs: remove dead link

* update notebook
diff --git a/docs/modules/demos/pages/jupyterhub-pyspark-hdfs-anomaly-detection-taxi-data.adoc b/docs/modules/demos/pages/jupyterhub-pyspark-hdfs-anomaly-detection-taxi-data.adoc
@@ -3,7 +3,6 @@
 :scikit-lib: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.IsolationForest.html
 :k8s-cpu: https://kubernetes.io/docs/tasks/debug/debug-cluster/resource-metrics-pipeline/#cpu
 :spark-pkg: https://spark.apache.org/docs/latest/api/python/user_guide/python_packaging.html
-:forest-article: https://towardsdatascience.com/isolation-forest-and-spark-b88ade6c63ff
 :pyspark: https://spark.apache.org/docs/latest/api/python/getting_started/index.html
 :forest-algo: https://cs.nju.edu.cn/zhouzh/zhouzh.files/publication/icdm08b.pdf
 :nyc-taxi: https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page
@@ -133,7 +132,7 @@ In practice, clients of Spark Connect do not need a full-blown Spark installatio
 == Model details
 
 The job uses an implementation of the Isolation Forest {forest-algo}[algorithm] provided by the scikit-learn {scikit-lib}[library]:
-the model is trained and then invoked by a user-defined function (see {forest-article}[this article] for how to call the sklearn library with a pyspark UDF), all of which is run using the Spark Connect executors.
+the model is trained and then invoked by a user-defined function running on the Spark Connect executors.
 This type of model attempts to isolate each data point by continually partitioning the data.
 Data closely packed together will require more partitions to separate data points.
 In contrast, any outliers will require less: the number of partitions needed for a particular data point is thus inversely proportional to the anomaly "score".
diff --git a/stacks/jupyterhub-pyspark-hdfs/notebook.ipynb b/stacks/jupyterhub-pyspark-hdfs/notebook.ipynb
@@ -27,14 +27,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "spark = (\n",
     "    SparkSession\n",
     "        .builder\n",
-    "        .remote(\"sc://spark-connect-server-default:15002\")\n",
+    "        .remote(\"sc://spark-connect-server:15002\")\n",
     "        .appName(\"taxi-data-anomaly-detection\")\n",
     "        .getOrCreate()\n",
     ")"
diff --git a/stacks/jupyterhub-pyspark-hdfs/spark_connect.yaml b/stacks/jupyterhub-pyspark-hdfs/spark_connect.yaml
@@ -53,8 +53,9 @@ spec:
           - name: hdfs-discovery-configmap
             configMap:
               name: hdfs
-    config:
+    roleConfig:
       listenerClass: external-unstable
+    config:
       resources:
         memory:
           limit: "2Gi"