diff --git a/docs/modules/demos/pages/jupyterhub-pyspark-hdfs-anomaly-detection-taxi-data.adoc b/docs/modules/demos/pages/jupyterhub-pyspark-hdfs-anomaly-detection-taxi-data.adoc index 138dac25..592d2c34 100644 --- a/docs/modules/demos/pages/jupyterhub-pyspark-hdfs-anomaly-detection-taxi-data.adoc +++ b/docs/modules/demos/pages/jupyterhub-pyspark-hdfs-anomaly-detection-taxi-data.adoc @@ -3,7 +3,6 @@ :scikit-lib: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.IsolationForest.html :k8s-cpu: https://kubernetes.io/docs/tasks/debug/debug-cluster/resource-metrics-pipeline/#cpu :spark-pkg: https://spark.apache.org/docs/latest/api/python/user_guide/python_packaging.html -:forest-article: https://towardsdatascience.com/isolation-forest-and-spark-b88ade6c63ff :pyspark: https://spark.apache.org/docs/latest/api/python/getting_started/index.html :forest-algo: https://cs.nju.edu.cn/zhouzh/zhouzh.files/publication/icdm08b.pdf :nyc-taxi: https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page @@ -133,7 +132,7 @@ In practice, clients of Spark Connect do not need a full-blown Spark installatio == Model details The job uses an implementation of the Isolation Forest {forest-algo}[algorithm] provided by the scikit-learn {scikit-lib}[library]: -the model is trained and then invoked by a user-defined function (see {forest-article}[this article] for how to call the sklearn library with a pyspark UDF), all of which is run using the Spark Connect executors. +the model is trained and then invoked by a user-defined function running on the Spark Connect executors. This type of model attempts to isolate each data point by continually partitioning the data. Data closely packed together will require more partitions to separate data points. In contrast, any outliers will require less: the number of partitions needed for a particular data point is thus inversely proportional to the anomaly "score". diff --git a/stacks/jupyterhub-pyspark-hdfs/notebook.ipynb b/stacks/jupyterhub-pyspark-hdfs/notebook.ipynb index acb9b431..3e81c879 100644 --- a/stacks/jupyterhub-pyspark-hdfs/notebook.ipynb +++ b/stacks/jupyterhub-pyspark-hdfs/notebook.ipynb @@ -27,14 +27,14 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "spark = (\n", " SparkSession\n", " .builder\n", - " .remote(\"sc://spark-connect-server-default:15002\")\n", + " .remote(\"sc://spark-connect-server:15002\")\n", " .appName(\"taxi-data-anomaly-detection\")\n", " .getOrCreate()\n", ")" diff --git a/stacks/jupyterhub-pyspark-hdfs/spark_connect.yaml b/stacks/jupyterhub-pyspark-hdfs/spark_connect.yaml index 4e88bfcc..5fe372b2 100644 --- a/stacks/jupyterhub-pyspark-hdfs/spark_connect.yaml +++ b/stacks/jupyterhub-pyspark-hdfs/spark_connect.yaml @@ -53,8 +53,9 @@ spec: - name: hdfs-discovery-configmap configMap: name: hdfs - config: + roleConfig: listenerClass: external-unstable + config: resources: memory: limit: "2Gi"