address comments

allisonwang-db · allisonwang-db · commit a7d719d867af · 2024-11-26T01:29:10.000-08:00
diff --git a/demo.ipynb b/demo.ipynb
@@ -1,69 +1,113 @@
 {
  "cells": [
   {
-   "cell_type": "code",
-   "execution_count": 11,
-   "id": "125a1871-6cab-4dc4-9fd5-4e5dbd63ada6",
+   "cell_type": "markdown",
+   "id": "19b1960e-9e0a-401f-be15-d343902eaa21",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "import warnings\n",
-    "warnings.filterwarnings('ignore')"
+    "# Spark HuggingFace Connector Demo"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "38dc7e9e-35fd-4604-9be3-1a1a8749fbcb",
+   "cell_type": "markdown",
+   "id": "c9a7bf1d-c208-4873-9e06-5db981f8eeaa",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "from pyspark_huggingface import HuggingFaceDatasets"
+    "## Create a Spark Session"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
    "id": "620d3ecb-b9cb-480c-b300-69198cce7a9c",
    "metadata": {},
+   "source": [
+    "from pyspark.sql import SparkSession\n",
+    "\n",
+    "spark = SparkSession.builder.getOrCreate()"
+   ],
    "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6f876028-2af5-4e63-8e9d-59afc0959267",
+   "metadata": {},
    "source": [
-    "from pyspark.sql import SparkSession"
+    "## Load a dataset as a Spark DataFrame"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
-   "id": "9255ffcb-0b61-43dc-b57a-2b8af01a8432",
-   "metadata": {},
+   "execution_count": 2,
+   "id": "b8580bde-3f64-4c71-a087-8b3f71099aee",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-11-26T08:54:32.132099Z",
+     "start_time": "2024-11-26T08:54:28.903653Z"
+    }
+   },
    "outputs": [],
    "source": [
-    "spark = SparkSession.builder.getOrCreate()"
+    "df = spark.read.format(\"huggingface\").load(\"rotten_tomatoes\")"
    ]
   },
   {
    "cell_type": "code",
-   "id": "7c4501a8-26f4-4f52-9dc8-a70393d567b4",
+   "execution_count": 4,
+   "id": "3bbf61d1-4c2c-40e7-9790-2722637aac9d",
    "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "root\n",
+      " |-- text: string (nullable = true)\n",
+      " |-- label: long (nullable = true)\n",
+      "\n"
+     ]
+    }
+   ],
    "source": [
-    "spark.dataSource.register(HuggingFaceDatasets)"
+    "df.printSchema()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "id": "7f7b9a2b-8733-499a-af56-3c51196d060f",
+   "metadata": {},
+   "source": [
+    "# Cache the dataframe to avoid re-downloading data\n",
+    "df.cache()"
    ],
    "outputs": [],
    "execution_count": null
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
-   "id": "b8580bde-3f64-4c71-a087-8b3f71099aee",
+   "execution_count": 12,
+   "id": "df121dba-2e1e-4206-b2bf-db156c298ee1",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "8530"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "df = spark.read.format(\"huggingface\").load(\"rotten_tomatoes\")"
+    "# Trigger the cache computation\n",
+    "df.count()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 13,
    "id": "8866bdfb-0782-4430-8b1e-09c65e699f41",
    "metadata": {
     "editable": true,
@@ -72,72 +116,158 @@
     },
     "tags": []
    },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Row(text='the rock is destined to be the 21st century\\'s new \" conan \" and that he\\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .', label=1)"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "id": "0d9d3112-d19b-4fa8-a6fc-ba40816d1d11",
+   "metadata": {},
+   "source": [
+    "df.show(n=5)"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "225bbbef-4164-424d-a701-c6c74494ef81",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "4265"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Then you can operate on this dataframe\n",
+    "df.filter(df.label == 0).count()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3932f1fd-a324-4f15-86e1-bbe1064d707a",
+   "metadata": {},
+   "source": [
+    "## Load a different split\n",
+    "You can specify the `split` data source option:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "a16e9270-eb02-4568-8739-db4dc715c274",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_df = (\n",
+    "    spark.read.format(\"huggingface\")\n",
+    "    .option(\"split\", \"test\")\n",
+    "    .load(\"rotten_tomatoes\")\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "3aec5719-c3a1-4d18-92c8-2b0c2f4bb939",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "DataFrame[text: string, label: bigint]"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "test_df.cache()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "d605289d-361d-4a6c-9b70-f7ccdff3aa9d",
+   "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "[Stage 5:>                                                          (0 + 1) / 1]"
+      "                                                                                "
      ]
     },
+    {
+     "data": {
+      "text/plain": [
+       "1066"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "test_df.count()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "df1ad003-1476-4557-811b-31c3888c0030",
+   "metadata": {},
+   "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
       "+--------------------+-----+\n",
       "|                text|label|\n",
       "+--------------------+-----+\n",
-      "|the rock is desti...|    1|\n",
-      "|the gorgeously el...|    1|\n",
-      "|effective but too...|    1|\n",
-      "|if you sometimes ...|    1|\n",
-      "|emerges as someth...|    1|\n",
-      "|the film provides...|    1|\n",
-      "|offers that rare ...|    1|\n",
-      "|perhaps no pictur...|    1|\n",
-      "|steers turns in a...|    1|\n",
-      "|take care of my c...|    1|\n",
-      "|this is a film we...|    1|\n",
-      "|what really surpr...|    1|\n",
-      "|( wendigo is ) wh...|    1|\n",
-      "|one of the greate...|    1|\n",
-      "|ultimately , it p...|    1|\n",
-      "|an utterly compel...|    1|\n",
-      "|illuminating if o...|    1|\n",
-      "|a masterpiece fou...|    1|\n",
-      "|the movie's ripe ...|    1|\n",
-      "|offers a breath o...|    1|\n",
+      "|lovingly photogra...|    1|\n",
+      "|consistently clev...|    1|\n",
+      "|it's like a \" big...|    1|\n",
+      "|the story gives a...|    1|\n",
+      "|red dragon \" neve...|    1|\n",
       "+--------------------+-----+\n",
-      "only showing top 20 rows\n",
+      "only showing top 5 rows\n",
       "\n"
      ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                                                                "
-     ]
     }
    ],
    "source": [
-    "df.show()"
+    "test_df.show(n=5)"
    ]
   },
-  {
-   "cell_type": "code",
-   "id": "873bb4fc-1424-4816-b835-6c2b839d3de4",
-   "metadata": {},
-   "source": [
-    "df.count()"
-   ],
-   "outputs": [],
-   "execution_count": null
-  },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "4a1b895f-fe20-4520-a90d-b17df8e691e4",
+   "id": "a7f14b91-059e-4894-83d2-4ed74e0adaf9",
    "metadata": {},
    "outputs": [],
    "source": []
diff --git a/pyspark_huggingface/__init__.py b/pyspark_huggingface/__init__.py
@@ -1 +1 @@
-from pyspark_huggingface.huggingface import HuggingFaceDatasets
+from pyspark_huggingface.huggingface import HuggingFaceDatasets as DefaultSource
diff --git a/pyspark_huggingface/huggingface.py b/pyspark_huggingface/huggingface.py

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-from pyspark_huggingface.huggingface import HuggingFaceDatasets`
	`1`	`+from pyspark_huggingface.huggingface import HuggingFaceDatasets as DefaultSource`