Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 28 additions & 8 deletions test-horovod-tensorflow/test-horovod-tensorflow.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@
},
"outputs": [],
"source": [
"# per ML-3824 need to install tensorflow and mlrun in the same command\n",
"!pip install plotly tensorflow==2.15.1 mlrun # TODO: remove 2.15.1 here and in functions requirements after ML-10271 fix"
"# with tensorflow==2.19.1 horovod build fails\n",
"!pip install plotly tensorflow==2.18.1 mlrun"
]
},
{
Expand Down Expand Up @@ -118,7 +118,7 @@
"def get_model() -> tf.keras.Model:\n",
" # Build the model architecture:\n",
" inputs = tf.keras.Input(shape=(28, 28))\n",
" x = tf.keras.layers.experimental.preprocessing.Rescaling(1.0 / 255)(inputs)\n",
" x = tf.keras.layers.Rescaling(1.0 / 255)(inputs)\n",
" x = tf.keras.layers.Flatten()(x)\n",
" x = tf.keras.layers.Dense(128, activation=\"relu\")(x)\n",
" x = tf.keras.layers.Dense(128, activation=\"relu\")(x)\n",
Expand Down Expand Up @@ -147,7 +147,7 @@
"\n",
" # Compile the model:\n",
" model.compile(\n",
" optimizer=tf.keras.optimizers.legacy.SGD(learning_rate=0.1, momentum=0.9),\n",
" optimizer = tf.keras.optimizers.SGD(learning_rate=0.1, momentum=0.9),\n",
" loss=\"sparse_categorical_crossentropy\",\n",
" metrics=[\"accuracy\"]\n",
" )\n",
Expand Down Expand Up @@ -221,7 +221,7 @@
"outputs": [],
"source": [
"# Create the job function:\n",
"job_function = project.set_function(\"./func.py\", name=\"train_job\", kind=\"job\", image=\"mlrun/mlrun\", handler=\"train\", requirements=[\"tensorflow==2.15.1\"])\n",
"job_function = project.set_function(\"./func.py\", name=\"train_job\", kind=\"job\", image=\"mlrun/mlrun\", handler=\"train\", requirements=[\"tensorflow==2.18.1\"])\n",
"job_function.apply(mlrun.auto_mount())\n",
"job_function.deploy()"
]
Expand All @@ -238,11 +238,15 @@
"outputs": [],
"source": [
"# Create the open mpi function:\n",
"mpijob_function = project.set_function(\"./func.py\", name=\"train_mpijob\", kind=\"mpijob\", image=\"mlrun/mlrun\", handler=\"train\", requirements=[\"horovod[tensorflow]\"])\n",
"mpijob_function = project.set_function(\"./func.py\", name=\"train_mpijob\", kind=\"mpijob\", image=\"mlrun/mlrun\", handler=\"train\")\n",
"mpijob_function.apply(mlrun.auto_mount())\n",
"mpijob_function.spec.replicas = N_RANKS\n",
"mpijob_function.with_commands([\"pip install tensorflow==2.15.1\"])\n",
"mpijob_function.deploy(builder_env={\"HOROVOD_WITH_TENSORFLOW\": \"1\"})"
"mpijob_function.with_commands([\n",
" \"apt-get update && apt-get install -y build-essential g++ cmake llvm-dev libclang-dev\",\n",
" \"pip install tensorflow==2.18.1\",\n",
" \"HOROVOD_WITH_TENSORFLOW=1 HOROVOD_WITHOUT_XLA=1 HOROVOD_WITHOUT_MLX=1 pip install horovod[tensorflow]\",\n",
"])\n",
"mpijob_function.deploy()\n"
]
},
{
Expand Down Expand Up @@ -384,6 +388,22 @@
"# assert mpijob_time < job_time\n",
"# assert np.isclose(job_accuracy, mpijob_accuracy, atol=0.1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "31ba0442-a98b-46fc-a680-9dfcdf1dfe9a",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "81d92305-268f-4ead-87df-5ab3a30f294b",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down