Upgrade to CUDA 12 and TF 2.15 (#1352)

rosbo · web-flow · commit 1d7b809de0f5 · 2024-01-15T20:51:44.000-08:00
Also upgrades, `torch` (and related libraries), `lightgbm` &amp; `jax`.

Add tests for numba.

http://b/319722433
diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl
@@ -7,10 +7,12 @@ ARG TORCH_VERSION
 ARG TORCHAUDIO_VERSION
 ARG TORCHTEXT_VERSION
 ARG TORCHVISION_VERSION
+ARG JAX_VERSION
 
 {{ if eq .Accelerator "gpu" }}
 FROM gcr.io/kaggle-images/python-lightgbm-whl:${GPU_BASE_IMAGE_NAME}-${BASE_IMAGE_TAG}-${LIGHTGBM_VERSION} AS lightgbm_whl
 FROM gcr.io/kaggle-images/python-torch-whl:${GPU_BASE_IMAGE_NAME}-${BASE_IMAGE_TAG}-${TORCH_VERSION} AS torch_whl
+FROM gcr.io/kaggle-images/python-jaxlib-whl:${GPU_BASE_IMAGE_NAME}-${BASE_IMAGE_TAG}-${JAX_VERSION} AS jaxlib_whl
 FROM ${BASE_IMAGE_REPO}/${GPU_BASE_IMAGE_NAME}:${BASE_IMAGE_TAG}
 {{ else }}
 FROM ${BASE_IMAGE_REPO}/${CPU_BASE_IMAGE_NAME}:${BASE_IMAGE_TAG}
@@ -36,9 +38,9 @@ RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/lib
 {{ end }}
 
 # Keep these variables in sync if base image is updated.
-ENV TENSORFLOW_VERSION=2.13.0
+ENV TENSORFLOW_VERSION=2.15.0
 # See https://github.com/tensorflow/io#tensorflow-version-compatibility
-ENV TENSORFLOW_IO_VERSION=0.34.0
+ENV TENSORFLOW_IO_VERSION=0.35.0
 
 # We need to redefine the ARG here to get the ARG value defined above the FROM instruction.
 # See: https://docs.docker.com/engine/reference/builder/#understand-how-arg-and-from-interact
@@ -47,6 +49,7 @@ ARG TORCH_VERSION
 ARG TORCHAUDIO_VERSION
 ARG TORCHTEXT_VERSION
 ARG TORCHVISION_VERSION
+ARG JAX_VERSION
 
 # Disable pesky logs like: KMP_AFFINITY: pid 6121 tid 6121 thread 0 bound to OS proc set 0
 # See: https://stackoverflow.com/questions/57385766/disable-tensorflow-log-information
@@ -158,7 +161,9 @@ RUN pip install lightgbm==$LIGHTGBM_VERSION && \
 
 # Install JAX
 {{ if eq .Accelerator "gpu" }}
-RUN pip install "jax[cuda11_local]" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html && \
+COPY --from=jaxlib_whl /tmp/whl/*.whl /tmp/jax/
+# b/319722433#comment9: Use pip wheels once versions matches our CUDA version.
+RUN pip install /tmp/jax/*.whl jax==$JAX_VERSION && \
     /tmp/clean-layer.sh
 {{ else }}
 RUN pip install jax[cpu] && \
@@ -169,7 +174,7 @@ RUN pip install jax[cpu] && \
 # Install GPU specific packages
 {{ if eq .Accelerator "gpu" }}
 # Install GPU-only packages
-# No specific package for nnabla-ext-cuda 11.x minor versions.
+# No specific package for nnabla-ext-cuda 12.x minor versions.
 RUN export PATH=/usr/local/cuda/bin:$PATH && \
     export CUDA_ROOT=/usr/local/cuda && \
     pip install pycuda \
@@ -199,10 +204,17 @@ RUN pip install -f http://h2o-release.s3.amazonaws.com/h2o/latest_stable_Py.html
 
 RUN pip install \
         "tensorflow==${TENSORFLOW_VERSION}" \
-        "tensorflow-io==${TENSORFLOW_IO_VERSION}"\
+        "tensorflow-io==${TENSORFLOW_IO_VERSION}" \
         tensorflow-addons \
         tensorflow_decision_forests \
-        tensorflow_text && \
+        tensorflow_text \
+        tensorflowjs \
+        tensorflow_hub && \
+    /tmp/clean-layer.sh
+
+# TODO(b/318672158): Upgrade to Keras 3 once compatible with other TF libries.
+# See blockers here: https://b.corp.google.com/issues/319722433#comment8
+RUN pip install keras keras-cv keras-nlp && \
     /tmp/clean-layer.sh
 
 RUN pip install pysal
@@ -268,12 +280,6 @@ RUN pip install scipy \
     apt-get install -y pandoc && \
     pip install essentia
 
-{{ if eq .Accelerator "gpu" }}
-# #1281 Install numba MVC support:
-RUN pip install ptxcompiler-cu11 cubinlinker-cu11 --extra-index-url=https://pypi.nvidia.com
-ENV NUMBA_CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY=1
-{{ end }}
-
 RUN apt-get install -y git-lfs && \
     /tmp/clean-layer.sh
 
@@ -316,8 +322,7 @@ RUN pip install mpld3 \
         s2sphere \
         bayesian-optimization \
         matplotlib-venn \
-        # b/184083722 pyldavis >= 3.3 requires numpy >= 1.20.0 but TensorFlow 2.4.1 / 2.5.0 requires 1.19.2
-        pyldavis==3.2.2 \
+        pyldavis \
         mlxtend \
         altair \
         ImageHash \
@@ -527,8 +532,6 @@ RUN pip install flashtext \
         gym \
         pyarabic \
         pandasql \
-        tensorflow_hub \
-        tensorflowjs \
         jieba  \
         # ggplot is broken and main repo does not merge and release https://github.com/yhat/ggpy/pull/668
         https://github.com/hbasria/ggpy/archive/0.11.5.zip \
@@ -543,13 +546,7 @@ RUN pip install flashtext \
         # b/290207097 switch back to the pip catalyst package when bug fixed
         # https://github.com/catalyst-team/catalyst/issues/1440
         git+https://github.com/Philmod/catalyst.git@fix-fp16#egg=catalyst \
-        # b/206990323 osmx 1.1.2 requires numpy >= 1.21 which we don't want.
-        osmnx==1.1.1 \
-        # Remove once `keras-core` is released as Keras
-        keras-core \
-        # TODO(b/315833744) unpin when the alpha versions are merged to the main version.
-        keras-cv \
-        keras-nlp && \
+        osmnx && \
     apt-get -y install libspatialindex-dev
 
 RUN pip install pytorch-ignite \
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -62,6 +62,24 @@ pipeline {
             '''
           }
         }
+        stage('jaxlib') {
+          options {
+            timeout(time: 60, unit: 'MINUTES')
+          }
+          steps {
+            sh '''#!/bin/bash
+              set -exo pipefail
+              source config.txt
+              cd packages/
+              ./build_package --base-image $BASE_IMAGE_REPO/$GPU_BASE_IMAGE_NAME:$BASE_IMAGE_TAG \
+                --package jaxlib \
+                --version $JAX_VERSION \
+                --build-arg CUDA_MAJOR_VERSION=$CUDA_MAJOR_VERSION \
+                --build-arg CUDA_MINOR_VERSION=$CUDA_MINOR_VERSION \
+                --push
+            '''
+          }
+        }
       }
     }
     stage('Build/Test/Diff') {
diff --git a/config.txt b/config.txt
@@ -1,11 +1,12 @@
 BASE_IMAGE_REPO=gcr.io/deeplearning-platform-release
-BASE_IMAGE_TAG=m111
-CPU_BASE_IMAGE_NAME=tf2-cpu.2-13.py310
-GPU_BASE_IMAGE_NAME=tf2-gpu.2-13.py310
-LIGHTGBM_VERSION=3.3.2
-TORCH_VERSION=2.0.0
-TORCHAUDIO_VERSION=2.0.1
-TORCHTEXT_VERSION=0.15.1
-TORCHVISION_VERSION=0.15.1
-CUDA_MAJOR_VERSION=11
-CUDA_MINOR_VERSION=8
+BASE_IMAGE_TAG=m114
+CPU_BASE_IMAGE_NAME=tf2-cpu.2-15.py310
+GPU_BASE_IMAGE_NAME=tf2-gpu.2-15.py310
+LIGHTGBM_VERSION=4.2.0
+TORCH_VERSION=2.1.2
+TORCHAUDIO_VERSION=2.1.2
+TORCHTEXT_VERSION=0.16.2
+TORCHVISION_VERSION=0.16.2
+JAX_VERSION=0.4.23
+CUDA_MAJOR_VERSION=12
+CUDA_MINOR_VERSION=1
diff --git a/packages/jaxlib.Dockerfile b/packages/jaxlib.Dockerfile
@@ -0,0 +1,39 @@
+ARG BASE_IMAGE
+
+FROM ${BASE_IMAGE} AS builder
+
+ARG PACKAGE_VERSION
+ARG CUDA_MAJOR_VERSION
+ARG CUDA_MINOR_VERSION
+
+# Make sure we are on the right version of CUDA
+RUN update-alternatives --set cuda /usr/local/cuda-$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION
+
+# Ensures shared libraries installed with conda can be found by the dynamic link loader.
+# For PyTorch, we need specifically mkl.
+ENV LIBRARY_PATH="$LIBRARY_PATH:/opt/conda/lib"
+ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/conda/lib"
+
+# Instructions: https://jax.readthedocs.io/en/latest/developer.html#building-jaxlib-from-source
+RUN apt-get update && \
+    apt-get install -y g++ python python3-dev
+
+RUN pip install numpy wheel build
+
+RUN cd /usr/local/src && \
+    git clone https://github.com/google/jax && \
+    cd jax && \
+    git checkout jaxlib-v$PACKAGE_VERSION
+
+RUN cd /usr/local/src/jax && \
+    python build/build.py --enable_cuda
+
+# Using multi-stage builds to ensure the output image is very small
+# See: https://docs.docker.com/develop/develop-images/multistage-build/
+FROM alpine:latest
+
+RUN mkdir -p /tmp/whl/
+COPY --from=builder /usr/local/src/jax/dist/*.whl /tmp/whl
+
+# Print out the built .whl file.
+RUN ls -lh /tmp/whl/
diff --git a/packages/lightgbm.Dockerfile b/packages/lightgbm.Dockerfile
@@ -11,24 +11,20 @@ RUN update-alternatives --set cuda /usr/local/cuda-$CUDA_MAJOR_VERSION.$CUDA_MIN
 
 # Build instructions: https://lightgbm.readthedocs.io/en/latest/GPU-Tutorial.html#build-lightgbm
 RUN apt-get update && \
-    apt-get install -y build-essential cmake libboost-dev libboost-system-dev libboost-filesystem-dev ocl-icd-libopencl1 clinfo opencl-headers
+    apt-get install -y build-essential cmake libboost-dev libboost-system-dev libboost-filesystem-dev clinfo nvidia-opencl-dev opencl-headers
 
 RUN cd /usr/local/src && \
     git clone --recursive https://github.com/microsoft/LightGBM && \
     cd LightGBM && \
     git checkout tags/v$PACKAGE_VERSION && \
-    mkdir build && cd build && \
-    cmake -DUSE_GPU=1 -DOpenCL_LIBRARY=/usr/local/cuda/lib64/libOpenCL.so -DOpenCL_INCLUDE_DIR=/usr/local/cuda/include/ .. && \
-    make -j$(nproc) && \
-    cd /usr/local/src/LightGBM/python-package && \
-    python setup.py bdist_wheel
+    ./build-python.sh bdist_wheel  --gpu --opencl-library=/usr/local/cuda/lib64/libOpenCL.so --opencl-include-dir=/usr/local/cuda/include/
 
 # Using multi-stage builds to ensure the output image is very small
 # See: https://docs.docker.com/develop/develop-images/multistage-build/
 FROM alpine:latest
 
 RUN mkdir -p /tmp/whl/
-COPY --from=builder /usr/local/src/LightGBM/python-package/dist/*.whl /tmp/whl
+COPY --from=builder /usr/local/src/LightGBM/dist/*.whl /tmp/whl
 
 # Print out the built .whl file.
 RUN ls -lh /tmp/whl/
diff --git a/packages/torch.Dockerfile b/packages/torch.Dockerfile
@@ -31,7 +31,7 @@ ENV PYTORCH_BUILD_NUMBER=1
 # For PyTorch, we need specifically mkl.
 ENV LIBRARY_PATH="$LIBRARY_PATH:/opt/conda/lib"
 ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/conda/lib"
-ENV TORCH_CUDA_ARCH_LIST="3.7;6.0;7.0+PTX;7.5+PTX"
+ENV TORCH_CUDA_ARCH_LIST="6.0;7.0+PTX;7.5+PTX"
 ENV FORCE_CUDA=1
 RUN cd /usr/local/src && \
     git clone --recursive https://github.com/pytorch/pytorch && \
diff --git a/tests/test_keras.py b/tests/test_keras.py
@@ -5,12 +5,9 @@
 
 os.environ["KERAS_BACKEND"] = "tensorflow" 
 
-# Note that keras_core should only be imported after the backend
-# has been configured. The backend cannot be changed once the
-# package is imported.
-import keras_core as keras
+import keras
 
-class TestKerasCore(unittest.TestCase):
+class TestKeras(unittest.TestCase):
     def test_train(self):
         # Load the data and split it between train and test sets
         (x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data(
diff --git a/tests/test_lightgbm.py b/tests/test_lightgbm.py
@@ -30,7 +30,7 @@ def test_cpu(self):
                         lgb_train,
                         num_boost_round=1,
                         valid_sets=lgb_eval,
-                        early_stopping_rounds=1)
+                        callbacks=[lgb.early_stopping(stopping_rounds=1)])
 
         self.assertEqual(1, gbm.best_iteration)
 
@@ -57,7 +57,7 @@ def test_gpu(self):
                         lgb_train,
                         num_boost_round=1,
                         valid_sets=lgb_eval,
-                        early_stopping_rounds=1)
+                        callbacks=[lgb.early_stopping(stopping_rounds=1)])
 
         self.assertEqual(1, gbm.best_iteration)
     
diff --git a/tests/test_numba.py b/tests/test_numba.py
@@ -0,0 +1,35 @@
+import unittest
+
+import numpy as np
+from numba import jit, cuda
+
+from common import gpu_test
+
+class TestNumba(unittest.TestCase):
+    def test_jit(self):
+        x = np.arange(100).reshape(10, 10)
+
+        @jit(nopython=True) # Set "nopython" mode for best performance, equivalent to @njit
+        def go_fast(a): # Function is compiled to machine code when called the first time
+            trace = 0.0
+            for i in range(a.shape[0]):   # Numba likes loops
+                trace += np.tanh(a[i, i]) # Numba likes NumPy functions
+            return a + trace              # Numba likes NumPy broadcasting
+
+        self.assertEqual(10, go_fast(x).shape[0])
+
+    @gpu_test
+    def test_cuda_jit(self):
+        x = np.arange(10)
+
+        @cuda.jit
+        def increment_by_one(an_array):
+            pos = cuda.grid(1)
+            if pos < an_array.size:
+                an_array[pos] += 1
+
+        threadsperblock = 32
+        blockspergrid = (x.size + (threadsperblock - 1))
+        self.assertEqual(0, x[0])
+        increment_by_one[blockspergrid, threadsperblock](x)
+        self.assertEqual(1, x[0])