Skip to content

Commit 1d7b809

Browse files
authored
Upgrade to CUDA 12 and TF 2.15 (#1352)
Also upgrades, `torch` (and related libraries), `lightgbm` & `jax`. Add tests for numba. http://b/319722433
1 parent 2f42cbb commit 1d7b809

File tree

9 files changed

+131
-48
lines changed

9 files changed

+131
-48
lines changed

Dockerfile.tmpl

Lines changed: 20 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,12 @@ ARG TORCH_VERSION
77
ARG TORCHAUDIO_VERSION
88
ARG TORCHTEXT_VERSION
99
ARG TORCHVISION_VERSION
10+
ARG JAX_VERSION
1011

1112
{{ if eq .Accelerator "gpu" }}
1213
FROM gcr.io/kaggle-images/python-lightgbm-whl:${GPU_BASE_IMAGE_NAME}-${BASE_IMAGE_TAG}-${LIGHTGBM_VERSION} AS lightgbm_whl
1314
FROM gcr.io/kaggle-images/python-torch-whl:${GPU_BASE_IMAGE_NAME}-${BASE_IMAGE_TAG}-${TORCH_VERSION} AS torch_whl
15+
FROM gcr.io/kaggle-images/python-jaxlib-whl:${GPU_BASE_IMAGE_NAME}-${BASE_IMAGE_TAG}-${JAX_VERSION} AS jaxlib_whl
1416
FROM ${BASE_IMAGE_REPO}/${GPU_BASE_IMAGE_NAME}:${BASE_IMAGE_TAG}
1517
{{ else }}
1618
FROM ${BASE_IMAGE_REPO}/${CPU_BASE_IMAGE_NAME}:${BASE_IMAGE_TAG}
@@ -36,9 +38,9 @@ RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/lib
3638
{{ end }}
3739

3840
# Keep these variables in sync if base image is updated.
39-
ENV TENSORFLOW_VERSION=2.13.0
41+
ENV TENSORFLOW_VERSION=2.15.0
4042
# See https://github.com/tensorflow/io#tensorflow-version-compatibility
41-
ENV TENSORFLOW_IO_VERSION=0.34.0
43+
ENV TENSORFLOW_IO_VERSION=0.35.0
4244

4345
# We need to redefine the ARG here to get the ARG value defined above the FROM instruction.
4446
# See: https://docs.docker.com/engine/reference/builder/#understand-how-arg-and-from-interact
@@ -47,6 +49,7 @@ ARG TORCH_VERSION
4749
ARG TORCHAUDIO_VERSION
4850
ARG TORCHTEXT_VERSION
4951
ARG TORCHVISION_VERSION
52+
ARG JAX_VERSION
5053

5154
# Disable pesky logs like: KMP_AFFINITY: pid 6121 tid 6121 thread 0 bound to OS proc set 0
5255
# See: https://stackoverflow.com/questions/57385766/disable-tensorflow-log-information
@@ -158,7 +161,9 @@ RUN pip install lightgbm==$LIGHTGBM_VERSION && \
158161

159162
# Install JAX
160163
{{ if eq .Accelerator "gpu" }}
161-
RUN pip install "jax[cuda11_local]" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html && \
164+
COPY --from=jaxlib_whl /tmp/whl/*.whl /tmp/jax/
165+
# b/319722433#comment9: Use pip wheels once versions matches our CUDA version.
166+
RUN pip install /tmp/jax/*.whl jax==$JAX_VERSION && \
162167
/tmp/clean-layer.sh
163168
{{ else }}
164169
RUN pip install jax[cpu] && \
@@ -169,7 +174,7 @@ RUN pip install jax[cpu] && \
169174
# Install GPU specific packages
170175
{{ if eq .Accelerator "gpu" }}
171176
# Install GPU-only packages
172-
# No specific package for nnabla-ext-cuda 11.x minor versions.
177+
# No specific package for nnabla-ext-cuda 12.x minor versions.
173178
RUN export PATH=/usr/local/cuda/bin:$PATH && \
174179
export CUDA_ROOT=/usr/local/cuda && \
175180
pip install pycuda \
@@ -199,10 +204,17 @@ RUN pip install -f http://h2o-release.s3.amazonaws.com/h2o/latest_stable_Py.html
199204

200205
RUN pip install \
201206
"tensorflow==${TENSORFLOW_VERSION}" \
202-
"tensorflow-io==${TENSORFLOW_IO_VERSION}"\
207+
"tensorflow-io==${TENSORFLOW_IO_VERSION}" \
203208
tensorflow-addons \
204209
tensorflow_decision_forests \
205-
tensorflow_text && \
210+
tensorflow_text \
211+
tensorflowjs \
212+
tensorflow_hub && \
213+
/tmp/clean-layer.sh
214+
215+
# TODO(b/318672158): Upgrade to Keras 3 once compatible with other TF libries.
216+
# See blockers here: https://b.corp.google.com/issues/319722433#comment8
217+
RUN pip install keras keras-cv keras-nlp && \
206218
/tmp/clean-layer.sh
207219

208220
RUN pip install pysal
@@ -268,12 +280,6 @@ RUN pip install scipy \
268280
apt-get install -y pandoc && \
269281
pip install essentia
270282

271-
{{ if eq .Accelerator "gpu" }}
272-
# #1281 Install numba MVC support:
273-
RUN pip install ptxcompiler-cu11 cubinlinker-cu11 --extra-index-url=https://pypi.nvidia.com
274-
ENV NUMBA_CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY=1
275-
{{ end }}
276-
277283
RUN apt-get install -y git-lfs && \
278284
/tmp/clean-layer.sh
279285

@@ -316,8 +322,7 @@ RUN pip install mpld3 \
316322
s2sphere \
317323
bayesian-optimization \
318324
matplotlib-venn \
319-
# b/184083722 pyldavis >= 3.3 requires numpy >= 1.20.0 but TensorFlow 2.4.1 / 2.5.0 requires 1.19.2
320-
pyldavis==3.2.2 \
325+
pyldavis \
321326
mlxtend \
322327
altair \
323328
ImageHash \
@@ -527,8 +532,6 @@ RUN pip install flashtext \
527532
gym \
528533
pyarabic \
529534
pandasql \
530-
tensorflow_hub \
531-
tensorflowjs \
532535
jieba \
533536
# ggplot is broken and main repo does not merge and release https://github.com/yhat/ggpy/pull/668
534537
https://github.com/hbasria/ggpy/archive/0.11.5.zip \
@@ -543,13 +546,7 @@ RUN pip install flashtext \
543546
# b/290207097 switch back to the pip catalyst package when bug fixed
544547
# https://github.com/catalyst-team/catalyst/issues/1440
545548
git+https://github.com/Philmod/catalyst.git@fix-fp16#egg=catalyst \
546-
# b/206990323 osmx 1.1.2 requires numpy >= 1.21 which we don't want.
547-
osmnx==1.1.1 \
548-
# Remove once `keras-core` is released as Keras
549-
keras-core \
550-
# TODO(b/315833744) unpin when the alpha versions are merged to the main version.
551-
keras-cv \
552-
keras-nlp && \
549+
osmnx && \
553550
apt-get -y install libspatialindex-dev
554551

555552
RUN pip install pytorch-ignite \

Jenkinsfile

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,24 @@ pipeline {
6262
'''
6363
}
6464
}
65+
stage('jaxlib') {
66+
options {
67+
timeout(time: 60, unit: 'MINUTES')
68+
}
69+
steps {
70+
sh '''#!/bin/bash
71+
set -exo pipefail
72+
source config.txt
73+
cd packages/
74+
./build_package --base-image $BASE_IMAGE_REPO/$GPU_BASE_IMAGE_NAME:$BASE_IMAGE_TAG \
75+
--package jaxlib \
76+
--version $JAX_VERSION \
77+
--build-arg CUDA_MAJOR_VERSION=$CUDA_MAJOR_VERSION \
78+
--build-arg CUDA_MINOR_VERSION=$CUDA_MINOR_VERSION \
79+
--push
80+
'''
81+
}
82+
}
6583
}
6684
}
6785
stage('Build/Test/Diff') {

config.txt

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
BASE_IMAGE_REPO=gcr.io/deeplearning-platform-release
2-
BASE_IMAGE_TAG=m111
3-
CPU_BASE_IMAGE_NAME=tf2-cpu.2-13.py310
4-
GPU_BASE_IMAGE_NAME=tf2-gpu.2-13.py310
5-
LIGHTGBM_VERSION=3.3.2
6-
TORCH_VERSION=2.0.0
7-
TORCHAUDIO_VERSION=2.0.1
8-
TORCHTEXT_VERSION=0.15.1
9-
TORCHVISION_VERSION=0.15.1
10-
CUDA_MAJOR_VERSION=11
11-
CUDA_MINOR_VERSION=8
2+
BASE_IMAGE_TAG=m114
3+
CPU_BASE_IMAGE_NAME=tf2-cpu.2-15.py310
4+
GPU_BASE_IMAGE_NAME=tf2-gpu.2-15.py310
5+
LIGHTGBM_VERSION=4.2.0
6+
TORCH_VERSION=2.1.2
7+
TORCHAUDIO_VERSION=2.1.2
8+
TORCHTEXT_VERSION=0.16.2
9+
TORCHVISION_VERSION=0.16.2
10+
JAX_VERSION=0.4.23
11+
CUDA_MAJOR_VERSION=12
12+
CUDA_MINOR_VERSION=1

packages/jaxlib.Dockerfile

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
ARG BASE_IMAGE
2+
3+
FROM ${BASE_IMAGE} AS builder
4+
5+
ARG PACKAGE_VERSION
6+
ARG CUDA_MAJOR_VERSION
7+
ARG CUDA_MINOR_VERSION
8+
9+
# Make sure we are on the right version of CUDA
10+
RUN update-alternatives --set cuda /usr/local/cuda-$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION
11+
12+
# Ensures shared libraries installed with conda can be found by the dynamic link loader.
13+
# For PyTorch, we need specifically mkl.
14+
ENV LIBRARY_PATH="$LIBRARY_PATH:/opt/conda/lib"
15+
ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/conda/lib"
16+
17+
# Instructions: https://jax.readthedocs.io/en/latest/developer.html#building-jaxlib-from-source
18+
RUN apt-get update && \
19+
apt-get install -y g++ python python3-dev
20+
21+
RUN pip install numpy wheel build
22+
23+
RUN cd /usr/local/src && \
24+
git clone https://github.com/google/jax && \
25+
cd jax && \
26+
git checkout jaxlib-v$PACKAGE_VERSION
27+
28+
RUN cd /usr/local/src/jax && \
29+
python build/build.py --enable_cuda
30+
31+
# Using multi-stage builds to ensure the output image is very small
32+
# See: https://docs.docker.com/develop/develop-images/multistage-build/
33+
FROM alpine:latest
34+
35+
RUN mkdir -p /tmp/whl/
36+
COPY --from=builder /usr/local/src/jax/dist/*.whl /tmp/whl
37+
38+
# Print out the built .whl file.
39+
RUN ls -lh /tmp/whl/

packages/lightgbm.Dockerfile

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,24 +11,20 @@ RUN update-alternatives --set cuda /usr/local/cuda-$CUDA_MAJOR_VERSION.$CUDA_MIN
1111

1212
# Build instructions: https://lightgbm.readthedocs.io/en/latest/GPU-Tutorial.html#build-lightgbm
1313
RUN apt-get update && \
14-
apt-get install -y build-essential cmake libboost-dev libboost-system-dev libboost-filesystem-dev ocl-icd-libopencl1 clinfo opencl-headers
14+
apt-get install -y build-essential cmake libboost-dev libboost-system-dev libboost-filesystem-dev clinfo nvidia-opencl-dev opencl-headers
1515

1616
RUN cd /usr/local/src && \
1717
git clone --recursive https://github.com/microsoft/LightGBM && \
1818
cd LightGBM && \
1919
git checkout tags/v$PACKAGE_VERSION && \
20-
mkdir build && cd build && \
21-
cmake -DUSE_GPU=1 -DOpenCL_LIBRARY=/usr/local/cuda/lib64/libOpenCL.so -DOpenCL_INCLUDE_DIR=/usr/local/cuda/include/ .. && \
22-
make -j$(nproc) && \
23-
cd /usr/local/src/LightGBM/python-package && \
24-
python setup.py bdist_wheel
20+
./build-python.sh bdist_wheel --gpu --opencl-library=/usr/local/cuda/lib64/libOpenCL.so --opencl-include-dir=/usr/local/cuda/include/
2521

2622
# Using multi-stage builds to ensure the output image is very small
2723
# See: https://docs.docker.com/develop/develop-images/multistage-build/
2824
FROM alpine:latest
2925

3026
RUN mkdir -p /tmp/whl/
31-
COPY --from=builder /usr/local/src/LightGBM/python-package/dist/*.whl /tmp/whl
27+
COPY --from=builder /usr/local/src/LightGBM/dist/*.whl /tmp/whl
3228

3329
# Print out the built .whl file.
3430
RUN ls -lh /tmp/whl/

packages/torch.Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ ENV PYTORCH_BUILD_NUMBER=1
3131
# For PyTorch, we need specifically mkl.
3232
ENV LIBRARY_PATH="$LIBRARY_PATH:/opt/conda/lib"
3333
ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/conda/lib"
34-
ENV TORCH_CUDA_ARCH_LIST="3.7;6.0;7.0+PTX;7.5+PTX"
34+
ENV TORCH_CUDA_ARCH_LIST="6.0;7.0+PTX;7.5+PTX"
3535
ENV FORCE_CUDA=1
3636
RUN cd /usr/local/src && \
3737
git clone --recursive https://github.com/pytorch/pytorch && \

tests/test_keras_core.py renamed to tests/test_keras.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,9 @@
55

66
os.environ["KERAS_BACKEND"] = "tensorflow"
77

8-
# Note that keras_core should only be imported after the backend
9-
# has been configured. The backend cannot be changed once the
10-
# package is imported.
11-
import keras_core as keras
8+
import keras
129

13-
class TestKerasCore(unittest.TestCase):
10+
class TestKeras(unittest.TestCase):
1411
def test_train(self):
1512
# Load the data and split it between train and test sets
1613
(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data(

tests/test_lightgbm.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ def test_cpu(self):
3030
lgb_train,
3131
num_boost_round=1,
3232
valid_sets=lgb_eval,
33-
early_stopping_rounds=1)
33+
callbacks=[lgb.early_stopping(stopping_rounds=1)])
3434

3535
self.assertEqual(1, gbm.best_iteration)
3636

@@ -57,7 +57,7 @@ def test_gpu(self):
5757
lgb_train,
5858
num_boost_round=1,
5959
valid_sets=lgb_eval,
60-
early_stopping_rounds=1)
60+
callbacks=[lgb.early_stopping(stopping_rounds=1)])
6161

6262
self.assertEqual(1, gbm.best_iteration)
6363

tests/test_numba.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
import unittest
2+
3+
import numpy as np
4+
from numba import jit, cuda
5+
6+
from common import gpu_test
7+
8+
class TestNumba(unittest.TestCase):
9+
def test_jit(self):
10+
x = np.arange(100).reshape(10, 10)
11+
12+
@jit(nopython=True) # Set "nopython" mode for best performance, equivalent to @njit
13+
def go_fast(a): # Function is compiled to machine code when called the first time
14+
trace = 0.0
15+
for i in range(a.shape[0]): # Numba likes loops
16+
trace += np.tanh(a[i, i]) # Numba likes NumPy functions
17+
return a + trace # Numba likes NumPy broadcasting
18+
19+
self.assertEqual(10, go_fast(x).shape[0])
20+
21+
@gpu_test
22+
def test_cuda_jit(self):
23+
x = np.arange(10)
24+
25+
@cuda.jit
26+
def increment_by_one(an_array):
27+
pos = cuda.grid(1)
28+
if pos < an_array.size:
29+
an_array[pos] += 1
30+
31+
threadsperblock = 32
32+
blockspergrid = (x.size + (threadsperblock - 1))
33+
self.assertEqual(0, x[0])
34+
increment_by_one[blockspergrid, threadsperblock](x)
35+
self.assertEqual(1, x[0])

0 commit comments

Comments
 (0)