diff --git a/.gitignore b/.gitignore index 0d038d25..ef82380f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ *.pyc .idea/ .vscode -.mypy_cache \ No newline at end of file +.mypy_cache +.generated \ No newline at end of file diff --git a/Dockerfile b/Dockerfile.tmpl similarity index 86% rename from Dockerfile rename to Dockerfile.tmpl index 230f5d13..c68e980d 100644 --- a/Dockerfile +++ b/Dockerfile.tmpl @@ -1,16 +1,26 @@ -ARG BASE_TAG=m78 -ARG TENSORFLOW_VERSION=2.4.1 - -FROM gcr.io/deeplearning-platform-release/base-cpu:${BASE_TAG} - -# We need to redefine TENSORFLOW_VERSION here to get the default ARG value defined above the FROM instruction. -# See: https://docs.docker.com/engine/reference/builder/#understand-how-arg-and-from-interact -ARG TENSORFLOW_VERSION +{{ if eq .Accelerator "gpu" }} +FROM gcr.io/deeplearning-platform-release/tf2-gpu.2-6:m78 +ENV CUDA_MAJOR_VERSION=11 +ENV CUDA_MINOR_VERSION=0 +{{ else }} +FROM gcr.io/deeplearning-platform-release/tf2-cpu.2-6:m78 +{{ end }} +# Keep these variables in sync if base image is updated. +ENV TENSORFLOW_VERSION=2.6.0 +# Disable pesky logs like: KMP_AFFINITY: pid 6121 tid 6121 thread 0 bound to OS proc set 0 +# See: https://stackoverflow.com/questions/57385766/disable-tensorflow-log-information +ENV KMP_WARNINGS=0 ADD clean-layer.sh /tmp/clean-layer.sh ADD patches/nbconvert-extensions.tpl /opt/kaggle/nbconvert-extensions.tpl ADD patches/template_conf.json /opt/kaggle/conf.json +{{ if eq .Accelerator "gpu" }} +# b/200968891 Keeps horovod once torch is upgraded. +RUN pip uninstall -y horovod && \ + /tmp/clean-layer.sh +{{ end }} + # Use a fixed apt-get repo to stop intermittent failures due to flaky httpredir connections, # as described by Lionel Chan at http://stackoverflow.com/a/37426929/5881346 RUN sed -i "s/httpredir.debian.org/debian.uchicago.edu/" /etc/apt/sources.list && \ @@ -24,8 +34,6 @@ RUN sed -i "s/httpredir.debian.org/debian.uchicago.edu/" /etc/apt/sources.list & apt-get install -y openssh-client && \ /tmp/clean-layer.sh -# Make sure the dynamic linker finds the right libstdc++ -ENV LD_LIBRARY_PATH=/opt/conda/lib # b/128333086: Set PROJ_LIB to points to the proj4 cartographic library. ENV PROJ_LIB=/opt/conda/share/proj @@ -39,8 +47,71 @@ RUN conda config --add channels nvidia && \ conda install mkl cartopy=0.19 imagemagick=7.1 pyproj==3.1.0 && \ /tmp/clean-layer.sh +{{ if eq .Accelerator "gpu" }} +RUN conda install cudf=21.08 cuml=21.08 cudatoolkit=$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION && \ + /tmp/clean-layer.sh +{{ end }} + +# Install PyTorch +{{ if eq .Accelerator "gpu" }} +RUN pip install torch==1.7.1+cu$CUDA_MAJOR_VERSION$CUDA_MINOR_VERSION torchvision==0.8.2+cu$CUDA_MAJOR_VERSION$CUDA_MINOR_VERSION torchaudio==0.7.2 torchtext==0.8.1 -f https://download.pytorch.org/whl/torch_stable.html && \ + /tmp/clean-layer.sh +{{ else }} RUN pip install torch==1.7.1+cpu torchvision==0.8.2+cpu torchaudio==0.7.2 torchtext==0.8.1 -f https://download.pytorch.org/whl/torch_stable.html && \ /tmp/clean-layer.sh +{{ end }} + +# Install LightGBM +ENV LIGHTGBM_VERSION=3.2.1 +{{ if eq .Accelerator "gpu" }} +# Install OpenCL & libboost (required by LightGBM GPU version) +RUN apt-get install -y ocl-icd-libopencl1 clinfo libboost-all-dev && \ + mkdir -p /etc/OpenCL/vendors && \ + echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd && \ + cd /usr/local/src && \ + git clone --recursive https://github.com/microsoft/LightGBM && \ + cd LightGBM && \ + git checkout tags/v$LIGHTGBM_VERSION && \ + mkdir build && cd build && \ + cmake -DUSE_GPU=1 -DOpenCL_LIBRARY=/usr/local/cuda/lib64/libOpenCL.so -DOpenCL_INCLUDE_DIR=/usr/local/cuda/include/ .. && \ + make -j$(nproc) && \ + cd /usr/local/src/LightGBM/python-package && \ + python setup.py install --precompile && \ + /tmp/clean-layer.sh +{{ else }} +RUN pip install lightgbm==$LIGHTGBM_VERSION && \ + /tmp/clean-layer.sh +{{ end }} + +# Install JAX +ENV JAX_VERSION=0.2.19 +{{ if eq .Accelerator "gpu" }} +RUN pip install jax[cuda$CUDA_MAJOR_VERSION$CUDA_MINOR_VERSION]==$JAX_VERSION -f https://storage.googleapis.com/jax-releases/jax_releases.html && \ + /tmp/clean-layer.sh +{{ else }} +RUN pip install jax[cpu]==$JAX_VERSION && \ + /tmp/clean-layer.sh +{{ end }} + +# Install mxnet +{{ if eq .Accelerator "gpu" }} +RUN pip install mxnet-cu$CUDA_MAJOR_VERSION$CUDA_MINOR_VERSION && \ + /tmp/clean-layer.sh +{{ else }} +RUN pip install mxnet && \ + /tmp/clean-layer.sh +{{ end}} + +# Install GPU specific packages +{{ if eq .Accelerator "gpu" }} +# Install GPU-only packages +RUN pip install pycuda && \ + pip install pynvrtc && \ + # b/190622765 latest version is causing issue. nnabla fixed it in https://github.com/sony/nnabla/issues/892, waiting for new release before we can remove this pin. + pip install pynvml==8.0.4 && \ + pip install nnabla-ext-cuda$CUDA_MAJOR_VERSION$CUDA_MINOR_VERSION && \ + /tmp/clean-layer.sh +{{ end }} RUN pip install pysal && \ pip install seaborn python-dateutil dask python-igraph && \ @@ -50,12 +121,8 @@ RUN pip install pysal && \ # Use `conda install -c h2oai h2o` once Python 3.7 version is released to conda. apt-get install -y default-jre-headless && \ pip install -f https://h2o-release.s3.amazonaws.com/h2o/latest_stable_Py.html h2o && \ - /tmp/clean-layer.sh - -RUN pip install tensorflow==${TENSORFLOW_VERSION} && \ - pip install tensorflow-gcs-config==2.4.0 && \ - pip install tensorflow-addons==0.12.1 && \ - pip install tensorflow_probability==0.12.2 && \ + pip install tensorflow-gcs-config==2.6.0 && \ + pip install tensorflow-addons==0.14.0 && \ /tmp/clean-layer.sh RUN apt-get install -y libfreetype6-dev && \ @@ -65,10 +132,7 @@ RUN apt-get install -y libfreetype6-dev && \ pip install textblob && \ pip install wordcloud && \ pip install xgboost && \ - # Pinned to match GPU version. Update version together. - pip install lightgbm==3.2.1 && \ pip install pydot && \ - pip install keras-tuner && \ pip install flake8 && \ # Pinned because it breaks theano test with the latest version (b/178107003). pip install theano-pymc==1.0.11 && \ @@ -99,7 +163,6 @@ RUN apt-get install -y libfreetype6-dev && \ /tmp/clean-layer.sh RUN pip install ibis-framework && \ - pip install mxnet && \ pip install gluonnlp && \ pip install gluoncv && \ /tmp/clean-layer.sh @@ -384,11 +447,6 @@ RUN pip install flashtext && \ pip install geopandas && \ pip install nnabla && \ pip install vowpalwabbit && \ - # papermill can replace nbconvert for executing notebooks - pip install cloud-tpu-client && \ - # b/188429515#comment7 tensorflow-cloud >= 0.1.14 installs tensorflow-transform which install apache-beam which downgrades the google.cloud library to 1.x. - pip install tensorflow-cloud==0.1.13 && \ - pip install tensorflow-datasets && \ pip install pydub && \ pip install pydegensac && \ # b/198635596 latest versions of torchmetrics & pytorch-lightning are failing at runtime. @@ -401,8 +459,6 @@ RUN pip install flashtext && \ # pycrypto is used by competitions team. pip install pycrypto && \ pip install easyocr && \ - # Keep JAX version in sync with GPU image. - pip install jax[cpu]==0.2.19 && \ # ipympl adds interactive widget support for matplotlib pip install ipympl==0.7.0 && \ pip install pandarallel && \ diff --git a/Jenkinsfile b/Jenkinsfile index 878a169e..41aa6a90 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -20,46 +20,7 @@ pipeline { } stages { - stage('Docker CPU Build') { - options { - timeout(time: 120, unit: 'MINUTES') - } - steps { - sh '''#!/bin/bash - set -exo pipefail - - ./build | ts - ./push ${PRETEST_TAG} - ''' - } - } - - stage('Test CPU Image') { - options { - timeout(time: 5, unit: 'MINUTES') - } - steps { - sh '''#!/bin/bash - set -exo pipefail - - date - docker pull gcr.io/kaggle-images/python:${PRETEST_TAG} - ./test --image gcr.io/kaggle-images/python:${PRETEST_TAG} - ''' - } - } - - stage('Docker GPU Build') { - // A GPU is not required to build this image. However, in our current setup, - // the default runtime is set to nvidia (as opposed to runc) and there - // is no option to specify a runtime for the `docker build` command. - // - // TODO(rosbo) don't set `nvidia` as the default runtime and use the - // `--runtime=nvidia` flag for the `docker run` command when GPU support is needed. - agent { label 'ephemeral-linux-gpu' } - options { - timeout(time: 60, unit: 'MINUTES') - } + stage('Clean Images') { steps { sh '''#!/bin/bash set -exo pipefail @@ -70,51 +31,93 @@ pipeline { # will untag the previously built image which is safe to do. Builds for a single branch are performed # serially. docker image prune -f - ./build --gpu --base-image-tag ${PRETEST_TAG} | ts - ./push --gpu ${PRETEST_TAG} ''' } } + stage('Build/Test/Diff') { + parallel { + stage('CPU') { + stages { + stage('Build CPU Image') { + options { + timeout(time: 120, unit: 'MINUTES') + } + steps { + sh '''#!/bin/bash + set -exo pipefail - stage('Test GPU Image') { - agent { label 'ephemeral-linux-gpu' } - options { - timeout(time: 20, unit: 'MINUTES') - } - steps { - sh '''#!/bin/bash - set -exo pipefail - - date - docker pull gcr.io/kaggle-private-byod/python:${PRETEST_TAG} - ./test --gpu --image gcr.io/kaggle-private-byod/python:${PRETEST_TAG} - ''' - } - } + ./build | ts + ./push ${PRETEST_TAG} + ''' + } + } + stage('Test CPU Image') { + options { + timeout(time: 5, unit: 'MINUTES') + } + steps { + sh '''#!/bin/bash + set -exo pipefail - stage('Package Versions') { - parallel { - stage('CPU Diff') { - steps { - sh '''#!/bin/bash - set -exo pipefail + date + docker pull gcr.io/kaggle-images/python:${PRETEST_TAG} + ./test --image gcr.io/kaggle-images/python:${PRETEST_TAG} + ''' + } + } + stage('Diff CPU image') { + steps { + sh '''#!/bin/bash + set -exo pipefail - docker pull gcr.io/kaggle-images/python:${PRETEST_TAG} - ./diff --target gcr.io/kaggle-images/python:${PRETEST_TAG} - ''' + docker pull gcr.io/kaggle-images/python:${PRETEST_TAG} + ./diff --target gcr.io/kaggle-images/python:${PRETEST_TAG} + ''' + } + } } } - stage('GPU Diff') { + stage('GPU') { agent { label 'ephemeral-linux-gpu' } - steps { - sh '''#!/bin/bash - set -exo pipefail + stages { + stage('Build GPU Image') { + options { + timeout(time: 120, unit: 'MINUTES') + } + steps { + sh '''#!/bin/bash + set -exo pipefail + ./build --gpu | ts + ./push --gpu ${PRETEST_TAG} + ''' + } + } + stage('Test GPU Image') { + options { + timeout(time: 20, unit: 'MINUTES') + } + steps { + sh '''#!/bin/bash + set -exo pipefail + + date + docker pull gcr.io/kaggle-private-byod/python:${PRETEST_TAG} + ./test --gpu --image gcr.io/kaggle-private-byod/python:${PRETEST_TAG} + ''' + } + } + stage('Diff GPU Image') { + steps { + sh '''#!/bin/bash + set -exo pipefail - docker pull gcr.io/kaggle-private-byod/python:${PRETEST_TAG} - ./diff --gpu --target gcr.io/kaggle-private-byod/python:${PRETEST_TAG} - ''' + docker pull gcr.io/kaggle-private-byod/python:${PRETEST_TAG} + ./diff --gpu --target gcr.io/kaggle-private-byod/python:${PRETEST_TAG} + ''' + } + } } - } + } } } diff --git a/build b/build index ae9a9779..4fc9c8e5 100755 --- a/build +++ b/build @@ -9,12 +9,12 @@ Build a new Python Docker image. Options: -g, --gpu Build an image with GPU support. -c, --use-cache Use layer cache when building a new image. - -b, --base-image-tag TAG Base image tag. Defaults to value defined in DOCKERFILE. EOF } CACHE_FLAG='--no-cache' DOCKERFILE='Dockerfile' +ACCELERATOR='none' IMAGE_TAG='kaggle/python-build' BUILD_ARGS='' @@ -27,19 +27,11 @@ while :; do -g|--gpu) IMAGE_TAG='kaggle/python-gpu-build' DOCKERFILE='gpu.Dockerfile' + ACCELERATOR='gpu' ;; -c|--use-cache) CACHE_FLAG='' ;; - -b|--base-image-tag) - if [[ -z $2 ]]; then - usage - printf 'ERROR: No TAG specified after the %s flag.\n' "$1" >&2 - exit - fi - BUILD_ARGS="--build-arg BASE_TAG=$2" - shift # skip the flag value - ;; -?*) usage printf 'ERROR: Unknown option: %s\n' "$1" >&2 @@ -57,8 +49,22 @@ BUILD_ARGS+=" --build-arg BUILD_DATE=$(date '+%Y%m%d-%H%M%S')" readonly CACHE_FLAG readonly DOCKERFILE +readonly ACCELERATOR readonly IMAGE_TAG readonly BUILD_ARGS + +SRCDIR=$(dirname "${BASH_SOURCE[0]}") +DOCKERFILE_OUTDIR="${SRCDIR}/.generated" +mkdir -p $DOCKERFILE_OUTDIR +DOCKERFILE_PATH="$DOCKERFILE_OUTDIR/$DOCKERFILE" + +# Generate Dockerfile from template. +echo "Generating Dockerfile from template..." +docker run --rm -v $PWD:/input:ro gcr.io/kaggle-images/go-renderizer:latest --ACCELERATOR=$ACCELERATOR /input/Dockerfile.tmpl > $DOCKERFILE_PATH +echo "==================== $DOCKERFILE START ====================" +cat $DOCKERFILE_PATH +echo "==================== $DOCKERFILE END ====================" + set -x -docker build --rm --pull $CACHE_FLAG -t "$IMAGE_TAG" -f "$DOCKERFILE" $BUILD_ARGS . +docker build --rm --pull $CACHE_FLAG -t "$IMAGE_TAG" -f "$DOCKERFILE_PATH" $BUILD_ARGS . diff --git a/gpu.Dockerfile b/gpu.Dockerfile deleted file mode 100644 index 191d54a1..00000000 --- a/gpu.Dockerfile +++ /dev/null @@ -1,104 +0,0 @@ -ARG BASE_TAG=staging - -FROM nvidia/cuda:11.0-cudnn8-devel-ubuntu18.04 AS nvidia -FROM gcr.io/kaggle-images/python:${BASE_TAG} - -ADD clean-layer.sh /tmp/clean-layer.sh - -# Cuda support -COPY --from=nvidia /etc/apt/sources.list.d/cuda.list /etc/apt/sources.list.d/ -COPY --from=nvidia /etc/apt/sources.list.d/nvidia-ml.list /etc/apt/sources.list.d/ -COPY --from=nvidia /etc/apt/trusted.gpg /etc/apt/trusted.gpg.d/cuda.gpg - -ENV CUDA_MAJOR_VERSION=11 -ENV CUDA_MINOR_VERSION=0 -ENV CUDA_VERSION=$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION -LABEL com.nvidia.volumes.needed="nvidia_driver" -LABEL com.nvidia.cuda.version="${CUDA_VERSION}" -ENV PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:/opt/bin:${PATH} -# The stub is useful to us both for built-time linking and run-time linking, on CPU-only systems. -# When intended to be used with actual GPUs, make sure to (besides providing access to the host -# CUDA user libraries, either manually or through the use of nvidia-docker) exclude them. One -# convenient way to do so is to obscure its contents by a bind mount: -# docker run .... -v /non-existing-directory:/usr/local/cuda/lib64/stubs:ro ... -# b/197989446#comment7 libgnutls version at /opt/conda/lib causes apt to fail to fetch packages using https URLs. -ENV LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu -RUN apt-get update && apt-get install -y --no-install-recommends \ - cuda-cupti-$CUDA_VERSION \ - cuda-cudart-$CUDA_VERSION \ - cuda-cudart-dev-$CUDA_VERSION \ - cuda-libraries-$CUDA_VERSION \ - cuda-libraries-dev-$CUDA_VERSION \ - cuda-nvml-dev-$CUDA_VERSION \ - cuda-minimal-build-$CUDA_VERSION \ - cuda-command-line-tools-$CUDA_VERSION \ - libcudnn8=8.0.4.30-1+cuda$CUDA_VERSION \ - libcudnn8-dev=8.0.4.30-1+cuda$CUDA_VERSION \ - libnccl2=2.7.8-1+cuda$CUDA_VERSION \ - libnccl-dev=2.7.8-1+cuda$CUDA_VERSION && \ - ln -s /usr/local/cuda-$CUDA_VERSION /usr/local/cuda && \ - ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 && \ - /tmp/clean-layer.sh - -ENV LD_LIBRARY_PATH_NO_STUBS="/usr/local/nvidia/lib64:/usr/local/cuda/lib64:/opt/conda/lib" -ENV LD_LIBRARY_PATH="/usr/local/nvidia/lib64:/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:/opt/conda/lib" -ENV NVIDIA_VISIBLE_DEVICES=all -ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility -ENV NVIDIA_REQUIRE_CUDA="cuda>=$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION" - -# Install OpenCL & libboost (required by LightGBM GPU version) -RUN apt-get install -y ocl-icd-libopencl1 clinfo libboost-all-dev && \ - mkdir -p /etc/OpenCL/vendors && \ - echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd && \ - /tmp/clean-layer.sh - -# When using pip in a conda environment, conda commands should be ran first and then -# the remaining pip commands: https://www.anaconda.com/using-pip-in-a-conda-environment/ -# However, because this image is based on the CPU image, this isn't possible but better -# to put them at the top of this file to minize conflicts. -RUN conda install cudf=21.08 cuml=21.08 cudatoolkit=$CUDA_VERSION && \ - /tmp/clean-layer.sh - -# Install Pytorch and torchvision with GPU support. -# Note: torchtext and torchaudio do not require a separate GPU package. -RUN pip install torch==1.7.1+cu$CUDA_MAJOR_VERSION$CUDA_MINOR_VERSION torchvision==0.8.2+cu$CUDA_MAJOR_VERSION$CUDA_MINOR_VERSION -f https://download.pytorch.org/whl/torch_stable.html && \ - /tmp/clean-layer.sh - -# Install LightGBM with GPU -RUN pip uninstall -y lightgbm && \ - cd /usr/local/src && \ - git clone --recursive https://github.com/microsoft/LightGBM && \ - cd LightGBM && \ - git checkout tags/v3.2.1 && \ - mkdir build && cd build && \ - cmake -DUSE_GPU=1 -DOpenCL_LIBRARY=/usr/local/cuda/lib64/libOpenCL.so -DOpenCL_INCLUDE_DIR=/usr/local/cuda/include/ .. && \ - make -j$(nproc) && \ - cd /usr/local/src/LightGBM/python-package && \ - python setup.py install --precompile && \ - mkdir -p /etc/OpenCL/vendors && \ - echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd && \ - /tmp/clean-layer.sh - -# Install JAX (Keep JAX version in sync with CPU image) -RUN pip install jax[cuda$CUDA_MAJOR_VERSION$CUDA_MINOR_VERSION]==0.2.19 -f https://storage.googleapis.com/jax-releases/jax_releases.html && \ - /tmp/clean-layer.sh - -# Reinstall packages with a separate version for GPU support. -RUN pip uninstall -y mxnet && \ - pip install mxnet-cu$CUDA_MAJOR_VERSION$CUDA_MINOR_VERSION && \ - /tmp/clean-layer.sh - -# Install GPU-only packages -RUN pip install pycuda && \ - pip install pynvrtc && \ - # b/190622765 latest version is causing issue. nnabla fixed it in https://github.com/sony/nnabla/issues/892, waiting for new release before we can remove this pin. - pip install pynvml==8.0.4 && \ - pip install nnabla-ext-cuda$CUDA_MAJOR_VERSION$CUDA_MINOR_VERSION && \ - /tmp/clean-layer.sh - -# Re-add TensorBoard Jupyter extension patch -# b/139212522 re-enable TensorBoard once solution for slowdown is implemented. -# ADD patches/tensorboard/notebook.py /opt/conda/lib/python3.7/site-packages/tensorboard/notebook.py - -# Remove the CUDA stubs. -ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH_NO_STUBS" diff --git a/renderizer/Dockerfile b/renderizer/Dockerfile new file mode 100644 index 00000000..9faac229 --- /dev/null +++ b/renderizer/Dockerfile @@ -0,0 +1,12 @@ +# Image used to generate the Dockerfiles from a Go text template. +# +# Build: +# docker build --rm --pull -t gcr.io/kaggle-images/go-renderizer -f Dockerfile . +# +# Push: +# docker push gcr.io/kaggle-images/go-renderizer +FROM golang:1.17 + +RUN go install github.com/gomatic/renderizer/v2/cmd/renderizer@v2.0.13 + +ENTRYPOINT ["renderizer"] \ No newline at end of file