diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index 40427969..92eeca84 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -1,12 +1,33 @@ +ARG BASE_IMAGE_REPO +ARG BASE_IMAGE_TAG +ARG CPU_BASE_IMAGE_NAME +ARG GPU_BASE_IMAGE_NAME +ARG LIGHTGBM_VERSION +ARG TORCH_VERSION +ARG TORCHAUDIO_VERSION +ARG TORCHTEXT_VERSION +ARG TORCHVISION_VERSION + {{ if eq .Accelerator "gpu" }} -FROM gcr.io/deeplearning-platform-release/tf2-gpu.2-6:m80 +FROM gcr.io/kaggle-images/python-lightgbm-whl:${GPU_BASE_IMAGE_NAME}-${BASE_IMAGE_TAG}-${LIGHTGBM_VERSION} AS lightgbm_whl +FROM gcr.io/kaggle-images/python-torch-whl:${GPU_BASE_IMAGE_NAME}-${BASE_IMAGE_TAG}-${TORCH_VERSION} AS torch_whl +FROM ${BASE_IMAGE_REPO}/${GPU_BASE_IMAGE_NAME}:${BASE_IMAGE_TAG} ENV CUDA_MAJOR_VERSION=11 ENV CUDA_MINOR_VERSION=0 {{ else }} -FROM gcr.io/deeplearning-platform-release/tf2-cpu.2-6:m80 +FROM ${BASE_IMAGE_REPO}/${CPU_BASE_IMAGE_NAME}:${BASE_IMAGE_TAG} {{ end }} # Keep these variables in sync if base image is updated. ENV TENSORFLOW_VERSION=2.6.0 + +# We need to redefine the ARG here to get the ARG value defined above the FROM instruction. +# See: https://docs.docker.com/engine/reference/builder/#understand-how-arg-and-from-interact +ARG LIGHTGBM_VERSION +ARG TORCH_VERSION +ARG TORCHAUDIO_VERSION +ARG TORCHTEXT_VERSION +ARG TORCHVISION_VERSION + # Disable pesky logs like: KMP_AFFINITY: pid 6121 tid 6121 thread 0 bound to OS proc set 0 # See: https://stackoverflow.com/questions/57385766/disable-tensorflow-log-information ENV KMP_WARNINGS=0 @@ -15,6 +36,9 @@ ADD clean-layer.sh /tmp/clean-layer.sh ADD patches/nbconvert-extensions.tpl /opt/kaggle/nbconvert-extensions.tpl ADD patches/template_conf.json /opt/kaggle/conf.json +# Adds the libcuda.so to LD_LIBRARY_PATH which is necessary for the GPU mxnet package. +ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/compat + {{ if eq .Accelerator "gpu" }} # b/200968891 Keeps horovod once torch is upgraded. RUN pip uninstall -y horovod && \ @@ -52,29 +76,24 @@ RUN conda install cudf=21.08 cuml=21.08 cudatoolkit=$CUDA_MAJOR_VERSION.$CUDA_MI # Install PyTorch {{ if eq .Accelerator "gpu" }} -RUN pip install torch==1.7.1+cu$CUDA_MAJOR_VERSION$CUDA_MINOR_VERSION torchvision==0.8.2+cu$CUDA_MAJOR_VERSION$CUDA_MINOR_VERSION torchaudio==0.7.2 torchtext==0.8.1 -f https://download.pytorch.org/whl/torch_stable.html && \ +COPY --from=torch_whl /tmp/whl/*.whl /tmp/torch/ +RUN pip install /tmp/torch/*.whl && \ + rm -rf /tmp/torch && \ /tmp/clean-layer.sh {{ else }} -RUN pip install torch==1.7.1+cpu torchvision==0.8.2+cpu torchaudio==0.7.2 torchtext==0.8.1 -f https://download.pytorch.org/whl/torch_stable.html && \ +RUN pip install torch==$TORCH_VERSION+cpu torchvision==$TORCHVISION_VERSION+cpu torchaudio==$TORCHAUDIO_VERSION torchtext==$TORCHTEXT_VERSION -f https://download.pytorch.org/whl/torch_stable.html && \ /tmp/clean-layer.sh {{ end }} # Install LightGBM -ENV LIGHTGBM_VERSION=3.2.1 {{ if eq .Accelerator "gpu" }} +COPY --from=lightgbm_whl /tmp/whl/*.whl /tmp/lightgbm/ # Install OpenCL (required by LightGBM GPU version) RUN apt-get install -y ocl-icd-libopencl1 clinfo && \ mkdir -p /etc/OpenCL/vendors && \ echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd && \ - cd /usr/local/src && \ - git clone --recursive https://github.com/microsoft/LightGBM && \ - cd LightGBM && \ - git checkout tags/v$LIGHTGBM_VERSION && \ - mkdir build && cd build && \ - cmake -DUSE_GPU=1 -DOpenCL_LIBRARY=/usr/local/cuda/lib64/libOpenCL.so -DOpenCL_INCLUDE_DIR=/usr/local/cuda/include/ .. && \ - make -j$(nproc) && \ - cd /usr/local/src/LightGBM/python-package && \ - python setup.py install --precompile && \ + pip install /tmp/lightgbm/*.whl && \ + rm -rf /tmp/lightgbm && \ /tmp/clean-layer.sh {{ else }} RUN pip install lightgbm==$LIGHTGBM_VERSION && \ @@ -386,8 +405,7 @@ RUN pip install bleach && \ pip install widgetsnbextension && \ pip install pyarrow && \ pip install feather-format && \ - # fastai >= 2.3.1 upgrades pytorch/torchvision. upgrade of pytorch will be handled in b/181966788 - pip install fastai==2.2.7 && \ + pip install fastai && \ pip install allennlp && \ # https://b.corp.google.com/issues/184685619#comment9: 3.9.0 is causing a major performance degradation with spacy 2.3.5 pip install importlib-metadata==3.4.0 && \ diff --git a/Jenkinsfile b/Jenkinsfile index 41aa6a90..99ad30c2 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -34,6 +34,42 @@ pipeline { ''' } } + stage('Pre-build Packages from Source') { + parallel { + stage('torch') { + options { + timeout(time: 180, unit: 'MINUTES') + } + steps { + sh '''#!/bin/bash + set -exo pipefail + source config.txt + cd packages/ + ./build_package --base-image $BASE_IMAGE_REPO/$GPU_BASE_IMAGE_NAME:$BASE_IMAGE_TAG \ + --package torch \ + --version $TORCH_VERSION \ + --build-arg TORCHAUDIO_VERSION=$TORCHAUDIO_VERSION \ + --build-arg TORCHTEXT_VERSION=$TORCHTEXT_VERSION \ + --build-arg TORCHVISION_VERSION=$TORCHVISION_VERSION \ + --push + ''' + } + } + stage('lightgbm') { + options { + timeout(time: 10, unit: 'MINUTES') + } + steps { + sh '''#!/bin/bash + set -exo pipefail + source config.txt + cd packages/ + ./build_package --base-image $BASE_IMAGE_REPO/$GPU_BASE_IMAGE_NAME:$BASE_IMAGE_TAG --package lightgbm --version $LIGHTGBM_VERSION --push + ''' + } + } + } + } stage('Build/Test/Diff') { parallel { stage('CPU') { @@ -79,7 +115,7 @@ pipeline { } stage('GPU') { agent { label 'ephemeral-linux-gpu' } - stages { + stages { stage('Build GPU Image') { options { timeout(time: 120, unit: 'MINUTES') diff --git a/build b/build index 4fc9c8e5..9b20f2dc 100755 --- a/build +++ b/build @@ -47,14 +47,18 @@ done BUILD_ARGS+=" --build-arg GIT_COMMIT=$(git rev-parse HEAD)" BUILD_ARGS+=" --build-arg BUILD_DATE=$(date '+%Y%m%d-%H%M%S')" +# Read build args from config.txt file. +SRCDIR=$(dirname "${BASH_SOURCE[0]}") +for l in `cat ${SRCDIR}/config.txt`; do + BUILD_ARGS+=" --build-arg $l" +done + readonly CACHE_FLAG readonly DOCKERFILE readonly ACCELERATOR readonly IMAGE_TAG readonly BUILD_ARGS - -SRCDIR=$(dirname "${BASH_SOURCE[0]}") DOCKERFILE_OUTDIR="${SRCDIR}/.generated" mkdir -p $DOCKERFILE_OUTDIR DOCKERFILE_PATH="$DOCKERFILE_OUTDIR/$DOCKERFILE" diff --git a/config.txt b/config.txt new file mode 100644 index 00000000..5ebf4822 --- /dev/null +++ b/config.txt @@ -0,0 +1,9 @@ +BASE_IMAGE_REPO=gcr.io/deeplearning-platform-release +BASE_IMAGE_TAG=m80 +CPU_BASE_IMAGE_NAME=tf2-cpu.2-6 +GPU_BASE_IMAGE_NAME=tf2-gpu.2-6 +LIGHTGBM_VERSION=3.2.1 +TORCH_VERSION=1.9.1 +TORCHAUDIO_VERSION=0.9.1 +TORCHTEXT_VERSION=0.10.1 +TORCHVISION_VERSION=0.10.1 \ No newline at end of file diff --git a/packages/README.md b/packages/README.md new file mode 100644 index 00000000..e69de29b diff --git a/packages/build_package b/packages/build_package new file mode 100755 index 00000000..6a6e7e5c --- /dev/null +++ b/packages/build_package @@ -0,0 +1,150 @@ +#!/bin/bash +set -e + +usage() { +cat << EOF +Usage: $0 [OPTIONS] +Build a new package ".whl". + +Options: + -p, --package PACKAGE Package to build (e.g. lightgbm). + -v, --version VERSION Package version to build. + -b, --base-image IMAGE Base image tag (e.g. m80). + -c, --use-cache Use layer cache when building a new image. + -f, --force-rebuild Rebuild the image regardless of whether it already exist on GCR. + -u, --push Push image to GCR. + --build-arg ARG=VALUE Build arguments to pass to the docker build command. +EOF +} + +PACKAGE='' +PACKAGE_VERSION='' +BASE_IMAGE='' +DOCKERFILE='' +CACHE_FLAG='--no-cache' +FORCE_REBUILD=false +PUSH_TO_GCR=false +BUILD_ARGS='' + +while :; do + case "$1" in + -h|--help) + usage + exit + ;; + -p|--package) + if [[ -z $2 ]]; then + usage + printf 'ERROR: No IMAGE specified after the %s flag.\n' "$1" >&2 + exit 1 + fi + PACKAGE=$2 + DOCKERFILE="${PACKAGE}.Dockerfile" + shift # skip the flag value + ;; + -v|--version) + if [[ -z $2 ]]; then + usage + printf 'ERROR: No VERSION specified after the %s flag.\n' "$1" >&2 + exit 1 + fi + PACKAGE_VERSION=$2 + shift # skip the flag value + ;; + -t|--base-image) + if [[ -z $2 ]]; then + usage + printf 'ERROR: No TAG specified after the %s flag.\n' "$1" >&2 + exit 1 + fi + BASE_IMAGE=$2 + shift # skip the flag value + ;; + -c|--use-cache) + CACHE_FLAG='' + ;; + -f|--force-rebuild) + FORCE_REBUILD=true + ;; + -u|--push) + PUSH_TO_GCR=true + ;; + --build-arg) + if [[ -z $2 ]]; then + usage + printf 'ERROR: No ARG=VALUE specified after the %s flag.\n' "$1" >&2 + exit 1 + fi + BUILD_ARGS+=" $1 $2" + shift # skip the flag value + ;; + -?*) + usage + printf 'ERROR: Unknown option: %s\n' "$1" >&2 + exit 1 + ;; + *) + break + esac + + shift +done + +readonly PACKAGE +readonly PACKAGE_VERSION +readonly BASE_IMAGE +readonly DOCKERFILE +readonly CACHE_FLAG +readonly FORCE_REBUILD + +SRCDIR=$(dirname "${BASH_SOURCE[0]}") +DOCKERFILE_PATH="$SRCDIR/$DOCKERFILE" + +if [[ -z "$PACKAGE_VERSION" ]]; then + printf 'ERROR: missing --version flag.\n' + exit 1 +fi + +if [[ -z "$BASE_IMAGE" ]]; then + printf 'ERROR: missing --base-image flag.\n' + exit 1 +fi + +if [[ -z "$DOCKERFILE" ]]; then + printf 'ERROR: missing --package flag.\n' + exit 1 +fi + +# Keep only `tf2-gpu.2-6:m80` in `gcr.io/deeplearning-platform-release/tf2-gpu.2-6:m80` +TAG=${BASE_IMAGE/gcr.io\/deeplearning-platform-release\//} +# Replace the `:` in `tf2-gpu.2-6:m80` by `-` +TAG=${TAG/:/-} +# Append the package version +TAG=$TAG-$PACKAGE_VERSION +# Add the gcr repo. +TAG=gcr.io/kaggle-images/python-$PACKAGE-whl:$TAG + +SHOULD_BUILD=true +if ! $FORCE_REBUILD; then + echo "Checking if $TAG exists..." + docker pull $TAG && SHOULD_BUILD=false +fi + +if $SHOULD_BUILD; then + echo "Building $TAG..." + docker build --rm --pull $BUILD_ARGS \ + $CACHE_FLAG \ + -t $TAG \ + -f "$DOCKERFILE_PATH" \ + --build-arg BASE_IMAGE=$BASE_IMAGE \ + --build-arg PACKAGE_VERSION=$PACKAGE_VERSION \ + $SRCDIR + + if $PUSH_TO_GCR; then + echo "Pushing $TAG to GCR..." + docker push $TAG + fi +else + echo "Skipping build. $TAG already exists." + echo "Use --force-rebuild if you want to build a new version anyway." +fi \ No newline at end of file diff --git a/packages/lightgbm.Dockerfile b/packages/lightgbm.Dockerfile new file mode 100644 index 00000000..408c6acc --- /dev/null +++ b/packages/lightgbm.Dockerfile @@ -0,0 +1,29 @@ +ARG BASE_IMAGE + +FROM ${BASE_IMAGE} AS builder + +ARG PACKAGE_VERSION + +# Build instructions: https://lightgbm.readthedocs.io/en/latest/GPU-Tutorial.html#build-lightgbm +RUN apt-get update && \ + apt-get install -y build-essential cmake libboost-dev libboost-system-dev libboost-filesystem-dev ocl-icd-libopencl1 clinfo + +RUN cd /usr/local/src && \ + git clone --recursive https://github.com/microsoft/LightGBM && \ + cd LightGBM && \ + git checkout tags/v$PACKAGE_VERSION && \ + mkdir build && cd build && \ + cmake -DUSE_GPU=1 -DOpenCL_LIBRARY=/usr/local/cuda/lib64/libOpenCL.so -DOpenCL_INCLUDE_DIR=/usr/local/cuda/include/ .. && \ + make -j$(nproc) && \ + cd /usr/local/src/LightGBM/python-package && \ + python setup.py bdist_wheel + +# Using multi-stage builds to ensure the output image is very small +# See: https://docs.docker.com/develop/develop-images/multistage-build/ +FROM alpine:latest + +RUN mkdir -p /tmp/whl/ +COPY --from=builder /usr/local/src/LightGBM/python-package/dist/*.whl /tmp/whl + +# Print out the built .whl file. +RUN ls -lh /tmp/whl/ \ No newline at end of file diff --git a/packages/torch.Dockerfile b/packages/torch.Dockerfile new file mode 100644 index 00000000..de451730 --- /dev/null +++ b/packages/torch.Dockerfile @@ -0,0 +1,79 @@ +ARG BASE_IMAGE + +FROM ${BASE_IMAGE} AS builder + +ARG PACKAGE_VERSION +ARG TORCHAUDIO_VERSION +ARG TORCHTEXT_VERSION +ARG TORCHVISION_VERSION + +# TORCHVISION_VERSION is mandatory +RUN test -n "$TORCHVISION_VERSION" + +# Build instructions: https://github.com/pytorch/pytorch#from-source +RUN conda install astunparse numpy ninja pyyaml mkl mkl-include setuptools cmake cffi typing_extensions future six requests dataclasses + +# By default, it uses the version from version.txt which includes the `a0` (alpha zero) suffix and part of the git hash. +# This causes dependency conflicts like these: https://paste.googleplex.com/4786486378496000 +ENV PYTORCH_BUILD_VERSION=$PACKAGE_VERSION +ENV PYTORCH_BUILD_NUMBER=1 + +ENV TORCH_CUDA_ARCH_LIST="3.7;6.0;7.0+PTX;7.5+PTX" +ENV FORCE_CUDA=1 +RUN cd /usr/local/src && \ + git clone --recursive https://github.com/pytorch/pytorch && \ + cd pytorch && \ + git checkout tags/v$PACKAGE_VERSION && \ + git submodule sync && \ + git submodule update --init --recursive --jobs 0 && \ + python setup.py bdist_wheel + +# Install torch which is required before we can build other torch* packages. +RUN pip install /usr/local/src/pytorch/dist/*.whl + +# Build torchaudio +# Instructions: https://github.com/pytorch/audio#from-source +# See comment above for PYTORCH_BUILD_VERSION. +ENV BUILD_VERSION=$TORCHAUDIO_VERSION +RUN cd /usr/local/src && \ + git clone https://github.com/pytorch/audio && \ + cd audio && \ + git checkout tags/v$TORCHAUDIO_VERSION && \ + git submodule sync && \ + git submodule update --init --recursive --jobs 0 && \ + python setup.py bdist_wheel + +# Build torchtext +# Instructions: https://github.com/pytorch/text#building-from-source +# See comment above for PYTORCH_BUILD_VERSION. +ENV BUILD_VERSION=$TORCHTEXT_VERSION +RUN cd /usr/local/src && \ + git clone https://github.com/pytorch/text && \ + cd text && \ + git checkout tags/v$TORCHTEXT_VERSION && \ + git submodule sync && \ + git submodule update --init --recursive --jobs 0 && \ + python setup.py bdist_wheel + +# Build torchvision. +# Instructions: https://github.com/pytorch/vision/tree/main#installation +# See comment above for PYTORCH_BUILD_VERSION. +ENV BUILD_VERSION=$TORCHVISION_VERSION +RUN cd /usr/local/src && \ + git clone --recursive https://github.com/pytorch/vision && \ + cd vision && \ + git checkout tags/v$TORCHVISION_VERSION && \ + python setup.py bdist_wheel + +# Using multi-stage builds to ensure the output image is very small +# See: https://docs.docker.com/develop/develop-images/multistage-build/ +FROM alpine:latest + +RUN mkdir -p /tmp/whl/ +COPY --from=builder /usr/local/src/pytorch/dist/*.whl /tmp/whl +COPY --from=builder /usr/local/src/audio/dist/*.whl /tmp/whl +COPY --from=builder /usr/local/src/text/dist/*.whl /tmp/whl +COPY --from=builder /usr/local/src/vision/dist/*.whl /tmp/whl + +# Print out the built .whl file. +RUN ls -lh /tmp/whl/ \ No newline at end of file