From 319fa6bf0ee17edcce595afa270dac8c0cabcc30 Mon Sep 17 00:00:00 2001 From: Vincent Roseberry Date: Fri, 17 Sep 2021 22:22:54 +0000 Subject: [PATCH 01/12] Templatize build & start from TF base image. - Build CPU & GPU images in parallel (faster build time) - The GPU image is no longer layers on top of the CPU image (small GPU image, simpler because no need to install CUDA ourselves) - Upgrade TensorFlow to 2.6 http://b/167268016 --- .gitignore | 3 +- Dockerfile.tmpl | 559 ++++++++++++++++++++++++++++++++++++++++++ Jenkinsfile | 172 +++++++------ build | 18 +- renderizer/Dockerfile | 12 + 5 files changed, 675 insertions(+), 89 deletions(-) create mode 100644 Dockerfile.tmpl create mode 100644 renderizer/Dockerfile diff --git a/.gitignore b/.gitignore index 0d038d25..ef82380f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ *.pyc .idea/ .vscode -.mypy_cache \ No newline at end of file +.mypy_cache +.generated \ No newline at end of file diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl new file mode 100644 index 00000000..b1f0bec3 --- /dev/null +++ b/Dockerfile.tmpl @@ -0,0 +1,559 @@ +ARG BASE_TAG=m78 + +{{ if eq .Accelerator "gpu" }} +FROM gcr.io/deeplearning-platform-release/tf2-gpu.2-6:${BASE_TAG} +{{ else }} +FROM gcr.io/deeplearning-platform-release/tf2-cpu.2-6:${BASE_TAG} +{{ end }} +# Keep these variables in sync if base image is updated. +ENV TENSORFLOW_VERSION=2.6.0 +ENV CUDA_MAJOR_VERSION=11 +ENV CUDA_MINOR_VERSION=0 + +ADD clean-layer.sh /tmp/clean-layer.sh +ADD patches/nbconvert-extensions.tpl /opt/kaggle/nbconvert-extensions.tpl +ADD patches/template_conf.json /opt/kaggle/conf.json + +# Use a fixed apt-get repo to stop intermittent failures due to flaky httpredir connections, +# as described by Lionel Chan at http://stackoverflow.com/a/37426929/5881346 +RUN sed -i "s/httpredir.debian.org/debian.uchicago.edu/" /etc/apt/sources.list && \ + apt-get update && \ + # Needed by vowpalwabbit & lightGBM (GPU build). + # https://github.com/VowpalWabbit/vowpal_wabbit/wiki/Python#installing + # https://lightgbm.readthedocs.io/en/latest/GPU-Tutorial.html#build-lightgbm + apt-get install -y build-essential unzip cmake && \ + apt-get install -y libboost-dev libboost-program-options-dev libboost-system-dev libboost-thread-dev libboost-math-dev libboost-test-dev libboost-python-dev libboost-filesystem-dev zlib1g-dev && \ + # b/182601974: ssh client was removed from the base image but is required for packages such as stable-baselines. + apt-get install -y openssh-client && \ + /tmp/clean-layer.sh + + + +# TODO(rosbo): Is this needed? Make sure gpu and stubs are properly set. +# Make sure the dynamic linker finds the right libstdc++ +# ENV LD_LIBRARY_PATH=/opt/conda/lib + +# b/128333086: Set PROJ_LIB to points to the proj4 cartographic library. +ENV PROJ_LIB=/opt/conda/share/proj + +# Install conda packages not available on pip. +# When using pip in a conda environment, conda commands should be ran first and then +# the remaining pip commands: https://www.anaconda.com/using-pip-in-a-conda-environment/ +RUN conda config --add channels nvidia && \ + conda config --add channels rapidsai && \ + # Base image channel order: conda-forge (highest priority), defaults. + # End state: rapidsai (highest priority), nvidia, conda-forge, defaults. + conda install mkl cartopy=0.19 imagemagick=7.1 pyproj==3.1.0 && \ + /tmp/clean-layer.sh + +{{ if eq .Accelerator "gpu" }} +RUN conda install cudf=21.08 cuml=21.08 cudatoolkit=$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION && \ + /tmp/clean-layer.sh +{{ end }} + +# Install PyTorch +{{ if eq .Accelerator "gpu" }} +RUN pip install torch==1.7.1+cu$CUDA_MAJOR_VERSION$CUDA_MINOR_VERSION torchvision==0.8.2+cu$CUDA_MAJOR_VERSION$CUDA_MINOR_VERSION -f https://download.pytorch.org/whl/torch_stable.html && \ + /tmp/clean-layer.sh +{{ else }} +RUN pip install torch==1.7.1+cpu torchvision==0.8.2+cpu torchaudio==0.7.2 torchtext==0.8.1 -f https://download.pytorch.org/whl/torch_stable.html && \ + /tmp/clean-layer.sh +{{ end }} + +# Install LightGBM +ENV LIGHTGBM_VERSION=3.2.1 +{{ if eq .Accelerator "gpu" }} +# Install OpenCL & libboost (required by LightGBM GPU version) +RUN apt-get install -y ocl-icd-libopencl1 clinfo libboost-all-dev && \ + mkdir -p /etc/OpenCL/vendors && \ + echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd && \pip uninstall -y lightgbm && \ + cd /usr/local/src && \ + git clone --recursive https://github.com/microsoft/LightGBM && \ + cd LightGBM && \ + git checkout tags/v$LIGHTGBM_VERSION && \ + mkdir build && cd build && \ + cmake -DUSE_GPU=1 -DOpenCL_LIBRARY=/usr/local/cuda/lib64/libOpenCL.so -DOpenCL_INCLUDE_DIR=/usr/local/cuda/include/ .. && \ + make -j$(nproc) && \ + cd /usr/local/src/LightGBM/python-package && \ + python setup.py install --precompile && \ + mkdir -p /etc/OpenCL/vendors && \ + echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd && \ + /tmp/clean-layer.sh +{{ else }} +RUN pip install lightgbm==$LIGHTGBM_VERSION && \ + /tmp/clean-layer.sh +{{ end }} + +# Install JAX +ENV JAX_VERSION=0.2.19 +{{ if eq .Accelerator "gpu" }} +RUN pip install jax[cuda$CUDA_MAJOR_VERSION$CUDA_MINOR_VERSION]==$JAX_VERSION -f https://storage.googleapis.com/jax-releases/jax_releases.html && \ + /tmp/clean-layer.sh +{{ else }} +RUN pip install jax[cpu]==$JAX_VERSION && \ + /tmp/clean-layer.sh +{{ end }} + +# Install mxnet +{{ if eq .Accelerator "gpu" }} +RUN pip install mxnet-cu$CUDA_MAJOR_VERSION$CUDA_MINOR_VERSION && \ + /tmp/clean-layer.sh +{{ else }} +RUN pip install mxnet && \ + /tmp/clean-layer.sh +{{ end}} + +# Install GPU specific packages +{{ if eq .Accelerator "gpu" }} +# Install GPU-only packages +RUN pip install pycuda && \ + pip install pynvrtc && \ + # b/190622765 latest version is causing issue. nnabla fixed it in https://github.com/sony/nnabla/issues/892, waiting for new release before we can remove this pin. + pip install pynvml==8.0.4 && \ + pip install nnabla-ext-cuda$CUDA_MAJOR_VERSION$CUDA_MINOR_VERSION && \ + /tmp/clean-layer.sh +{{ end }} + +RUN pip install pysal && \ + pip install seaborn python-dateutil dask python-igraph && \ + pip install pyyaml joblib husl geopy ml_metrics mne pyshp && \ + pip install pandas && \ + # Install h2o from source. + # Use `conda install -c h2oai h2o` once Python 3.7 version is released to conda. + apt-get install -y default-jre-headless && \ + pip install -f https://h2o-release.s3.amazonaws.com/h2o/latest_stable_Py.html h2o && \ + pip install tensorflow-gcs-config==2.6.0 && \ + pip install tensorflow-addons==0.14.0 && \ + /tmp/clean-layer.sh + +RUN apt-get install -y libfreetype6-dev && \ + apt-get install -y libglib2.0-0 libxext6 libsm6 libxrender1 libfontconfig1 --fix-missing && \ + # b/198300835 kornia 4.1.0 is not compatible with our version of numpy. + pip install gensim==4.0.1 && \ + pip install textblob && \ + pip install wordcloud && \ + pip install xgboost && \ + pip install pydot && \ + pip install flake8 && \ + # Pinned because it breaks theano test with the latest version (b/178107003). + pip install theano-pymc==1.0.11 && \ + pip install python-Levenshtein && \ + pip install hep_ml && \ + # NLTK Project datasets + mkdir -p /usr/share/nltk_data && \ + # NLTK Downloader no longer continues smoothly after an error, so we explicitly list + # the corpuses that work + # "yes | ..." answers yes to the retry prompt in case of an error. See b/133762095. + yes | python -m nltk.downloader -d /usr/share/nltk_data abc alpino averaged_perceptron_tagger \ + basque_grammars biocreative_ppi bllip_wsj_no_aux \ + book_grammars brown brown_tei cess_cat cess_esp chat80 city_database cmudict \ + comtrans conll2000 conll2002 conll2007 crubadan dependency_treebank \ + europarl_raw floresta gazetteers genesis gutenberg \ + ieer inaugural indian jeita kimmo knbc large_grammars lin_thesaurus mac_morpho machado \ + masc_tagged maxent_ne_chunker maxent_treebank_pos_tagger moses_sample movie_reviews \ + mte_teip5 names nps_chat omw opinion_lexicon paradigms \ + pil pl196x porter_test ppattach problem_reports product_reviews_1 product_reviews_2 propbank \ + pros_cons ptb punkt qc reuters rslp rte sample_grammars semcor senseval sentence_polarity \ + sentiwordnet shakespeare sinica_treebank smultron snowball_data spanish_grammars \ + state_union stopwords subjectivity swadesh switchboard tagsets timit toolbox treebank \ + twitter_samples udhr2 udhr unicode_samples universal_tagset universal_treebanks_v20 \ + vader_lexicon verbnet webtext word2vec_sample wordnet wordnet_ic words ycoe && \ + # Stop-words + pip install stop-words && \ + pip install scikit-image && \ + /tmp/clean-layer.sh + +RUN pip install ibis-framework && \ + pip install gluonnlp && \ + pip install gluoncv && \ + /tmp/clean-layer.sh + +RUN pip install scipy && \ + # b/176817038 avoid upgrade to 0.24 which is causing issues with hep-ml package. + pip install scikit-learn==0.23.2 && \ + # HDF5 support + pip install h5py && \ + pip install biopython && \ + # PUDB, for local debugging convenience + pip install pudb && \ + pip install imbalanced-learn && \ + # Profiling and other utilities + pip install line_profiler && \ + pip install orderedmultidict && \ + pip install smhasher && \ + pip install bokeh && \ + pip install numba && \ + pip install datashader && \ + # Boruta (python implementation) + pip install Boruta && \ + apt-get install -y graphviz && pip install graphviz && \ + # Pandoc is a dependency of deap + apt-get install -y pandoc && \ + pip install git+git://github.com/scikit-learn-contrib/py-earth.git@issue191 && \ + pip install essentia && \ + /tmp/clean-layer.sh + +# vtk with dependencies +RUN apt-get install -y libgl1-mesa-glx && \ + pip install vtk && \ + # xvfbwrapper with dependencies + apt-get install -y xvfb && \ + pip install xvfbwrapper && \ + /tmp/clean-layer.sh + +RUN pip install mpld3 && \ + pip install gpxpy && \ + pip install arrow && \ + pip install nilearn && \ + pip install nibabel && \ + pip install pronouncing && \ + pip install markovify && \ + pip install imgaug && \ + pip install preprocessing && \ + pip install path.py && \ + pip install Geohash && \ + # https://github.com/vinsci/geohash/issues/4 + sed -i -- 's/geohash/.geohash/g' /opt/conda/lib/python3.7/site-packages/Geohash/__init__.py && \ + pip install deap && \ + pip install tpot && \ + pip install scikit-optimize && \ + pip install haversine && \ + pip install toolz cytoolz && \ + pip install plotly && \ + pip install hyperopt && \ + pip install fitter && \ + pip install langid && \ + # Delorean. Useful for dealing with datetime + pip install delorean && \ + pip install trueskill && \ + # Useful data exploration libraries (for missing data and generating reports) + pip install missingno && \ + pip install pandas-profiling && \ + pip install s2sphere && \ + pip install bayesian-optimization && \ + pip install matplotlib-venn && \ + # b/184083722 pyldavis >= 3.3 requires numpy >= 1.20.0 but TensorFlow 2.4.1 / 2.5.0 requires 1.19.2 + pip install pyldavis==3.2.2 && \ + pip install mlxtend && \ + pip install altair && \ + # b/183944405 pystan 3.x is not compatible with fbprophet. + pip install pystan==2.19.1.1 && \ + pip install ImageHash && \ + pip install ecos && \ + pip install CVXcanon && \ + pip install pymc3 && \ + pip install imagecodecs && \ + pip install tifffile && \ + pip install spectral && \ + pip install descartes && \ + pip install geojson && \ + pip install pydicom && \ + pip install wavio && \ + pip install SimpleITK && \ + pip install hmmlearn && \ + pip install bayespy && \ + pip install gplearn && \ + pip install PyAstronomy && \ + pip install squarify && \ + pip install fuzzywuzzy && \ + pip install python-louvain && \ + pip install pyexcel-ods && \ + pip install sklearn-pandas && \ + pip install stemming && \ + pip install fbprophet && \ + pip install holoviews && \ + pip install geoviews && \ + pip install hypertools && \ + pip install py_stringsimjoin && \ + pip install mlens && \ + pip install scikit-multilearn && \ + pip install cleverhans && \ + pip install leven && \ + pip install catboost && \ + pip install lightfm && \ + pip install folium && \ + pip install scikit-plot && \ + # dipy requires the optional fury dependency for visualizations. + pip install fury dipy && \ + pip install plotnine && \ + pip install scikit-surprise && \ + pip install pymongo && \ + pip install geoplot && \ + pip install eli5 && \ + pip install implicit && \ + pip install kaggle && \ + /tmp/clean-layer.sh + +RUN pip install tensorpack && \ + # Add google PAIR-code Facets + cd /opt/ && git clone https://github.com/PAIR-code/facets && cd facets/ && jupyter nbextension install facets-dist/ --user && \ + export PYTHONPATH=$PYTHONPATH:/opt/facets/facets_overview/python/ && \ + pip install pycountry && \ + pip install iso3166 && \ + pip install pydash && \ + pip install kmodes --no-dependencies && \ + pip install librosa && \ + pip install polyglot && \ + pip install mmh3 && \ + pip install fbpca && \ + pip install sentencepiece && \ + pip install cufflinks && \ + pip install lime && \ + pip install memory_profiler && \ + /tmp/clean-layer.sh + +# install cython & cysignals before pyfasttext +RUN pip install --upgrade cython && \ + pip install --upgrade cysignals && \ + pip install pyfasttext && \ + pip install fasttext && \ + apt-get install -y libhunspell-dev && pip install hunspell && \ + pip install annoy && \ + pip install category_encoders && \ + # google-cloud-automl 2.0.0 introduced incompatible API changes, need to pin to 1.0.1 + pip install google-cloud-automl==1.0.1 && \ + pip install google-cloud-bigquery==2.2.0 && \ + pip install google-cloud-storage && \ + pip install google-cloud-translate==3.* && \ + pip install google-cloud-language==2.* && \ + pip install google-cloud-videointelligence==2.* && \ + pip install google-cloud-vision==2.* && \ + # b/183041606#comment5: the Kaggle data proxy doesn't support these APIs. If the library is missing, it falls back to using a regular BigQuery query to fetch data. + pip uninstall -y google-cloud-bigquery-storage && \ + # After launch this should be installed from pip + pip install git+https://github.com/googleapis/python-aiplatform.git@mb-release && \ + pip install ortools && \ + pip install scattertext && \ + # Pandas data reader + pip install pandas-datareader && \ + pip install wordsegment && \ + pip install wordbatch && \ + pip install emoji && \ + # Add Japanese morphological analysis engine + pip install janome && \ + pip install wfdb && \ + pip install vecstack && \ + # yellowbrick machine learning visualization library + pip install yellowbrick && \ + pip install mlcrate && \ + /tmp/clean-layer.sh + +RUN pip install bleach && \ + pip install certifi && \ + pip install cycler && \ + pip install decorator && \ + pip install entrypoints && \ + pip install html5lib && \ + pip install ipykernel && \ + pip install ipython && \ + pip install ipython-genutils && \ + pip install ipywidgets && \ + pip install isoweek && \ + pip install jedi && \ + pip install Jinja2 && \ + pip install jsonschema && \ + pip install jupyter-client && \ + pip install jupyter-console && \ + pip install jupyter-core && \ + pip install MarkupSafe && \ + pip install mistune && \ + pip install nbconvert && \ + pip install nbformat && \ + pip install notebook && \ + pip install papermill && \ + pip install olefile && \ + # b/198300835 kornia 0.5.10 is not compatible with our version of numpy. + pip install kornia==0.5.8 && \ + pip install pandas_summary && \ + pip install pandocfilters && \ + pip install pexpect && \ + pip install pickleshare && \ + pip install Pillow && \ + # Install openslide and its python binding + apt-get install -y openslide-tools && \ + pip install openslide-python && \ + pip install ptyprocess && \ + pip install Pygments && \ + pip install pyparsing && \ + pip install pytz && \ + pip install PyYAML && \ + pip install pyzmq && \ + pip install qtconsole && \ + pip install six && \ + pip install terminado && \ + pip install tornado && \ + pip install tqdm && \ + pip install traitlets && \ + pip install wcwidth && \ + pip install webencodings && \ + pip install widgetsnbextension && \ + pip install pyarrow && \ + pip install feather-format && \ + # fastai >= 2.3.1 upgrades pytorch/torchvision. upgrade of pytorch will be handled in b/181966788 + pip install fastai==2.2.7 && \ + pip install allennlp && \ + # https://b.corp.google.com/issues/184685619#comment9: 3.9.0 is causing a major performance degradation with spacy 2.3.5 + pip install importlib-metadata==3.4.0 && \ + python -m spacy download en_core_web_sm && python -m spacy download en_core_web_lg && \ + apt-get install -y ffmpeg && \ + /tmp/clean-layer.sh + + ########### + # + # NEW CONTRIBUTORS: + # Please add new pip/apt installs in this block. Don't forget a "&& \" at the end + # of all non-final lines. Thanks! + # + ########### + +RUN pip install flashtext && \ + pip install wandb && \ + pip install marisa-trie && \ + pip install pyemd && \ + pip install pyupset && \ + pip install pympler && \ + pip install s3fs && \ + pip install featuretools && \ + pip install -e git+https://github.com/SohierDane/BigQuery_Helper#egg=bq_helper && \ + pip install hpsklearn && \ + pip install git+https://github.com/Kaggle/learntools && \ + pip install kmapper && \ + pip install shap && \ + pip install ray && \ + pip install gym && \ + pip install pyarabic && \ + pip install pandasql && \ + pip install tensorflow_hub && \ + pip install jieba && \ + pip install git+https://github.com/SauceCat/PDPbox && \ + # ggplot is broken and main repo does not merge and release https://github.com/yhat/ggpy/pull/668 + pip install https://github.com/hbasria/ggpy/archive/0.11.5.zip && \ + pip install cesium && \ + pip install rgf_python && \ + pip install tsfresh && \ + pip install pykalman && \ + pip install optuna && \ + pip install plotly_express && \ + pip install albumentations && \ + pip install catalyst && \ + pip install osmnx && \ + apt-get -y install libspatialindex-dev && \ + pip install pytorch-ignite && \ + pip install qgrid && \ + pip install bqplot && \ + pip install earthengine-api && \ + pip install transformers && \ + pip install dlib && \ + pip install kaggle-environments && \ + pip install geopandas && \ + pip install nnabla && \ + pip install vowpalwabbit && \ + pip install pydub && \ + pip install pydegensac && \ + # b/198635596 latest versions of torchmetrics & pytorch-lightning are failing at runtime. + pip install torchmetrics==0.5.0 && \ + pip install pytorch-lightning==1.4.4 && \ + pip install datatable && \ + pip install sympy && \ + # flask is used by agents in the simulation competitions. + pip install flask && \ + # pycrypto is used by competitions team. + pip install pycrypto && \ + pip install easyocr && \ + # ipympl adds interactive widget support for matplotlib + pip install ipympl==0.7.0 && \ + pip install pandarallel && \ + /tmp/clean-layer.sh + +# Download base easyocr models. +# https://github.com/JaidedAI/EasyOCR#usage +RUN mkdir -p /root/.EasyOCR/model && \ + wget --no-verbose "https://github.com/JaidedAI/EasyOCR/releases/download/v1.3/latin_g2.zip" -O /root/.EasyOCR/model/latin.zip && \ + unzip /root/.EasyOCR/model/latin.zip -d /root/.EasyOCR/model/ && \ + rm /root/.EasyOCR/model/latin.zip && \ + wget --no-verbose "https://github.com/JaidedAI/EasyOCR/releases/download/v1.3/english_g2.zip" -O /root/.EasyOCR/model/english.zip && \ + unzip /root/.EasyOCR/model/english.zip -d /root/.EasyOCR/model/ && \ + rm /root/.EasyOCR/model/english.zip && \ + wget --no-verbose "https://github.com/JaidedAI/EasyOCR/releases/download/pre-v1.1.6/craft_mlt_25k.zip" -O /root/.EasyOCR/model/craft_mlt_25k.zip && \ + unzip /root/.EasyOCR/model/craft_mlt_25k.zip -d /root/.EasyOCR/model/ && \ + rm /root/.EasyOCR/model/craft_mlt_25k.zip && \ + /tmp/clean-layer.sh + +# Tesseract and some associated utility packages +RUN apt-get install tesseract-ocr -y && \ + pip install pytesseract && \ + pip install wand && \ + pip install pdf2image && \ + pip install PyPDF && \ + pip install pyocr && \ + /tmp/clean-layer.sh +ENV TESSERACT_PATH=/usr/bin/tesseract + +# For Facets +ENV PYTHONPATH=$PYTHONPATH:/opt/facets/facets_overview/python/ +# For Theano with MKL +ENV MKL_THREADING_LAYER=GNU + +# Temporary fixes and patches +# Temporary patch for Dask getting downgraded, which breaks Keras +RUN pip install --upgrade dask && \ + # Stop jupyter nbconvert trying to rewrite its folder hierarchy + mkdir -p /root/.jupyter && touch /root/.jupyter/jupyter_nbconvert_config.py && touch /root/.jupyter/migrated && \ + mkdir -p /.jupyter && touch /.jupyter/jupyter_nbconvert_config.py && touch /.jupyter/migrated && \ + # Stop Matplotlib printing junk to the console on first load + sed -i "s/^.*Matplotlib is building the font cache using fc-list.*$/# Warning removed by Kaggle/g" /opt/conda/lib/python3.7/site-packages/matplotlib/font_manager.py && \ + # Make matplotlib output in Jupyter notebooks display correctly + mkdir -p /etc/ipython/ && echo "c = get_config(); c.IPKernelApp.matplotlib = 'inline'" > /etc/ipython/ipython_config.py && \ + # Temporary patch for broken libpixman 0.38 in conda-forge, symlink to system libpixman 0.34 untile conda package gets updated to 0.38.5 or higher. + ln -sf /usr/lib/x86_64-linux-gnu/libpixman-1.so.0.34.0 /opt/conda/lib/libpixman-1.so.0.38.0 && \ + /tmp/clean-layer.sh + +# gcloud SDK https://cloud.google.com/sdk/docs/quickstart-debian-ubuntu +RUN echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] http://packages.cloud.google.com/apt cloud-sdk main" \ + | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && \ + curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | \ + apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - && \ + apt-get update -y && apt-get install google-cloud-sdk -y && \ + /tmp/clean-layer.sh + +# Add BigQuery client proxy settings +ENV PYTHONUSERBASE "/root/.local" +ADD patches/kaggle_gcp.py /root/.local/lib/python3.7/site-packages/kaggle_gcp.py +ADD patches/kaggle_secrets.py /root/.local/lib/python3.7/site-packages/kaggle_secrets.py +ADD patches/kaggle_session.py /root/.local/lib/python3.7/site-packages/kaggle_session.py +ADD patches/kaggle_web_client.py /root/.local/lib/python3.7/site-packages/kaggle_web_client.py +ADD patches/kaggle_datasets.py /root/.local/lib/python3.7/site-packages/kaggle_datasets.py +ADD patches/log.py /root/.local/lib/python3.7/site-packages/log.py +ADD patches/sitecustomize.py /root/.local/lib/python3.7/site-packages/sitecustomize.py +# Override default imagemagick policies +ADD patches/imagemagick-policy.xml /etc/ImageMagick-6/policy.xml + +# TensorBoard Jupyter extension. Should be replaced with TensorBoard's provided magic once we have +# worker tunneling support in place. +# b/139212522 re-enable TensorBoard once solution for slowdown is implemented. +# ENV JUPYTER_CONFIG_DIR "/root/.jupyter/" +# RUN pip install jupyter_tensorboard && \ +# jupyter serverextension enable jupyter_tensorboard && \ +# jupyter tensorboard enable +# ADD patches/tensorboard/notebook.py /opt/conda/lib/python3.7/site-packages/tensorboard/notebook.py +# TODO(rosbo): Will likely need to remove tensorboard. + +# Disable unnecessary jupyter extensions +RUN jupyter-nbextension disable nb_conda --py --sys-prefix && \ + jupyter-serverextension disable nb_conda --py --sys-prefix && \ + python -m nb_conda_kernels.install --disable + +# Set backend for matplotlib +ENV MPLBACKEND "agg" + +ARG GIT_COMMIT=unknown +ARG BUILD_DATE=unknown + +LABEL git-commit=$GIT_COMMIT +LABEL build-date=$BUILD_DATE +LABEL tensorflow-version=$TENSORFLOW_VERSION +# Used in the Jenkins `Docker GPU Build` step to restrict the images being pruned. +LABEL kaggle-lang=python + +# Correlate current release with the git hash inside the kernel editor by running `!cat /etc/git_commit`. +RUN echo "$GIT_COMMIT" > /etc/git_commit && echo "$BUILD_DATE" > /etc/build_date diff --git a/Jenkinsfile b/Jenkinsfile index 878a169e..dc6a3860 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -20,101 +20,99 @@ pipeline { } stages { - stage('Docker CPU Build') { - options { - timeout(time: 120, unit: 'MINUTES') - } - steps { - sh '''#!/bin/bash - set -exo pipefail - - ./build | ts - ./push ${PRETEST_TAG} - ''' - } - } - - stage('Test CPU Image') { - options { - timeout(time: 5, unit: 'MINUTES') - } - steps { - sh '''#!/bin/bash - set -exo pipefail - - date - docker pull gcr.io/kaggle-images/python:${PRETEST_TAG} - ./test --image gcr.io/kaggle-images/python:${PRETEST_TAG} - ''' - } - } - - stage('Docker GPU Build') { - // A GPU is not required to build this image. However, in our current setup, - // the default runtime is set to nvidia (as opposed to runc) and there - // is no option to specify a runtime for the `docker build` command. - // - // TODO(rosbo) don't set `nvidia` as the default runtime and use the - // `--runtime=nvidia` flag for the `docker run` command when GPU support is needed. - agent { label 'ephemeral-linux-gpu' } - options { - timeout(time: 60, unit: 'MINUTES') - } - steps { - sh '''#!/bin/bash - set -exo pipefail - # Remove images (dangling or not) created more than 120h (5 days ago) to prevent disk from filling up. - docker image prune --all --force --filter "until=120h" --filter "label=kaggle-lang=python" - # Remove any dangling images (no tags). - # All builds for the same branch uses the same tag. This means a subsequent build for the same branch - # will untag the previously built image which is safe to do. Builds for a single branch are performed - # serially. - docker image prune -f - ./build --gpu --base-image-tag ${PRETEST_TAG} | ts - ./push --gpu ${PRETEST_TAG} - ''' - } - } - - stage('Test GPU Image') { - agent { label 'ephemeral-linux-gpu' } - options { - timeout(time: 20, unit: 'MINUTES') - } - steps { - sh '''#!/bin/bash - set -exo pipefail + stage('Docker Build') { + parallel { + stage('CPU') { + stages { + stage('Docker CPU Build') { + options { + timeout(time: 120, unit: 'MINUTES') + } + steps { + sh '''#!/bin/bash + set -exo pipefail - date - docker pull gcr.io/kaggle-private-byod/python:${PRETEST_TAG} - ./test --gpu --image gcr.io/kaggle-private-byod/python:${PRETEST_TAG} - ''' - } - } + ./build | ts + ./push ${PRETEST_TAG} + ''' + } + } + stage('Test CPU Image') { + options { + timeout(time: 5, unit: 'MINUTES') + } + steps { + sh '''#!/bin/bash + set -exo pipefail - stage('Package Versions') { - parallel { - stage('CPU Diff') { - steps { - sh '''#!/bin/bash - set -exo pipefail + date + docker pull gcr.io/kaggle-images/python:${PRETEST_TAG} + ./test --image gcr.io/kaggle-images/python:${PRETEST_TAG} + ''' + } + } + stage('CPU Diff') { + steps { + sh '''#!/bin/bash + set -exo pipefail - docker pull gcr.io/kaggle-images/python:${PRETEST_TAG} - ./diff --target gcr.io/kaggle-images/python:${PRETEST_TAG} - ''' + docker pull gcr.io/kaggle-images/python:${PRETEST_TAG} + ./diff --target gcr.io/kaggle-images/python:${PRETEST_TAG} + ''' + } + } } } - stage('GPU Diff') { + stage('GPU') { agent { label 'ephemeral-linux-gpu' } - steps { - sh '''#!/bin/bash - set -exo pipefail + stages { + stage('Docker GPU Build') { + options { + timeout(time: 60, unit: 'MINUTES') + } + steps { + sh '''#!/bin/bash + set -exo pipefail + # Remove images (dangling or not) created more than 120h (5 days ago) to prevent disk from filling up. + docker image prune --all --force --filter "until=120h" --filter "label=kaggle-lang=python" + # Remove any dangling images (no tags). + # All builds for the same branch uses the same tag. This means a subsequent build for the same branch + # will untag the previously built image which is safe to do. Builds for a single branch are performed + # serially. + docker image prune -f + ./build --gpu --base-image-tag ${PRETEST_TAG} | ts + ./push --gpu ${PRETEST_TAG} + ''' + } + } + stage('Test GPU Image') { + agent { label 'ephemeral-linux-gpu' } + options { + timeout(time: 20, unit: 'MINUTES') + } + steps { + sh '''#!/bin/bash + set -exo pipefail - docker pull gcr.io/kaggle-private-byod/python:${PRETEST_TAG} - ./diff --gpu --target gcr.io/kaggle-private-byod/python:${PRETEST_TAG} - ''' + date + docker pull gcr.io/kaggle-private-byod/python:${PRETEST_TAG} + ./test --gpu --image gcr.io/kaggle-private-byod/python:${PRETEST_TAG} + ''' + } + } + stage('GPU Diff') { + agent { label 'ephemeral-linux-gpu' } + steps { + sh '''#!/bin/bash + set -exo pipefail + + docker pull gcr.io/kaggle-private-byod/python:${PRETEST_TAG} + ./diff --gpu --target gcr.io/kaggle-private-byod/python:${PRETEST_TAG} + ''' + } + } } - } + } } } diff --git a/build b/build index ae9a9779..587cf9bc 100755 --- a/build +++ b/build @@ -15,6 +15,7 @@ EOF CACHE_FLAG='--no-cache' DOCKERFILE='Dockerfile' +ACCELERATOR='none' IMAGE_TAG='kaggle/python-build' BUILD_ARGS='' @@ -27,6 +28,7 @@ while :; do -g|--gpu) IMAGE_TAG='kaggle/python-gpu-build' DOCKERFILE='gpu.Dockerfile' + ACCELERATOR='gpu' ;; -c|--use-cache) CACHE_FLAG='' @@ -57,8 +59,22 @@ BUILD_ARGS+=" --build-arg BUILD_DATE=$(date '+%Y%m%d-%H%M%S')" readonly CACHE_FLAG readonly DOCKERFILE +readonly ACCELERATOR readonly IMAGE_TAG readonly BUILD_ARGS + +SRCDIR=$(dirname "${BASH_SOURCE[0]}") +DOCKERFILE_OUTDIR="${SRCDIR}/.generated" +mkdir -p $DOCKERFILE_OUTDIR +DOCKERFILE_PATH="$DOCKERFILE_OUTDIR/$DOCKERFILE" + +# Generate Dockerfile from template. +echo "Generating Dockerfile from template..." +docker run --rm -v $PWD:/input:ro gcr.io/kaggle-images/go-renderizer:latest --ACCELERATOR=$ACCELERATOR /input/Dockerfile.tmpl > $DOCKERFILE_PATH +echo "==================== $DOCKERFILE START ====================" +cat $DOCKERFILE_PATH +echo "==================== $DOCKERFILE END ====================" + set -x -docker build --rm --pull $CACHE_FLAG -t "$IMAGE_TAG" -f "$DOCKERFILE" $BUILD_ARGS . +docker build --rm --pull $CACHE_FLAG -t "$IMAGE_TAG" -f "$DOCKERFILE_PATH" $BUILD_ARGS . diff --git a/renderizer/Dockerfile b/renderizer/Dockerfile new file mode 100644 index 00000000..9faac229 --- /dev/null +++ b/renderizer/Dockerfile @@ -0,0 +1,12 @@ +# Image used to generate the Dockerfiles from a Go text template. +# +# Build: +# docker build --rm --pull -t gcr.io/kaggle-images/go-renderizer -f Dockerfile . +# +# Push: +# docker push gcr.io/kaggle-images/go-renderizer +FROM golang:1.17 + +RUN go install github.com/gomatic/renderizer/v2/cmd/renderizer@v2.0.13 + +ENTRYPOINT ["renderizer"] \ No newline at end of file From a8013e7d051a5691859cd7c63ee660b4a9bb4ffd Mon Sep 17 00:00:00 2001 From: Vincent Roseberry Date: Fri, 17 Sep 2021 22:35:10 +0000 Subject: [PATCH 02/12] remove base-tag, re-org Jenkinsfile --- Dockerfile.tmpl | 6 +- Jenkinsfile | 148 +++++++++++++++++++++++------------------------- build | 10 ---- 3 files changed, 74 insertions(+), 90 deletions(-) diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index b1f0bec3..e027738b 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -1,9 +1,7 @@ -ARG BASE_TAG=m78 - {{ if eq .Accelerator "gpu" }} -FROM gcr.io/deeplearning-platform-release/tf2-gpu.2-6:${BASE_TAG} +FROM gcr.io/deeplearning-platform-release/tf2-gpu.2-6:m78 {{ else }} -FROM gcr.io/deeplearning-platform-release/tf2-cpu.2-6:${BASE_TAG} +FROM gcr.io/deeplearning-platform-release/tf2-cpu.2-6:m78 {{ end }} # Keep these variables in sync if base image is updated. ENV TENSORFLOW_VERSION=2.6.0 diff --git a/Jenkinsfile b/Jenkinsfile index dc6a3860..f73290a0 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -20,100 +20,96 @@ pipeline { } stages { - stage('Docker Build') { - parallel { - stage('CPU') { - stages { - stage('Docker CPU Build') { - options { - timeout(time: 120, unit: 'MINUTES') - } - steps { - sh '''#!/bin/bash - set -exo pipefail - - ./build | ts - ./push ${PRETEST_TAG} - ''' - } + parallel { + stage('CPU') { + stages { + stage('Docker CPU Build') { + options { + timeout(time: 120, unit: 'MINUTES') } - stage('Test CPU Image') { - options { - timeout(time: 5, unit: 'MINUTES') - } - steps { - sh '''#!/bin/bash - set -exo pipefail + steps { + sh '''#!/bin/bash + set -exo pipefail - date - docker pull gcr.io/kaggle-images/python:${PRETEST_TAG} - ./test --image gcr.io/kaggle-images/python:${PRETEST_TAG} - ''' - } + ./build | ts + ./push ${PRETEST_TAG} + ''' + } + } + stage('Test CPU Image') { + options { + timeout(time: 5, unit: 'MINUTES') } - stage('CPU Diff') { - steps { - sh '''#!/bin/bash + steps { + sh '''#!/bin/bash set -exo pipefail + date docker pull gcr.io/kaggle-images/python:${PRETEST_TAG} - ./diff --target gcr.io/kaggle-images/python:${PRETEST_TAG} + ./test --image gcr.io/kaggle-images/python:${PRETEST_TAG} ''' - } + } + } + stage('CPU Diff') { + steps { + sh '''#!/bin/bash + set -exo pipefail + + docker pull gcr.io/kaggle-images/python:${PRETEST_TAG} + ./diff --target gcr.io/kaggle-images/python:${PRETEST_TAG} + ''' } } } - stage('GPU') { - agent { label 'ephemeral-linux-gpu' } - stages { - stage('Docker GPU Build') { - options { - timeout(time: 60, unit: 'MINUTES') - } - steps { - sh '''#!/bin/bash - set -exo pipefail - # Remove images (dangling or not) created more than 120h (5 days ago) to prevent disk from filling up. - docker image prune --all --force --filter "until=120h" --filter "label=kaggle-lang=python" - # Remove any dangling images (no tags). - # All builds for the same branch uses the same tag. This means a subsequent build for the same branch - # will untag the previously built image which is safe to do. Builds for a single branch are performed - # serially. - docker image prune -f - ./build --gpu --base-image-tag ${PRETEST_TAG} | ts - ./push --gpu ${PRETEST_TAG} - ''' - } + } + stage('GPU') { + agent { label 'ephemeral-linux-gpu' } + stages { + stage('Docker GPU Build') { + options { + timeout(time: 60, unit: 'MINUTES') } - stage('Test GPU Image') { - agent { label 'ephemeral-linux-gpu' } - options { - timeout(time: 20, unit: 'MINUTES') - } - steps { - sh '''#!/bin/bash - set -exo pipefail - - date - docker pull gcr.io/kaggle-private-byod/python:${PRETEST_TAG} - ./test --gpu --image gcr.io/kaggle-private-byod/python:${PRETEST_TAG} - ''' - } + steps { + sh '''#!/bin/bash + set -exo pipefail + # Remove images (dangling or not) created more than 120h (5 days ago) to prevent disk from filling up. + docker image prune --all --force --filter "until=120h" --filter "label=kaggle-lang=python" + # Remove any dangling images (no tags). + # All builds for the same branch uses the same tag. This means a subsequent build for the same branch + # will untag the previously built image which is safe to do. Builds for a single branch are performed + # serially. + docker image prune -f + ./build --gpu --base-image-tag ${PRETEST_TAG} | ts + ./push --gpu ${PRETEST_TAG} + ''' } - stage('GPU Diff') { - agent { label 'ephemeral-linux-gpu' } - steps { - sh '''#!/bin/bash + } + stage('Test GPU Image') { + options { + timeout(time: 20, unit: 'MINUTES') + } + steps { + sh '''#!/bin/bash set -exo pipefail + date docker pull gcr.io/kaggle-private-byod/python:${PRETEST_TAG} - ./diff --gpu --target gcr.io/kaggle-private-byod/python:${PRETEST_TAG} + ./test --gpu --image gcr.io/kaggle-private-byod/python:${PRETEST_TAG} ''' - } } } - } - } + stage('GPU Diff') { + steps { + sh '''#!/bin/bash + set -exo pipefail + + docker pull gcr.io/kaggle-private-byod/python:${PRETEST_TAG} + ./diff --gpu --target gcr.io/kaggle-private-byod/python:${PRETEST_TAG} + ''' + } + } + } + } } stage('Label CPU/GPU Staging Images') { diff --git a/build b/build index 587cf9bc..4fc9c8e5 100755 --- a/build +++ b/build @@ -9,7 +9,6 @@ Build a new Python Docker image. Options: -g, --gpu Build an image with GPU support. -c, --use-cache Use layer cache when building a new image. - -b, --base-image-tag TAG Base image tag. Defaults to value defined in DOCKERFILE. EOF } @@ -33,15 +32,6 @@ while :; do -c|--use-cache) CACHE_FLAG='' ;; - -b|--base-image-tag) - if [[ -z $2 ]]; then - usage - printf 'ERROR: No TAG specified after the %s flag.\n' "$1" >&2 - exit - fi - BUILD_ARGS="--build-arg BASE_TAG=$2" - shift # skip the flag value - ;; -?*) usage printf 'ERROR: Unknown option: %s\n' "$1" >&2 From d7be1fd820a13b6768364e840b5f0acc9c6f99e6 Mon Sep 17 00:00:00 2001 From: Vincent Roseberry Date: Fri, 17 Sep 2021 22:41:12 +0000 Subject: [PATCH 03/12] fix jenkinsfile --- Jenkinsfile | 153 +++++++++++++++++++++++++++------------------------- 1 file changed, 81 insertions(+), 72 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index f73290a0..4d630358 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -20,96 +20,105 @@ pipeline { } stages { - parallel { - stage('CPU') { - stages { - stage('Docker CPU Build') { - options { - timeout(time: 120, unit: 'MINUTES') - } - steps { - sh '''#!/bin/bash - set -exo pipefail + stage('Clean Images') { + steps { + sh '''#!/bin/bash + set -exo pipefail + # Remove images (dangling or not) created more than 120h (5 days ago) to prevent disk from filling up. + docker image prune --all --force --filter "until=120h" --filter "label=kaggle-lang=python" + # Remove any dangling images (no tags). + # All builds for the same branch uses the same tag. This means a subsequent build for the same branch + # will untag the previously built image which is safe to do. Builds for a single branch are performed + # serially. + docker image prune -f + ''' + } + } + stage('Build/Test/Diff') { + parallel { + stage('CPU') { + stages { + stage('Build CPU Image') { + options { + timeout(time: 120, unit: 'MINUTES') + } + steps { + sh '''#!/bin/bash + set -exo pipefail - ./build | ts - ./push ${PRETEST_TAG} - ''' + ./build | ts + ./push ${PRETEST_TAG} + ''' + } } - } - stage('Test CPU Image') { - options { - timeout(time: 5, unit: 'MINUTES') + stage('Test CPU Image') { + options { + timeout(time: 5, unit: 'MINUTES') + } + steps { + sh '''#!/bin/bash + set -exo pipefail + + date + docker pull gcr.io/kaggle-images/python:${PRETEST_TAG} + ./test --image gcr.io/kaggle-images/python:${PRETEST_TAG} + ''' + } } - steps { - sh '''#!/bin/bash + stage('Diff CPU image') { + steps { + sh '''#!/bin/bash set -exo pipefail - date docker pull gcr.io/kaggle-images/python:${PRETEST_TAG} - ./test --image gcr.io/kaggle-images/python:${PRETEST_TAG} + ./diff --target gcr.io/kaggle-images/python:${PRETEST_TAG} ''' - } - } - stage('CPU Diff') { - steps { - sh '''#!/bin/bash - set -exo pipefail - - docker pull gcr.io/kaggle-images/python:${PRETEST_TAG} - ./diff --target gcr.io/kaggle-images/python:${PRETEST_TAG} - ''' + } } } } - } - stage('GPU') { - agent { label 'ephemeral-linux-gpu' } - stages { - stage('Docker GPU Build') { - options { - timeout(time: 60, unit: 'MINUTES') - } - steps { - sh '''#!/bin/bash - set -exo pipefail - # Remove images (dangling or not) created more than 120h (5 days ago) to prevent disk from filling up. - docker image prune --all --force --filter "until=120h" --filter "label=kaggle-lang=python" - # Remove any dangling images (no tags). - # All builds for the same branch uses the same tag. This means a subsequent build for the same branch - # will untag the previously built image which is safe to do. Builds for a single branch are performed - # serially. - docker image prune -f - ./build --gpu --base-image-tag ${PRETEST_TAG} | ts - ./push --gpu ${PRETEST_TAG} - ''' + stage('GPU') { + agent { label 'ephemeral-linux-gpu' } + stages { + stage('Build GPU Image') { + options { + timeout(time: 60, unit: 'MINUTES') + } + steps { + sh '''#!/bin/bash + set -exo pipefail + ./build --gpu | ts + ./push --gpu ${PRETEST_TAG} + ''' + } } - } - stage('Test GPU Image') { - options { - timeout(time: 20, unit: 'MINUTES') + stage('Test GPU Image') { + options { + timeout(time: 20, unit: 'MINUTES') + } + steps { + sh '''#!/bin/bash + set -exo pipefail + + date + docker pull gcr.io/kaggle-private-byod/python:${PRETEST_TAG} + ./test --gpu --image gcr.io/kaggle-private-byod/python:${PRETEST_TAG} + ''' + } } - steps { - sh '''#!/bin/bash + stage('Diff GPU Image') { + steps { + sh '''#!/bin/bash set -exo pipefail - date docker pull gcr.io/kaggle-private-byod/python:${PRETEST_TAG} - ./test --gpu --image gcr.io/kaggle-private-byod/python:${PRETEST_TAG} + ./diff --gpu --target gcr.io/kaggle-private-byod/python:${PRETEST_TAG} ''' + } } } - stage('GPU Diff') { - steps { - sh '''#!/bin/bash - set -exo pipefail - - docker pull gcr.io/kaggle-private-byod/python:${PRETEST_TAG} - ./diff --gpu --target gcr.io/kaggle-private-byod/python:${PRETEST_TAG} - ''' - } - } - } - } + } + } } stage('Label CPU/GPU Staging Images') { From 73d40ed47f1821c3161c679950f48862eb40cbc4 Mon Sep 17 00:00:00 2001 From: Vincent Roseberry Date: Fri, 17 Sep 2021 22:51:30 +0000 Subject: [PATCH 04/12] remove blank lines --- Dockerfile.tmpl | 2 -- 1 file changed, 2 deletions(-) diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index e027738b..de995adc 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -25,8 +25,6 @@ RUN sed -i "s/httpredir.debian.org/debian.uchicago.edu/" /etc/apt/sources.list & apt-get install -y openssh-client && \ /tmp/clean-layer.sh - - # TODO(rosbo): Is this needed? Make sure gpu and stubs are properly set. # Make sure the dynamic linker finds the right libstdc++ # ENV LD_LIBRARY_PATH=/opt/conda/lib From 3b475f8fcbc845c8f8741d164c4977eb31ca2187 Mon Sep 17 00:00:00 2001 From: Vincent Roseberry Date: Fri, 17 Sep 2021 23:28:54 +0000 Subject: [PATCH 05/12] Increase GPU build time --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 4d630358..41aa6a90 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -82,7 +82,7 @@ pipeline { stages { stage('Build GPU Image') { options { - timeout(time: 60, unit: 'MINUTES') + timeout(time: 120, unit: 'MINUTES') } steps { sh '''#!/bin/bash From 7e6de43f76624805036ae29ebf6e93dafdad7595 Mon Sep 17 00:00:00 2001 From: Vincent Roseberry Date: Mon, 20 Sep 2021 16:39:54 +0000 Subject: [PATCH 06/12] install torchaudio/torchtext on GPU build --- Dockerfile.tmpl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index de995adc..e4719665 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -49,7 +49,7 @@ RUN conda install cudf=21.08 cuml=21.08 cudatoolkit=$CUDA_MAJOR_VERSION.$CUDA_MI # Install PyTorch {{ if eq .Accelerator "gpu" }} -RUN pip install torch==1.7.1+cu$CUDA_MAJOR_VERSION$CUDA_MINOR_VERSION torchvision==0.8.2+cu$CUDA_MAJOR_VERSION$CUDA_MINOR_VERSION -f https://download.pytorch.org/whl/torch_stable.html && \ +RUN pip install torch==1.7.1+cu$CUDA_MAJOR_VERSION$CUDA_MINOR_VERSION torchvision==0.8.2+cu$CUDA_MAJOR_VERSION$CUDA_MINOR_VERSION torchaudio==0.7.2 torchtext==0.8.1 -f https://download.pytorch.org/whl/torch_stable.html && \ /tmp/clean-layer.sh {{ else }} RUN pip install torch==1.7.1+cpu torchvision==0.8.2+cpu torchaudio==0.7.2 torchtext==0.8.1 -f https://download.pytorch.org/whl/torch_stable.html && \ From 5fc2ea6851d44687cdab18ddbb8fe8f0e38a53ee Mon Sep 17 00:00:00 2001 From: Vincent Roseberry Date: Tue, 21 Sep 2021 20:42:16 +0000 Subject: [PATCH 07/12] Turn off KMP_AFFINITY logs --- Dockerfile.tmpl | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index e4719665..d61e22d8 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -1,12 +1,15 @@ {{ if eq .Accelerator "gpu" }} FROM gcr.io/deeplearning-platform-release/tf2-gpu.2-6:m78 +ENV CUDA_MAJOR_VERSION=11 +ENV CUDA_MINOR_VERSION=0 {{ else }} FROM gcr.io/deeplearning-platform-release/tf2-cpu.2-6:m78 {{ end }} # Keep these variables in sync if base image is updated. ENV TENSORFLOW_VERSION=2.6.0 -ENV CUDA_MAJOR_VERSION=11 -ENV CUDA_MINOR_VERSION=0 +# Disable pesky logs like: KMP_AFFINITY: pid 6121 tid 6121 thread 0 bound to OS proc set 0 +# See: https://stackoverflow.com/questions/57385766/disable-tensorflow-log-information +ENV KMP_WARNINGS=0 ADD clean-layer.sh /tmp/clean-layer.sh ADD patches/nbconvert-extensions.tpl /opt/kaggle/nbconvert-extensions.tpl From 205a31b87467291f02ebae45ca912645f47e89f6 Mon Sep 17 00:00:00 2001 From: Vincent Roseberry Date: Thu, 23 Sep 2021 18:51:10 +0000 Subject: [PATCH 08/12] remove horovod --- Dockerfile.tmpl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index d61e22d8..59c27fea 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -2,6 +2,9 @@ FROM gcr.io/deeplearning-platform-release/tf2-gpu.2-6:m78 ENV CUDA_MAJOR_VERSION=11 ENV CUDA_MINOR_VERSION=0 +# b/200968891 Keeps horovod once torch is upgraded. +RUN pip uninstall -y horovod && \ + /tmp/clean-layer.sh {{ else }} FROM gcr.io/deeplearning-platform-release/tf2-cpu.2-6:m78 {{ end }} @@ -535,7 +538,6 @@ ADD patches/imagemagick-policy.xml /etc/ImageMagick-6/policy.xml # jupyter serverextension enable jupyter_tensorboard && \ # jupyter tensorboard enable # ADD patches/tensorboard/notebook.py /opt/conda/lib/python3.7/site-packages/tensorboard/notebook.py -# TODO(rosbo): Will likely need to remove tensorboard. # Disable unnecessary jupyter extensions RUN jupyter-nbextension disable nb_conda --py --sys-prefix && \ From 1b78937bb024c3e47df8c4f2adc7308167d5780f Mon Sep 17 00:00:00 2001 From: Vincent Roseberry Date: Thu, 23 Sep 2021 20:17:19 +0000 Subject: [PATCH 09/12] Move uninstall statement after clean-layer.sh has been added --- Dockerfile.tmpl | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index 59c27fea..9fdc0035 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -2,9 +2,6 @@ FROM gcr.io/deeplearning-platform-release/tf2-gpu.2-6:m78 ENV CUDA_MAJOR_VERSION=11 ENV CUDA_MINOR_VERSION=0 -# b/200968891 Keeps horovod once torch is upgraded. -RUN pip uninstall -y horovod && \ - /tmp/clean-layer.sh {{ else }} FROM gcr.io/deeplearning-platform-release/tf2-cpu.2-6:m78 {{ end }} @@ -18,6 +15,12 @@ ADD clean-layer.sh /tmp/clean-layer.sh ADD patches/nbconvert-extensions.tpl /opt/kaggle/nbconvert-extensions.tpl ADD patches/template_conf.json /opt/kaggle/conf.json +{{ if eq .Accelerator "gpu" }} +# b/200968891 Keeps horovod once torch is upgraded. +RUN pip uninstall -y horovod && \ + /tmp/clean-layer.sh +{{ end }} + # Use a fixed apt-get repo to stop intermittent failures due to flaky httpredir connections, # as described by Lionel Chan at http://stackoverflow.com/a/37426929/5881346 RUN sed -i "s/httpredir.debian.org/debian.uchicago.edu/" /etc/apt/sources.list && \ From 21faceecc0ba290d170dcf579c9193e314932b3b Mon Sep 17 00:00:00 2001 From: Vincent Roseberry Date: Thu, 23 Sep 2021 20:18:08 +0000 Subject: [PATCH 10/12] remove uninstsall lightgbm statement for gpu now that it's templatized --- Dockerfile.tmpl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index 9fdc0035..fd8c007e 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -71,7 +71,7 @@ ENV LIGHTGBM_VERSION=3.2.1 # Install OpenCL & libboost (required by LightGBM GPU version) RUN apt-get install -y ocl-icd-libopencl1 clinfo libboost-all-dev && \ mkdir -p /etc/OpenCL/vendors && \ - echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd && \pip uninstall -y lightgbm && \ + echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd && \ cd /usr/local/src && \ git clone --recursive https://github.com/microsoft/LightGBM && \ cd LightGBM && \ From d00e6d8dafeef36530d4f2a68d4aa09ddd3eb26b Mon Sep 17 00:00:00 2001 From: Vincent Roseberry Date: Fri, 24 Sep 2021 01:27:39 +0000 Subject: [PATCH 11/12] Remove CPU & GPU Dockerfiles --- Dockerfile | 501 ------------------------------------------------- gpu.Dockerfile | 104 ---------- 2 files changed, 605 deletions(-) delete mode 100644 Dockerfile delete mode 100644 gpu.Dockerfile diff --git a/Dockerfile b/Dockerfile deleted file mode 100644 index 230f5d13..00000000 --- a/Dockerfile +++ /dev/null @@ -1,501 +0,0 @@ -ARG BASE_TAG=m78 -ARG TENSORFLOW_VERSION=2.4.1 - -FROM gcr.io/deeplearning-platform-release/base-cpu:${BASE_TAG} - -# We need to redefine TENSORFLOW_VERSION here to get the default ARG value defined above the FROM instruction. -# See: https://docs.docker.com/engine/reference/builder/#understand-how-arg-and-from-interact -ARG TENSORFLOW_VERSION - -ADD clean-layer.sh /tmp/clean-layer.sh -ADD patches/nbconvert-extensions.tpl /opt/kaggle/nbconvert-extensions.tpl -ADD patches/template_conf.json /opt/kaggle/conf.json - -# Use a fixed apt-get repo to stop intermittent failures due to flaky httpredir connections, -# as described by Lionel Chan at http://stackoverflow.com/a/37426929/5881346 -RUN sed -i "s/httpredir.debian.org/debian.uchicago.edu/" /etc/apt/sources.list && \ - apt-get update && \ - # Needed by vowpalwabbit & lightGBM (GPU build). - # https://github.com/VowpalWabbit/vowpal_wabbit/wiki/Python#installing - # https://lightgbm.readthedocs.io/en/latest/GPU-Tutorial.html#build-lightgbm - apt-get install -y build-essential unzip cmake && \ - apt-get install -y libboost-dev libboost-program-options-dev libboost-system-dev libboost-thread-dev libboost-math-dev libboost-test-dev libboost-python-dev libboost-filesystem-dev zlib1g-dev && \ - # b/182601974: ssh client was removed from the base image but is required for packages such as stable-baselines. - apt-get install -y openssh-client && \ - /tmp/clean-layer.sh - -# Make sure the dynamic linker finds the right libstdc++ -ENV LD_LIBRARY_PATH=/opt/conda/lib -# b/128333086: Set PROJ_LIB to points to the proj4 cartographic library. -ENV PROJ_LIB=/opt/conda/share/proj - -# Install conda packages not available on pip. -# When using pip in a conda environment, conda commands should be ran first and then -# the remaining pip commands: https://www.anaconda.com/using-pip-in-a-conda-environment/ -RUN conda config --add channels nvidia && \ - conda config --add channels rapidsai && \ - # Base image channel order: conda-forge (highest priority), defaults. - # End state: rapidsai (highest priority), nvidia, conda-forge, defaults. - conda install mkl cartopy=0.19 imagemagick=7.1 pyproj==3.1.0 && \ - /tmp/clean-layer.sh - -RUN pip install torch==1.7.1+cpu torchvision==0.8.2+cpu torchaudio==0.7.2 torchtext==0.8.1 -f https://download.pytorch.org/whl/torch_stable.html && \ - /tmp/clean-layer.sh - -RUN pip install pysal && \ - pip install seaborn python-dateutil dask python-igraph && \ - pip install pyyaml joblib husl geopy ml_metrics mne pyshp && \ - pip install pandas && \ - # Install h2o from source. - # Use `conda install -c h2oai h2o` once Python 3.7 version is released to conda. - apt-get install -y default-jre-headless && \ - pip install -f https://h2o-release.s3.amazonaws.com/h2o/latest_stable_Py.html h2o && \ - /tmp/clean-layer.sh - -RUN pip install tensorflow==${TENSORFLOW_VERSION} && \ - pip install tensorflow-gcs-config==2.4.0 && \ - pip install tensorflow-addons==0.12.1 && \ - pip install tensorflow_probability==0.12.2 && \ - /tmp/clean-layer.sh - -RUN apt-get install -y libfreetype6-dev && \ - apt-get install -y libglib2.0-0 libxext6 libsm6 libxrender1 libfontconfig1 --fix-missing && \ - # b/198300835 kornia 4.1.0 is not compatible with our version of numpy. - pip install gensim==4.0.1 && \ - pip install textblob && \ - pip install wordcloud && \ - pip install xgboost && \ - # Pinned to match GPU version. Update version together. - pip install lightgbm==3.2.1 && \ - pip install pydot && \ - pip install keras-tuner && \ - pip install flake8 && \ - # Pinned because it breaks theano test with the latest version (b/178107003). - pip install theano-pymc==1.0.11 && \ - pip install python-Levenshtein && \ - pip install hep_ml && \ - # NLTK Project datasets - mkdir -p /usr/share/nltk_data && \ - # NLTK Downloader no longer continues smoothly after an error, so we explicitly list - # the corpuses that work - # "yes | ..." answers yes to the retry prompt in case of an error. See b/133762095. - yes | python -m nltk.downloader -d /usr/share/nltk_data abc alpino averaged_perceptron_tagger \ - basque_grammars biocreative_ppi bllip_wsj_no_aux \ - book_grammars brown brown_tei cess_cat cess_esp chat80 city_database cmudict \ - comtrans conll2000 conll2002 conll2007 crubadan dependency_treebank \ - europarl_raw floresta gazetteers genesis gutenberg \ - ieer inaugural indian jeita kimmo knbc large_grammars lin_thesaurus mac_morpho machado \ - masc_tagged maxent_ne_chunker maxent_treebank_pos_tagger moses_sample movie_reviews \ - mte_teip5 names nps_chat omw opinion_lexicon paradigms \ - pil pl196x porter_test ppattach problem_reports product_reviews_1 product_reviews_2 propbank \ - pros_cons ptb punkt qc reuters rslp rte sample_grammars semcor senseval sentence_polarity \ - sentiwordnet shakespeare sinica_treebank smultron snowball_data spanish_grammars \ - state_union stopwords subjectivity swadesh switchboard tagsets timit toolbox treebank \ - twitter_samples udhr2 udhr unicode_samples universal_tagset universal_treebanks_v20 \ - vader_lexicon verbnet webtext word2vec_sample wordnet wordnet_ic words ycoe && \ - # Stop-words - pip install stop-words && \ - pip install scikit-image && \ - /tmp/clean-layer.sh - -RUN pip install ibis-framework && \ - pip install mxnet && \ - pip install gluonnlp && \ - pip install gluoncv && \ - /tmp/clean-layer.sh - -RUN pip install scipy && \ - # b/176817038 avoid upgrade to 0.24 which is causing issues with hep-ml package. - pip install scikit-learn==0.23.2 && \ - # HDF5 support - pip install h5py && \ - pip install biopython && \ - # PUDB, for local debugging convenience - pip install pudb && \ - pip install imbalanced-learn && \ - # Profiling and other utilities - pip install line_profiler && \ - pip install orderedmultidict && \ - pip install smhasher && \ - pip install bokeh && \ - pip install numba && \ - pip install datashader && \ - # Boruta (python implementation) - pip install Boruta && \ - apt-get install -y graphviz && pip install graphviz && \ - # Pandoc is a dependency of deap - apt-get install -y pandoc && \ - pip install git+git://github.com/scikit-learn-contrib/py-earth.git@issue191 && \ - pip install essentia && \ - /tmp/clean-layer.sh - -# vtk with dependencies -RUN apt-get install -y libgl1-mesa-glx && \ - pip install vtk && \ - # xvfbwrapper with dependencies - apt-get install -y xvfb && \ - pip install xvfbwrapper && \ - /tmp/clean-layer.sh - -RUN pip install mpld3 && \ - pip install gpxpy && \ - pip install arrow && \ - pip install nilearn && \ - pip install nibabel && \ - pip install pronouncing && \ - pip install markovify && \ - pip install imgaug && \ - pip install preprocessing && \ - pip install path.py && \ - pip install Geohash && \ - # https://github.com/vinsci/geohash/issues/4 - sed -i -- 's/geohash/.geohash/g' /opt/conda/lib/python3.7/site-packages/Geohash/__init__.py && \ - pip install deap && \ - pip install tpot && \ - pip install scikit-optimize && \ - pip install haversine && \ - pip install toolz cytoolz && \ - pip install plotly && \ - pip install hyperopt && \ - pip install fitter && \ - pip install langid && \ - # Delorean. Useful for dealing with datetime - pip install delorean && \ - pip install trueskill && \ - # Useful data exploration libraries (for missing data and generating reports) - pip install missingno && \ - pip install pandas-profiling && \ - pip install s2sphere && \ - pip install bayesian-optimization && \ - pip install matplotlib-venn && \ - # b/184083722 pyldavis >= 3.3 requires numpy >= 1.20.0 but TensorFlow 2.4.1 / 2.5.0 requires 1.19.2 - pip install pyldavis==3.2.2 && \ - pip install mlxtend && \ - pip install altair && \ - # b/183944405 pystan 3.x is not compatible with fbprophet. - pip install pystan==2.19.1.1 && \ - pip install ImageHash && \ - pip install ecos && \ - pip install CVXcanon && \ - pip install pymc3 && \ - pip install imagecodecs && \ - pip install tifffile && \ - pip install spectral && \ - pip install descartes && \ - pip install geojson && \ - pip install pydicom && \ - pip install wavio && \ - pip install SimpleITK && \ - pip install hmmlearn && \ - pip install bayespy && \ - pip install gplearn && \ - pip install PyAstronomy && \ - pip install squarify && \ - pip install fuzzywuzzy && \ - pip install python-louvain && \ - pip install pyexcel-ods && \ - pip install sklearn-pandas && \ - pip install stemming && \ - pip install fbprophet && \ - pip install holoviews && \ - pip install geoviews && \ - pip install hypertools && \ - pip install py_stringsimjoin && \ - pip install mlens && \ - pip install scikit-multilearn && \ - pip install cleverhans && \ - pip install leven && \ - pip install catboost && \ - pip install lightfm && \ - pip install folium && \ - pip install scikit-plot && \ - # dipy requires the optional fury dependency for visualizations. - pip install fury dipy && \ - pip install plotnine && \ - pip install scikit-surprise && \ - pip install pymongo && \ - pip install geoplot && \ - pip install eli5 && \ - pip install implicit && \ - pip install kaggle && \ - /tmp/clean-layer.sh - -RUN pip install tensorpack && \ - # Add google PAIR-code Facets - cd /opt/ && git clone https://github.com/PAIR-code/facets && cd facets/ && jupyter nbextension install facets-dist/ --user && \ - export PYTHONPATH=$PYTHONPATH:/opt/facets/facets_overview/python/ && \ - pip install pycountry && \ - pip install iso3166 && \ - pip install pydash && \ - pip install kmodes --no-dependencies && \ - pip install librosa && \ - pip install polyglot && \ - pip install mmh3 && \ - pip install fbpca && \ - pip install sentencepiece && \ - pip install cufflinks && \ - pip install lime && \ - pip install memory_profiler && \ - /tmp/clean-layer.sh - -# install cython & cysignals before pyfasttext -RUN pip install --upgrade cython && \ - pip install --upgrade cysignals && \ - pip install pyfasttext && \ - pip install fasttext && \ - apt-get install -y libhunspell-dev && pip install hunspell && \ - pip install annoy && \ - pip install category_encoders && \ - # google-cloud-automl 2.0.0 introduced incompatible API changes, need to pin to 1.0.1 - pip install google-cloud-automl==1.0.1 && \ - pip install google-cloud-bigquery==2.2.0 && \ - pip install google-cloud-storage && \ - pip install google-cloud-translate==3.* && \ - pip install google-cloud-language==2.* && \ - pip install google-cloud-videointelligence==2.* && \ - pip install google-cloud-vision==2.* && \ - # b/183041606#comment5: the Kaggle data proxy doesn't support these APIs. If the library is missing, it falls back to using a regular BigQuery query to fetch data. - pip uninstall -y google-cloud-bigquery-storage && \ - # After launch this should be installed from pip - pip install git+https://github.com/googleapis/python-aiplatform.git@mb-release && \ - pip install ortools && \ - pip install scattertext && \ - # Pandas data reader - pip install pandas-datareader && \ - pip install wordsegment && \ - pip install wordbatch && \ - pip install emoji && \ - # Add Japanese morphological analysis engine - pip install janome && \ - pip install wfdb && \ - pip install vecstack && \ - # yellowbrick machine learning visualization library - pip install yellowbrick && \ - pip install mlcrate && \ - /tmp/clean-layer.sh - -RUN pip install bleach && \ - pip install certifi && \ - pip install cycler && \ - pip install decorator && \ - pip install entrypoints && \ - pip install html5lib && \ - pip install ipykernel && \ - pip install ipython && \ - pip install ipython-genutils && \ - pip install ipywidgets && \ - pip install isoweek && \ - pip install jedi && \ - pip install Jinja2 && \ - pip install jsonschema && \ - pip install jupyter-client && \ - pip install jupyter-console && \ - pip install jupyter-core && \ - pip install MarkupSafe && \ - pip install mistune && \ - pip install nbconvert && \ - pip install nbformat && \ - pip install notebook && \ - pip install papermill && \ - pip install olefile && \ - # b/198300835 kornia 0.5.10 is not compatible with our version of numpy. - pip install kornia==0.5.8 && \ - pip install pandas_summary && \ - pip install pandocfilters && \ - pip install pexpect && \ - pip install pickleshare && \ - pip install Pillow && \ - # Install openslide and its python binding - apt-get install -y openslide-tools && \ - pip install openslide-python && \ - pip install ptyprocess && \ - pip install Pygments && \ - pip install pyparsing && \ - pip install pytz && \ - pip install PyYAML && \ - pip install pyzmq && \ - pip install qtconsole && \ - pip install six && \ - pip install terminado && \ - pip install tornado && \ - pip install tqdm && \ - pip install traitlets && \ - pip install wcwidth && \ - pip install webencodings && \ - pip install widgetsnbextension && \ - pip install pyarrow && \ - pip install feather-format && \ - # fastai >= 2.3.1 upgrades pytorch/torchvision. upgrade of pytorch will be handled in b/181966788 - pip install fastai==2.2.7 && \ - pip install allennlp && \ - # https://b.corp.google.com/issues/184685619#comment9: 3.9.0 is causing a major performance degradation with spacy 2.3.5 - pip install importlib-metadata==3.4.0 && \ - python -m spacy download en_core_web_sm && python -m spacy download en_core_web_lg && \ - apt-get install -y ffmpeg && \ - /tmp/clean-layer.sh - - ########### - # - # NEW CONTRIBUTORS: - # Please add new pip/apt installs in this block. Don't forget a "&& \" at the end - # of all non-final lines. Thanks! - # - ########### - -RUN pip install flashtext && \ - pip install wandb && \ - pip install marisa-trie && \ - pip install pyemd && \ - pip install pyupset && \ - pip install pympler && \ - pip install s3fs && \ - pip install featuretools && \ - pip install -e git+https://github.com/SohierDane/BigQuery_Helper#egg=bq_helper && \ - pip install hpsklearn && \ - pip install git+https://github.com/Kaggle/learntools && \ - pip install kmapper && \ - pip install shap && \ - pip install ray && \ - pip install gym && \ - pip install pyarabic && \ - pip install pandasql && \ - pip install tensorflow_hub && \ - pip install jieba && \ - pip install git+https://github.com/SauceCat/PDPbox && \ - # ggplot is broken and main repo does not merge and release https://github.com/yhat/ggpy/pull/668 - pip install https://github.com/hbasria/ggpy/archive/0.11.5.zip && \ - pip install cesium && \ - pip install rgf_python && \ - pip install tsfresh && \ - pip install pykalman && \ - pip install optuna && \ - pip install plotly_express && \ - pip install albumentations && \ - pip install catalyst && \ - pip install osmnx && \ - apt-get -y install libspatialindex-dev && \ - pip install pytorch-ignite && \ - pip install qgrid && \ - pip install bqplot && \ - pip install earthengine-api && \ - pip install transformers && \ - pip install dlib && \ - pip install kaggle-environments && \ - pip install geopandas && \ - pip install nnabla && \ - pip install vowpalwabbit && \ - # papermill can replace nbconvert for executing notebooks - pip install cloud-tpu-client && \ - # b/188429515#comment7 tensorflow-cloud >= 0.1.14 installs tensorflow-transform which install apache-beam which downgrades the google.cloud library to 1.x. - pip install tensorflow-cloud==0.1.13 && \ - pip install tensorflow-datasets && \ - pip install pydub && \ - pip install pydegensac && \ - # b/198635596 latest versions of torchmetrics & pytorch-lightning are failing at runtime. - pip install torchmetrics==0.5.0 && \ - pip install pytorch-lightning==1.4.4 && \ - pip install datatable && \ - pip install sympy && \ - # flask is used by agents in the simulation competitions. - pip install flask && \ - # pycrypto is used by competitions team. - pip install pycrypto && \ - pip install easyocr && \ - # Keep JAX version in sync with GPU image. - pip install jax[cpu]==0.2.19 && \ - # ipympl adds interactive widget support for matplotlib - pip install ipympl==0.7.0 && \ - pip install pandarallel && \ - /tmp/clean-layer.sh - -# Download base easyocr models. -# https://github.com/JaidedAI/EasyOCR#usage -RUN mkdir -p /root/.EasyOCR/model && \ - wget --no-verbose "https://github.com/JaidedAI/EasyOCR/releases/download/v1.3/latin_g2.zip" -O /root/.EasyOCR/model/latin.zip && \ - unzip /root/.EasyOCR/model/latin.zip -d /root/.EasyOCR/model/ && \ - rm /root/.EasyOCR/model/latin.zip && \ - wget --no-verbose "https://github.com/JaidedAI/EasyOCR/releases/download/v1.3/english_g2.zip" -O /root/.EasyOCR/model/english.zip && \ - unzip /root/.EasyOCR/model/english.zip -d /root/.EasyOCR/model/ && \ - rm /root/.EasyOCR/model/english.zip && \ - wget --no-verbose "https://github.com/JaidedAI/EasyOCR/releases/download/pre-v1.1.6/craft_mlt_25k.zip" -O /root/.EasyOCR/model/craft_mlt_25k.zip && \ - unzip /root/.EasyOCR/model/craft_mlt_25k.zip -d /root/.EasyOCR/model/ && \ - rm /root/.EasyOCR/model/craft_mlt_25k.zip && \ - /tmp/clean-layer.sh - -# Tesseract and some associated utility packages -RUN apt-get install tesseract-ocr -y && \ - pip install pytesseract && \ - pip install wand && \ - pip install pdf2image && \ - pip install PyPDF && \ - pip install pyocr && \ - /tmp/clean-layer.sh -ENV TESSERACT_PATH=/usr/bin/tesseract - -# For Facets -ENV PYTHONPATH=$PYTHONPATH:/opt/facets/facets_overview/python/ -# For Theano with MKL -ENV MKL_THREADING_LAYER=GNU - -# Temporary fixes and patches -# Temporary patch for Dask getting downgraded, which breaks Keras -RUN pip install --upgrade dask && \ - # Stop jupyter nbconvert trying to rewrite its folder hierarchy - mkdir -p /root/.jupyter && touch /root/.jupyter/jupyter_nbconvert_config.py && touch /root/.jupyter/migrated && \ - mkdir -p /.jupyter && touch /.jupyter/jupyter_nbconvert_config.py && touch /.jupyter/migrated && \ - # Stop Matplotlib printing junk to the console on first load - sed -i "s/^.*Matplotlib is building the font cache using fc-list.*$/# Warning removed by Kaggle/g" /opt/conda/lib/python3.7/site-packages/matplotlib/font_manager.py && \ - # Make matplotlib output in Jupyter notebooks display correctly - mkdir -p /etc/ipython/ && echo "c = get_config(); c.IPKernelApp.matplotlib = 'inline'" > /etc/ipython/ipython_config.py && \ - # Temporary patch for broken libpixman 0.38 in conda-forge, symlink to system libpixman 0.34 untile conda package gets updated to 0.38.5 or higher. - ln -sf /usr/lib/x86_64-linux-gnu/libpixman-1.so.0.34.0 /opt/conda/lib/libpixman-1.so.0.38.0 && \ - /tmp/clean-layer.sh - -# gcloud SDK https://cloud.google.com/sdk/docs/quickstart-debian-ubuntu -RUN echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] http://packages.cloud.google.com/apt cloud-sdk main" \ - | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && \ - curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | \ - apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - && \ - apt-get update -y && apt-get install google-cloud-sdk -y && \ - /tmp/clean-layer.sh - -# Add BigQuery client proxy settings -ENV PYTHONUSERBASE "/root/.local" -ADD patches/kaggle_gcp.py /root/.local/lib/python3.7/site-packages/kaggle_gcp.py -ADD patches/kaggle_secrets.py /root/.local/lib/python3.7/site-packages/kaggle_secrets.py -ADD patches/kaggle_session.py /root/.local/lib/python3.7/site-packages/kaggle_session.py -ADD patches/kaggle_web_client.py /root/.local/lib/python3.7/site-packages/kaggle_web_client.py -ADD patches/kaggle_datasets.py /root/.local/lib/python3.7/site-packages/kaggle_datasets.py -ADD patches/log.py /root/.local/lib/python3.7/site-packages/log.py -ADD patches/sitecustomize.py /root/.local/lib/python3.7/site-packages/sitecustomize.py -# Override default imagemagick policies -ADD patches/imagemagick-policy.xml /etc/ImageMagick-6/policy.xml - -# TensorBoard Jupyter extension. Should be replaced with TensorBoard's provided magic once we have -# worker tunneling support in place. -# b/139212522 re-enable TensorBoard once solution for slowdown is implemented. -# ENV JUPYTER_CONFIG_DIR "/root/.jupyter/" -# RUN pip install jupyter_tensorboard && \ -# jupyter serverextension enable jupyter_tensorboard && \ -# jupyter tensorboard enable -# ADD patches/tensorboard/notebook.py /opt/conda/lib/python3.7/site-packages/tensorboard/notebook.py - -# Disable unnecessary jupyter extensions -RUN jupyter-nbextension disable nb_conda --py --sys-prefix && \ - jupyter-serverextension disable nb_conda --py --sys-prefix && \ - python -m nb_conda_kernels.install --disable - -# Set backend for matplotlib -ENV MPLBACKEND "agg" - -ARG GIT_COMMIT=unknown -ARG BUILD_DATE=unknown - -LABEL git-commit=$GIT_COMMIT -LABEL build-date=$BUILD_DATE -LABEL tensorflow-version=$TENSORFLOW_VERSION -# Used in the Jenkins `Docker GPU Build` step to restrict the images being pruned. -LABEL kaggle-lang=python - -# Correlate current release with the git hash inside the kernel editor by running `!cat /etc/git_commit`. -RUN echo "$GIT_COMMIT" > /etc/git_commit && echo "$BUILD_DATE" > /etc/build_date diff --git a/gpu.Dockerfile b/gpu.Dockerfile deleted file mode 100644 index 191d54a1..00000000 --- a/gpu.Dockerfile +++ /dev/null @@ -1,104 +0,0 @@ -ARG BASE_TAG=staging - -FROM nvidia/cuda:11.0-cudnn8-devel-ubuntu18.04 AS nvidia -FROM gcr.io/kaggle-images/python:${BASE_TAG} - -ADD clean-layer.sh /tmp/clean-layer.sh - -# Cuda support -COPY --from=nvidia /etc/apt/sources.list.d/cuda.list /etc/apt/sources.list.d/ -COPY --from=nvidia /etc/apt/sources.list.d/nvidia-ml.list /etc/apt/sources.list.d/ -COPY --from=nvidia /etc/apt/trusted.gpg /etc/apt/trusted.gpg.d/cuda.gpg - -ENV CUDA_MAJOR_VERSION=11 -ENV CUDA_MINOR_VERSION=0 -ENV CUDA_VERSION=$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION -LABEL com.nvidia.volumes.needed="nvidia_driver" -LABEL com.nvidia.cuda.version="${CUDA_VERSION}" -ENV PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:/opt/bin:${PATH} -# The stub is useful to us both for built-time linking and run-time linking, on CPU-only systems. -# When intended to be used with actual GPUs, make sure to (besides providing access to the host -# CUDA user libraries, either manually or through the use of nvidia-docker) exclude them. One -# convenient way to do so is to obscure its contents by a bind mount: -# docker run .... -v /non-existing-directory:/usr/local/cuda/lib64/stubs:ro ... -# b/197989446#comment7 libgnutls version at /opt/conda/lib causes apt to fail to fetch packages using https URLs. -ENV LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu -RUN apt-get update && apt-get install -y --no-install-recommends \ - cuda-cupti-$CUDA_VERSION \ - cuda-cudart-$CUDA_VERSION \ - cuda-cudart-dev-$CUDA_VERSION \ - cuda-libraries-$CUDA_VERSION \ - cuda-libraries-dev-$CUDA_VERSION \ - cuda-nvml-dev-$CUDA_VERSION \ - cuda-minimal-build-$CUDA_VERSION \ - cuda-command-line-tools-$CUDA_VERSION \ - libcudnn8=8.0.4.30-1+cuda$CUDA_VERSION \ - libcudnn8-dev=8.0.4.30-1+cuda$CUDA_VERSION \ - libnccl2=2.7.8-1+cuda$CUDA_VERSION \ - libnccl-dev=2.7.8-1+cuda$CUDA_VERSION && \ - ln -s /usr/local/cuda-$CUDA_VERSION /usr/local/cuda && \ - ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 && \ - /tmp/clean-layer.sh - -ENV LD_LIBRARY_PATH_NO_STUBS="/usr/local/nvidia/lib64:/usr/local/cuda/lib64:/opt/conda/lib" -ENV LD_LIBRARY_PATH="/usr/local/nvidia/lib64:/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:/opt/conda/lib" -ENV NVIDIA_VISIBLE_DEVICES=all -ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility -ENV NVIDIA_REQUIRE_CUDA="cuda>=$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION" - -# Install OpenCL & libboost (required by LightGBM GPU version) -RUN apt-get install -y ocl-icd-libopencl1 clinfo libboost-all-dev && \ - mkdir -p /etc/OpenCL/vendors && \ - echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd && \ - /tmp/clean-layer.sh - -# When using pip in a conda environment, conda commands should be ran first and then -# the remaining pip commands: https://www.anaconda.com/using-pip-in-a-conda-environment/ -# However, because this image is based on the CPU image, this isn't possible but better -# to put them at the top of this file to minize conflicts. -RUN conda install cudf=21.08 cuml=21.08 cudatoolkit=$CUDA_VERSION && \ - /tmp/clean-layer.sh - -# Install Pytorch and torchvision with GPU support. -# Note: torchtext and torchaudio do not require a separate GPU package. -RUN pip install torch==1.7.1+cu$CUDA_MAJOR_VERSION$CUDA_MINOR_VERSION torchvision==0.8.2+cu$CUDA_MAJOR_VERSION$CUDA_MINOR_VERSION -f https://download.pytorch.org/whl/torch_stable.html && \ - /tmp/clean-layer.sh - -# Install LightGBM with GPU -RUN pip uninstall -y lightgbm && \ - cd /usr/local/src && \ - git clone --recursive https://github.com/microsoft/LightGBM && \ - cd LightGBM && \ - git checkout tags/v3.2.1 && \ - mkdir build && cd build && \ - cmake -DUSE_GPU=1 -DOpenCL_LIBRARY=/usr/local/cuda/lib64/libOpenCL.so -DOpenCL_INCLUDE_DIR=/usr/local/cuda/include/ .. && \ - make -j$(nproc) && \ - cd /usr/local/src/LightGBM/python-package && \ - python setup.py install --precompile && \ - mkdir -p /etc/OpenCL/vendors && \ - echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd && \ - /tmp/clean-layer.sh - -# Install JAX (Keep JAX version in sync with CPU image) -RUN pip install jax[cuda$CUDA_MAJOR_VERSION$CUDA_MINOR_VERSION]==0.2.19 -f https://storage.googleapis.com/jax-releases/jax_releases.html && \ - /tmp/clean-layer.sh - -# Reinstall packages with a separate version for GPU support. -RUN pip uninstall -y mxnet && \ - pip install mxnet-cu$CUDA_MAJOR_VERSION$CUDA_MINOR_VERSION && \ - /tmp/clean-layer.sh - -# Install GPU-only packages -RUN pip install pycuda && \ - pip install pynvrtc && \ - # b/190622765 latest version is causing issue. nnabla fixed it in https://github.com/sony/nnabla/issues/892, waiting for new release before we can remove this pin. - pip install pynvml==8.0.4 && \ - pip install nnabla-ext-cuda$CUDA_MAJOR_VERSION$CUDA_MINOR_VERSION && \ - /tmp/clean-layer.sh - -# Re-add TensorBoard Jupyter extension patch -# b/139212522 re-enable TensorBoard once solution for slowdown is implemented. -# ADD patches/tensorboard/notebook.py /opt/conda/lib/python3.7/site-packages/tensorboard/notebook.py - -# Remove the CUDA stubs. -ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH_NO_STUBS" From 0a5d88ce851fd4741be15b8d9692e875b2b3c8ac Mon Sep 17 00:00:00 2001 From: Vincent Roseberry Date: Fri, 24 Sep 2021 18:05:04 +0000 Subject: [PATCH 12/12] Remove duplicated code & resolved TODO --- Dockerfile.tmpl | 6 ------ 1 file changed, 6 deletions(-) diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index fd8c007e..c68e980d 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -34,10 +34,6 @@ RUN sed -i "s/httpredir.debian.org/debian.uchicago.edu/" /etc/apt/sources.list & apt-get install -y openssh-client && \ /tmp/clean-layer.sh -# TODO(rosbo): Is this needed? Make sure gpu and stubs are properly set. -# Make sure the dynamic linker finds the right libstdc++ -# ENV LD_LIBRARY_PATH=/opt/conda/lib - # b/128333086: Set PROJ_LIB to points to the proj4 cartographic library. ENV PROJ_LIB=/opt/conda/share/proj @@ -81,8 +77,6 @@ RUN apt-get install -y ocl-icd-libopencl1 clinfo libboost-all-dev && \ make -j$(nproc) && \ cd /usr/local/src/LightGBM/python-package && \ python setup.py install --precompile && \ - mkdir -p /etc/OpenCL/vendors && \ - echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd && \ /tmp/clean-layer.sh {{ else }} RUN pip install lightgbm==$LIGHTGBM_VERSION && \