From e6619f71af0b1e5abbd5a46f8abe414bb2abcfca Mon Sep 17 00:00:00 2001 From: Nadia Yakimakha <32335935+nadiaya@users.noreply.github.com> Date: Thu, 18 Apr 2019 15:54:33 -0700 Subject: [PATCH 1/8] Add support for ray 0.6.5. --- ray/docker/0.6.5/Dockerfile.tf | 40 ++++++++++++++++++++++++ test/resources/ray_cartpole/train_ray.py | 2 +- 2 files changed, 41 insertions(+), 1 deletion(-) create mode 100644 ray/docker/0.6.5/Dockerfile.tf diff --git a/ray/docker/0.6.5/Dockerfile.tf b/ray/docker/0.6.5/Dockerfile.tf new file mode 100644 index 0000000..0b70044 --- /dev/null +++ b/ray/docker/0.6.5/Dockerfile.tf @@ -0,0 +1,40 @@ +ARG processor +FROM 520713654638.dkr.ecr.us-west-2.amazonaws.com/sagemaker-tensorflow-scriptmode:1.12.0-$processor-py3 + +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + jq \ + libav-tools \ + libjpeg-dev \ + libxrender1 \ + python3.6-dev \ + python3-opengl \ + wget \ + xvfb && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --no-cache-dir \ + Cython==0.29.7 \ + gym==0.12.1 \ + lz4 \ + opencv-python-headless \ + PyOpenGL==3.1.0 \ + pyyaml \ + redis==3.2.1 \ + ray==0.6.5 \ + ray[rllib]==0.6.5 \ + scipy + +# https://click.palletsprojects.com/en/7.x/python3/ +ENV LC_ALL=C.UTF-8 +ENV LANG=C.UTF-8 + +# Copy workaround script for incorrect hostname +COPY lib/changehostname.c / + +COPY lib/start.sh /usr/local/bin/start.sh +RUN chmod +x /usr/local/bin/start.sh + +# Starts framework +ENTRYPOINT ["bash", "-m", "start.sh"] diff --git a/test/resources/ray_cartpole/train_ray.py b/test/resources/ray_cartpole/train_ray.py index 3fe2040..e5ccf9d 100644 --- a/test/resources/ray_cartpole/train_ray.py +++ b/test/resources/ray_cartpole/train_ray.py @@ -5,7 +5,7 @@ from ray.tune.logger import pretty_print # Based on https://github.com/ray-project/ray/blob/master/doc/source/rllib-training.rst#python-api -ray.init(redirect_output=False, redirect_worker_output=False) +ray.init(log_to_driver=False) config = ppo.DEFAULT_CONFIG.copy() config["num_gpus"] = int(os.environ.get("SM_NUM_GPUS", 0)) checkpoint_dir = os.environ.get("SM_MODEL_DIR", '/Users/nadzeya/gym') From 3fc4e75a0034805182601300e1a368879be52bb9 Mon Sep 17 00:00:00 2001 From: Nadia Yakimakha <32335935+nadiaya@users.noreply.github.com> Date: Thu, 18 Apr 2019 16:14:54 -0700 Subject: [PATCH 2/8] Update readme with new ray version. --- README.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.rst b/README.rst index 1be3774..164a7a0 100644 --- a/README.rst +++ b/README.rst @@ -117,7 +117,7 @@ To build RL Docker image: # Example # Ray TensorFlow CPU - docker build -t tf-ray:0.5.3-cpu-py3 -f ray/docker/0.5.3/Dockerfile.tf --build-arg processor=cpu . + docker build -t tf-ray:0.6.5-cpu-py3 -f ray/docker/0.6.5/Dockerfile.tf --build-arg processor=cpu . # Coach TensorFlow GPU docker build -t tf-coach:0.11.0-gpu-py3 -f coach/docker/0.11.0/Dockerfile.tf --build-arg processor=gpu . @@ -239,10 +239,10 @@ TensorFlow Coach Images: TensorFlow Ray Images: -* 520713654638.dkr.ecr..amazonaws.com/sagemaker-rl-tensorflow:ray0.5-cpu-py3 -* 520713654638.dkr.ecr..amazonaws.com/sagemaker-rl-tensorflow:ray0.5.3-cpu-py3 -* 520713654638.dkr.ecr..amazonaws.com/sagemaker-rl-tensorflow:ray0.5-gpu-py3 -* 520713654638.dkr.ecr..amazonaws.com/sagemaker-rl-tensorflow:ray0.5.3-gpu-py3 +* 520713654638.dkr.ecr..amazonaws.com/sagemaker-rl-tensorflow:ray0.6-cpu-py3 +* 520713654638.dkr.ecr..amazonaws.com/sagemaker-rl-tensorflow:ray0.6.5-cpu-py3 +* 520713654638.dkr.ecr..amazonaws.com/sagemaker-rl-tensorflow:ray0.6-gpu-py3 +* 520713654638.dkr.ecr..amazonaws.com/sagemaker-rl-tensorflow:ray0.6.5-gpu-py3 `List of supported SageMaker regions `__. From 4a42f194b623c37dfe9b5ad651d6860289dc1402 Mon Sep 17 00:00:00 2001 From: Nadia Yakimakha <32335935+nadiaya@users.noreply.github.com> Date: Thu, 18 Apr 2019 16:15:13 -0700 Subject: [PATCH 3/8] Freeze opencv-version. --- ray/docker/0.6.5/Dockerfile.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ray/docker/0.6.5/Dockerfile.tf b/ray/docker/0.6.5/Dockerfile.tf index 0b70044..cc9b21d 100644 --- a/ray/docker/0.6.5/Dockerfile.tf +++ b/ray/docker/0.6.5/Dockerfile.tf @@ -18,7 +18,7 @@ RUN pip install --no-cache-dir \ Cython==0.29.7 \ gym==0.12.1 \ lz4 \ - opencv-python-headless \ + opencv-python-headless==4.1.0 \ PyOpenGL==3.1.0 \ pyyaml \ redis==3.2.1 \ From 021b482c45aa4b591dc4299eaebde2380cabb823 Mon Sep 17 00:00:00 2001 From: Nadia Yakimakha <32335935+nadiaya@users.noreply.github.com> Date: Thu, 18 Apr 2019 16:19:30 -0700 Subject: [PATCH 4/8] Update buildspec to use newest ray version. --- buildspec.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/buildspec.yml b/buildspec.yml index a929f1e..4589b22 100644 --- a/buildspec.yml +++ b/buildspec.yml @@ -6,8 +6,8 @@ env: COACH_MXNET_FRAMEWORK_VERSION: '1.3.0' COACH_TF_TOOLKIT_VERSION: '0.11.1' COACH_TF_FRAMEWORK_VERSION: '1.12.0' - RAY_TF_TOOKIT_VERSION: '0.5.3' - RAY_TF_FRAMEWORK_VERSION: '1.11.0' + RAY_TF_TOOKIT_VERSION: '0.6.5' + RAY_TF_FRAMEWORK_VERSION: '1.12.0' CPU_INSTANCE_TYPE: 'ml.c4.xlarge' GPU_INSTANCE_TYPE: 'ml.p2.xlarge' PY_VERSION: '3' From 4f88c585ba102d56c82c0bb146c0d42c97eec174 Mon Sep 17 00:00:00 2001 From: yangaws <31293788+yangaws@users.noreply.github.com> Date: Thu, 25 Apr 2019 08:03:20 -0700 Subject: [PATCH 5/8] Follow more recent redis installation instructions. (#20) * test * Update buildspec * Increase ssh connection to 50 minutes --- buildspec.yml | 94 +++++++++++++++------------- coach/docker/0.11.0/Dockerfile.mxnet | 3 +- 2 files changed, 50 insertions(+), 47 deletions(-) diff --git a/buildspec.yml b/buildspec.yml index feb6bab..795eab1 100644 --- a/buildspec.yml +++ b/buildspec.yml @@ -26,15 +26,22 @@ phases: ACCOUNT=$(aws sts get-caller-identity --query 'Account' --output text) PREPROD_IMAGE="$ACCOUNT.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/$ECR_REPO" # keep ssh connection alive when communicating with remote ec2 server during integ test - # largest connection idle time allowed: 30 seconds * 60 attempts = 30 minutes + # largest connection idle time allowed: 10 seconds * 300 attempts = 50 minutes - | - echo ' ServerAliveInterval 30' >> ~/.ssh/config - echo ' ServerAliveCountMax 60' >> ~/.ssh/config + echo ' ServerAliveInterval 10' >> ~/.ssh/config + echo ' ServerAliveCountMax 300' >> ~/.ssh/config build: commands: # install - pip3 install -U -e . + # launch remote gpu instance + - | + prefix='ml.' + instance_type=${GPU_INSTANCE_TYPE#"$prefix"} + - create-key-pair + - launch-ec2-instance --instance-type $instance_type --ami-name dlami-ubuntu + - $(aws ecr get-login --no-include-email --region $AWS_DEFAULT_REGION --registry-ids $PROD_ACCOUNT) - | MXNET_IMAGE="$PROD_ACCOUNT.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/sagemaker-mxnet" @@ -65,82 +72,79 @@ phases: # build cpu images - | - COACH_MXNET_CPU_TAG="coach-$COACH_MXNET_TOOLKIT_VERSION-mxnet-cpu-py$PY_VERSION-$build_id" + COACH_MXNET_CPU_TAG="coach-$COACH_MXNET_TOOLKIT_VERSION-mxnet-cpu-py$PY_VERSION-$BUILD_ID" docker build -t $PREPROD_IMAGE:$COACH_MXNET_CPU_TAG -f coach/docker/$COACH_MXNET_TOOLKIT_VERSION/Dockerfile.mxnet --build-arg processor=cpu . - COACH_TF_CPU_TAG="coach-$COACH_TF_TOOLKIT_VERSION-tf-cpu-py$PY_VERSION-$build_id" + COACH_TF_CPU_TAG="coach-$COACH_TF_TOOLKIT_VERSION-tf-cpu-py$PY_VERSION-$BUILD_ID" docker build -t $PREPROD_IMAGE:$COACH_TF_CPU_TAG -f coach/docker/$COACH_TF_TOOLKIT_VERSION/Dockerfile.tf --build-arg processor=cpu . - RAY_TF_CPU_TAG="ray-$RAY_TF_TOOKIT_VERSION-tf-cpu-py$PY_VERSION-$build_id" + RAY_TF_CPU_TAG="ray-$RAY_TF_TOOKIT_VERSION-tf-cpu-py$PY_VERSION-$BUILD_ID" docker build -t $PREPROD_IMAGE:$RAY_TF_CPU_TAG -f ray/docker/$RAY_TF_TOOKIT_VERSION/Dockerfile.tf --build-arg processor=cpu . - + + # push cpu images to ecr + - | + $(aws ecr get-login --registry-ids $ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION) + docker push $PREPROD_IMAGE:$COACH_MXNET_CPU_TAG + docker push $PREPROD_IMAGE:$COACH_TF_CPU_TAG + docker push $PREPROD_IMAGE:$RAY_TF_CPU_TAG + + # run cpu integration tests + - | + if has-matching-changes "test/" "tests/" "src/*.py" "coach/*" "ray/*"; then + pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --tag $COACH_MXNET_CPU_TAG --framework mxnet --toolkit coach --processor cpu + pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --tag $COACH_TF_CPU_TAG --framework tensorflow --toolkit coach --processor cpu + pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --tag $RAY_TF_CPU_TAG --framework tensorflow --toolkit ray --processor cpu + else + echo "skipping cpu integration tests" + fi + # build gpu images - | - COACH_MXNET_GPU_TAG="coach-$COACH_MXNET_TOOLKIT_VERSION-mxnet-gpu-py$PY_VERSION-$build_id" + COACH_MXNET_GPU_TAG="coach-$COACH_MXNET_TOOLKIT_VERSION-mxnet-gpu-py$PY_VERSION-$BUILD_ID" docker build -t $PREPROD_IMAGE:$COACH_MXNET_GPU_TAG -f coach/docker/$COACH_MXNET_TOOLKIT_VERSION/Dockerfile.mxnet --build-arg processor=gpu . - COACH_TF_GPU_TAG="coach-$COACH_TF_TOOLKIT_VERSION-tf-gpu-py$PY_VERSION-$build_id" + COACH_TF_GPU_TAG="coach-$COACH_TF_TOOLKIT_VERSION-tf-gpu-py$PY_VERSION-$BUILD_ID" docker build -t $PREPROD_IMAGE:$COACH_TF_GPU_TAG -f coach/docker/$COACH_TF_TOOLKIT_VERSION/Dockerfile.tf --build-arg processor=gpu . - RAY_TF_GPU_TAG="ray-$RAY_TF_TOOKIT_VERSION-tf-gpu-py$PY_VERSION-$build_id" + RAY_TF_GPU_TAG="ray-$RAY_TF_TOOKIT_VERSION-tf-gpu-py$PY_VERSION-$BUILD_ID" docker build -t $PREPROD_IMAGE:$RAY_TF_GPU_TAG -f ray/docker/$RAY_TF_TOOKIT_VERSION/Dockerfile.tf --build-arg processor=gpu . - - # push images to ecr + + # push gpu images to ecr - | $(aws ecr get-login --registry-ids $ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION) - docker push $PREPROD_IMAGE:$COACH_MXNET_CPU_TAG - docker push $PREPROD_IMAGE:$COACH_TF_CPU_TAG - docker push $PREPROD_IMAGE:$RAY_TF_CPU_TAG docker push $PREPROD_IMAGE:$COACH_MXNET_GPU_TAG docker push $PREPROD_IMAGE:$COACH_TF_GPU_TAG docker push $PREPROD_IMAGE:$RAY_TF_GPU_TAG - # launch remote gpu instance - - | - prefix='ml.' - instance_type=${GPU_INSTANCE_TYPE#"$prefix"} - - create-key-pair - - launch-ec2-instance --instance-type $instance_type --ami-name dlami-ubuntu - # run gpu integration tests - | if has-matching-changes "test/" "tests/" "src/*.py" "coach/*" "ray/*"; then printf "$SETUP_CMDS" > $SETUP_FILE - cmd="pytest test/integration/local --region $AWS_DEFAULT_REGION --toolkit coach --docker-base-name $PREPROD_IMAGE --tag $COACH_MXNET_GPU_TAG --processor gpu" + cmd="pytest test/integration/local --region $AWS_DEFAULT_REGION --toolkit coach --framework mxnet --docker-base-name $PREPROD_IMAGE --tag $COACH_MXNET_GPU_TAG --processor gpu" remote-test --github-repo $GITHUB_REPO --test-cmd "$cmd" --setup-file $SETUP_FILE - cmd="pytest test/integration/local --region $AWS_DEFAULT_REGION --toolkit coach --docker-base-name $PREPROD_IMAGE --tag $COACH_TF_GPU_TAG --processor gpu" + cmd="pytest test/integration/local --region $AWS_DEFAULT_REGION --toolkit coach --framework tensorflow --docker-base-name $PREPROD_IMAGE --tag $COACH_TF_GPU_TAG --processor gpu" remote-test --github-repo $GITHUB_REPO --test-cmd "$cmd" --skip-setup - cmd="pytest test/integration/local --region $AWS_DEFAULT_REGION --toolkit ray --docker-base-name $PREPROD_IMAGE --tag $RAY_TF_GPU_TAG --processor gpu" + cmd="pytest test/integration/local --region $AWS_DEFAULT_REGION --toolkit ray --framework tensorflow --docker-base-name $PREPROD_IMAGE --tag $RAY_TF_GPU_TAG --processor gpu" remote-test --github-repo $GITHUB_REPO --test-cmd "$cmd" --skip-setup else echo "skipping coach gpu integration tests" fi - # run cpu integration tests + # run cpu sagemaker tests - | - if has-matching-changes "test/" "tests/" "src/*.py" "coach/*" "ray/*"; then - pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --tag $COACH_MXNET_CPU_TAG --toolkit coach --processor cpu - pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --tag $COACH_TF_CPU_TAG --toolkit coach --processor cpu - pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --tag $RAY_TF_CPU_TAG --toolkit ray --processor cpu + if has-matching-changes "test/" "tests/" "src/*.py" "docker/*" "ray/*"; then + pytest test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --tag $COACH_MXNET_CPU_TAG --framework mxnet --toolkit coach --instance-type $CPU_INSTANCE_TYPE + pytest test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --tag $COACH_TF_CPU_TAG --framework tensorflow --toolkit coach --instance-type $CPU_INSTANCE_TYPE + pytest test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --tag $RAY_TF_CPU_TAG --framework tensorflow --toolkit ray --instance-type $CPU_INSTANCE_TYPE else - echo "skipping cpu integration tests" + echo "skipping cpu sagemaker tests" fi # run gpu sagemaker tests - | if has-matching-changes "test/" "tests/" "src/*.py" "docker/*" "ray/*"; then - pytest test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --tag $COACH_MXNET_CPU_TAG --toolkit coach --instance-type $GPU_INSTANCE_TYPE - pytest test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --tag $COACH_TF_CPU_TAG --toolkit coach --instance-type $GPU_INSTANCE_TYPE - pytest test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --tag $RAY_TF_CPU_TAG --toolkit ray --instance-type $GPU_INSTANCE_TYPE + pytest test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --tag $COACH_MXNET_GPU_TAG --framework mxnet --toolkit coach --instance-type $GPU_INSTANCE_TYPE + pytest test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --tag $COACH_TF_GPU_TAG --framework tensorflow --toolkit coach --instance-type $GPU_INSTANCE_TYPE + pytest test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --tag $RAY_TF_GPU_TAG --framework tensorflow --toolkit ray --instance-type $GPU_INSTANCE_TYPE else echo "skipping gpu sagemaker tests" fi - - # run cpu sagemaker tests - - | - if has-matching-changes "test/" "tests/" "src/*.py" "docker/*" "ray/*"; then - pytest test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --tag $COACH_MXNET_CPU_TAG --toolkit coach --instance-type $CPU_INSTANCE_TYPE - pytest test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --tag $COACH_TF_CPU_TAG --toolkit coach --instance-type $CPU_INSTANCE_TYPE - pytest test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --tag $RAY_TF_CPU_TAG --toolkit ray --instance-type $CPU_INSTANCE_TYPE - else - echo "skipping cpu sagemaker tests" - fi finally: # shut down remote gpu instance - cleanup-gpu-instances @@ -153,4 +157,4 @@ phases: aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$RAY_TF_CPU_TAG aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$COACH_MXNET_GPU_TAG aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$COACH_TF_GPU_TAG - aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$RAY_TF_GPU_TAG + aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$RAY_TF_GPU_TAG \ No newline at end of file diff --git a/coach/docker/0.11.0/Dockerfile.mxnet b/coach/docker/0.11.0/Dockerfile.mxnet index dba06bd..c0adfeb 100644 --- a/coach/docker/0.11.0/Dockerfile.mxnet +++ b/coach/docker/0.11.0/Dockerfile.mxnet @@ -19,8 +19,7 @@ RUN cd /tmp && \ wget http://download.redis.io/redis-stable.tar.gz && \ tar xvzf redis-stable.tar.gz && \ cd redis-stable && \ - make && \ - make installtest + make # Install and set up coach RUN pip install --no-cache-dir \ From 4a145ba5a3bb562d431ad3b2ca000e841ab82511 Mon Sep 17 00:00:00 2001 From: Nadia Yakimakha <32335935+nadiaya@users.noreply.github.com> Date: Thu, 25 Apr 2019 12:01:57 -0700 Subject: [PATCH 6/8] Add echo for each build stage about what it does. --- buildspec.yml | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/buildspec.yml b/buildspec.yml index 795eab1..c0cf066 100644 --- a/buildspec.yml +++ b/buildspec.yml @@ -33,11 +33,12 @@ phases: build: commands: # install + - echo "install" - pip3 install -U -e . # launch remote gpu instance - - | - prefix='ml.' + - echo "launch remote gpu instance" + - prefix='ml.' instance_type=${GPU_INSTANCE_TYPE#"$prefix"} - create-key-pair - launch-ec2-instance --instance-type $instance_type --ami-name dlami-ubuntu @@ -48,7 +49,8 @@ phases: TF_IMAGE="$PROD_ACCOUNT.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/sagemaker-tensorflow-scriptmode" BUILD_ID="$(echo $CODEBUILD_BUILD_ID | sed -e 's/:/-/g')" - # pull cpu images + # pull cpu base images + - echo "pull cpu base images" - | COACH_MXNET_CPU_BASE_TAG="$COACH_MXNET_FRAMEWORK_VERSION-cpu-py$PY_VERSION" docker pull $MXNET_IMAGE:$COACH_MXNET_CPU_BASE_TAG @@ -59,7 +61,8 @@ phases: docker pull $TF_IMAGE:$RAY_TF_CPU_BASE_TAG fi - # pull gpu images + # pull gpu base images + - echo "pull gpu base images" - | COACH_MXNET_GPU_BASE_TAG="$COACH_MXNET_FRAMEWORK_VERSION-gpu-py$PY_VERSION" docker pull $MXNET_IMAGE:$COACH_MXNET_GPU_BASE_TAG @@ -71,6 +74,7 @@ phases: fi # build cpu images + - echo "build cpu images" - | COACH_MXNET_CPU_TAG="coach-$COACH_MXNET_TOOLKIT_VERSION-mxnet-cpu-py$PY_VERSION-$BUILD_ID" docker build -t $PREPROD_IMAGE:$COACH_MXNET_CPU_TAG -f coach/docker/$COACH_MXNET_TOOLKIT_VERSION/Dockerfile.mxnet --build-arg processor=cpu . @@ -80,6 +84,7 @@ phases: docker build -t $PREPROD_IMAGE:$RAY_TF_CPU_TAG -f ray/docker/$RAY_TF_TOOKIT_VERSION/Dockerfile.tf --build-arg processor=cpu . # push cpu images to ecr + - echo "push cpu images to ecr" - | $(aws ecr get-login --registry-ids $ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION) docker push $PREPROD_IMAGE:$COACH_MXNET_CPU_TAG @@ -87,6 +92,7 @@ phases: docker push $PREPROD_IMAGE:$RAY_TF_CPU_TAG # run cpu integration tests + - echo "run cpu integration tests" - | if has-matching-changes "test/" "tests/" "src/*.py" "coach/*" "ray/*"; then pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --tag $COACH_MXNET_CPU_TAG --framework mxnet --toolkit coach --processor cpu @@ -97,6 +103,7 @@ phases: fi # build gpu images + - echo "build gpu images" - | COACH_MXNET_GPU_TAG="coach-$COACH_MXNET_TOOLKIT_VERSION-mxnet-gpu-py$PY_VERSION-$BUILD_ID" docker build -t $PREPROD_IMAGE:$COACH_MXNET_GPU_TAG -f coach/docker/$COACH_MXNET_TOOLKIT_VERSION/Dockerfile.mxnet --build-arg processor=gpu . @@ -106,6 +113,7 @@ phases: docker build -t $PREPROD_IMAGE:$RAY_TF_GPU_TAG -f ray/docker/$RAY_TF_TOOKIT_VERSION/Dockerfile.tf --build-arg processor=gpu . # push gpu images to ecr + - echo "push gpu images to ecr" - | $(aws ecr get-login --registry-ids $ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION) docker push $PREPROD_IMAGE:$COACH_MXNET_GPU_TAG @@ -113,6 +121,7 @@ phases: docker push $PREPROD_IMAGE:$RAY_TF_GPU_TAG # run gpu integration tests + - echo "run gpu integration tests" - | if has-matching-changes "test/" "tests/" "src/*.py" "coach/*" "ray/*"; then printf "$SETUP_CMDS" > $SETUP_FILE @@ -127,6 +136,7 @@ phases: fi # run cpu sagemaker tests + - echo "run cpu sagemaker tests" - | if has-matching-changes "test/" "tests/" "src/*.py" "docker/*" "ray/*"; then pytest test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --tag $COACH_MXNET_CPU_TAG --framework mxnet --toolkit coach --instance-type $CPU_INSTANCE_TYPE @@ -137,6 +147,7 @@ phases: fi # run gpu sagemaker tests + - echo "run gpu sagemaker tests" - | if has-matching-changes "test/" "tests/" "src/*.py" "docker/*" "ray/*"; then pytest test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --tag $COACH_MXNET_GPU_TAG --framework mxnet --toolkit coach --instance-type $GPU_INSTANCE_TYPE From 4708ea245da38d518123b40e3f49c8da7289c160 Mon Sep 17 00:00:00 2001 From: Nadia Yakimakha <32335935+nadiaya@users.noreply.github.com> Date: Thu, 25 Apr 2019 12:08:38 -0700 Subject: [PATCH 7/8] Freeze opencv version. --- ray/docker/0.6.5/Dockerfile.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ray/docker/0.6.5/Dockerfile.tf b/ray/docker/0.6.5/Dockerfile.tf index cc9b21d..b92c321 100644 --- a/ray/docker/0.6.5/Dockerfile.tf +++ b/ray/docker/0.6.5/Dockerfile.tf @@ -18,7 +18,7 @@ RUN pip install --no-cache-dir \ Cython==0.29.7 \ gym==0.12.1 \ lz4 \ - opencv-python-headless==4.1.0 \ + opencv-python-headless==4.1.0.25 \ PyOpenGL==3.1.0 \ pyyaml \ redis==3.2.1 \ From 2ecf063b0dfde728ee81a6b5eacd142e9fe8b03f Mon Sep 17 00:00:00 2001 From: Nadia Yakimakha <32335935+nadiaya@users.noreply.github.com> Date: Thu, 25 Apr 2019 15:24:07 -0700 Subject: [PATCH 8/8] Fix typo in build config. --- buildspec.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/buildspec.yml b/buildspec.yml index c0cf066..b588429 100644 --- a/buildspec.yml +++ b/buildspec.yml @@ -38,7 +38,8 @@ phases: # launch remote gpu instance - echo "launch remote gpu instance" - - prefix='ml.' + - | + prefix='ml.' instance_type=${GPU_INSTANCE_TYPE#"$prefix"} - create-key-pair - launch-ec2-instance --instance-type $instance_type --ami-name dlami-ubuntu