Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ To build RL Docker image:
# Example

# Ray TensorFlow CPU
docker build -t tf-ray:0.5.3-cpu-py3 -f ray/docker/0.5.3/Dockerfile.tf --build-arg processor=cpu .
docker build -t tf-ray:0.6.5-cpu-py3 -f ray/docker/0.6.5/Dockerfile.tf --build-arg processor=cpu .

# Coach TensorFlow GPU
docker build -t tf-coach:0.11.0-gpu-py3 -f coach/docker/0.11.0/Dockerfile.tf --build-arg processor=gpu .
Expand Down Expand Up @@ -239,10 +239,10 @@ TensorFlow Coach Images:

TensorFlow Ray Images:

* 520713654638.dkr.ecr.<region>.amazonaws.com/sagemaker-rl-tensorflow:ray0.5-cpu-py3
* 520713654638.dkr.ecr.<region>.amazonaws.com/sagemaker-rl-tensorflow:ray0.5.3-cpu-py3
* 520713654638.dkr.ecr.<region>.amazonaws.com/sagemaker-rl-tensorflow:ray0.5-gpu-py3
* 520713654638.dkr.ecr.<region>.amazonaws.com/sagemaker-rl-tensorflow:ray0.5.3-gpu-py3
* 520713654638.dkr.ecr.<region>.amazonaws.com/sagemaker-rl-tensorflow:ray0.6-cpu-py3
* 520713654638.dkr.ecr.<region>.amazonaws.com/sagemaker-rl-tensorflow:ray0.6.5-cpu-py3
* 520713654638.dkr.ecr.<region>.amazonaws.com/sagemaker-rl-tensorflow:ray0.6-gpu-py3
* 520713654638.dkr.ecr.<region>.amazonaws.com/sagemaker-rl-tensorflow:ray0.6.5-gpu-py3


`List of supported SageMaker regions <https://docs.aws.amazon.com/general/latest/gr/rande.html#sagemaker_region>`__.
Expand Down
28 changes: 20 additions & 8 deletions buildspec.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ env:
COACH_MXNET_FRAMEWORK_VERSION: '1.3.0'
COACH_TF_TOOLKIT_VERSION: '0.11.1'
COACH_TF_FRAMEWORK_VERSION: '1.12.0'
RAY_TF_TOOKIT_VERSION: '0.5.3'
RAY_TF_FRAMEWORK_VERSION: '1.11.0'
RAY_TF_TOOKIT_VERSION: '0.6.5'
RAY_TF_FRAMEWORK_VERSION: '1.12.0'
CPU_INSTANCE_TYPE: 'ml.c4.xlarge'
GPU_INSTANCE_TYPE: 'ml.p2.xlarge'
PY_VERSION: '3'
Expand All @@ -34,9 +34,11 @@ phases:
build:
commands:
# install
- echo "install"
- pip3 install -U -e .

# launch remote gpu instance
- echo "launch remote gpu instance"
- |
prefix='ml.'
instance_type=${GPU_INSTANCE_TYPE#"$prefix"}
Expand All @@ -49,7 +51,8 @@ phases:
TF_IMAGE="$PROD_ACCOUNT.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/sagemaker-tensorflow-scriptmode"
BUILD_ID="$(echo $CODEBUILD_BUILD_ID | sed -e 's/:/-/g')"

# pull cpu images
# pull cpu base images
- echo "pull cpu base images"
- |
COACH_MXNET_CPU_BASE_TAG="$COACH_MXNET_FRAMEWORK_VERSION-cpu-py$PY_VERSION"
docker pull $MXNET_IMAGE:$COACH_MXNET_CPU_BASE_TAG
Expand All @@ -60,7 +63,8 @@ phases:
docker pull $TF_IMAGE:$RAY_TF_CPU_BASE_TAG
fi

# pull gpu images
# pull gpu base images
- echo "pull gpu base images"
- |
COACH_MXNET_GPU_BASE_TAG="$COACH_MXNET_FRAMEWORK_VERSION-gpu-py$PY_VERSION"
docker pull $MXNET_IMAGE:$COACH_MXNET_GPU_BASE_TAG
Expand All @@ -72,6 +76,7 @@ phases:
fi

# build cpu images
- echo "build cpu images"
- |
COACH_MXNET_CPU_TAG="coach-$COACH_MXNET_TOOLKIT_VERSION-mxnet-cpu-py$PY_VERSION-$BUILD_ID"
docker build -t $PREPROD_IMAGE:$COACH_MXNET_CPU_TAG -f coach/docker/$COACH_MXNET_TOOLKIT_VERSION/Dockerfile.mxnet --build-arg processor=cpu .
Expand All @@ -81,15 +86,17 @@ phases:
docker build -t $PREPROD_IMAGE:$RAY_TF_CPU_TAG -f ray/docker/$RAY_TF_TOOKIT_VERSION/Dockerfile.tf --build-arg processor=cpu .

# push cpu images to ecr
- echo "push cpu images to ecr"
- |
$(aws ecr get-login --registry-ids $ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION)
docker push $PREPROD_IMAGE:$COACH_MXNET_CPU_TAG
docker push $PREPROD_IMAGE:$COACH_TF_CPU_TAG
docker push $PREPROD_IMAGE:$RAY_TF_CPU_TAG

# run cpu integration tests
- echo "run cpu integration tests"
- |
if has-matching-changes "test/" "tests/" "src/*.py" "coach/*" "ray/* buildspec.yml"; then
if has-matching-changes "test/" "tests/" "src/*.py" "coach/*" "ray/*" "buildspec.yml"; then
pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --tag $COACH_MXNET_CPU_TAG --framework mxnet --toolkit coach --processor cpu
pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --tag $COACH_TF_CPU_TAG --framework tensorflow --toolkit coach --processor cpu
pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --tag $RAY_TF_CPU_TAG --framework tensorflow --toolkit ray --processor cpu
Expand All @@ -98,6 +105,7 @@ phases:
fi

# build gpu images
- echo "build gpu images"
- |
COACH_MXNET_GPU_TAG="coach-$COACH_MXNET_TOOLKIT_VERSION-mxnet-gpu-py$PY_VERSION-$BUILD_ID"
docker build -t $PREPROD_IMAGE:$COACH_MXNET_GPU_TAG -f coach/docker/$COACH_MXNET_TOOLKIT_VERSION/Dockerfile.mxnet --build-arg processor=gpu .
Expand All @@ -107,15 +115,17 @@ phases:
docker build -t $PREPROD_IMAGE:$RAY_TF_GPU_TAG -f ray/docker/$RAY_TF_TOOKIT_VERSION/Dockerfile.tf --build-arg processor=gpu .

# push gpu images to ecr
- echo "push gpu images to ecr"
- |
$(aws ecr get-login --registry-ids $ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION)
docker push $PREPROD_IMAGE:$COACH_MXNET_GPU_TAG
docker push $PREPROD_IMAGE:$COACH_TF_GPU_TAG
docker push $PREPROD_IMAGE:$RAY_TF_GPU_TAG

# run gpu integration tests
- echo "run gpu integration tests"
- |
if has-matching-changes "test/" "tests/" "src/*.py" "coach/*" "ray/*" buildspec.yml; then
if has-matching-changes "test/" "tests/" "src/*.py" "coach/*" "ray/*" "buildspec.yml"; then
printf "$SETUP_CMDS" > $SETUP_FILE
cmd="pytest test/integration/local --region $AWS_DEFAULT_REGION --toolkit coach --framework mxnet --docker-base-name $PREPROD_IMAGE --tag $COACH_MXNET_GPU_TAG --processor gpu"
remote-test --github-repo $GITHUB_REPO --test-cmd "$cmd" --setup-file $SETUP_FILE --pr-number $PR_NUM
Expand All @@ -128,8 +138,9 @@ phases:
fi

# run cpu sagemaker tests
- echo "run cpu sagemaker tests"
- |
if has-matching-changes "test/" "tests/" "src/*.py" "docker/*" "ray/*" buildspec.yml; then
if has-matching-changes "test/" "tests/" "src/*.py" "docker/*" "ray/*" "buildspec.yml"; then
pytest test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --tag $COACH_MXNET_CPU_TAG --framework mxnet --toolkit coach --instance-type $CPU_INSTANCE_TYPE
pytest test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --tag $COACH_TF_CPU_TAG --framework tensorflow --toolkit coach --instance-type $CPU_INSTANCE_TYPE
pytest test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --tag $RAY_TF_CPU_TAG --framework tensorflow --toolkit ray --instance-type $CPU_INSTANCE_TYPE
Expand All @@ -138,8 +149,9 @@ phases:
fi

# run gpu sagemaker tests
- echo "run gpu sagemaker tests"
- |
if has-matching-changes "test/" "tests/" "src/*.py" "docker/*" "ray/*" buildspec.yml; then
if has-matching-changes "test/" "tests/" "src/*.py" "docker/*" "ray/*" "buildspec.yml"; then
pytest test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --tag $COACH_MXNET_GPU_TAG --framework mxnet --toolkit coach --instance-type $GPU_INSTANCE_TYPE
pytest test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --tag $COACH_TF_GPU_TAG --framework tensorflow --toolkit coach --instance-type $GPU_INSTANCE_TYPE
pytest test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --tag $RAY_TF_GPU_TAG --framework tensorflow --toolkit ray --instance-type $GPU_INSTANCE_TYPE
Expand Down
40 changes: 40 additions & 0 deletions ray/docker/0.6.5/Dockerfile.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
ARG processor
FROM 520713654638.dkr.ecr.us-west-2.amazonaws.com/sagemaker-tensorflow-scriptmode:1.12.0-$processor-py3

RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
jq \
libav-tools \
libjpeg-dev \
libxrender1 \
python3.6-dev \
python3-opengl \
wget \
xvfb && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

RUN pip install --no-cache-dir \
Cython==0.29.7 \
gym==0.12.1 \
lz4 \
opencv-python-headless==4.1.0.25 \
PyOpenGL==3.1.0 \
pyyaml \
redis==3.2.1 \
ray==0.6.5 \
ray[rllib]==0.6.5 \
scipy

# https://click.palletsprojects.com/en/7.x/python3/
ENV LC_ALL=C.UTF-8
ENV LANG=C.UTF-8

# Copy workaround script for incorrect hostname
COPY lib/changehostname.c /

COPY lib/start.sh /usr/local/bin/start.sh
RUN chmod +x /usr/local/bin/start.sh

# Starts framework
ENTRYPOINT ["bash", "-m", "start.sh"]
2 changes: 1 addition & 1 deletion test/resources/ray_cartpole/train_ray.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from ray.tune.logger import pretty_print

# Based on https://github.com/ray-project/ray/blob/master/doc/source/rllib-training.rst#python-api
ray.init(redirect_output=False, redirect_worker_output=False)
ray.init(log_to_driver=False)
config = ppo.DEFAULT_CONFIG.copy()
config["num_gpus"] = int(os.environ.get("SM_NUM_GPUS", 0))
checkpoint_dir = os.environ.get("SM_MODEL_DIR", '/Users/nadzeya/gym')
Expand Down