diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 2ff7d61e..7f2d0ed0 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,5 +1,6 @@ -Issue #, if available: +**Issue #, if available:** + +**Description of changes:** -Description of changes: By submitting this pull request, I confirm that my contribution is made under the terms of the Apache 2.0 license. diff --git a/.github/workflows/ci.yml b/.github/workflows/build-and-test.yaml old mode 100644 new mode 100755 similarity index 53% rename from .github/workflows/ci.yml rename to .github/workflows/build-and-test.yaml index 603463cc..a0d45064 --- a/.github/workflows/ci.yml +++ b/.github/workflows/build-and-test.yaml @@ -1,6 +1,16 @@ -name: NTH Continuous Integration and Release - -on: [push, pull_request, workflow_dispatch] +name: Build and Test + +on: + push: + branches: + - main + tags-ignore: + - "v*.*.*" + pull_request: + workflow_dispatch: + # Run M-F at 5AM CDT + schedule: + - cron: '0 10 * * 1-5' env: DEFAULT_GO_VERSION: ^1.16 @@ -9,11 +19,9 @@ env: WEBHOOK_URL: ${{ secrets.WEBHOOK_URL }} jobs: - fastTests: - name: Fast Tests and Lints + name: Fast Test runs-on: ubuntu-20.04 - if: ${{ !contains(github.ref, 'refs/tags/') }} steps: - name: Set up Go 1.x uses: actions/setup-go@v2 @@ -23,6 +31,15 @@ jobs: - name: Check out code into the Go module directory uses: actions/checkout@v2 + - name: Restore go mod cache + uses: actions/cache@v2 + with: + path: | + ~/.cache/go-build + ~/go/pkg/mod + ~/go/bin/ + key: gocache + - name: Unit Tests run: make unit-test @@ -34,16 +51,21 @@ jobs: env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Run golangci-lint - uses: golangci/golangci-lint-action@v2.5.2 - - name: Generate K8s YAML run: make generate-k8s-yaml + golangci: + # this action needs to run in its own job per setup + name: Lint Eastwood + runs-on: ubuntu-20.04 + steps: + - uses: actions/checkout@v2 + - name: Set up golangci-lint + uses: golangci/golangci-lint-action@v2.5.2 + buildLinux: name: Build Linux Binaries runs-on: ubuntu-20.04 - if: ${{ !contains(github.ref, 'refs/tags/') }} steps: - name: Set up Go 1.x uses: actions/setup-go@v2 @@ -52,19 +74,37 @@ jobs: - name: Check out code into the Go module directory uses: actions/checkout@v2 + + - name: Restore go mod cache + uses: actions/cache@v2 + with: + path: | + ~/.cache/go-build + ~/go/pkg/mod + ~/go/bin/ + key: gocache + - name: Build Linux Binaries run: make build-binaries buildLinuxDocker: name: Build Linux Docker Images runs-on: ubuntu-20.04 - if: ${{ !contains(github.ref, 'refs/tags/') }} steps: - name: Set up Go 1.x uses: actions/setup-go@v2 with: go-version: ${{ env.DEFAULT_GO_VERSION }} + - name: Restore go mod cache + uses: actions/cache@v2 + with: + path: | + ~/.cache/go-build + ~/go/pkg/mod + ~/go/bin/ + key: gocache + - name: Check out code into the Go module directory uses: actions/checkout@v2 @@ -74,7 +114,6 @@ jobs: buildWindows: name: Build Windows Binaries runs-on: windows-2019 - if: ${{ !contains(github.ref, 'refs/tags/') }} steps: - name: Set up Go 1.x uses: actions/setup-go@v2 @@ -84,13 +123,21 @@ jobs: - name: Check out code into the Go module directory uses: actions/checkout@v2 + - name: Restore go mod cache + uses: actions/cache@v2 + with: + path: | + ~/.cache/go-build + ~/go/pkg/mod + ~/go/bin/ + key: gocache + - name: Build Windows Binaries run: choco install make && choco install zip && RefreshEnv.cmd && make build-binaries-windows buildWindowsDocker: name: Build Windows Docker Images runs-on: windows-2019 - if: ${{ !contains(github.ref, 'refs/tags/') }} steps: - name: Set up Go 1.x uses: actions/setup-go@v2 @@ -100,13 +147,21 @@ jobs: - name: Check out code into the Go module directory uses: actions/checkout@v2 + - name: Restore go mod cache + uses: actions/cache@v2 + with: + path: | + ~/.cache/go-build + ~/go/pkg/mod + ~/go/bin/ + key: gocache + - name: Build Windows Docker Images run: choco install make && RefreshEnv.cmd && make build-docker-images-windows e2e: name: E2E Tests runs-on: ubuntu-20.04 - if: ${{ !contains(github.ref, 'refs/tags/') }} strategy: matrix: k8sVersion: ["1.17", "1.18", "1.19", "1.20", "1.21", "1.22"] @@ -119,72 +174,14 @@ jobs: - name: Check out code into the Go module directory uses: actions/checkout@v2 - - name: E2E Tests - run: test/k8s-local-cluster-test/run-test -v ${{ matrix.k8sVersion }} - - releaseLinux: - name: Release Linux - runs-on: ubuntu-20.04 - if: github.event_name == 'push' && contains(github.ref, 'refs/tags/') - steps: - - name: Set up Go 1.x - uses: actions/setup-go@v2 - with: - go-version: ${{ env.DEFAULT_GO_VERSION }} - - - name: Check out code into the Go module directory - uses: actions/checkout@v2 - - - name: Release Linux Assets - run: make release - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - AWS_SESSION_TOKEN: ${{ secrets.AWS_SESSION_TOKEN }} - - releaseWindows: - name: Release Windows - runs-on: windows-2019 - if: github.event_name == 'push' && contains(github.ref, 'refs/tags/') - steps: - - name: Set up Go 1.x - uses: actions/setup-go@v2 + - name: Restore go mod cache + uses: actions/cache@v2 with: - go-version: ${{ env.DEFAULT_GO_VERSION }} + path: | + ~/.cache/go-build + ~/go/pkg/mod + ~/go/bin/ + key: gocache - - name: Check out code into the Go module directory - uses: actions/checkout@v2 - - - name: Release Windows Assets - run: choco install make && choco install zip && RefreshEnv.cmd && make release-windows - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - AWS_SESSION_TOKEN: ${{ secrets.AWS_SESSION_TOKEN }} - - release: - name: Release - runs-on: ubuntu-20.04 - needs: [releaseLinux, releaseWindows] - if: github.event_name == 'push' && contains(github.ref, 'refs/tags/') - steps: - - name: Set up Go 1.x - uses: actions/setup-go@v2 - with: - go-version: ${{ env.DEFAULT_GO_VERSION }} - - - name: Check out code into the Go module directory - uses: actions/checkout@v2 - - - name: Create eks-charts PR - run: make ekscharts-sync-release - - - name: Sync Readme to ECR Public - run: make sync-readme-to-ecr-public - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - AWS_SESSION_TOKEN: ${{ secrets.AWS_SESSION_TOKEN }} - - - name: Create NTH README Update PR - run: make create-release-prep-pr-readme + - name: E2E Tests + run: test/k8s-local-cluster-test/run-test -v ${{ matrix.k8sVersion }} diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml new file mode 100755 index 00000000..0c0a0a54 --- /dev/null +++ b/.github/workflows/release.yaml @@ -0,0 +1,80 @@ +name: Release + +on: + push: + tags: + - "v*.*.*" + +permissions: + contents: write # required for uploading releases + +env: + DEFAULT_GO_VERSION: ^1.16 + GITHUB_USERNAME: ${{ secrets.EC2_BOT_GITHUB_USERNAME }} + GITHUB_TOKEN: ${{ secrets.EC2_BOT_GITHUB_TOKEN }} + WEBHOOK_URL: ${{ secrets.WEBHOOK_URL }} + +jobs: + releaseLinux: + name: Release Linux + runs-on: ubuntu-20.04 + steps: + - name: Set up Go 1.x + uses: actions/setup-go@v2 + with: + go-version: ${{ env.DEFAULT_GO_VERSION }} + + - name: Check out code into the Go module directory + uses: actions/checkout@v2 + + - name: Release Linux Assets + run: make release + env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_SESSION_TOKEN: ${{ secrets.AWS_SESSION_TOKEN }} + + releaseWindows: + name: Release Windows + runs-on: windows-2019 + steps: + - name: Set up Go 1.x + uses: actions/setup-go@v2 + with: + go-version: ${{ env.DEFAULT_GO_VERSION }} + + - name: Check out code into the Go module directory + uses: actions/checkout@v2 + + - name: Release Windows Assets + run: choco install make && choco install zip && RefreshEnv.cmd && make release-windows + env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_SESSION_TOKEN: ${{ secrets.AWS_SESSION_TOKEN }} + + release: + name: Release + runs-on: ubuntu-20.04 + needs: [releaseLinux, releaseWindows] + steps: + - name: Set up Go 1.x + uses: actions/setup-go@v2 + with: + go-version: ${{ env.DEFAULT_GO_VERSION }} + + - name: Check out code into the Go module directory + uses: actions/checkout@v2 + + - name: Create eks-charts PR + run: make ekscharts-sync-release + + - name: Sync Readme to ECR Public + run: make sync-readme-to-ecr-public + env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_SESSION_TOKEN: ${{ secrets.AWS_SESSION_TOKEN }} + + - name: Create NTH README Update PR + run: make create-release-prep-pr-readme \ No newline at end of file diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml index 80f442d9..a5b4c02e 100644 --- a/.github/workflows/stale.yml +++ b/.github/workflows/stale.yml @@ -21,6 +21,7 @@ jobs: days-before-issue-close: -1 remove-stale-when-updated: true stale-pr-label: "stale" + operations-per-run: 100 stale-pr-message: > This PR has been automatically marked as stale because it has not had recent activity. It will be closed if no further activity occurs. If you want this PR to never become stale, please ask a maintainer to apply the "stalebot-ignore" label. @@ -40,6 +41,7 @@ jobs: days-before-pr-close: -1 remove-stale-when-updated: true stale-issue-label: "stale" + operations-per-run: 100 stale-issue-message: > This issue has been automatically marked as stale because it has not had recent activity. It will be closed if no further activity occurs. If you want this issue to never become stale, please ask a maintainer to apply the "stalebot-ignore" label. diff --git a/BUILD.md b/BUILD.md index 2fb3a119..6912a496 100644 --- a/BUILD.md +++ b/BUILD.md @@ -5,21 +5,26 @@ Clone the repo: ``` git clone https://github.com/aws/aws-node-termination-handler.git ``` -Build the latest version of the docker image: +Build the latest version of the docker image for `linux/amd64`: ``` make docker-build ``` ### Multi-Target -By default a linux/amd64 image will be built. To build for a different target the build-arg `GOARCH` can be changed. +If you instead want to build for all support Linux architectures (`linux/amd64` and `linux/arm64`), you can run this make target: +``` +make build-docker-images +``` +Under the hood, this passes each architecture as the `--platform` argument to `docker buildx build`, like this: ``` -$ docker build --build-arg=GOARCH=amd64 -t ${USER}/aws-node-termination-handler-amd64:v1.0.0 . -$ docker build --build-arg=GOARCH=arm64 -t ${USER}/aws-node-termination-handler-arm64:v1.0.0 . +$ docker buildx create --use +$ docker buildx build --load --platform "linux/amd64" -t ${USER}/aws-node-termination-handler-amd64:v1.0.0 . +$ docker buildx build --load --platform "linux/arm64" -t ${USER}/aws-node-termination-handler-arm64:v1.0.0 . ``` -To push a multi-arch image, the helper tool [manifest-tool](https://github.com/estesp/manifest-tool) can be used. +To push a multi-arch image, you can use the helper tool [manifest-tool](https://github.com/estesp/manifest-tool). ``` $ cat << EOF > manifest.yaml @@ -39,16 +44,24 @@ EOF $ manifest-tool push from-spec manifest.yaml ``` +### Building for Windows + +You can build the Windows docker image with the following command: +``` +make build-docker-images-windows +``` +Currently, our `windows/amd64` builds use the older `docker build` system, not `docker buildx build` because it does not seem to be well supported. We hope to unify them in the future. + ### Go Module Proxy -By default, Go 1.13+ uses the proxy.golang.org proxy for go module downloads. This can be changed to a different go module proxy or revert back to pre-go 1.13 default which was "direct". `GOPROXY=direct` will pull from the VCS provider directly instead of going through a proxy at all. +By default, Go 1.13+ uses the proxy.golang.org proxy for go module downloads. You can change this to a different go module proxy or revert back to pre-go 1.13 default which was "direct". `GOPROXY=direct` will pull from the VCS provider directly instead of going through a proxy at all. ``` ## No Proxy -docker build --build-arg=GOPROXY=direct -t ${USER}/aws-node-termination-handler:v1.0.0 . +docker buildx build --load --build-arg=GOPROXY=direct -t ${USER}/aws-node-termination-handler:v1.0.0 . ## My Corp Proxy -docker build --build-arg=GOPROXY=go-proxy.mycorp.com -t ${USER}/aws-node-termination-handler:v1.0.0 . +docker buildx build --load --build-arg=GOPROXY=go-proxy.mycorp.com -t ${USER}/aws-node-termination-handler:v1.0.0 . ``` ### Kubernetes Object Files diff --git a/CODEOWNERS b/CODEOWNERS new file mode 100644 index 00000000..a6990f6e --- /dev/null +++ b/CODEOWNERS @@ -0,0 +1,4 @@ +# Require approvals from someone in the owner team before merging +# More information here: https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners + +* @aws/ec2-guacamole diff --git a/Dockerfile b/Dockerfile index 8d6a3e44..8b13b6d1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM golang:1.16 as builder +FROM --platform=$BUILDPLATFORM golang:1.16 as builder ## GOLANG env ARG GOPROXY="https://proxy.golang.org|direct" @@ -11,8 +11,9 @@ COPY go.sum . RUN go mod download ARG CGO_ENABLED=0 -ARG GOOS=linux -ARG GOARCH=amd64 +ARG TARGETOS TARGETARCH +ARG GOOS=$TARGETOS +ARG GOARCH=$TARGETARCH # Build COPY . . diff --git a/Makefile b/Makefile index fb146e49..738c9cf9 100644 --- a/Makefile +++ b/Makefile @@ -12,7 +12,7 @@ GOARCH ?= amd64 GOPROXY ?= "https://proxy.golang.org,direct" MAKEFILE_PATH = $(dir $(realpath -s $(firstword $(MAKEFILE_LIST)))) BUILD_DIR_PATH = ${MAKEFILE_PATH}/build -SUPPORTED_PLATFORMS_LINUX ?= "linux/amd64,linux/arm64,linux/arm,darwin/amd64" +SUPPORTED_PLATFORMS_LINUX ?= "linux/amd64,linux/arm64" SUPPORTED_PLATFORMS_WINDOWS ?= "windows/amd64" BINARY_NAME ?= "node-termination-handler" @@ -146,9 +146,6 @@ help: @grep -E '^[a-zA-Z_-]+:.*$$' $(MAKEFILE_LIST) | sort ## Targets intended to be run in preparation for a new release -draft-release-notes: - ${MAKEFILE_PATH}/scripts/draft-release-notes - create-local-release-tag-major: ${MAKEFILE_PATH}/scripts/create-local-tag-for-release -m diff --git a/README.md b/README.md index 1608100e..3abbce70 100644 --- a/README.md +++ b/README.md @@ -18,14 +18,21 @@ docker-pulls + + +

-![NTH Continuous Integration and Release](https://github.com/aws/aws-node-termination-handler/workflows/NTH%20Continuous%20Integration%20and%20Release/badge.svg) -

+### Community Meeting +NTH community meeting is hosted on a monthly cadence. Everyone is welcome to participate! +* **When:** first Tuesday of every month from 9:00-9:30AM PST | [Calendar Event (ics)](https://raw.githubusercontent.com/aws/aws-node-termination-handler/main/assets/nth-community-meeting.ics) +* **Where:** [Chime meeting bridge](https://chime.aws/6502066216) + + ## Project Summary This project ensures that the Kubernetes control plane responds appropriately to events that can cause your EC2 instance to become unavailable, such as [EC2 maintenance events](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/monitoring-instances-status-check_sched.html), [EC2 Spot interruptions](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/spot-interruptions.html), [ASG Scale-In](https://docs.aws.amazon.com/autoscaling/ec2/userguide/AutoScalingGroupLifecycle.html#as-lifecycle-scale-in), [ASG AZ Rebalance](https://docs.aws.amazon.com/autoscaling/ec2/userguide/auto-scaling-benefits.html#AutoScalingBehavior.InstanceUsage), and EC2 Instance Termination via the API or Console. If not handled, your application code may not stop gracefully, take longer to recover full availability, or accidentally schedule work to nodes that are going down. @@ -54,6 +61,7 @@ You can run the termination handler on any Kubernetes cluster running on AWS, in - EC2 Instance Rebalance Recommendation - EC2 Auto-Scaling Group Termination Lifecycle Hooks to take care of ASG Scale-In, AZ-Rebalance, Unhealthy Instances, and more! - EC2 Status Change Events + - EC2 Scheduled Change events from AWS Health - Helm installation and event configuration support - Webhook feature to send shutdown or restart notification messages - Unit & Integration Tests @@ -103,7 +111,7 @@ The termination handler DaemonSet installs into your cluster a [ServiceAccount]( You can use kubectl to directly add all of the above resources with the default configuration into your cluster. ``` -kubectl apply -f https://github.com/aws/aws-node-termination-handler/releases/download/v1.13.4/all-resources.yaml +kubectl apply -f https://github.com/aws/aws-node-termination-handler/releases/download/v1.16.0/all-resources.yaml ``` For a full list of releases and associated artifacts see our [releases page](https://github.com/aws/aws-node-termination-handler/releases). @@ -187,45 +195,12 @@ For a full list of configuration options see our [Helm readme](https://github.co The termination handler deployment requires some infrastructure to be setup before deploying the application. You'll need the following AWS infrastructure components: -1. AutoScaling Group Termination Lifecycle Hook -2. Amazon Simple Queue Service (SQS) Queue +1. Amazon Simple Queue Service (SQS) Queue +2. AutoScaling Group Termination Lifecycle Hook 3. Amazon EventBridge Rule 4. IAM Role for the aws-node-termination-handler Queue Processing Pods -#### 1. Setup a Termination Lifecycle Hook on an ASG: - -Here is the AWS CLI command to create a termination lifecycle hook on an existing ASG, although this should really be configured via your favorite infrastructure-as-code tool like CloudFormation or Terraform: - -``` -$ aws autoscaling put-lifecycle-hook \ - --lifecycle-hook-name=my-k8s-term-hook \ - --auto-scaling-group-name=my-k8s-asg \ - --lifecycle-transition=autoscaling:EC2_INSTANCE_TERMINATING \ - --default-result=CONTINUE \ - --heartbeat-timeout=300 -``` - -#### 2. Tag the ASGs: - -By default the aws-node-termination-handler will only manage terminations for ASGs tagged w/ `key=aws-node-termination-handler/managed` - -``` -$ aws autoscaling create-or-update-tags \ - --tags ResourceId=my-auto-scaling-group,ResourceType=auto-scaling-group,Key=aws-node-termination-handler/managed,Value=,PropagateAtLaunch=true -``` - -The value of the key does not matter. - -This functionality is helpful in accounts where there are ASGs that do not run kubernetes nodes or you do not want aws-node-termination-handler to manage their termination lifecycle. -However, if your account is dedicated to ASGs for your kubernetes cluster, then you can turn off the ASG tag check by setting the flag `--check-asg-tag-before-draining=false` or environment variable `CHECK_ASG_TAG_BEFORE_DRAINING=false`. - -You can also control what resources NTH manages by adding the resource ARNs to your Amazon EventBridge rules. - -Take a look at the docs on how to create rules that only manage certain ASGs [here](https://docs.aws.amazon.com/autoscaling/ec2/userguide/cloud-watch-events.html). - -See all the different events docs [here](https://docs.aws.amazon.com/eventbridge/latest/userguide/event-types.html#auto-scaling-event-types). - -#### 3. Create an SQS Queue: +#### 1. Create an SQS Queue: Here is the AWS CLI command to create an SQS queue to hold termination events from ASG and EC2, although this should really be configured via your favorite infrastructure-as-code tool like CloudFormation or Terraform: @@ -263,9 +238,65 @@ EOF $ aws sqs create-queue --queue-name "${SQS_QUEUE_NAME}" --attributes file:///tmp/queue-attributes.json ``` +If you are sending Lifecycle termination events from ASG directly to SQS, instead of through EventBridge, then you will also need to create an IAM service role to give Amazon EC2 Auto Scaling access to your SQS queue. Please follow [these linked instructions to create the IAM service role: link.](https://docs.aws.amazon.com/autoscaling/ec2/userguide/configuring-lifecycle-hook-notifications.html#sqs-notifications) +Note the ARNs for the SQS queue and the associated IAM role for Step 2. + +There are some caveats when using [server side encryption with SQS](https://docs.aws.amazon.com/AWSSimpleQueueService/latest/SQSDeveloperGuide/sqs-server-side-encryption.html): +* using [SSE-KMS](https://docs.aws.amazon.com/AWSSimpleQueueService/latest/SQSDeveloperGuide/sqs-configure-sse-existing-queue.html) with a [customer managed key](https://docs.aws.amazon.com/kms/latest/developerguide/concepts.html#key-mgmt) requires [changing the KMS key policy](https://docs.aws.amazon.com/eventbridge/latest/userguide/eb-troubleshooting.html#eb-sqs-encrypted) to allow EventBridge to publish events to SQS. +* using [SSE-KMS](https://docs.aws.amazon.com/AWSSimpleQueueService/latest/SQSDeveloperGuide/sqs-configure-sse-existing-queue.html) with an [AWS managed key](https://docs.aws.amazon.com/kms/latest/developerguide/concepts.html#key-mgmt) is not supported as the KMS key policy can't be updated to allow EventBridge to publish events to SQS. +* using [SSE-SQS](https://docs.aws.amazon.com/AWSSimpleQueueService/latest/SQSDeveloperGuide/sqs-configure-sqs-sse-queue.html) doesn't require extra setup and works out of the box as SQS queues without encryption at rest. + +#### 2. Setup a Termination Lifecycle Hook on an ASG: + +Here is the AWS CLI command to create a termination lifecycle hook on an existing ASG when using EventBridge, although this should really be configured via your favorite infrastructure-as-code tool like CloudFormation or Terraform: + +``` +$ aws autoscaling put-lifecycle-hook \ + --lifecycle-hook-name=my-k8s-term-hook \ + --auto-scaling-group-name=my-k8s-asg \ + --lifecycle-transition=autoscaling:EC2_INSTANCE_TERMINATING \ + --default-result=CONTINUE \ + --heartbeat-timeout=300 +``` + +If you want to avoid using EventBridge and instead send ASG Lifecycle events directly to SQS, instead use the following command, using the ARNs from Step 1: + +``` +$ aws autoscaling put-lifecycle-hook \ + --lifecycle-hook-name=my-k8s-term-hook \ + --auto-scaling-group-name=my-k8s-asg \ + --lifecycle-transition=autoscaling:EC2_INSTANCE_TERMINATING \ + --default-result=CONTINUE \ + --heartbeat-timeout=300 \ + --notification-target-arn \ + --role-arn +``` + +#### 3. Tag the ASGs: + +By default the aws-node-termination-handler will only manage terminations for ASGs tagged w/ `key=aws-node-termination-handler/managed` + +``` +$ aws autoscaling create-or-update-tags \ + --tags ResourceId=my-auto-scaling-group,ResourceType=auto-scaling-group,Key=aws-node-termination-handler/managed,Value=,PropagateAtLaunch=true +``` + +The value of the key does not matter. + +This functionality is helpful in accounts where there are ASGs that do not run kubernetes nodes or you do not want aws-node-termination-handler to manage their termination lifecycle. +However, if your account is dedicated to ASGs for your kubernetes cluster, then you can turn off the ASG tag check by setting the flag `--check-asg-tag-before-draining=false` or environment variable `CHECK_ASG_TAG_BEFORE_DRAINING=false`. + +You can also control what resources NTH manages by adding the resource ARNs to your Amazon EventBridge rules. + +Take a look at the docs on how to create rules that only manage certain ASGs [here](https://docs.aws.amazon.com/autoscaling/ec2/userguide/cloud-watch-events.html). + +See all the different events docs [here](https://docs.aws.amazon.com/eventbridge/latest/userguide/event-types.html#auto-scaling-event-types). + #### 4. Create Amazon EventBridge Rules -Here are AWS CLI commands to create Amazon EventBridge rules so that ASG termination events, Spot Interruptions, Instance state changes and Rebalance Recommendations are sent to the SQS queue created in the previous step. This should really be configured via your favorite infrastructure-as-code tool like CloudFormation or Terraform: +You may skip this step if sending events from ASG to SQS directly. + +Here are AWS CLI commands to create Amazon EventBridge rules so that ASG termination events, Spot Interruptions, Instance state changes, Rebalance Recommendations, and AWS Health Scheduled Changes are sent to the SQS queue created in the previous step. This should really be configured via your favorite infrastructure-as-code tool like CloudFormation or Terraform: ``` $ aws events put-rule \ @@ -295,6 +326,13 @@ $ aws events put-rule \ $ aws events put-targets --rule MyK8sInstanceStateChangeRule \ --targets "Id"="1","Arn"="arn:aws:sqs:us-east-1:123456789012:MyK8sTermQueue" + +$ aws events put-rule \ + --name MyK8sScheduledChangeRule \ + --event-pattern "{\"source\": [\"aws.health\"],\"detail-type\": [\"AWS Health Event\"]}" + +$ aws events put-targets --rule MyK8sScheduledChangeRule \ + --targets "Id"="1","Arn"="arn:aws:sqs:us-east-1:123456789012:MyK8sTermQueue" ``` #### 5. Create an IAM Role for the Pods @@ -388,7 +426,7 @@ Queue Processor needs an **sqs queue url** to function; therefore, manifest chan Minimal Config: ``` -curl -L https://github.com/aws/aws-node-termination-handler/releases/download/v1.13.4/all-resources-queue-processor.yaml -o all-resources-queue-processor.yaml +curl -L https://github.com/aws/aws-node-termination-handler/releases/download/v1.16.0/all-resources-queue-processor.yaml -o all-resources-queue-processor.yaml kubectl apply -f ./all-resources-queue-processor.yaml ``` diff --git a/assets/nth-community-meeting.ics b/assets/nth-community-meeting.ics new file mode 100644 index 00000000..e33306b2 --- /dev/null +++ b/assets/nth-community-meeting.ics @@ -0,0 +1,49 @@ +BEGIN:VCALENDAR +CALSCALE:GREGORIAN +VERSION:2.0 +X-WR-CALNAME:AWS Node Termination Handler Community Meeting +METHOD:PUBLISH +PRODID:-//Apple Inc.//macOS 11.6.2//EN +BEGIN:VTIMEZONE +TZID:America/Chicago +BEGIN:DAYLIGHT +TZOFFSETFROM:-0600 +RRULE:FREQ=YEARLY;BYMONTH=3;BYDAY=2SU +DTSTART:20070311T020000 +TZNAME:CDT +TZOFFSETTO:-0500 +END:DAYLIGHT +BEGIN:STANDARD +TZOFFSETFROM:-0500 +RRULE:FREQ=YEARLY;BYMONTH=11;BYDAY=1SU +DTSTART:20071104T020000 +TZNAME:CST +TZOFFSETTO:-0600 +END:STANDARD +END:VTIMEZONE +BEGIN:VEVENT +TRANSP:OPAQUE +DTEND;TZID=America/Chicago:20220104T113000 +LAST-MODIFIED:20220104T153156Z +UID:4693991B-94C2-4E48-A18C-C8DF1082D77C +DTSTAMP:20220104T152914Z +DESCRIPTION:==============Conference Bridge Information==============\nY + ou have been invited to an online meeting\, powered by Amazon Chime.\n\n + Chime meeting ID: 6502066216\n\nJoin via Chime clients (manually): Selec + t "Meetings > Join a Meeting"\, and enter 6502066216\n\nJoin via Chime c + lients (auto-call): If you invite auto-call as attendee\, Chime will cal + l you when the meeting starts\, select "Answer"\n\nJoin via browser scre + en share: https://chime.aws/6502066216\n\nJoin via phone (US): +1-929-43 + 2-4463\,\,\,6502066216#\n\nJoin vi + a phone (US toll-free): +1-855-552-4463\,\,\,6502066216#\n\nInternational dial-in: https://chime.aws/diali + nnumbers/\n\nIn-room video system: Ext: 62000\, Meeting PIN: 6502066216# + \n\n================================================= +SEQUENCE:0 +X-APPLE-TRAVEL-ADVISORY-BEHAVIOR:AUTOMATIC +DTSTART;TZID=America/Chicago:20220104T110000 +SUMMARY:AWS Node Termination Handler Community Meeting +CREATED:20220104T152821Z +RRULE:FREQ=MONTHLY;INTERVAL=1;BYDAY=1TU +END:VEVENT +END:VCALENDAR diff --git a/cmd/node-termination-handler.go b/cmd/node-termination-handler.go index 160116a5..9c60a7ca 100644 --- a/cmd/node-termination-handler.go +++ b/cmd/node-termination-handler.go @@ -378,7 +378,6 @@ func cordonNode(node node.Node, nodeName string, drainEvent *monitor.Interruptio } else { log.Err(err).Msg("There was a problem while trying to cordon the node") recorder.Emit(nodeName, observability.Warning, observability.CordonErrReason, observability.CordonErrMsgFmt, err.Error()) - os.Exit(1) } return err } else { @@ -398,9 +397,6 @@ func cordonAndDrainNode(node node.Node, nodeName string, drainEvent *monitor.Int log.Err(err).Msg("There was a problem while trying to cordon and drain the node") metrics.NodeActionsInc("cordon-and-drain", nodeName, err) recorder.Emit(nodeName, observability.Warning, observability.CordonAndDrainErrReason, observability.CordonAndDrainErrMsgFmt, err.Error()) - if !sqsTerminationDraining { - os.Exit(1) - } } return err } else { diff --git a/config/helm/aws-node-termination-handler/.helmignore b/config/helm/aws-node-termination-handler/.helmignore index 50af0317..69a52314 100644 --- a/config/helm/aws-node-termination-handler/.helmignore +++ b/config/helm/aws-node-termination-handler/.helmignore @@ -20,3 +20,4 @@ .idea/ *.tmproj .vscode/ +example-values*.yaml diff --git a/config/helm/aws-node-termination-handler/Chart.yaml b/config/helm/aws-node-termination-handler/Chart.yaml index dae8179a..8f59d4ff 100644 --- a/config/helm/aws-node-termination-handler/Chart.yaml +++ b/config/helm/aws-node-termination-handler/Chart.yaml @@ -1,21 +1,25 @@ -apiVersion: v1 +apiVersion: v2 name: aws-node-termination-handler -description: A Helm chart for the AWS Node Termination Handler -version: 0.15.4 -appVersion: 1.13.4 +description: A Helm chart for the AWS Node Termination Handler. +type: application +version: 0.18.0 +appVersion: 1.16.0 +kubeVersion: ">= 1.16-0" +keywords: + - aws + - eks + - ec2 + - node-termination + - spot home: https://github.com/aws/eks-charts icon: https://raw.githubusercontent.com/aws/eks-charts/master/docs/logo/aws.png sources: - - https://github.com/aws/eks-charts + - https://github.com/aws/aws-node-termination-handler/ + - https://github.com/aws/eks-charts/ maintainers: - name: Brandon Wagner url: https://github.com/bwagner5 email: bwagner5@users.noreply.github.com - - name: Jillian Montalvo + - name: Jillian Kuentz url: https://github.com/jillmon email: jillmon@users.noreply.github.com -keywords: - - eks - - ec2 - - node-termination - - spot diff --git a/config/helm/aws-node-termination-handler/README.md b/config/helm/aws-node-termination-handler/README.md index eaf49187..43f7d488 100644 --- a/config/helm/aws-node-termination-handler/README.md +++ b/config/helm/aws-node-termination-handler/README.md @@ -1,167 +1,172 @@ # AWS Node Termination Handler -AWS Node Termination Handler Helm chart for Kubernetes. For more information on this project see the project repo at https://github.com/aws/aws-node-termination-handler. +AWS Node Termination Handler Helm chart for Kubernetes. For more information on this project see the project repo at [github.com/aws/aws-node-termination-handler](https://github.com/aws/aws-node-termination-handler). ## Prerequisites -* Kubernetes >= 1.14 +- _Kubernetes_ >= v1.16 ## Installing the Chart -Add the EKS repository to Helm: +Before you can install the chart you will need to add the `aws` repo to [Helm](https://helm.sh/). -```sh -helm repo add eks https://aws.github.io/eks-charts +```shell +helm repo add eks https://aws.github.io/eks-charts/ ``` -Install AWS Node Termination Handler: +After you've installed the repo you can install the chart, the following command will install the chart with the release name `aws-node-termination-handler` and the default configuration to the `kube-system` namespace. -To install the chart with the release name aws-node-termination-handler and default configuration: - -```sh -helm upgrade --install aws-node-termination-handler \ - --namespace kube-system \ - eks/aws-node-termination-handler +```shell +helm upgrade --install --namespace kube-system aws-node-termination-handler eks/aws-node-termination-handler ``` -To install into an EKS cluster where the Node Termination Handler is already installed, you can run: +To install the chart on an EKS cluster where the AWS Node Termination Handler is already installed, you can run the following command. -```sh -helm upgrade --install --recreate-pods --force \ - aws-node-termination-handler --namespace kube-system eks/aws-node-termination-handler +```shell +helm upgrade --install --namespace kube-system aws-node-termination-handler eks/aws-node-termination-handler --recreate-pods --force ``` -If you receive an error similar to `Error: release aws-node-termination-handler -failed: "aws-node-termination-handler" already exists`, simply rerun -the above command. - -The [configuration](#configuration) section lists the parameters that can be configured during installation. +If you receive an error similar to the one below simply rerun the above command. -## Uninstalling the Chart +> Error: release aws-node-termination-handler failed: "aws-node-termination-handler" already exists -To uninstall/delete the `aws-node-termination-handler` deployment: +To uninstall the `aws-node-termination-handler` chart installation from the `kube-system` namespace run the following command. -```sh -helm delete --purge aws-node-termination-handler +```shell +helm delete --namespace kube-system aws-node-termination-handler ``` -The command removes all the Kubernetes components associated with the chart and deletes the release. - ## Configuration -The following tables lists the configurable parameters of the chart and their default values. - -### AWS Node Termination Handler Common Configuration - -The configuration in this table applies to both queue-processor mode and IMDS mode. - -Parameter | Description | Default ---- | --- | --- -`deleteLocalData` | Tells kubectl to continue even if there are pods using emptyDir (local data that will be deleted when the node is drained). | `true` -`gracePeriod` | (DEPRECATED: Renamed to podTerminationGracePeriod) The time in seconds given to each pod to terminate gracefully. If negative, the default value specified in the pod will be used, which defaults to 30 seconds if not specified. | `-1` -`podTerminationGracePeriod` | The time in seconds given to each pod to terminate gracefully. If negative, the default value specified in the pod will be used, which defaults to 30 seconds if not specified. | `-1` -`nodeTerminationGracePeriod` | Period of time in seconds given to each NODE to terminate gracefully. Node draining will be scheduled based on this value to optimize the amount of compute time, but still safely drain the node before an event. | `120` -`ignoreDaemonSets` | Causes kubectl to skip daemon set managed pods | `true` -`instanceMetadataURL` | The URL of EC2 instance metadata. This shouldn't need to be changed unless you are testing. | `http://169.254.169.254:80` -`webhookURL` | Posts event data to URL upon instance interruption action | `` -`webhookURLSecretName` | Pass Webhook URL as a secret. Secret Key: `webhookurl`, Value: `` | None -`webhookProxy` | Uses the specified HTTP(S) proxy for sending webhooks | `` -`webhookHeaders` | Replaces the default webhook headers. | `{"Content-type":"application/json"}` -`webhookTemplate` | Replaces the default webhook message template. | `{"text":"[NTH][Instance Interruption] EventID: {{ .EventID }} - Kind: {{ .Kind }} - Instance: {{ .InstanceID }} - Node: {{ .NodeName }} - Description: {{ .Description }} - Start Time: {{ .StartTime }}"}` -`webhookTemplateConfigMapName` | Pass Webhook template file as configmap | None -`webhookTemplateConfigMapKey` | Name of the template file stored in the configmap| None -`metadataTries` | The number of times to try requesting metadata. If you would like 2 retries, set metadata-tries to 3. | `3` -`cordonOnly` | If true, nodes will be cordoned but not drained when an interruption event occurs. | `false` -`taintNode` | If true, nodes will be tainted when an interruption event occurs. Currently used taint keys are `aws-node-termination-handler/scheduled-maintenance`, `aws-node-termination-handler/spot-itn`, `aws-node-termination-handler/asg-lifecycle-termination` and `aws-node-termination-handler/rebalance-recommendation`| `false` -`jsonLogging` | If true, use JSON-formatted logs instead of human readable logs. | `false` -`logLevel` | Sets the log level (INFO, DEBUG, or ERROR) | `INFO` -`enablePrometheusServer` | If true, start an http server exposing `/metrics` endpoint for prometheus. | `false` -`prometheusServerPort` | Replaces the default HTTP port for exposing prometheus metrics. | `9092` -`enableProbesServer` | If true, start an http server exposing `/healthz` endpoint for probes. | `false` -`probesServerPort` | Replaces the default HTTP port for exposing probes endpoint. | `8080` -`probesServerEndpoint` | Replaces the default endpoint for exposing probes endpoint. | `/healthz` -`podMonitor.create` | If `true`, create a PodMonitor | `false` -`podMonitor.interval` | Prometheus scrape interval | `30s` -`podMonitor.sampleLimit` | Number of scraped samples accepted | `5000` -`podMonitor.labels` | Additional PodMonitor metadata labels | `{}` -`podMonitor.namespace` | Override podMonitor Helm release namespace | `{{ .Release.Namespace }}` -`emitKubernetesEvents` | If `true`, Kubernetes events will be emitted when interruption events are received and when actions are taken on Kubernetes nodes. In IMDS Processor mode a default set of annotations with all the node metadata gathered from IMDS will be attached to each event. More information [here](https://github.com/aws/aws-node-termination-handler/blob/main/docs/kubernetes_events.md) | `false` -`kubernetesExtraEventsAnnotations` | A comma-separated list of `key=value` extra annotations to attach to all emitted Kubernetes events. Example: `first=annotation,sample.annotation/number=two"` | None - -### AWS Node Termination Handler - Queue-Processor Mode Configuration - -Parameter | Description | Default ---- | --- | --- -`enableSqsTerminationDraining` | If true, this turns on queue-processor mode which drains nodes when an SQS termination event is received. | `false` -`queueURL` | Listens for messages on the specified SQS queue URL | None -`awsRegion` | If specified, use the AWS region for AWS API calls, else NTH will try to find the region through AWS_REGION env var, IMDS, or the specified queue URL | `` -`checkASGTagBeforeDraining` | If true, check that the instance is tagged with "aws-node-termination-handler/managed" as the key before draining the node | `true` -`managedAsgTag` | The tag to ensure is on a node if checkASGTagBeforeDraining is true | `aws-node-termination-handler/managed` -`workers` | The maximum amount of parallel event processors | `10` -`replicas` | The number of replicas in the NTH deployment when using queue-processor mode (NOTE: increasing replicas may cause duplicate webhooks since NTH pods are stateless) | `1` -`podDisruptionBudget` | Limit the disruption for controller pods, requires at least 2 controller replicas | `{}` - -### AWS Node Termination Handler - IMDS Mode Configuration - -Parameter | Description | Default ---- | --- | --- -`enableScheduledEventDraining` | [EXPERIMENTAL] If true, drain nodes before the maintenance window starts for an EC2 instance scheduled event | `false` -`enableSpotInterruptionDraining` | If true, drain nodes when the spot interruption termination notice is received | `true` -`enableRebalanceDraining` | If true, drain nodes when the rebalance recommendation notice is received | `false` -`enableRebalanceMonitoring` | If true, cordon nodes when the rebalance recommendation notice is received. If you'd like to drain the node in addition to cordoning, then also set `enableRebalanceDraining`. | `false` -`useHostNetwork` | If `true`, enables `hostNetwork` for the Linux DaemonSet. NOTE: setting this to `false` may cause issues accessing IMDSv2 if your account is not configured with an IP hop count of 2 | `true` - -### Kubernetes Configuration - -Parameter | Description | Default ---- | --- | --- -`image.repository` | image repository | `public.ecr.aws/aws-ec2/aws-node-termination-handler` -`image.tag` | image tag | `` -`image.pullPolicy` | image pull policy | `IfNotPresent` -`image.pullSecrets` | image pull secrets (for private docker registries) | `[]` -`affinity` | node/pod affinities | None -`linuxAffinity` | Linux node/pod affinities | None -`windowsAffinity` | Windows node/pod affinities | None -`podAnnotations` | annotations to add to each pod | `{}` -`linuxPodAnnotations` | Linux annotations to add to each pod | `{}` -`windowsPodAnnotations` | Windows annotations to add to each pod | `{}` -`podLabels` | labels to add to each pod | `{}` -`linuxPodLabels` | labels to add to each Linux pod | `{}` -`windowsPodLabels` | labels to add to each Windows pod | `{}` -`priorityClassName` | Name of the priorityClass | `system-node-critical` -`resources` | Resources for the pods | `requests.cpu: 50m, requests.memory: 64Mi, limits.cpu: 100m, limits.memory: 128Mi` -`dnsPolicy` | DaemonSet DNS policy | Linux: `ClusterFirstWithHostNet`, Windows: `ClusterFirst` -`nodeSelector` | Tells the all daemon sets where to place the node-termination-handler pods. For example: `lifecycle: "Ec2Spot"`, `on-demand: "false"`, `aws.amazon.com/purchaseType: "spot"`, etc. Value must be a valid yaml expression. | `{}` -`linuxNodeSelector` | Tells the Linux daemon set where to place the node-termination-handler pods. For example: `lifecycle: "Ec2Spot"`, `on-demand: "false"`, `aws.amazon.com/purchaseType: "spot"`, etc. Value must be a valid yaml expression. | `{}` -`windowsNodeSelector` | Tells the Windows daemon set where to place the node-termination-handler pods. For example: `lifecycle: "Ec2Spot"`, `on-demand: "false"`, `aws.amazon.com/purchaseType: "spot"`, etc. Value must be a valid yaml expression. | `{}` -`tolerations` | list of node taints to tolerate | `[ {"operator": "Exists"} ]` -`rbac.create` | if `true`, create and use RBAC resources | `true` -`rbac.pspEnabled` | If `true`, create and use a restricted pod security policy | `true` -`serviceAccount.create` | If `true`, create a new service account | `true` -`serviceAccount.name` | Service account to be used | None -`serviceAccount.annotations` | Specifies the annotations for ServiceAccount | `{}` -`securityContext.runAsUserID` | User ID to run the container | `1000` -`securityContext.runAsGroupID` | Group ID to run the container | `1000` -`nodeSelectorTermsOs` | Operating System Node Selector Key | `kubernetes.io/os` -`nodeSelectorTermsArch` | CPU Architecture Node Selector Key | `kubernetes.io/arch` -`targetNodeOs` | Space separated list of node OS's to target, e.g. "linux", "windows", "linux windows". Note: Windows support is experimental. | `"linux"` -`updateStrategy` | Update strategy for the all DaemonSets (Linux and Windows) | `type=RollingUpdate,rollingUpdate.maxUnavailable=1` -`linuxUpdateStrategy` | Update strategy for the Linux DaemonSet | `type=RollingUpdate,rollingUpdate.maxUnavailable=1` -`windowsUpdateStrategy` | Update strategy for the Windows DaemonSet | `type=RollingUpdate,rollingUpdate.maxUnavailable=1` -`extraEnv` | Additional environment variables to inject into pod configuration | `[]` - -### Testing Configuration (NOT RECOMMENDED FOR PROD DEPLOYMENTS) - -Parameter | Description | Default ---- | --- | --- -`procUptimeFile` | (Used for Testing) Specify the uptime file | `/proc/uptime` -`awsEndpoint` | (Used for testing) If specified, use the AWS endpoint to make API calls | None -`awsSecretAccessKey` | (Used for testing) Pass-thru env var | None -`awsAccessKeyID` | (Used for testing) Pass-thru env var | None -`dryRun` | If true, only log if a node would be drained | `false` - -## Metrics endpoint consideration - -NTH in IMDS mode runs as a DaemonSet w/ `host_networking=true` by default. If the prometheus server is enabled, nothing else will be able to bind to the configured port (by default `:9092`) in the root network namespace. Therefore, it will need to have a firewall/security group configured on the nodes to block access to the `/metrics` endpoint. - -You can switch NTH in IMDS mode to run w/ `host_networking=false`, but you will need to make sure that IMDSv1 is enabled or IMDSv2 IP hop count will need to be incremented to 2. https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/configuring-instance-metadata-service.html +The following tables lists the configurable parameters of the chart and their default values. These values are split up into the [common configuration](#common-configuration) shared by all AWS Node Termination Handler modes, [queue configuration](#queue-processor-mode-configuration) used when AWS Node Termination Handler is in in queue-processor mode, and [IMDS configuration](#imds-mode-configuration) used when AWS Node Termination Handler is in IMDS mode; for more information about the different modes see the project [README](https://github.com/aws/aws-node-termination-handler/blob/main/README.md). + +### Common Configuration + +The configuration in this table applies to all AWS Node Termination Handler modes. + +| Parameter | Description | Default | +| ---------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------- | +| `image.repository` | Image repository. | `public.ecr.aws/aws-ec2/aws-node-termination-handler` | +| `image.tag` | Image tag. | `v{{ .Chart.AppVersion}}` | +| `image.pullPolicy` | Image pull policy. | `IfNotPresent` | +| `image.pullSecrets` | Image pull secrets. | `[]` | +| `nameOverride` | Override the `name` of the chart. | `""` | +| `fullnameOverride` | Override the `fullname` of the chart. | `""` | +| `serviceAccount.create` | If `true`, create a new service account. | `true` | +| `serviceAccount.name` | Service account to be used. If not set and `serviceAccount.create` is `true`, a name is generated using the full name template. | `nil` | +| `serviceAccount.annotations` | Annotations to add to the service account. | `{}` | +| `rbac.create` | If `true`, create the RBAC resources. | `true` | +| `rbac.pspEnabled` | If `true`, create a pod security policy resource. | `true` | +| `customLabels` | Labels to add to all resource metadata. | `{}` | +| `podLabels` | Labels to add to the pod. | `{}` | +| `podAnnotations` | Annotations to add to the pod. | `{}` | +| `podSecurityContext` | Security context for the pod. | _See values.yaml_ | +| `securityContext` | Security context for the _aws-node-termination-handler_ container. | _See values.yaml_ | +| `terminationGracePeriodSeconds` | The termination grace period for the pod. | `nil` | +| `resources` | Resource requests and limits for the _aws-node-termination-handler_ container. | `{}` | +| `nodeSelector` | Expressions to select a node by it's labels for pod assignment. In IMDS mode this has a higher priority than `daemonsetNodeSelector` (for backwards compatibility) but shouldn't be used. | `{}` | +| `affinity` | Affinity settings for pod assignment. In IMDS mode this has a higher priority than `daemonsetAffinity` (for backwards compatibility) but shouldn't be used. | `{}` | +| `tolerations` | Tolerations for pod assignment. In IMDS mode this has a higher priority than `daemonsetTolerations` (for backwards compatibility) but shouldn't be used. | `[]` | +| `extraEnv` | Additional environment variables for the _aws-node-termination-handler_ container. | `[]` | +| `probes` | The Kubernetes liveness probe configuration. | _See values.yaml_ | +| `logLevel` | Sets the log level (`info`,`debug`, or `error`) | `info` | +| `jsonLogging` | If `true`, use JSON-formatted logs instead of human readable logs. | `false` | +| `enablePrometheusServer` | If `true`, start an http server exposing `/metrics` endpoint for _Prometheus_. | `false` | +| `prometheusServerPort` | Replaces the default HTTP port for exposing _Prometheus_ metrics. | `9092` | +| `dryRun` | If `true`, only log if a node would be drained. | `false` | +| `cordonOnly` | If `true`, nodes will be cordoned but not drained when an interruption event occurs. | `false` | +| `taintNode` | If `true`, nodes will be tainted when an interruption event occurs. Currently used taint keys are `aws-node-termination-handler/scheduled-maintenance`, `aws-node-termination-handler/spot-itn`, `aws-node-termination-handler/asg-lifecycle-termination` and `aws-node-termination-handler/rebalance-recommendation`. | `false` | +| `excludeFromLoadBalancers` | If `true`, nodes will be marked for exclusion from load balancers before they are cordoned. This applies the `node.kubernetes.io/exclude-from-external-load-balancers` label to enable the ServiceNodeExclusion feature gate. The label will not be modified or removed for nodes that already have it. | `false` | +| `deleteLocalData` | If `true`, continue even if there are pods using local data that will be deleted when the node is drained. | `true` | +| `ignoreDaemonSets` | If `true`, skip terminating daemon set managed pods. | `true` | +| `podTerminationGracePeriod` | The time in seconds given to each pod to terminate gracefully. If negative, the default value specified in the pod will be used, which defaults to 30 seconds if not specified for the pod. | `-1` | +| `nodeTerminationGracePeriod` | Period of time in seconds given to each node to terminate gracefully. Node draining will be scheduled based on this value to optimize the amount of compute time, but still safely drain the node before an event. | `120` | +| `emitKubernetesEvents` | If `true`, Kubernetes events will be emitted when interruption events are received and when actions are taken on Kubernetes nodes. In IMDS Processor mode a default set of annotations with all the node metadata gathered from IMDS will be attached to each event. More information [here](https://github.com/aws/aws-node-termination-handler/blob/main/docs/kubernetes_events.md). | `false` | +| `kubernetesEventsExtraAnnotations` | A comma-separated list of `key=value` extra annotations to attach to all emitted Kubernetes events (e.g. `first=annotation,sample.annotation/number=two"`). | `""` | +| `webhookURL` | Posts event data to URL upon instance interruption action. | `""` | +| `webhookURLSecretName` | Pass the webhook URL as a Secret using the key `webhookurl`. | `""` | +| `webhookHeaders` | Replace the default webhook headers (e.g. `{"Content-type":"application/json"}`). | `""` | +| `webhookProxy` | Uses the specified HTTP(S) proxy for sending webhook data. | `""` | +| `webhookTemplate` | Replaces the default webhook message template (e.g. `{"text":"[NTH][Instance Interruption] EventID: {{ .EventID }} - Kind: {{ .Kind }} - Instance: {{ .InstanceID }} - Node: {{ .NodeName }} - Description: {{ .Description }} - Start Time: {{ .StartTime }}"}`). | `""` | +| `webhookTemplateConfigMapName` | Pass the webhook template file as a configmap. | "``" | +| `webhookTemplateConfigMapKey` | Name of the Configmap key storing the template file. | `""` | +| `enableSqsTerminationDraining` | If `true`, this turns on queue-processor mode which drains nodes when an SQS termination event is received. | `false` | + +### Queue-Processor Mode Configuration + +The configuration in this table applies to AWS Node Termination Handler in queue-processor mode. + +| Parameter | Description | Default | +| ---------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------- | +| `replicas` | The number of replicas in the deployment when using queue-processor mode (NOTE: increasing replicas may cause duplicate webhooks since pods are stateless). | `1` | +| `strategy` | Specify the update strategy for the deployment. | `{}` | +| `podDisruptionBudget` | Limit the disruption for controller pods, requires at least 2 controller replicas. | `{}` | +| `serviceMonitor.create` | If `true`, create a ServiceMonitor. This requires `enablePrometheusServer: true`. | `false` | +| `serviceMonitor.namespace` | Override ServiceMonitor _Helm_ release namespace. | `nil` | +| `serviceMonitor.labels` | Additional ServiceMonitor metadata labels. | `{}` | +| `serviceMonitor.interval` | _Prometheus_ scrape interval. | `30s` | +| `serviceMonitor.sampleLimit` | Number of scraped samples accepted. | `5000` | +| `priorityClassName` | Name of the PriorityClass to use for the Deployment. | `system-cluster-critical` | +| `awsRegion` | If specified, use the AWS region for AWS API calls, else NTH will try to find the region through the `AWS_REGION` environment variable, IMDS, or the specified queue URL. | `""` | +| `queueURL` | Listens for messages on the specified SQS queue URL. | `""` | +| `workers` | The maximum amount of parallel event processors to handle concurrent events. | `10` | +| `checkASGTagBeforeDraining` | If `true`, check that the instance is tagged with the `managedAsgTag` before draining the node. | `true` | +| `managedAsgTag` | The node tag to check if `checkASGTagBeforeDraining` is `true`. | `aws-node-termination-handler/managed` | +| `assumeAsgTagPropagation` | If `true`, assume that ASG tags will be appear on the ASG's instances. | `false` | + +### IMDS Mode Configuration + +The configuration in this table applies to AWS Node Termination Handler in IMDS mode. + +| Parameter | Description | Default | +| -------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------- | +| `targetNodeOs` | Space separated list of node OS's to target (e.g. `"linux"`, `"windows"`, `"linux windows"`). Windows support is **EXPERIMENTAL**. | `"linux"` | +| `linuxPodLabels` | Labels to add to each Linux pod. | `{}` | +| `windowsPodLabels` | Labels to add to each Windows pod. | `{}` | +| `linuxPodAnnotations` | Annotations to add to each Linux pod. | `{}` | +| `windowsPodAnnotations` | Annotations to add to each Windows pod. | `{}` | +| `updateStrategy` | Update strategy for the all DaemonSets. | _See values.yaml_ | +| `daemonsetPriorityClassName` | Name of the PriorityClass to use for all DaemonSets. | `system-node-critical` | +| `podMonitor.create` | If `true`, create a PodMonitor. This requires `enablePrometheusServer: true`. | `false` | +| `podMonitor.namespace` | Override PodMonitor _Helm_ release namespace. | `nil` | +| `podMonitor.labels` | Additional PodMonitor metadata labels | `{}` | +| `podMonitor.interval` | _Prometheus_ scrape interval. | `30s` | +| `podMonitor.sampleLimit` | Number of scraped samples accepted. | `5000` | +| `useHostNetwork` | If `true`, enables `hostNetwork` for the Linux DaemonSet. NOTE: setting this to `false` may cause issues accessing IMDSv2 if your account is not configured with an IP hop count of 2 see [Metrics Endpoint Considerations](#metrics-endpoint-considerations) | `true` | +| `dnsPolicy` | If specified, this overrides `linuxDnsPolicy` and `windowsDnsPolicy` with a single policy. | `""` | +| `dnsConfig` | If specified, this sets the dnsConfig: https://kubernetes.io/docs/concepts/services-networking/dns-pod-service/#pod-dns-config | `{}` | +| `linuxDnsPolicy` | DNS policy for the Linux DaemonSet. | `""` | +| `windowsDnsPolicy` | DNS policy for the Windows DaemonSet. | `""` | +| `daemonsetNodeSelector` | Expressions to select a node by it's labels for DaemonSet pod assignment. For backwards compatibility the `nodeSelector` value has priority over this but shouldn't be used. | `{}` | +| `linuxNodeSelector` | Override `daemonsetNodeSelector` for the Linux DaemonSet. | `{}` | +| `windowsNodeSelector` | Override `daemonsetNodeSelector` for the Windows DaemonSet. | `{}` | +| `daemonsetAffinity` | Affinity settings for DaemonSet pod assignment. For backwards compatibility the `affinity` has priority over this but shouldn't be used. | `{}` | +| `linuxAffinity` | Override `daemonsetAffinity` for the Linux DaemonSet. | `{}` | +| `windowsAffinity` | Override `daemonsetAffinity` for the Windows DaemonSet. | `{}` | +| `daemonsetTolerations` | Tolerations for DaemonSet pod assignment. For backwards compatibility the `tolerations` has priority over this but shouldn't be used. | `[]` | +| `linuxTolerations` | Override `daemonsetTolerations` for the Linux DaemonSet. | `[]` | +| `windowsTolerations` | Override `daemonsetTolerations` for the Linux DaemonSet. | `[]` | +| `enableProbesServer` | If `true`, start an http server exposing `/healthz` endpoint for probes. | `false` | +| `metadataTries` | The number of times to try requesting metadata. | `3` | +| `enableSpotInterruptionDraining` | If `true`, drain nodes when the spot interruption termination notice is received. | `true` | +| `enableScheduledEventDraining` | If `true`, drain nodes before the maintenance window starts for an EC2 instance scheduled event. This is **EXPERIMENTAL**. | `false` | +| `enableRebalanceMonitoring` | If `true`, cordon nodes when the rebalance recommendation notice is received. If you'd like to drain the node in addition to cordoning, then also set `enableRebalanceDraining`. | `false` | +| `enableRebalanceDraining` | If `true`, drain nodes when the rebalance recommendation notice is received. | `false` | + +### Testing Configuration + +The configuration in this table applies to AWS Node Termination Handler testing and is **NOT RECOMMENDED** FOR PRODUCTION DEPLOYMENTS. + +| Parameter | Description | Default | +| --------------------- | --------------------------------------------------------------------------------- | -------------- | +| `awsEndpoint` | (Used for testing) If specified, use the provided AWS endpoint to make API calls. | `""` | +| `awsSecretAccessKey` | (Used for testing) Pass-thru environment variable. | `nil` | +| `awsAccessKeyID` | (Used for testing) Pass-thru environment variable. | `nil` | +| `instanceMetadataURL` | (Used for testing) If specified, use the provided metadata URL. | `""` | +| `procUptimeFile` | (Used for Testing) Specify the uptime file. | `/proc/uptime` | + +## Metrics Endpoint Considerations + +AWS Node Termination HAndler in IMDS mode runs as a DaemonSet with `useHostNetwork: true` by default. If the Prometheus server is enabled with `enablePrometheusServer: true` nothing else will be able to bind to the configured port (by default `prometheusServerPort: 9092`) in the root network namespace. Therefore, it will need to have a firewall/security group configured on the nodes to block access to the `/metrics` endpoint. + +You can switch NTH in IMDS mode to run w/ `useHostNetwork: false`, but you will need to make sure that IMDSv1 is enabled or IMDSv2 IP hop count will need to be incremented to 2 (see the [IMDSv2 documentation](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/configuring-instance-metadata-service.html). diff --git a/config/helm/aws-node-termination-handler/example-values-imds-linux.yaml b/config/helm/aws-node-termination-handler/example-values-imds-linux.yaml new file mode 100644 index 00000000..c0df26ca --- /dev/null +++ b/config/helm/aws-node-termination-handler/example-values-imds-linux.yaml @@ -0,0 +1,5 @@ +enableSqsTerminationDraining: false + +targetNodeOs: linux + +enableProbesServer: true diff --git a/config/helm/aws-node-termination-handler/example-values-imds-windows.yaml b/config/helm/aws-node-termination-handler/example-values-imds-windows.yaml new file mode 100644 index 00000000..193978ea --- /dev/null +++ b/config/helm/aws-node-termination-handler/example-values-imds-windows.yaml @@ -0,0 +1,5 @@ +enableSqsTerminationDraining: false + +targetNodeOs: windows + +enableProbesServer: true diff --git a/config/helm/aws-node-termination-handler/example-values-queue.yaml b/config/helm/aws-node-termination-handler/example-values-queue.yaml new file mode 100644 index 00000000..fd204ab5 --- /dev/null +++ b/config/helm/aws-node-termination-handler/example-values-queue.yaml @@ -0,0 +1,13 @@ +serviceAccount: + annotations: + eks.amazonaws.com/role-arn: arn:aws:iam::99999999:role/nth-role + +resources: + requests: + cpu: 100m + memory: 128Mi + limits: + cpu: 500m + memory: 256Mi + +enableSqsTerminationDraining: true diff --git a/config/helm/aws-node-termination-handler/templates/NOTES.txt b/config/helm/aws-node-termination-handler/templates/NOTES.txt index f2dd1cec..d0aaf70c 100644 --- a/config/helm/aws-node-termination-handler/templates/NOTES.txt +++ b/config/helm/aws-node-termination-handler/templates/NOTES.txt @@ -1,3 +1,8 @@ -{{ .Release.Name }} has been installed or updated. To check the status of pods, run: - -kubectl get pods --namespace {{ .Values.namespace }} +*********************************************************************** +* AWS Node Termination Handler * +*********************************************************************** + Chart version: {{ .Chart.Version }} + App version: {{ .Chart.AppVersion }} + Image tag: {{ include "aws-node-termination-handler.image" . }} + Mode : {{ if .Values.enableSqsTerminationDraining }}Queue Processor{{ else }}IMDS{{ end }} +*********************************************************************** diff --git a/config/helm/aws-node-termination-handler/templates/_helpers.tpl b/config/helm/aws-node-termination-handler/templates/_helpers.tpl index 249a9c98..45f06f4b 100644 --- a/config/helm/aws-node-termination-handler/templates/_helpers.tpl +++ b/config/helm/aws-node-termination-handler/templates/_helpers.tpl @@ -1,4 +1,5 @@ {{/* vim: set filetype=mustache: */}} + {{/* Expand the name of the chart. */}} @@ -28,20 +29,32 @@ If release name contains chart name it will be used as a full name. Equivalent to "aws-node-termination-handler.fullname" except that "-win" indicator is appended to the end. Name will not exceed 63 characters. */}} -{{- define "aws-node-termination-handler.fullname.windows" -}} +{{- define "aws-node-termination-handler.fullnameWindows" -}} {{- include "aws-node-termination-handler.fullname" . | trunc 59 | trimSuffix "-" | printf "%s-win" -}} {{- end -}} +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "aws-node-termination-handler.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}} +{{- end -}} + {{/* Common labels */}} {{- define "aws-node-termination-handler.labels" -}} -helm.sh/chart: {{ include "aws-node-termination-handler.chart" . }} {{ include "aws-node-termination-handler.selectorLabels" . }} {{- if .Chart.AppVersion }} app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} {{- end }} +app.kubernetes.io/component: {{ .Release.Name }} +app.kubernetes.io/part-of: {{ .Release.Name }} app.kubernetes.io/managed-by: {{ .Release.Service }} +helm.sh/chart: {{ include "aws-node-termination-handler.chart" . }} +{{- with .Values.customLabels }} +{{ toYaml . }} +{{- end }} {{- end -}} {{/* @@ -53,10 +66,19 @@ app.kubernetes.io/instance: {{ .Release.Name }} {{- end -}} {{/* -Create chart name and version as used by the chart label. +Selector labels for the deployment */}} -{{- define "aws-node-termination-handler.chart" -}} -{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}} +{{- define "aws-node-termination-handler.selectorLabelsDeployment" -}} +{{ include "aws-node-termination-handler.selectorLabels" . }} +app.kubernetes.io/component: deployment +{{- end -}} + +{{/* +Selector labels for the daemonset +*/}} +{{- define "aws-node-termination-handler.selectorLabelsDaemonset" -}} +{{ include "aws-node-termination-handler.selectorLabels" . }} +app.kubernetes.io/component: daemonset {{- end -}} {{/* @@ -71,36 +93,17 @@ Create the name of the service account to use {{- end -}} {{/* -Get the default node selector term prefix. -*/}} -{{- define "aws-node-termination-handler.defaultNodeSelectorTermsPrefix" -}} -kubernetes.io -{{- end -}} - -{{/* -Get the default node selector OS term. -*/}} -{{- define "aws-node-termination-handler.defaultNodeSelectorTermsOs" -}} - {{- list (include "aws-node-termination-handler.defaultNodeSelectorTermsPrefix" .) "os" | join "/" -}} -{{- end -}} - -{{/* -Get the default node selector Arch term. -*/}} -{{- define "aws-node-termination-handler.defaultNodeSelectorTermsArch" -}} - {{- list (include "aws-node-termination-handler.defaultNodeSelectorTermsPrefix" .) "arch" | join "/" -}} -{{- end -}} - -{{/* -Get the node selector OS term. +The image to use */}} -{{- define "aws-node-termination-handler.nodeSelectorTermsOs" -}} - {{- or .Values.nodeSelectorTermsOs (include "aws-node-termination-handler.defaultNodeSelectorTermsOs" .) -}} -{{- end -}} +{{- define "aws-node-termination-handler.image" -}} +{{- printf "%s:%s" .Values.image.repository (default (printf "v%s" .Chart.AppVersion) .Values.image.tag) }} +{{- end }} -{{/* -Get the node selector Arch term. -*/}} -{{- define "aws-node-termination-handler.nodeSelectorTermsArch" -}} - {{- or .Values.nodeSelectorTermsArch (include "aws-node-termination-handler.defaultNodeSelectorTermsArch" .) -}} +{{/* Get PodDisruptionBudget API Version */}} +{{- define "aws-node-termination-handler.pdb.apiVersion" -}} + {{- if and (.Capabilities.APIVersions.Has "policy/v1") (semverCompare ">= 1.21-0" .Capabilities.KubeVersion.Version) -}} + {{- print "policy/v1" -}} + {{- else -}} + {{- print "policy/v1beta1" -}} + {{- end -}} {{- end -}} diff --git a/config/helm/aws-node-termination-handler/templates/clusterrole.yaml b/config/helm/aws-node-termination-handler/templates/clusterrole.yaml index 8418ff3e..43c2b030 100644 --- a/config/helm/aws-node-termination-handler/templates/clusterrole.yaml +++ b/config/helm/aws-node-termination-handler/templates/clusterrole.yaml @@ -1,7 +1,10 @@ +{{- if .Values.rbac.create -}} kind: ClusterRole apiVersion: rbac.authorization.k8s.io/v1 metadata: name: {{ include "aws-node-termination-handler.fullname" . }} + labels: + {{- include "aws-node-termination-handler.labels" . | nindent 4 }} rules: - apiGroups: - "" @@ -46,3 +49,4 @@ rules: - create - patch {{- end }} +{{- end -}} diff --git a/config/helm/aws-node-termination-handler/templates/clusterrolebinding.yaml b/config/helm/aws-node-termination-handler/templates/clusterrolebinding.yaml index b5c25327..1058df1b 100644 --- a/config/helm/aws-node-termination-handler/templates/clusterrolebinding.yaml +++ b/config/helm/aws-node-termination-handler/templates/clusterrolebinding.yaml @@ -1,12 +1,16 @@ +{{- if .Values.rbac.create -}} kind: ClusterRoleBinding apiVersion: rbac.authorization.k8s.io/v1 metadata: name: {{ include "aws-node-termination-handler.fullname" . }} -subjects: -- kind: ServiceAccount - name: {{ template "aws-node-termination-handler.serviceAccountName" . }} - namespace: {{ .Release.Namespace }} + labels: + {{- include "aws-node-termination-handler.labels" . | nindent 4 }} roleRef: + apiGroup: rbac.authorization.k8s.io kind: ClusterRole name: {{ include "aws-node-termination-handler.fullname" . }} - apiGroup: rbac.authorization.k8s.io +subjects: + - kind: ServiceAccount + name: {{ template "aws-node-termination-handler.serviceAccountName" . }} + namespace: {{ .Release.Namespace }} +{{- end -}} diff --git a/config/helm/aws-node-termination-handler/templates/daemonset.linux.yaml b/config/helm/aws-node-termination-handler/templates/daemonset.linux.yaml index 0a09aa64..1e250e70 100644 --- a/config/helm/aws-node-termination-handler/templates/daemonset.linux.yaml +++ b/config/helm/aws-node-termination-handler/templates/daemonset.linux.yaml @@ -1,4 +1,4 @@ -{{- if and (lower .Values.targetNodeOs | contains "linux") (not .Values.enableSqsTerminationDraining) -}} +{{- if and (not .Values.enableSqsTerminationDraining) (lower .Values.targetNodeOs | contains "linux") -}} apiVersion: apps/v1 kind: DaemonSet metadata: @@ -7,220 +7,199 @@ metadata: labels: {{- include "aws-node-termination-handler.labels" . | nindent 4 }} spec: - {{- if (or .Values.updateStrategy .Values.linuxUpdateStrategy) }} + {{- with .Values.updateStrategy }} updateStrategy: - {{- with .Values.updateStrategy }} - {{- toYaml . | nindent 4 }} - {{- end }} - {{- with .Values.linuxUpdateStrategy }} - {{- toYaml . | nindent 4 }} - {{- end }} + {{- toYaml . | nindent 4 }} {{- end }} selector: matchLabels: - {{- include "aws-node-termination-handler.selectorLabels" . | nindent 6 }} - {{ include "aws-node-termination-handler.nodeSelectorTermsOs" . }}: linux + {{- include "aws-node-termination-handler.selectorLabelsDaemonset" . | nindent 6 }} + kubernetes.io/os: linux template: metadata: - {{- if (or .Values.podAnnotations .Values.linuxPodAnnotations) }} - annotations: - {{- range $key, $value := (mergeOverwrite (dict) .Values.podAnnotations .Values.linuxPodAnnotations) }} - {{ $key }}: {{ $value | quote }} - {{- end }} - {{- end }} labels: - {{- include "aws-node-termination-handler.selectorLabels" . | nindent 8 }} + {{- include "aws-node-termination-handler.selectorLabelsDaemonset" . | nindent 8 }} + kubernetes.io/os: linux k8s-app: aws-node-termination-handler - {{ include "aws-node-termination-handler.nodeSelectorTermsOs" . }}: linux - {{- range $key, $value := (mergeOverwrite (dict) .Values.podLabels .Values.linuxPodLabels) }} - {{ $key }}: {{ $value | quote }} + {{- with (mergeOverwrite (dict) .Values.podLabels .Values.linuxPodLabels) }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- if or .Values.podAnnotations .Values.linuxPodAnnotations }} + annotations: + {{- toYaml (mergeOverwrite (dict) .Values.podAnnotations .Values.linuxPodAnnotations) | nindent 8 }} {{- end }} spec: - volumes: - - name: "uptime" - hostPath: - path: {{ .Values.procUptimeFile | default "/proc/uptime" | quote }} - {{- if and .Values.webhookTemplateConfigMapName .Values.webhookTemplateConfigMapKey }} - - name: "webhook-template" - configMap: - name: {{ .Values.webhookTemplateConfigMapName }} - {{- end }} - priorityClassName: {{ .Values.priorityClassName | quote }} - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: {{ include "aws-node-termination-handler.nodeSelectorTermsOs" . | quote }} - operator: In - values: - - linux - - key: {{ include "aws-node-termination-handler.nodeSelectorTermsArch" . | quote }} - operator: In - values: - - amd64 - - arm64 - - arm - - key: "eks.amazonaws.com/compute-type" - operator: NotIn - values: - - fargate - {{- with .Values.affinity }} - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.linuxAffinity }} - {{- toYaml . | nindent 8 }} - {{- end }} - serviceAccountName: {{ template "aws-node-termination-handler.serviceAccountName" . }} + {{- with .Values.image.pullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + serviceAccountName: {{ include "aws-node-termination-handler.serviceAccountName" . }} + {{- with .Values.podSecurityContext }} + securityContext: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.daemonsetPriorityClassName }} + priorityClassName: {{ . }} + {{- end }} + {{- with .Values.terminationGracePeriodSeconds }} + terminationGracePeriodSeconds: {{ . }} + {{- end }} hostNetwork: {{ .Values.useHostNetwork }} - dnsPolicy: {{ .Values.dnsPolicy | default "ClusterFirstWithHostNet" | quote }} + dnsPolicy: {{ default .Values.linuxDnsPolicy .Values.dnsPolicy }} + {{- with .Values.dnsConfig }} + dnsConfig: + {{- toYaml . | nindent 8 }} + {{- end }} containers: - - name: {{ include "aws-node-termination-handler.name" . }} - image: {{ .Values.image.repository }}:{{ .Values.image.tag }} - imagePullPolicy: {{ .Values.image.pullPolicy }} + - name: aws-node-termination-handler + {{- with .Values.securityContext }} securityContext: - readOnlyRootFilesystem: true - runAsNonRoot: true - runAsUser: {{ .Values.securityContext.runAsUserID }} - runAsGroup: {{ .Values.securityContext.runAsGroupID }} - allowPrivilegeEscalation: false - volumeMounts: - - name: "uptime" - mountPath: {{ .Values.procUptimeFile | default "/proc/uptime" | quote }} - readOnly: true + {{- toYaml . | nindent 12 }} + {{- end }} + image: {{ include "aws-node-termination-handler.image" . }} + imagePullPolicy: {{ .Values.image.pullPolicy }} + env: + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: ENABLE_PROBES_SERVER + value: {{ .Values.enableProbesServer | quote }} + - name: PROBES_SERVER_PORT + value: {{ .Values.probes.httpGet.port | quote }} + - name: PROBES_SERVER_ENDPOINT + value: {{ .Values.probes.httpGet.path | quote }} + - name: LOG_LEVEL + value: {{ .Values.logLevel | quote }} + - name: JSON_LOGGING + value: {{ .Values.jsonLogging | quote }} + - name: ENABLE_PROMETHEUS_SERVER + value: {{ .Values.enablePrometheusServer | quote }} + - name: PROMETHEUS_SERVER_PORT + value: {{ .Values.prometheusServerPort | quote }} + {{- with .Values.instanceMetadataURL }} + - name: INSTANCE_METADATA_URL + value: {{ . | quote }} + {{- end }} + - name: METADATA_TRIES + value: {{ .Values.metadataTries | quote }} + - name: DRY_RUN + value: {{ .Values.dryRun | quote }} + - name: CORDON_ONLY + value: {{ .Values.cordonOnly | quote }} + - name: TAINT_NODE + value: {{ .Values.taintNode | quote }} + - name: EXCLUDE_FROM_LOAD_BALANCERS + value: {{ .Values.excludeFromLoadBalancers | quote }} + - name: DELETE_LOCAL_DATA + value: {{ .Values.deleteLocalData | quote }} + - name: IGNORE_DAEMON_SETS + value: {{ .Values.ignoreDaemonSets | quote }} + - name: POD_TERMINATION_GRACE_PERIOD + value: {{ .Values.podTerminationGracePeriod | quote }} + - name: NODE_TERMINATION_GRACE_PERIOD + value: {{ .Values.nodeTerminationGracePeriod | quote }} + - name: EMIT_KUBERNETES_EVENTS + value: {{ .Values.emitKubernetesEvents | quote }} + {{- with .Values.kubernetesEventsExtraAnnotations }} + - name: KUBERNETES_EVENTS_EXTRA_ANNOTATIONS + value: {{ . | quote }} + {{- end }} + {{- if or .Values.webhookURL .Values.webhookURLSecretName }} + - name: WEBHOOK_URL + {{- if .Values.webhookURLSecretName }} + valueFrom: + secretKeyRef: + name: {{ .Values.webhookURLSecretName }} + key: webhookurl + {{- else }} + value: {{ .Values.webhookURL | quote }} + {{- end }} + {{- end }} + {{- with .Values.webhookHeaders }} + - name: WEBHOOK_HEADERS + value: {{ . | quote }} + {{- end }} + {{- with .Values.webhookProxy }} + - name: WEBHOOK_PROXY + value: {{ . | quote }} + {{- end }} {{- if and .Values.webhookTemplateConfigMapName .Values.webhookTemplateConfigMapKey }} - - name: "webhook-template" - mountPath: "/config/" + - name: WEBHOOK_TEMPLATE_FILE + value: {{ print "/config/" .Values.webhookTemplateConfigMapKey | quote }} + {{- else if .Values.webhookTemplate }} + - name: WEBHOOK_TEMPLATE + value: {{ .Values.webhookTemplate | quote }} {{- end }} - env: - - name: NODE_NAME - valueFrom: - fieldRef: - fieldPath: spec.nodeName - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: DELETE_LOCAL_DATA - value: {{ .Values.deleteLocalData | quote }} - - name: IGNORE_DAEMON_SETS - value: {{ .Values.ignoreDaemonSets | quote }} - - name: GRACE_PERIOD - value: {{ .Values.gracePeriod | quote }} - - name: POD_TERMINATION_GRACE_PERIOD - value: {{ .Values.podTerminationGracePeriod | quote }} - - name: INSTANCE_METADATA_URL - value: {{ .Values.instanceMetadataURL | quote }} - - name: NODE_TERMINATION_GRACE_PERIOD - value: {{ .Values.nodeTerminationGracePeriod | quote }} - - name: WEBHOOK_URL - {{- if .Values.webhookURLSecretName }} - valueFrom: - secretKeyRef: - name: {{ .Values.webhookURLSecretName }} - key: webhookurl - {{- else }} - value: {{ .Values.webhookURL | quote }} - {{- end }} - - name: WEBHOOK_HEADERS - value: {{ .Values.webhookHeaders | quote }} - {{- if and .Values.webhookTemplateConfigMapName .Values.webhookTemplateConfigMapKey }} - - name: WEBHOOK_TEMPLATE_FILE - value: {{ print "/config/" .Values.webhookTemplateConfigMapKey | quote }} - {{- end }} - - name: WEBHOOK_TEMPLATE - value: {{ .Values.webhookTemplate | quote }} - - name: DRY_RUN - value: {{ .Values.dryRun | quote }} - - name: ENABLE_SPOT_INTERRUPTION_DRAINING - value: {{ .Values.enableSpotInterruptionDraining | quote }} - - name: ENABLE_SCHEDULED_EVENT_DRAINING - value: {{ .Values.enableScheduledEventDraining | quote }} - - name: ENABLE_REBALANCE_MONITORING - value: {{ .Values.enableRebalanceMonitoring | quote }} - - name: ENABLE_REBALANCE_DRAINING - value: {{ .Values.enableRebalanceDraining | quote }} - - name: CHECK_ASG_TAG_BEFORE_DRAINING - value: {{ .Values.checkASGTagBeforeDraining | quote }} - - name: MANAGED_ASG_TAG - value: {{ .Values.managedAsgTag | quote }} - - name: METADATA_TRIES - value: {{ .Values.metadataTries | quote }} - - name: CORDON_ONLY - value: {{ .Values.cordonOnly | quote }} - - name: TAINT_NODE - value: {{ .Values.taintNode | quote }} - - name: JSON_LOGGING - value: {{ .Values.jsonLogging | quote }} - - name: LOG_LEVEL - value: {{ .Values.logLevel | quote }} - - name: WEBHOOK_PROXY - value: {{ .Values.webhookProxy | quote }} - - name: UPTIME_FROM_FILE - value: {{ .Values.procUptimeFile | quote }} - - name: ENABLE_PROMETHEUS_SERVER - value: {{ .Values.enablePrometheusServer | quote }} - - name: PROMETHEUS_SERVER_PORT - value: {{ .Values.prometheusServerPort | quote }} - - name: ENABLE_PROBES_SERVER - value: {{ .Values.enableProbesServer | quote }} - - name: PROBES_SERVER_PORT - value: {{ .Values.probesServerPort | quote }} - - name: PROBES_SERVER_ENDPOINT - value: {{ .Values.probesServerEndpoint | quote }} - - name: EMIT_KUBERNETES_EVENTS - value: {{ .Values.emitKubernetesEvents | quote }} - - name: KUBERNETES_EVENTS_EXTRA_ANNOTATIONS - value: {{ .Values.kubernetesEventsExtraAnnotations | quote }} -{{- range $key, $value := .Values.extraEnv }} - - name: {{ $key }} - value: {{ $value | quote }} -{{- end }} - resources: - {{- toYaml .Values.resources | nindent 12 }} + - name: ENABLE_SPOT_INTERRUPTION_DRAINING + value: {{ .Values.enableSpotInterruptionDraining | quote }} + - name: ENABLE_SCHEDULED_EVENT_DRAINING + value: {{ .Values.enableScheduledEventDraining | quote }} + - name: ENABLE_REBALANCE_MONITORING + value: {{ .Values.enableRebalanceMonitoring | quote }} + - name: ENABLE_REBALANCE_DRAINING + value: {{ .Values.enableRebalanceDraining | quote }} + - name: ENABLE_SQS_TERMINATION_DRAINING + value: "false" + - name: UPTIME_FROM_FILE + value: {{ .Values.procUptimeFile | quote }} {{- if or .Values.enablePrometheusServer .Values.enableProbesServer }} ports: + {{- if .Values.enableProbesServer }} + - name: liveness-probe + protocol: TCP + containerPort: {{ .Values.probes.httpGet.port }} {{- end }} {{- if .Values.enablePrometheusServer }} - - containerPort: {{ .Values.prometheusServerPort }} - {{- if .Values.useHostNetwork }} - hostPort: {{ .Values.prometheusServerPort }} - {{- end }} - name: http-metrics - protocol: TCP + - name: http-metrics + protocol: TCP + containerPort: {{ .Values.prometheusServerPort }} {{- end }} - {{- if .Values.enableProbesServer }} - - containerPort: {{ .Values.probesServerPort }} - {{- if .Values.useHostNetwork }} - hostPort: {{ .Values.probesServerPort }} - {{- end }} - name: liveness-probe - protocol: TCP {{- end }} {{- if .Values.enableProbesServer }} livenessProbe: {{- toYaml .Values.probes | nindent 12 }} {{- end }} - nodeSelector: - {{ include "aws-node-termination-handler.nodeSelectorTermsOs" . }}: linux - {{- with .Values.nodeSelector }} - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.linuxNodeSelector }} - {{- toYaml . | nindent 8 }} + {{- with .Values.resources }} + resources: + {{- toYaml . | nindent 12 }} + {{- end }} + volumeMounts: + - name: uptime + mountPath: {{ .Values.procUptimeFile }} + readOnly: true + {{- if and .Values.webhookTemplateConfigMapName .Values.webhookTemplateConfigMapKey }} + - name: webhook-template + mountPath: /config/ + {{- end }} + volumes: + - name: uptime + hostPath: + path: {{ .Values.procUptimeFile | default "/proc/uptime" }} + {{- if and .Values.webhookTemplateConfigMapName .Values.webhookTemplateConfigMapKey }} + - name: webhook-template + configMap: + name: {{ .Values.webhookTemplateConfigMapName }} {{- end }} - {{- if .Values.image.pullSecrets }} - imagePullSecrets: - {{- range .Values.image.pullSecrets }} - - name: {{ . }} + nodeSelector: + kubernetes.io/os: linux + {{- with default .Values.daemonsetNodeSelector (default .Values.nodeSelector .Values.linuxNodeSelector) }} + {{- toYaml . | nindent 8 }} {{- end }} + {{- if or .Values.daemonsetAffinity (or .Values.affinity .Values.linuxAffinity) }} + affinity: + {{- toYaml (default .Values.daemonsetAffinity (default .Values.affinity .Values.linuxAffinity)) | nindent 8 }} {{- end }} - {{- with .Values.tolerations }} + {{- if or .Values.daemonsetTolerations (or .Values.tolerations .Values.linuxTolerations) }} tolerations: - {{- toYaml . | nindent 8 }} - {{- end }} + {{- toYaml (default .Values.daemonsetTolerations (default .Values.tolerations .Values.linuxTolerations )) | nindent 8 }} + {{- end }} {{- end -}} diff --git a/config/helm/aws-node-termination-handler/templates/daemonset.windows.yaml b/config/helm/aws-node-termination-handler/templates/daemonset.windows.yaml index d5dfa6f8..f4e2935e 100644 --- a/config/helm/aws-node-termination-handler/templates/daemonset.windows.yaml +++ b/config/helm/aws-node-termination-handler/templates/daemonset.windows.yaml @@ -1,196 +1,199 @@ -{{- if and (lower .Values.targetNodeOs | contains "windows") (not .Values.enableSqsTerminationDraining) -}} +{{- if and (not .Values.enableSqsTerminationDraining) (lower .Values.targetNodeOs | contains "windows") -}} apiVersion: apps/v1 kind: DaemonSet metadata: - name: {{ include "aws-node-termination-handler.fullname.windows" . }} + name: {{ include "aws-node-termination-handler.fullnameWindows" . }} namespace: {{ .Release.Namespace }} labels: {{- include "aws-node-termination-handler.labels" . | nindent 4 }} spec: - {{- if (or .Values.updateStrategy .Values.windowsUpdateStrategy) }} + {{- with .Values.updateStrategy }} updateStrategy: - {{- with .Values.updateStrategy }} - {{- toYaml . | nindent 4 }} - {{- end }} - {{- with .Values.windowsUpdateStrategy }} - {{- toYaml . | nindent 4 }} - {{- end }} + {{- toYaml . | nindent 4 }} {{- end }} selector: matchLabels: - {{- include "aws-node-termination-handler.selectorLabels" . | nindent 6 }} - {{ include "aws-node-termination-handler.nodeSelectorTermsOs" . }}: windows + {{- include "aws-node-termination-handler.selectorLabelsDaemonset" . | nindent 6 }} + kubernetes.io/os: windows template: metadata: - {{- if (or .Values.podAnnotations .Values.windowsPodAnnotations) }} - annotations: - {{- range $key, $value := (mergeOverwrite (dict) .Values.podAnnotations .Values.windowsPodAnnotations) }} - {{ $key }}: {{ $value | quote }} - {{- end }} - {{- end }} labels: - {{- include "aws-node-termination-handler.selectorLabels" . | nindent 8 }} + {{- include "aws-node-termination-handler.selectorLabelsDaemonset" . | nindent 8 }} + kubernetes.io/os: windows k8s-app: aws-node-termination-handler - {{ include "aws-node-termination-handler.nodeSelectorTermsOs" . }}: windows - {{- range $key, $value := (mergeOverwrite (dict) .Values.podLabels .Values.windowsPodLabels) }} - {{ $key }}: {{ $value | quote }} + {{- with (mergeOverwrite (dict) .Values.podLabels .Values.windowsPodLabels) }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- if or .Values.podAnnotations .Values.windowsPodAnnotations }} + annotations: + {{- toYaml (mergeOverwrite (dict) .Values.podAnnotations .Values.windowsPodAnnotations) | nindent 8 }} {{- end }} spec: - {{- if and .Values.webhookTemplateConfigMapName .Values.webhookTemplateConfigMapKey }} - volumes: - - name: "webhook-template" - configMap: - name: {{ .Values.webhookTemplateConfigMapName }} + {{- with .Values.image.pullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + serviceAccountName: {{ include "aws-node-termination-handler.serviceAccountName" . }} + {{- with .Values.podSecurityContext }} + securityContext: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.daemonsetPriorityClassName }} + priorityClassName: {{ . }} + {{- end }} + {{- with .Values.terminationGracePeriodSeconds }} + terminationGracePeriodSeconds: {{ . }} + {{- end }} + hostNetwork: false + dnsPolicy: {{ default .Values.windowsDnsPolicy .Values.dnsPolicy }} + {{- with .Values.dnsConfig }} + dnsConfig: + {{- toYaml . | nindent 8 }} {{- end }} - priorityClassName: {{ .Values.priorityClassName | quote }} - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: {{ include "aws-node-termination-handler.nodeSelectorTermsOs" . | quote }} - operator: In - values: - - windows - - key: {{ include "aws-node-termination-handler.nodeSelectorTermsArch" . | quote }} - operator: In - values: - - amd64 - {{- with .Values.affinity }} - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.windowsAffinity }} - {{- toYaml . | nindent 8 }} - {{- end }} - serviceAccountName: {{ template "aws-node-termination-handler.serviceAccountName" . }} - dnsPolicy: {{ .Values.dnsPolicy | default "ClusterFirst" | quote }} containers: - - name: {{ include "aws-node-termination-handler.name" . }} - image: {{ .Values.image.repository }}:{{ .Values.image.tag }} - imagePullPolicy: {{ .Values.image.pullPolicy }} - {{- if and .Values.webhookTemplateConfigMapName .Values.webhookTemplateConfigMapKey }} - volumeMounts: - - name: "webhook-template" - mountPath: "/config/" + - name: aws-node-termination-handler + {{- with .Values.securityContext }} + securityContext: + {{- toYaml . | nindent 12 }} {{- end }} + image: {{ include "aws-node-termination-handler.image" . }} + imagePullPolicy: {{ .Values.image.pullPolicy }} env: - - name: NODE_NAME - valueFrom: - fieldRef: - fieldPath: spec.nodeName - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: DELETE_LOCAL_DATA - value: {{ .Values.deleteLocalData | quote }} - - name: IGNORE_DAEMON_SETS - value: {{ .Values.ignoreDaemonSets | quote }} - - name: GRACE_PERIOD - value: {{ .Values.gracePeriod | quote }} - - name: POD_TERMINATION_GRACE_PERIOD - value: {{ .Values.podTerminationGracePeriod | quote }} - - name: INSTANCE_METADATA_URL - value: {{ .Values.instanceMetadataURL | quote }} - - name: NODE_TERMINATION_GRACE_PERIOD - value: {{ .Values.nodeTerminationGracePeriod | quote }} - - name: WEBHOOK_URL - value: {{ .Values.webhookURL | quote }} - - name: WEBHOOK_HEADERS - value: {{ .Values.webhookHeaders | quote }} - {{- if and .Values.webhookTemplateConfigMapName .Values.webhookTemplateConfigMapKey }} - - name: WEBHOOK_TEMPLATE_FILE - value: {{ print "/config/" .Values.webhookTemplateConfigMapKey | quote }} - {{- end }} - - name: WEBHOOK_TEMPLATE - value: {{ .Values.webhookTemplate | quote }} - - name: DRY_RUN - value: {{ .Values.dryRun | quote }} - - name: ENABLE_SPOT_INTERRUPTION_DRAINING - value: {{ .Values.enableSpotInterruptionDraining | quote }} - - name: ENABLE_SCHEDULED_EVENT_DRAINING - value: {{ .Values.enableScheduledEventDraining | quote }} - - name: ENABLE_REBALANCE_MONITORING - value: {{ .Values.enableRebalanceMonitoring | quote }} - - name: ENABLE_REBALANCE_DRAINING - value: {{ .Values.enableRebalanceDraining | quote }} - - name: CHECK_ASG_TAG_BEFORE_DRAINING - value: {{ .Values.checkASGTagBeforeDraining | quote }} - - name: MANAGED_ASG_TAG - value: {{ .Values.managedAsgTag | quote }} - - name: METADATA_TRIES - value: {{ .Values.metadataTries | quote }} - - name: CORDON_ONLY - value: {{ .Values.cordonOnly | quote }} - - name: TAINT_NODE - value: {{ .Values.taintNode | quote }} - - name: JSON_LOGGING - value: {{ .Values.jsonLogging | quote }} - - name: LOG_LEVEL - value: {{ .Values.logLevel | quote }} - - name: WEBHOOK_PROXY - value: {{ .Values.webhookProxy | quote }} - - name: UPTIME_FROM_FILE - value: {{ .Values.procUptimeFile | quote }} - - name: ENABLE_PROMETHEUS_SERVER - value: {{ .Values.enablePrometheusServer | quote }} - - name: PROMETHEUS_SERVER_PORT - value: {{ .Values.prometheusServerPort | quote }} - - name: ENABLE_PROBES_SERVER - value: {{ .Values.enableProbesServer | quote }} - - name: PROBES_SERVER_PORT - value: {{ .Values.probesServerPort | quote }} - - name: PROBES_SERVER_ENDPOINT - value: {{ .Values.probesServerEndpoint | quote }} - - name: EMIT_KUBERNETES_EVENTS - value: {{ .Values.emitKubernetesEvents | quote }} - - name: KUBERNETES_EVENTS_EXTRA_ANNOTATIONS - value: {{ .Values.kubernetesEventsExtraAnnotations | quote }} -{{- range $key, $value := .Values.extraEnv }} - - name: {{ $key }} - value: {{ $value | quote }} -{{- end }} - resources: - {{- toYaml .Values.resources | nindent 12 }} + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: ENABLE_PROBES_SERVER + value: {{ .Values.enableProbesServer | quote }} + - name: PROBES_SERVER_PORT + value: {{ .Values.probes.httpGet.port | quote }} + - name: PROBES_SERVER_ENDPOINT + value: {{ .Values.probes.httpGet.path | quote }} + - name: LOG_LEVEL + value: {{ .Values.logLevel | quote }} + - name: JSON_LOGGING + value: {{ .Values.jsonLogging | quote }} + - name: ENABLE_PROMETHEUS_SERVER + value: {{ .Values.enablePrometheusServer | quote }} + - name: PROMETHEUS_SERVER_PORT + value: {{ .Values.prometheusServerPort | quote }} + {{- with .Values.instanceMetadataURL }} + - name: INSTANCE_METADATA_URL + value: {{ . | quote }} + {{- end }} + - name: METADATA_TRIES + value: {{ .Values.metadataTries | quote }} + - name: DRY_RUN + value: {{ .Values.dryRun | quote }} + - name: CORDON_ONLY + value: {{ .Values.cordonOnly | quote }} + - name: TAINT_NODE + value: {{ .Values.taintNode | quote }} + - name: EXCLUDE_FROM_LOAD_BALANCERS + value: {{ .Values.excludeFromLoadBalancers | quote }} + - name: DELETE_LOCAL_DATA + value: {{ .Values.deleteLocalData | quote }} + - name: IGNORE_DAEMON_SETS + value: {{ .Values.ignoreDaemonSets | quote }} + - name: POD_TERMINATION_GRACE_PERIOD + value: {{ .Values.podTerminationGracePeriod | quote }} + - name: NODE_TERMINATION_GRACE_PERIOD + value: {{ .Values.nodeTerminationGracePeriod | quote }} + - name: EMIT_KUBERNETES_EVENTS + value: {{ .Values.emitKubernetesEvents | quote }} + {{- with .Values.kubernetesEventsExtraAnnotations }} + - name: KUBERNETES_EVENTS_EXTRA_ANNOTATIONS + value: {{ . | quote }} + {{- end }} + {{- if or .Values.webhookURL .Values.webhookURLSecretName }} + - name: WEBHOOK_URL + {{- if .Values.webhookURLSecretName }} + valueFrom: + secretKeyRef: + name: {{ .Values.webhookURLSecretName }} + key: webhookurl + {{- else }} + value: {{ .Values.webhookURL | quote }} + {{- end }} + {{- end }} + {{- with .Values.webhookHeaders }} + - name: WEBHOOK_HEADERS + value: {{ . | quote }} + {{- end }} + {{- with .Values.webhookProxy }} + - name: WEBHOOK_PROXY + value: {{ . | quote }} + {{- end }} + {{- if and .Values.webhookTemplateConfigMapName .Values.webhookTemplateConfigMapKey }} + - name: WEBHOOK_TEMPLATE_FILE + value: {{ print "/config/" .Values.webhookTemplateConfigMapKey | quote }} + {{- else if .Values.webhookTemplate }} + - name: WEBHOOK_TEMPLATE + value: {{ .Values.webhookTemplate | quote }} + {{- end }} + - name: ENABLE_SPOT_INTERRUPTION_DRAINING + value: {{ .Values.enableSpotInterruptionDraining | quote }} + - name: ENABLE_SCHEDULED_EVENT_DRAINING + value: {{ .Values.enableScheduledEventDraining | quote }} + - name: ENABLE_REBALANCE_MONITORING + value: {{ .Values.enableRebalanceMonitoring | quote }} + - name: ENABLE_REBALANCE_DRAINING + value: {{ .Values.enableRebalanceDraining | quote }} + - name: ENABLE_SQS_TERMINATION_DRAINING + value: "false" {{- if or .Values.enablePrometheusServer .Values.enableProbesServer }} ports: + {{- if .Values.enableProbesServer }} + - name: liveness-probe + protocol: TCP + containerPort: {{ .Values.probes.httpGet.port }} + hostPort: {{ .Values.probes.httpGet.port }} {{- end }} {{- if .Values.enablePrometheusServer }} - - containerPort: {{ .Values.prometheusServerPort }} - hostPort: {{ .Values.prometheusServerPort }} - name: http-metrics - protocol: TCP + - name: http-metrics + protocol: TCP + containerPort: {{ .Values.prometheusServerPort }} + hostPort: {{ .Values.prometheusServerPort }} {{- end }} - {{- if .Values.enableProbesServer }} - - containerPort: {{ .Values.probesServerPort }} - hostPort: {{ .Values.probesServerPort }} - name: liveness-probe - protocol: TCP {{- end }} {{- if .Values.enableProbesServer }} livenessProbe: {{- toYaml .Values.probes | nindent 12 }} {{- end }} + {{- with .Values.resources }} + resources: + {{- toYaml . | nindent 12 }} + {{- end }} + {{- if and .Values.webhookTemplateConfigMapName .Values.webhookTemplateConfigMapKey }} + volumeMounts: + - name: webhook-template + mountPath: /config/ + {{- end }} + {{- if and .Values.webhookTemplateConfigMapName .Values.webhookTemplateConfigMapKey }} + volumes: + - name: webhook-template + configMap: + name: {{ .Values.webhookTemplateConfigMapName }} + {{- end }} nodeSelector: - {{ include "aws-node-termination-handler.nodeSelectorTermsOs" . }}: windows - {{- with .Values.nodeSelector }} - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.windowsNodeSelector }} - {{- toYaml . | nindent 8 }} - {{- end }} - {{- if .Values.image.pullSecrets }} - imagePullSecrets: - {{- range .Values.image.pullSecrets }} - - name: {{ . }} + kubernetes.io/os: windows + {{- with default .Values.daemonsetNodeSelector (default .Values.nodeSelector .Values.windowsNodeSelector) }} + {{- toYaml . | nindent 8 }} {{- end }} + {{- if or .Values.daemonsetAffinity (or .Values.affinity .Values.windowsAffinity) }} + affinity: + {{- toYaml (default .Values.daemonsetAffinity (default .Values.affinity .Values.windowsAffinity )) | nindent 8 }} {{- end }} - {{- with .Values.tolerations }} + {{- if or .Values.daemonsetTolerations (or .Values.tolerations .Values.windowsTolerations) }} tolerations: - {{- toYaml . | nindent 8 }} - {{- end }} + {{- toYaml (default .Values.daemonsetTolerations (default .Values.tolerations .Values.windowsTolerations )) | nindent 8 }} + {{- end }} {{- end -}} diff --git a/config/helm/aws-node-termination-handler/templates/deployment.yaml b/config/helm/aws-node-termination-handler/templates/deployment.yaml index d2e71157..0fdd2961 100644 --- a/config/helm/aws-node-termination-handler/templates/deployment.yaml +++ b/config/helm/aws-node-termination-handler/templates/deployment.yaml @@ -8,201 +8,202 @@ metadata: {{- include "aws-node-termination-handler.labels" . | nindent 4 }} spec: replicas: {{ .Values.replicas }} + {{- with .Values.strategy }} + strategy: + {{- toYaml . | nindent 4 }} + {{- end }} selector: matchLabels: - {{- include "aws-node-termination-handler.selectorLabels" . | nindent 6 }} - {{ include "aws-node-termination-handler.nodeSelectorTermsOs" . }}: linux + {{- include "aws-node-termination-handler.selectorLabelsDeployment" . | nindent 6 }} template: metadata: - annotations: - {{- range $key, $value := .Values.podAnnotations }} - {{ $key }}: {{ $value | quote }} - {{- end }} labels: - {{- include "aws-node-termination-handler.selectorLabels" . | nindent 8 }} + {{- include "aws-node-termination-handler.selectorLabelsDeployment" . | nindent 8 }} k8s-app: aws-node-termination-handler - {{ include "aws-node-termination-handler.nodeSelectorTermsOs" . }}: linux - {{- range $key, $value := .Values.podLabels }} - {{ $key }}: {{ $value | quote }} + {{- with .Values.podLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.podAnnotations }} + annotations: + {{- toYaml . | nindent 8 }} {{- end }} spec: - volumes: - {{- if and .Values.webhookTemplateConfigMapName .Values.webhookTemplateConfigMapKey }} - - name: "webhook-template" - configMap: - name: {{ .Values.webhookTemplateConfigMapName }} - {{- end }} - priorityClassName: {{ .Values.priorityClassName | quote }} - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: {{ include "aws-node-termination-handler.nodeSelectorTermsOs" . | quote }} - operator: In - values: - - linux - - key: {{ include "aws-node-termination-handler.nodeSelectorTermsArch" . | quote }} - operator: In - values: - - amd64 - - arm64 - - arm - {{- with .Values.affinity }} - {{- toYaml . | nindent 8 }} - {{- end }} - serviceAccountName: {{ template "aws-node-termination-handler.serviceAccountName" . }} - hostNetwork: false - dnsPolicy: {{ .Values.dnsPolicy | quote }} + {{- with .Values.image.pullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + serviceAccountName: {{ include "aws-node-termination-handler.serviceAccountName" . }} + {{- with .Values.podSecurityContext }} securityContext: - fsGroup: {{ .Values.securityContext.runAsGroupID }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.priorityClassName }} + priorityClassName: {{ . }} + {{- end }} + {{- with .Values.terminationGracePeriodSeconds }} + terminationGracePeriodSeconds: {{ . }} + {{- end }} + {{- with .Values.dnsConfig }} + dnsConfig: + {{- toYaml . | nindent 8 }} + {{- end }} containers: - - name: {{ include "aws-node-termination-handler.name" . }} - image: {{ .Values.image.repository }}:{{ .Values.image.tag }} - imagePullPolicy: {{ .Values.image.pullPolicy }} + - name: aws-node-termination-handler + {{- with .Values.securityContext }} securityContext: - readOnlyRootFilesystem: true - runAsNonRoot: true - runAsUser: {{ .Values.securityContext.runAsUserID }} - runAsGroup: {{ .Values.securityContext.runAsGroupID }} - allowPrivilegeEscalation: false - volumeMounts: + {{- toYaml . | nindent 12 }} + {{- end }} + image: {{ include "aws-node-termination-handler.image" . }} + imagePullPolicy: {{ .Values.image.pullPolicy }} + env: + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: ENABLE_PROBES_SERVER + value: "true" + - name: PROBES_SERVER_PORT + value: {{ .Values.probes.httpGet.port | quote }} + - name: PROBES_SERVER_ENDPOINT + value: {{ .Values.probes.httpGet.path | quote }} + - name: LOG_LEVEL + value: {{ .Values.logLevel | quote }} + - name: JSON_LOGGING + value: {{ .Values.jsonLogging | quote }} + - name: ENABLE_PROMETHEUS_SERVER + value: {{ .Values.enablePrometheusServer | quote }} + - name: PROMETHEUS_SERVER_PORT + value: {{ .Values.prometheusServerPort | quote }} + - name: CHECK_ASG_TAG_BEFORE_DRAINING + value: {{ .Values.checkASGTagBeforeDraining | quote }} + - name: MANAGED_ASG_TAG + value: {{ .Values.managedAsgTag | quote }} + - name: ASSUME_ASG_TAG_PROPAGATION + value: {{ .Values.assumeAsgTagPropagation | quote }} + - name: DRY_RUN + value: {{ .Values.dryRun | quote }} + - name: CORDON_ONLY + value: {{ .Values.cordonOnly | quote }} + - name: TAINT_NODE + value: {{ .Values.taintNode | quote }} + - name: EXCLUDE_FROM_LOAD_BALANCERS + value: {{ .Values.excludeFromLoadBalancers | quote }} + - name: DELETE_LOCAL_DATA + value: {{ .Values.deleteLocalData | quote }} + - name: IGNORE_DAEMON_SETS + value: {{ .Values.ignoreDaemonSets | quote }} + - name: POD_TERMINATION_GRACE_PERIOD + value: {{ .Values.podTerminationGracePeriod | quote }} + - name: NODE_TERMINATION_GRACE_PERIOD + value: {{ .Values.nodeTerminationGracePeriod | quote }} + - name: EMIT_KUBERNETES_EVENTS + value: {{ .Values.emitKubernetesEvents | quote }} + {{- with .Values.kubernetesEventsExtraAnnotations }} + - name: KUBERNETES_EVENTS_EXTRA_ANNOTATIONS + value: {{ . | quote }} + {{- end }} + {{- if or .Values.webhookURL .Values.webhookURLSecretName }} + - name: WEBHOOK_URL + {{- if .Values.webhookURLSecretName }} + valueFrom: + secretKeyRef: + name: {{ .Values.webhookURLSecretName }} + key: webhookurl + {{- else }} + value: {{ .Values.webhookURL | quote }} + {{- end }} + {{- end }} + {{- with .Values.webhookHeaders }} + - name: WEBHOOK_HEADERS + value: {{ . | quote }} + {{- end }} + {{- with .Values.webhookProxy }} + - name: WEBHOOK_PROXY + value: {{ . | quote }} + {{- end }} {{- if and .Values.webhookTemplateConfigMapName .Values.webhookTemplateConfigMapKey }} - - name: "webhook-template" - mountPath: "/config/" + - name: WEBHOOK_TEMPLATE_FILE + value: {{ print "/config/" .Values.webhookTemplateConfigMapKey | quote }} + {{- else if .Values.webhookTemplate }} + - name: WEBHOOK_TEMPLATE + value: {{ .Values.webhookTemplate | quote }} + {{- end }} + - name: ENABLE_SPOT_INTERRUPTION_DRAINING + value: "false" + - name: ENABLE_SCHEDULED_EVENT_DRAINING + value: "false" + - name: ENABLE_REBALANCE_MONITORING + value: "false" + - name: ENABLE_REBALANCE_DRAINING + value: "false" + - name: ENABLE_SQS_TERMINATION_DRAINING + value: "true" + {{- with .Values.awsRegion }} + - name: AWS_REGION + value: {{ . | quote }} + {{- end }} + {{- with .Values.awsEndpoint }} + - name: AWS_ENDPOINT + value: {{ . | quote }} + {{- end }} + {{- if and .Values.awsAccessKeyID .Values.awsSecretAccessKey }} + - name: AWS_ACCESS_KEY_ID + value: {{ .Values.awsAccessKeyID | quote }} + - name: AWS_SECRET_ACCESS_KEY + value: {{ .Values.awsSecretAccessKey | quote }} + {{- end }} + - name: QUEUE_URL + value: {{ .Values.queueURL | quote }} + - name: WORKERS + value: {{ .Values.workers | quote }} + {{- with .Values.extraEnv }} + {{- toYaml . | nindent 12 }} {{- end }} - env: - - name: NODE_NAME - valueFrom: - fieldRef: - fieldPath: spec.nodeName - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: DELETE_LOCAL_DATA - value: {{ .Values.deleteLocalData | quote }} - - name: IGNORE_DAEMON_SETS - value: {{ .Values.ignoreDaemonSets | quote }} - - name: POD_TERMINATION_GRACE_PERIOD - value: {{ .Values.podTerminationGracePeriod | quote }} - - name: INSTANCE_METADATA_URL - value: {{ .Values.instanceMetadataURL | quote }} - - name: NODE_TERMINATION_GRACE_PERIOD - value: {{ .Values.nodeTerminationGracePeriod | quote }} - - name: WEBHOOK_URL - {{- if .Values.webhookURLSecretName }} - valueFrom: - secretKeyRef: - name: {{ .Values.webhookURLSecretName }} - key: webhookurl - {{- else }} - value: {{ .Values.webhookURL | quote }} - {{- end }} - - name: WEBHOOK_HEADERS - value: {{ .Values.webhookHeaders | quote }} - {{- if and .Values.webhookTemplateConfigMapName .Values.webhookTemplateConfigMapKey }} - - name: WEBHOOK_TEMPLATE_FILE - value: {{ print "/config/" .Values.webhookTemplateConfigMapKey | quote }} - {{- end }} - - name: WEBHOOK_TEMPLATE - value: {{ .Values.webhookTemplate | quote }} - - name: DRY_RUN - value: {{ .Values.dryRun | quote }} - - name: METADATA_TRIES - value: {{ .Values.metadataTries | quote }} - - name: CORDON_ONLY - value: {{ .Values.cordonOnly | quote }} - - name: TAINT_NODE - value: {{ .Values.taintNode | quote }} - - name: JSON_LOGGING - value: {{ .Values.jsonLogging | quote }} - - name: LOG_LEVEL - value: {{ .Values.logLevel | quote }} - - name: WEBHOOK_PROXY - value: {{ .Values.webhookProxy | quote }} - - name: ENABLE_PROMETHEUS_SERVER - value: {{ .Values.enablePrometheusServer | quote }} - - name: ENABLE_PROBES_SERVER - value: {{ .Values.enableProbesServer | quote }} - - name: ENABLE_SPOT_INTERRUPTION_DRAINING - value: "false" - - name: ENABLE_SCHEDULED_EVENT_DRAINING - value: "false" - - name: ENABLE_REBALANCE_MONITORING - value: "false" - - name: ENABLE_REBALANCE_DRAINING - value: "false" - - name: ENABLE_SQS_TERMINATION_DRAINING - value: "true" - - name: QUEUE_URL - value: {{ .Values.queueURL | quote }} - - name: PROMETHEUS_SERVER_PORT - value: {{ .Values.prometheusServerPort | quote }} - - name: PROBES_SERVER_PORT - value: {{ .Values.probesServerPort | quote }} - - name: PROBES_SERVER_ENDPOINT - value: {{ .Values.probesServerEndpoint | quote }} - - name: AWS_REGION - value: {{ .Values.awsRegion | quote }} - - name: AWS_ENDPOINT - value: {{ .Values.awsEndpoint | quote }} - {{- if .Values.awsSecretAccessKey }} - - name: AWS_SECRET_ACCESS_KEY - value: {{ .Values.awsSecretAccessKey | quote }} - - name: AWS_ACCESS_KEY_ID - value: {{ .Values.awsAccessKeyID | quote }} - {{- end }} - - name: CHECK_ASG_TAG_BEFORE_DRAINING - value: {{ .Values.checkASGTagBeforeDraining | quote }} - - name: MANAGED_ASG_TAG - value: {{ .Values.managedAsgTag | quote }} - - name: WORKERS - value: {{ .Values.workers | quote }} - - name: EMIT_KUBERNETES_EVENTS - value: {{ .Values.emitKubernetesEvents | quote }} - - name: KUBERNETES_EVENTS_EXTRA_ANNOTATIONS - value: {{ .Values.kubernetesEventsExtraAnnotations | quote }} -{{- range $key, $value := .Values.extraEnv }} - - name: {{ $key }} - value: {{ $value | quote }} -{{- end }} - resources: - {{- toYaml .Values.resources | nindent 12 }} - {{- if or .Values.enablePrometheusServer .Values.enableProbesServer }} ports: - {{- end }} + - name: liveness-probe + protocol: TCP + containerPort: {{ .Values.probes.httpGet.port }} {{- if .Values.enablePrometheusServer }} - - containerPort: {{ .Values.prometheusServerPort }} - name: http-metrics - protocol: TCP - {{- end }} - {{- if .Values.enableProbesServer }} - - containerPort: {{ .Values.probesServerPort }} - name: liveness-probe - protocol: TCP + - name: http-metrics + protocol: TCP + containerPort: {{ .Values.prometheusServerPort }} {{- end }} - {{- if .Values.enableProbesServer }} livenessProbe: {{- toYaml .Values.probes | nindent 12 }} + {{- with .Values.resources }} + resources: + {{- toYaml . | nindent 12 }} + {{- end }} + {{- if and .Values.webhookTemplateConfigMapName .Values.webhookTemplateConfigMapKey }} + volumeMounts: + - name: webhook-template + mountPath: /config/ {{- end }} + {{- if and .Values.webhookTemplateConfigMapName .Values.webhookTemplateConfigMapKey }} + volumes: + - name: webhook-template + configMap: + name: {{ .Values.webhookTemplateConfigMapName }} + {{- end }} nodeSelector: - {{ include "aws-node-termination-handler.nodeSelectorTermsOs" . }}: linux - {{- with .Values.nodeSelector }} - {{- toYaml . | nindent 8 }} - {{- end }} - {{- if .Values.image.pullSecrets }} - imagePullSecrets: - {{- range .Values.image.pullSecrets }} - - name: {{ . }} + kubernetes.io/os: linux + {{- with .Values.nodeSelector }} + {{- toYaml . | nindent 8 }} {{- end }} + {{- with .Values.affinity }} + affinity: + {{- toYaml . | nindent 8 }} {{- end }} - {{- with .Values.tolerations }} + {{- with .Values.tolerations }} tolerations: {{- toYaml . | nindent 8 }} - {{- end }} + {{- end }} {{- end }} diff --git a/config/helm/aws-node-termination-handler/templates/pdb.yaml b/config/helm/aws-node-termination-handler/templates/pdb.yaml index 1c88ef5d..7f43ab29 100644 --- a/config/helm/aws-node-termination-handler/templates/pdb.yaml +++ b/config/helm/aws-node-termination-handler/templates/pdb.yaml @@ -1,13 +1,14 @@ {{- if and .Values.enableSqsTerminationDraining (and .Values.podDisruptionBudget (gt (int .Values.replicas) 1)) }} -apiVersion: policy/v1beta1 +apiVersion: {{ include "aws-node-termination-handler.pdb.apiVersion" . }} kind: PodDisruptionBudget metadata: name: {{ include "aws-node-termination-handler.fullname" . }} + namespace: {{ .Release.Namespace }} labels: {{- include "aws-node-termination-handler.labels" . | nindent 4 }} spec: selector: matchLabels: - {{- include "aws-node-termination-handler.selectorLabels" . | nindent 6 }} + {{- include "aws-node-termination-handler.selectorLabelsDeployment" . | nindent 6 }} {{- toYaml .Values.podDisruptionBudget | nindent 2 }} {{- end }} diff --git a/config/helm/aws-node-termination-handler/templates/podmonitor.yaml b/config/helm/aws-node-termination-handler/templates/podmonitor.yaml index 47ae26fb..e214d12c 100644 --- a/config/helm/aws-node-termination-handler/templates/podmonitor.yaml +++ b/config/helm/aws-node-termination-handler/templates/podmonitor.yaml @@ -1,29 +1,29 @@ -{{- if .Values.podMonitor.create }} +{{- if and (not .Values.enableSqsTerminationDraining) (and .Values.enablePrometheusServer .Values.podMonitor.create) -}} apiVersion: monitoring.coreos.com/v1 kind: PodMonitor metadata: name: {{ template "aws-node-termination-handler.fullname" . }} - {{- if .Values.podMonitor.namespace }} - namespace: {{ .Values.podMonitor.namespace }} - {{- else }} - namespace: {{ .Release.Namespace }} - {{- end }} + namespace: {{ default .Release.Namespace .Values.podMonitor.namespace }} labels: {{- include "aws-node-termination-handler.labels" . | nindent 4 }} {{- with .Values.podMonitor.labels }} {{- toYaml . | nindent 4 }} {{- end }} spec: - jobLabel: {{ include "aws-node-termination-handler.name" . }} + jobLabel: app.kubernetes.io/name namespaceSelector: matchNames: - - {{ .Release.Namespace }} + - {{ .Release.Namespace }} podMetricsEndpoints: - - interval: {{ .Values.podMonitor.interval }} - path: /metrics - port: http-metrics - sampleLimit: {{ .Values.podMonitor.sampleLimit }} + - port: http-metrics + path: /metrics + {{- with .Values.podMonitor.interval }} + interval: {{ . }} + {{- end }} + {{- with .Values.podMonitor.sampleLimit }} + sampleLimit: {{ . }} + {{- end }} selector: matchLabels: - {{- include "aws-node-termination-handler.selectorLabels" . | nindent 6 }} -{{- end }} + {{- include "aws-node-termination-handler.selectorLabelsDaemonset" . | nindent 6 }} +{{- end -}} diff --git a/config/helm/aws-node-termination-handler/templates/psp.yaml b/config/helm/aws-node-termination-handler/templates/psp.yaml index ea953f8f..70c576e8 100644 --- a/config/helm/aws-node-termination-handler/templates/psp.yaml +++ b/config/helm/aws-node-termination-handler/templates/psp.yaml @@ -19,8 +19,8 @@ spec: max: {{ .Values.prometheusServerPort }} {{- end }} {{- if .Values.enableProbesServer }} - - min: {{ .Values.probesServerPort }} - max: {{ .Values.probesServerPort }} + - min: {{ .Values.probes.httpGet.port }} + max: {{ .Values.probes.httpGet.port }} {{- end }} {{- end }} readOnlyRootFilesystem: false @@ -38,8 +38,8 @@ spec: volumes: - '*' --- -kind: Role apiVersion: rbac.authorization.k8s.io/v1 +kind: Role metadata: name: {{ template "aws-node-termination-handler.fullname" . }}-psp namespace: {{ .Release.Namespace }} diff --git a/config/helm/aws-node-termination-handler/templates/service.yaml b/config/helm/aws-node-termination-handler/templates/service.yaml new file mode 100644 index 00000000..1779749e --- /dev/null +++ b/config/helm/aws-node-termination-handler/templates/service.yaml @@ -0,0 +1,18 @@ +{{- if and .Values.enableSqsTerminationDraining .Values.enablePrometheusServer -}} +apiVersion: v1 +kind: Service +metadata: + name: {{ include "aws-node-termination-handler.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "aws-node-termination-handler.labels" . | nindent 4 }} +spec: + type: ClusterIP + selector: + {{- include "aws-node-termination-handler.selectorLabelsDeployment" . | nindent 4 }} + ports: + - name: http-metrics + port: {{ .Values.prometheusServerPort }} + targetPort: http-metrics + protocol: TCP +{{- end -}} diff --git a/config/helm/aws-node-termination-handler/templates/serviceaccount.yaml b/config/helm/aws-node-termination-handler/templates/serviceaccount.yaml index 40daa397..08f3dc5f 100644 --- a/config/helm/aws-node-termination-handler/templates/serviceaccount.yaml +++ b/config/helm/aws-node-termination-handler/templates/serviceaccount.yaml @@ -4,10 +4,10 @@ kind: ServiceAccount metadata: name: {{ template "aws-node-termination-handler.serviceAccountName" . }} namespace: {{ .Release.Namespace }} -{{- with .Values.serviceAccount.annotations }} - annotations: - {{- toYaml . | nindent 4 }} -{{- end }} labels: {{- include "aws-node-termination-handler.labels" . | nindent 4 }} + {{- with .Values.serviceAccount.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} {{- end -}} diff --git a/config/helm/aws-node-termination-handler/templates/servicemonitor.yaml b/config/helm/aws-node-termination-handler/templates/servicemonitor.yaml new file mode 100644 index 00000000..d5fe1479 --- /dev/null +++ b/config/helm/aws-node-termination-handler/templates/servicemonitor.yaml @@ -0,0 +1,29 @@ +{{- if and .Values.enableSqsTerminationDraining (and .Values.enablePrometheusServer .Values.serviceMonitor.create) -}} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ include "aws-node-termination-handler.fullname" . }} + namespace: {{ default .Release.Namespace .Values.serviceMonitor.namespace }} + labels: + {{- include "aws-node-termination-handler.labels" . | nindent 4 }} + {{- with .Values.serviceMonitor.labels }} + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + jobLabel: app.kubernetes.io/name + namespaceSelector: + matchNames: + - {{ .Release.Namespace }} + endpoints: + - port: http-metrics + path: /metrics + {{- with .Values.serviceMonitor.interval }} + interval: {{ . }} + {{- end }} + {{- with .Values.serviceMonitor.sampleLimit }} + sampleLimit: {{ . }} + {{- end }} + selector: + matchLabels: + {{- include "aws-node-termination-handler.selectorLabelsDeployment" . | nindent 6 }} +{{- end -}} diff --git a/config/helm/aws-node-termination-handler/test.yaml b/config/helm/aws-node-termination-handler/test.yaml deleted file mode 100644 index a24efaa9..00000000 --- a/config/helm/aws-node-termination-handler/test.yaml +++ /dev/null @@ -1,175 +0,0 @@ -# Test values for aws-node-termination-handler. -# This is a YAML-formatted file. -# Declare variables to test template rendering functionality. - -image: - repository: amazon/aws-node-termination-handler - tag: v1.6.1 - pullPolicy: IfNotPresent - pullSecrets: ["test"] - -securityContext: - runAsUserID: 1000 - runAsGroupID: 1000 - -nameOverride: "test-nth" -fullnameOverride: "test-aws-node-termination-handler" - -priorityClassName: system-node-critical - -podAnnotations: { - test: test -} -linuxPodAnnotations: { - test: test -} -windowsPodAnnotations: { - test: test -} - -podLabels: { - test: test -} -linuxPodLabels: { - test: test -} -windowsPodLabels: { - test: test -} - -resources: - requests: - memory: "64Mi" - cpu: "50m" - limits: - memory: "128Mi" - cpu: "100m" - -## enableSpotInterruptionDraining If false, do not drain nodes when the spot interruption termination notice is received -enableSpotInterruptionDraining: true - -## enableScheduledEventDraining [EXPERIMENTAL] If true, drain nodes before the maintenance window starts for an EC2 instance scheduled event -enableScheduledEventDraining: true - -# Total number of times to try making the metadata request before failing. -metadataTries: 3 - -# Cordon but do not drain nodes upon spot interruption termination notice. -cordonOnly: false - -# Taint node upon spot interruption termination notice. -taintNode: false - -# Log messages in JSON format. -jsonLogging: false - -## dryRun tells node-termination-handler to only log calls to kubernetes control plane -dryRun: false - -# deleteLocalData tells kubectl to continue even if there are pods using -# emptyDir (local data that will be deleted when the node is drained). -deleteLocalData: true - -# ignoreDaemonSets causes kubectl to skip Daemon Set managed pods. -ignoreDaemonSets: true - -# gracePeriod (DEPRECATED - use podTerminationGracePeriod instead) is time in seconds given to each pod to terminate gracefully. -# If negative, the default value specified in the pod will be used. -gracePeriod: 1 -podTerminationGracePeriod: 1 - -# nodeTerminationGracePeriod specifies the period of time in seconds given to each NODE to terminate gracefully. Node draining will be scheduled based on this value to optimize the amount of compute time, but still safely drain the node before an event. -nodeTerminationGracePeriod: 1 - -# webhookURL if specified, posts event data to URL upon instance interruption action. -webhookURL: https://localhost:1338 - -# Webhook URL will be fetched from the secret store using the given name. -webhookURLSecretName: test - -# webhookProxy if specified, uses this HTTP(S) proxy configuration. -webhookProxy: tcp://localhost:1338 - -# webhookHeaders if specified, replaces the default webhook headers. -webhookHeaders: "Content-Type: json" - -# webhookTemplate if specified, replaces the default webhook message template. -webhookTemplate: "{\"Content\":\"[NTH][Instance Interruption] InstanceId\"}" - -# instanceMetadataURL is used to override the default metadata URL (default: http://169.254.169.254:80) -instanceMetadataURL: "https://localhost:1338" - -# (TESTING USE): Mount path for uptime file -procUptimeFile: "/proc/uptime" - -# Create node OS specific daemonset(s). (e.g. "linux", "windows", "linux windows") -targetNodeOs: "linux" - -# nodeSelector tells both linux and windows daemonsets where to place the node-termination-handler -# pods. By default, this value is empty and every node will receive a pod. -nodeSelector: { - test: test -} -# linuxNodeSelector tells the linux daemonset where to place the node-termination-handler -# pods. By default, this value is empty and every linux node will receive a pod. -linuxNodeSelector: { - test: test -} -# windowsNodeSelector tells the windows daemonset where to place the node-termination-handler -# pods. By default, this value is empty and every windows node will receive a pod. -windowsNodeSelector: { - test: test -} - -enablePrometheusServer: true -prometheusServerPort: 9092 - -tolerations: -- operator: "Exists" - -affinity: { - test: test -} -linuxAffinity: { - test: test -} -windowsAffinity: { - test: test -} - -serviceAccount: - # Specifies whether a service account should be created - create: true - # The name of the service account to use. If name is not set and create is true, - # a name is generated using fullname template - name: test - annotations: { - test: test - } - # eks.amazonaws.com/role-arn: arn:aws:iam::AWS_ACCOUNT_ID:role/IAM_ROLE_NAME - -rbac: - # rbac.pspEnabled: `true` if PodSecurityPolicy resources should be created - pspEnabled: true - -dnsPolicy: "ClusterFirstWithHostNet" - -podMonitor: - # Specifies whether PodMonitor should be created - create: true - # The Prometheus scrape interval - interval: 30s - # The number of scraped samples that will be accepted - sampleLimit: 5000 - # Additional labels to add to the metadata - labels: { - test: test - } - -# K8s DaemonSet update strategy. -updateStrategy: - type: RollingUpdate - rollingUpdate: - maxUnavailable: 1 - linuxUpdateStrategy: "RollingUpdate" - windowsUpdateStrategy: "RollingUpdate" diff --git a/config/helm/aws-node-termination-handler/values.yaml b/config/helm/aws-node-termination-handler/values.yaml index c97eef36..88491d83 100644 --- a/config/helm/aws-node-termination-handler/values.yaml +++ b/config/helm/aws-node-termination-handler/values.yaml @@ -4,81 +4,76 @@ image: repository: public.ecr.aws/aws-ec2/aws-node-termination-handler - tag: v1.13.4 + # Overrides the image tag whose default is {{ printf "v%s" .Chart.AppVersion }} + tag: "" pullPolicy: IfNotPresent pullSecrets: [] -securityContext: - runAsUserID: 1000 - runAsGroupID: 1000 - nameOverride: "" fullnameOverride: "" -extraEnv: [] +serviceAccount: + # Specifies whether a service account should be created + create: true + # The name of the service account to use. If namenot set and create is true, a name is generated using fullname template + name: + annotations: {} + # eks.amazonaws.com/role-arn: arn:aws:iam::AWS_ACCOUNT_ID:role/IAM_ROLE_NAME -priorityClassName: system-node-critical +rbac: + # Specifies whether RBAC resources should be created + create: true + # Specifies if PodSecurityPolicy resources should be created + pspEnabled: true -podAnnotations: {} -linuxPodAnnotations: {} -windowsPodAnnotations: {} +customLabels: {} podLabels: {} -linuxPodLabels: {} -windowsPodLabels: {} -# liveness probe settings. -probes: - httpGet: - path: /healthz - port: 8080 - initialDelaySeconds: 5 - periodSeconds: 5 +podAnnotations: {} -resources: - requests: - memory: "64Mi" - cpu: "50m" - limits: - memory: "128Mi" - cpu: "100m" +podSecurityContext: + fsGroup: 1000 -# enableSqsTerminationDraining If true, this turns on queue-processor mode which drains nodes when an SQS termination event is received -enableSqsTerminationDraining: false +securityContext: + readOnlyRootFilesystem: true + runAsNonRoot: true + allowPrivilegeEscalation: false + runAsUser: 1000 + runAsGroup: 1000 -# enableRebalanceMonitoring If true, cordon nodes when the rebalance recommendation notice is received -enableRebalanceMonitoring: false +terminationGracePeriodSeconds: -# enableRebalanceDraining If true, drain nodes when the rebalance recommendation notice is received -enableRebalanceDraining: false +resources: {} -# queueURL Listens for messages on the specified SQS queue URL -queueURL: "" +nodeSelector: {} -# checkASGTagBeforeDraining If true, check that the instance is tagged with "aws-node-termination-handler/managed" as the key before draining the node -checkASGTagBeforeDraining: true +affinity: {} -# managedAsgTag The tag to ensure is on a node if checkASGTagBeforeDraining is true -managedAsgTag: "aws-node-termination-handler/managed" +tolerations: [] -# awsRegion If specified, use the AWS region for AWS API calls -awsRegion: "" +# Extra environment variables +extraEnv: [] -# awsEndpoint If specified, use the AWS endpoint to make API calls. -awsEndpoint: "" +# Liveness probe settings +probes: + httpGet: + path: /healthz + port: 8080 + initialDelaySeconds: 5 + periodSeconds: 5 -# These should only be used for testing w/ localstack! -awsSecretAccessKey: -awsAccessKeyID: +# Set the log level +logLevel: info -# enableSpotInterruptionDraining If false, do not drain nodes when the spot interruption termination notice is received -enableSpotInterruptionDraining: "" +# Log messages in JSON format +jsonLogging: false -# enableScheduledEventDraining [EXPERIMENTAL] If true, drain nodes before the maintenance window starts for an EC2 instance scheduled event -enableScheduledEventDraining: "" +enablePrometheusServer: false +prometheusServerPort: 9092 -# Total number of times to try making the metadata request before failing. -metadataTries: 3 +# dryRun tells node-termination-handler to only log calls to kubernetes control plane +dryRun: false # Cordon but do not drain nodes upon spot interruption termination notice. cordonOnly: false @@ -86,29 +81,28 @@ cordonOnly: false # Taint node upon spot interruption termination notice. taintNode: false -# Log messages in JSON format. -jsonLogging: false - -# Sets the log level -logLevel: "info" - -# dryRun tells node-termination-handler to only log calls to kubernetes control plane -dryRun: false +# Exclude node from load balancer before cordoning via the ServiceNodeExclusion feature gate. +excludeFromLoadBalancers: false # deleteLocalData tells kubectl to continue even if there are pods using # emptyDir (local data that will be deleted when the node is drained). -deleteLocalData: "" +deleteLocalData: true # ignoreDaemonSets causes kubectl to skip Daemon Set managed pods. -ignoreDaemonSets: "" +ignoreDaemonSets: true -# gracePeriod (DEPRECATED - use podTerminationGracePeriod instead) is time in seconds given to each pod to terminate gracefully. -# If negative, the default value specified in the pod will be used. -gracePeriod: "" -podTerminationGracePeriod: "" +# podTerminationGracePeriod is time in seconds given to each pod to terminate gracefully. If negative, the default value specified in the pod will be used. +podTerminationGracePeriod: -1 # nodeTerminationGracePeriod specifies the period of time in seconds given to each NODE to terminate gracefully. Node draining will be scheduled based on this value to optimize the amount of compute time, but still safely drain the node before an event. -nodeTerminationGracePeriod: "" +nodeTerminationGracePeriod: 120 + +# emitKubernetesEvents If true, Kubernetes events will be emitted when interruption events are received and when actions are taken on Kubernetes nodes. In IMDS Processor mode a default set of annotations with all the node metadata gathered from IMDS will be attached to each event +emitKubernetesEvents: false + +# kubernetesEventsExtraAnnotations A comma-separated list of key=value extra annotations to attach to all emitted Kubernetes events +# Example: "first=annotation,sample.annotation/number=two" +kubernetesEventsExtraAnnotations: "" # webhookURL if specified, posts event data to URL upon instance interruption action. webhookURL: "" @@ -116,11 +110,14 @@ webhookURL: "" # Webhook URL will be fetched from the secret store using the given name. webhookURLSecretName: "" +# webhookHeaders if specified, replaces the default webhook headers. +webhookHeaders: "" + # webhookProxy if specified, uses this HTTP(S) proxy configuration. webhookProxy: "" -# webhookHeaders if specified, replaces the default webhook headers. -webhookHeaders: "" +# webhookTemplate if specified, replaces the default webhook message template. +webhookTemplate: "" # webhook template file will be fetched from given config map name # if specified, replaces the default webhook message with the content of the template file @@ -129,100 +126,158 @@ webhookTemplateConfigMapName: "" # template file name stored in configmap webhookTemplateConfigMapKey: "" -# webhookTemplate if specified, replaces the default webhook message template. -webhookTemplate: "" +# enableSqsTerminationDraining If true, this turns on queue-processor mode which drains nodes when an SQS termination event is received +enableSqsTerminationDraining: false -# instanceMetadataURL is used to override the default metadata URL (default: http://169.254.169.254:80) -instanceMetadataURL: "" +# --------------------------------------------------------------------------------------------------------------------- +# Queue Processor Mode +# --------------------------------------------------------------------------------------------------------------------- -# (TESTING USE): Mount path for uptime file -procUptimeFile: "" +# The number of replicas in the NTH deployment when using queue-processor mode (NOTE: increasing this may cause duplicate webhooks since NTH pods are stateless) +replicas: 1 -# Create node OS specific daemonset(s). (e.g. "linux", "windows", "linux windows") -targetNodeOs: "linux" +# Specify the update strategy for the deployment +strategy: {} -# nodeSelector tells both linux and windows daemonsets where to place the node-termination-handler -# pods. By default, this value is empty and every node will receive a pod. -nodeSelector: {} -# linuxNodeSelector tells the linux daemonset where to place the node-termination-handler -# pods. By default, this value is empty and every linux node will receive a pod. -linuxNodeSelector: {} -# windowsNodeSelector tells the windows daemonset where to place the node-termination-handler -# pods. By default, this value is empty and every windows node will receive a pod. -windowsNodeSelector: {} +# podDisruptionBudget specifies the disruption budget for the controller pods. +# Disruption budget will be configured only when the replicaCount is greater than 1 +podDisruptionBudget: {} +# maxUnavailable: 1 -nodeSelectorTermsOs: "" -nodeSelectorTermsArch: "" +serviceMonitor: + # Specifies whether ServiceMonitor should be created + # this needs enableSqsTerminationDraining: true + # and enablePrometheusServer: true + create: false + # Specifies whether the ServiceMonitor should be created in a different namespace than + # the Helm release + namespace: + # Additional labels to add to the metadata + labels: {} + # The Prometheus scrape interval + interval: 30s + # The number of scraped samples that will be accepted + sampleLimit: 5000 -enablePrometheusServer: false -prometheusServerPort: 9092 +priorityClassName: system-cluster-critical -enableProbesServer: false -probesServerPort: 8080 -probesServerEndpoint: "/healthz" +# If specified, use the AWS region for AWS API calls +awsRegion: "" -# emitKubernetesEvents If true, Kubernetes events will be emitted when interruption events are received and when actions are taken on Kubernetes nodes. In IMDS Processor mode a default set of annotations with all the node metadata gathered from IMDS will be attached to each event -emitKubernetesEvents: false +# Listens for messages on the specified SQS queue URL +queueURL: "" -# kubernetesEventsExtraAnnotations A comma-separated list of key=value extra annotations to attach to all emitted Kubernetes events -# Example: "first=annotation,sample.annotation/number=two" -kubernetesEventsExtraAnnotations: "" +# The maximum amount of parallel event processors to handle concurrent events +workers: 10 -tolerations: - - operator: "Exists" +# If true, check that the instance is tagged with "aws-node-termination-handler/managed" as the key before draining the node +checkASGTagBeforeDraining: true -affinity: {} -linuxAffinity: {} -windowsAffinity: {} +# The tag to ensure is on a node if checkASGTagBeforeDraining is true +managedAsgTag: "aws-node-termination-handler/managed" -serviceAccount: - # Specifies whether a service account should be created - create: true - # The name of the service account to use. If namenot set and create is true, - # a name is generated using fullname template - name: - annotations: {} - # eks.amazonaws.com/role-arn: arn:aws:iam::AWS_ACCOUNT_ID:role/IAM_ROLE_NAME +# If true, assume that ASG tags will be appear on the ASG's instances +assumeAsgTagPropagation: false -rbac: - # rbac.pspEnabled: `true` if PodSecurityPolicy resources should be created - pspEnabled: true +# --------------------------------------------------------------------------------------------------------------------- +# IMDS Mode +# --------------------------------------------------------------------------------------------------------------------- -dnsPolicy: "" +# Create node OS specific daemonset(s). (e.g. "linux", "windows", "linux windows") +targetNodeOs: linux + +linuxPodLabels: {} +windowsPodLabels: {} + +linuxPodAnnotations: {} +windowsPodAnnotations: {} + +# K8s DaemonSet update strategy. +updateStrategy: + type: RollingUpdate + rollingUpdate: + maxUnavailable: 25% + +daemonsetPriorityClassName: system-node-critical podMonitor: # Specifies whether PodMonitor should be created + # this needs enableSqsTerminationDraining: false + # and enablePrometheusServer: true create: false - # The Prometheus scrape interval + # Specifies whether the PodMonitor should be created in a different namespace than + # the Helm release + namespace: + # Additional labels to add to the metadata + labels: {} + # The Prometheus scrape interval interval: 30s # The number of scraped samples that will be accepted sampleLimit: 5000 - # Additional labels to add to the metadata - labels: {} - # Specifies whether a pod monitor should be created in a different namespace than - # the Helm release - # namespace: monitoring - -# K8s DaemonSet update strategy. -updateStrategy: - type: RollingUpdate - rollingUpdate: - maxUnavailable: 1 -linuxUpdateStrategy: "" -windowsUpdateStrategy: "" # Determines if NTH uses host networking for Linux when running the DaemonSet (only IMDS mode; queue-processor never runs with host networking) # If you have disabled IMDSv1 and are relying on IMDSv2, you'll need to increase the IP hop count to 2 before switching this to false # https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/configuring-instance-metadata-service.html useHostNetwork: true -# The maximal amount of parallel event processors to handle concurrent events -workers: 10 +# Daemonset DNS policy +dnsPolicy: "" +dnsConfig: {} +linuxDnsPolicy: ClusterFirstWithHostNet +windowsDnsPolicy: ClusterFirst -# The number of replicas in the NTH deployment when using queue-processor mode (NOTE: increasing this may cause duplicate webhooks since NTH pods are stateless) -replicas: 1 +daemonsetNodeSelector: {} +linuxNodeSelector: {} +windowsNodeSelector: {} -# podDisruptionBudget specifies the disruption budget for the controller pods. -# Disruption budget will be configured only when the replicaCount is greater than 1 -podDisruptionBudget: {} -# maxUnavailable: 1 +daemonsetAffinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: "eks.amazonaws.com/compute-type" + operator: NotIn + values: + - fargate +linuxAffinity: {} +windowsAffinity: {} + +daemonsetTolerations: + - operator: Exists +linuxTolerations: [] +windowsTolerations: [] + +# If the probes server is running for the Daemonset +enableProbesServer: false + +# Total number of times to try making the metadata request before failing. +metadataTries: 3 + +# enableSpotInterruptionDraining If false, do not drain nodes when the spot interruption termination notice is received +enableSpotInterruptionDraining: true + +# enableScheduledEventDraining [EXPERIMENTAL] If true, drain nodes before the maintenance window starts for an EC2 instance scheduled event +enableScheduledEventDraining: false + +# enableRebalanceMonitoring If true, cordon nodes when the rebalance recommendation notice is received +enableRebalanceMonitoring: false + +# enableRebalanceDraining If true, drain nodes when the rebalance recommendation notice is received +enableRebalanceDraining: false + +# --------------------------------------------------------------------------------------------------------------------- +# Testing +# --------------------------------------------------------------------------------------------------------------------- + +# (TESTING USE): If specified, use the provided AWS endpoint to make API calls. +awsEndpoint: "" + +# (TESTING USE): These should only be used for testing w/ localstack! +awsAccessKeyID: +awsSecretAccessKey: + +# (TESTING USE): Override the default metadata URL (default: http://169.254.169.254:80) +instanceMetadataURL: "" + +# (TESTING USE): Mount path for uptime file +procUptimeFile: /proc/uptime diff --git a/go.mod b/go.mod index f80bc078..0b522a20 100644 --- a/go.mod +++ b/go.mod @@ -13,6 +13,7 @@ require ( go.opentelemetry.io/otel v0.20.0 go.opentelemetry.io/otel/exporters/metric/prometheus v0.20.0 go.opentelemetry.io/otel/metric v0.20.0 + go.uber.org/multierr v1.7.0 golang.org/x/crypto v0.0.0-20210513164829-c07d793c2f9a // indirect golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4 // indirect golang.org/x/sys v0.0.0-20210608053332-aa57babbf139 diff --git a/go.sum b/go.sum index 13b17fed..83f67433 100644 --- a/go.sum +++ b/go.sum @@ -608,8 +608,12 @@ go.starlark.net v0.0.0-20200306205701-8dd3e2ee1dd5/go.mod h1:nmDLcffg48OtT/PSW0H go.uber.org/atomic v1.3.2/go.mod h1:gD2HeocX3+yG+ygLZcrzQJaqmWj9AIm7n08wl/qW/PE= go.uber.org/atomic v1.4.0/go.mod h1:gD2HeocX3+yG+ygLZcrzQJaqmWj9AIm7n08wl/qW/PE= go.uber.org/atomic v1.5.0/go.mod h1:sABNBOSYdrvTF6hTgEIbc7YasKWGhgEQZyfxyTvoXHQ= +go.uber.org/atomic v1.7.0 h1:ADUqmZGgLDDfbSL9ZmPxKTybcoEYHgpYfELNoN+7hsw= +go.uber.org/atomic v1.7.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc= go.uber.org/multierr v1.1.0/go.mod h1:wR5kodmAFQ0UK8QlbwjlSNy0Z68gJhDJUG5sjR94q/0= go.uber.org/multierr v1.3.0/go.mod h1:VgVr7evmIr6uPjLBxg28wmKNXyqE9akIJ5XnfpiKl+4= +go.uber.org/multierr v1.7.0 h1:zaiO/rmgFjbmCXdSYJWQcdvOCsthmdaHfr3Gm2Kx4Ec= +go.uber.org/multierr v1.7.0/go.mod h1:7EAYxJLBy9rStEaz58O2t4Uvip6FSURkq8/ppBp95ak= go.uber.org/tools v0.0.0-20190618225709-2cfd321de3ee/go.mod h1:vJERXedbb3MVM5f9Ejo0C68/HhF8uaILCdgjnY+goOA= go.uber.org/zap v1.10.0/go.mod h1:vwi/ZaCAaUcBkycHslxD9B2zi4UTXhF60s6SWpuDF0Q= go.uber.org/zap v1.13.0/go.mod h1:zwrFLgMcdUuIBviXEYEH1YKNaOBnKXsx2IPda5bBwHM= @@ -916,6 +920,8 @@ gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c h1:dUUwHk2QECo/6vqA44rthZ8ie2QXMNeKRTHCNY2nXvo= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b h1:h8qDotaEPuJATrMmW04NCwg7v22aHH28wwpauUhK9Oo= +gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gotest.tools/v3 v3.0.2/go.mod h1:3SzNCllyD9/Y+b5r9JIKQ474KzkZyqLqEfYqMsX94Bk= gotest.tools/v3 v3.0.3 h1:4AuOwCGf4lLR9u3YOe2awrHygurzhO/HeQ6laiA6Sx0= gotest.tools/v3 v3.0.3/go.mod h1:Z7Lb0S5l+klDB31fvDQX8ss/FlKDxtlFlw3Oa8Ymbl8= diff --git a/pkg/config/config.go b/pkg/config/config.go index 2771d357..42b6fdb8 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -29,6 +29,7 @@ const ( defaultInstanceMetadataURL = "http://169.254.169.254" dryRunConfigKey = "DRY_RUN" nodeNameConfigKey = "NODE_NAME" + podNameConfigKey = "POD_NAME" kubernetesServiceHostConfigKey = "KUBERNETES_SERVICE_HOST" kubernetesServicePortConfigKey = "KUBERNETES_SERVICE_PORT" deleteLocalDataConfigKey = "DELETE_LOCAL_DATA" @@ -67,6 +68,9 @@ const ( metadataTriesDefault = 3 cordonOnly = "CORDON_ONLY" taintNode = "TAINT_NODE" + taintEffectDefault = "NoSchedule" + taintEffect = "TAINT_EFFECT" + excludeFromLoadBalancers = "EXCLUDE_FROM_LOAD_BALANCERS" jsonLoggingConfigKey = "JSON_LOGGING" jsonLoggingDefault = false logLevelConfigKey = "LOG_LEVEL" @@ -100,6 +104,7 @@ const ( type Config struct { DryRun bool NodeName string + PodName string MetadataURL string IgnoreDaemonSets bool DeleteLocalData bool @@ -123,6 +128,8 @@ type Config struct { MetadataTries int CordonOnly bool TaintNode bool + TaintEffect string + ExcludeFromLoadBalancers bool JsonLogging bool LogLevel string UptimeFromFile string @@ -152,6 +159,7 @@ func ParseCliArgs() (config Config, err error) { }() flag.BoolVar(&config.DryRun, "dry-run", getBoolEnv(dryRunConfigKey, false), "If true, only log if a node would be drained") flag.StringVar(&config.NodeName, "node-name", getEnv(nodeNameConfigKey, ""), "The kubernetes node name") + flag.StringVar(&config.PodName, "pod-name", getEnv(podNameConfigKey, ""), "The kubernetes pod name") flag.StringVar(&config.MetadataURL, "metadata-url", getEnv(instanceMetadataURLConfigKey, defaultInstanceMetadataURL), "The URL of EC2 instance metadata. This shouldn't need to be changed unless you are testing.") flag.BoolVar(&config.IgnoreDaemonSets, "ignore-daemon-sets", getBoolEnv(ignoreDaemonSetsConfigKey, true), "If true, ignore daemon sets and drain other pods when a spot interrupt is received.") flag.BoolVar(&config.DeleteLocalData, "delete-local-data", getBoolEnv(deleteLocalDataConfigKey, true), "If true, do not drain pods that are using local node storage in emptyDir") @@ -175,6 +183,8 @@ func ParseCliArgs() (config Config, err error) { flag.IntVar(&config.MetadataTries, "metadata-tries", getIntEnv(metadataTriesConfigKey, metadataTriesDefault), "The number of times to try requesting metadata. If you would like 2 retries, set metadata-tries to 3.") flag.BoolVar(&config.CordonOnly, "cordon-only", getBoolEnv(cordonOnly, false), "If true, nodes will be cordoned but not drained when an interruption event occurs.") flag.BoolVar(&config.TaintNode, "taint-node", getBoolEnv(taintNode, false), "If true, nodes will be tainted when an interruption event occurs.") + flag.StringVar(&config.TaintEffect, "taint-effect", getEnv(taintEffect, taintEffectDefault), "Sets the effect when a node is tainted.") + flag.BoolVar(&config.ExcludeFromLoadBalancers, "exclude-from-load-balancers", getBoolEnv(excludeFromLoadBalancers, false), "If true, nodes will be marked for exclusion from load balancers when an interruption event occurs.") flag.BoolVar(&config.JsonLogging, "json-logging", getBoolEnv(jsonLoggingConfigKey, jsonLoggingDefault), "If true, use JSON-formatted logs instead of human readable logs.") flag.StringVar(&config.LogLevel, "log-level", getEnv(logLevelConfigKey, logLevelDefault), "Sets the log level (INFO, DEBUG, or ERROR)") flag.StringVar(&config.UptimeFromFile, "uptime-from-file", getEnv(uptimeFromFileConfigKey, uptimeFromFileDefault), "If specified, read system uptime from the file path (useful for testing).") @@ -234,6 +244,7 @@ func (c Config) PrintJsonConfigArgs() { log.Info(). Bool("dry_run", c.DryRun). Str("node_name", c.NodeName). + Str("pod_name", c.PodName). Str("metadata_url", c.MetadataURL). Str("kubernetes_service_host", c.KubernetesServiceHost). Str("kubernetes_service_port", c.KubernetesServicePort). @@ -249,6 +260,8 @@ func (c Config) PrintJsonConfigArgs() { Int("metadata_tries", c.MetadataTries). Bool("cordon_only", c.CordonOnly). Bool("taint_node", c.TaintNode). + Str("taint_effect", c.TaintEffect). + Bool("exclude_from_load_balancers", c.ExcludeFromLoadBalancers). Bool("json_logging", c.JsonLogging). Str("log_level", c.LogLevel). Str("webhook_proxy", c.WebhookProxy). @@ -277,6 +290,7 @@ func (c Config) PrintHumanConfigArgs() { "aws-node-termination-handler arguments: \n"+ "\tdry-run: %t,\n"+ "\tnode-name: %s,\n"+ + "\tpod-name: %s,\n"+ "\tmetadata-url: %s,\n"+ "\tkubernetes-service-host: %s,\n"+ "\tkubernetes-service-port: %s,\n"+ @@ -292,6 +306,8 @@ func (c Config) PrintHumanConfigArgs() { "\tmetadata-tries: %d,\n"+ "\tcordon-only: %t,\n"+ "\ttaint-node: %t,\n"+ + "\ttaint-effect: %s,\n"+ + "\texclude-from-load-balancers: %t,\n"+ "\tjson-logging: %t,\n"+ "\tlog-level: %s,\n"+ "\twebhook-proxy: %s,\n"+ @@ -311,6 +327,7 @@ func (c Config) PrintHumanConfigArgs() { "\taws-endpoint: %s,\n", c.DryRun, c.NodeName, + c.PodName, c.MetadataURL, c.KubernetesServiceHost, c.KubernetesServicePort, @@ -326,6 +343,8 @@ func (c Config) PrintHumanConfigArgs() { c.MetadataTries, c.CordonOnly, c.TaintNode, + c.TaintEffect, + c.ExcludeFromLoadBalancers, c.JsonLogging, c.LogLevel, c.WebhookProxy, diff --git a/pkg/monitor/sqsevent/asg-lifecycle-event.go b/pkg/monitor/sqsevent/asg-lifecycle-event.go index 1feda2aa..969d5234 100644 --- a/pkg/monitor/sqsevent/asg-lifecycle-event.go +++ b/pkg/monitor/sqsevent/asg-lifecycle-event.go @@ -32,11 +32,11 @@ import ( "id": "782d5b4c-0f6f-1fd6-9d62-ecf6aed0a470", "detail-type": "EC2 Instance-terminate Lifecycle Action", "source": "aws.autoscaling", - "account": "896453262834", + "account": "123456789012", "time": "2020-07-01T22:19:58Z", "region": "us-east-1", "resources": [ - "arn:aws:autoscaling:us-east-1:896453262834:autoScalingGroup:26e7234b-03a4-47fb-b0a9-2b241662774e:autoScalingGroupName/testt1.demo-0a20f32c.kops.sh" + "arn:aws:autoscaling:us-east-1:123456789012:autoScalingGroup:26e7234b-03a4-47fb-b0a9-2b241662774e:autoScalingGroupName/testt1.demo-0a20f32c.kops.sh" ], "detail": { "LifecycleActionToken": "0befcbdb-6ecd-498a-9ff7-ae9b54447cd6", @@ -55,9 +55,11 @@ type LifecycleDetail struct { LifecycleHookName string `json:"LifecycleHookName"` EC2InstanceID string `json:"EC2InstanceId"` LifecycleTransition string `json:"LifecycleTransition"` + RequestID string `json:"RequestId"` + Time string `json:"Time"` } -func (m SQSMonitor) asgTerminationToInterruptionEvent(event EventBridgeEvent, message *sqs.Message) (*monitor.InterruptionEvent, error) { +func (m SQSMonitor) asgTerminationToInterruptionEvent(event *EventBridgeEvent, message *sqs.Message) (*monitor.InterruptionEvent, error) { lifecycleDetail := &LifecycleDetail{} err := json.Unmarshal(event.Detail, lifecycleDetail) if err != nil { diff --git a/pkg/monitor/sqsevent/ec2-state-change-event.go b/pkg/monitor/sqsevent/ec2-state-change-event.go index dd5d6ace..59725a19 100644 --- a/pkg/monitor/sqsevent/ec2-state-change-event.go +++ b/pkg/monitor/sqsevent/ec2-state-change-event.go @@ -50,7 +50,7 @@ type EC2StateChangeDetail struct { const instanceStatesToDrain = "stopping,stopped,shutting-down,terminated" -func (m SQSMonitor) ec2StateChangeToInterruptionEvent(event EventBridgeEvent, message *sqs.Message) (*monitor.InterruptionEvent, error) { +func (m SQSMonitor) ec2StateChangeToInterruptionEvent(event *EventBridgeEvent, message *sqs.Message) (*monitor.InterruptionEvent, error) { ec2StateChangeDetail := &EC2StateChangeDetail{} err := json.Unmarshal(event.Detail, ec2StateChangeDetail) if err != nil { diff --git a/pkg/monitor/sqsevent/rebalance-recommendation-event.go b/pkg/monitor/sqsevent/rebalance-recommendation-event.go index d1368935..8bf882b1 100644 --- a/pkg/monitor/sqsevent/rebalance-recommendation-event.go +++ b/pkg/monitor/sqsevent/rebalance-recommendation-event.go @@ -46,7 +46,7 @@ type RebalanceRecommendationDetail struct { InstanceID string `json:"instance-id"` } -func (m SQSMonitor) rebalanceRecommendationToInterruptionEvent(event EventBridgeEvent, message *sqs.Message) (*monitor.InterruptionEvent, error) { +func (m SQSMonitor) rebalanceRecommendationToInterruptionEvent(event *EventBridgeEvent, message *sqs.Message) (*monitor.InterruptionEvent, error) { rebalanceRecDetail := &RebalanceRecommendationDetail{} err := json.Unmarshal(event.Detail, rebalanceRecDetail) if err != nil { diff --git a/pkg/monitor/sqsevent/scheduled-change-event.go b/pkg/monitor/sqsevent/scheduled-change-event.go new file mode 100644 index 00000000..0f523aa1 --- /dev/null +++ b/pkg/monitor/sqsevent/scheduled-change-event.go @@ -0,0 +1,123 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"). You may +// not use this file except in compliance with the License. A copy of the +// License is located at +// +// http://aws.amazon.com/apache2.0/ +// +// or in the "license" file accompanying this file. This file is distributed +// on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either +// express or implied. See the License for the specific language governing +// permissions and limitations under the License. + +package sqsevent + +import ( + "encoding/json" + "fmt" + "time" + + "github.com/aws/aws-node-termination-handler/pkg/monitor" + "github.com/aws/aws-node-termination-handler/pkg/node" + "github.com/aws/aws-sdk-go/service/sqs" + "github.com/rs/zerolog/log" +) + +/* Example AWS Health Scheduled Change EC2 Event: +{ + "version": "0", + "id": "7fb65329-1628-4cf3-a740-95fg457h1402", + "detail-type": "AWS Health Event", + "source": "aws.health", + "account": "account id", + "time": "2016-06-05T06:27:57Z", + "region": "us-east-1", + "resources": ["i-12345678"], + "detail": { + "eventArn": "arn:aws:health:region::event/id", + "service": "EC2", + "eventTypeCode": "AWS_EC2_DEDICATED_HOST_NETWORK_MAINTENANCE_SCHEDULED", + "eventTypeCategory": "scheduledChange", + "startTime": "Sat, 05 Jun 2016 15:10:09 GMT", + "eventDescription": [{ + "language": "en_US", + "latestDescription": "A description of the event will be provided here" + }], + "affectedEntities": [{ + "entityValue": "i-12345678", + "tags": { + "stage": "prod", + "app": "my-app" + } + }] + } +} +*/ + +// AffectedEntity holds information about an entity that is affected by a Health event +type AffectedEntity struct { + EntityValue string `json:"entityValue"` +} + +// ScheduledChangeEventDetail holds the event details for AWS Health scheduled EC2 change events from Amazon EventBridge +type ScheduledChangeEventDetail struct { + EventTypeCategory string `json:"eventTypeCategory"` + Service string `json:"service"` + AffectedEntities []AffectedEntity `json:"affectedEntities"` +} + +func (m SQSMonitor) scheduledEventToInterruptionEvents(event *EventBridgeEvent, message *sqs.Message) []InterruptionEventWrapper { + scheduledChangeEventDetail := &ScheduledChangeEventDetail{} + interruptionEventWrappers := []InterruptionEventWrapper{} + + if err := json.Unmarshal(event.Detail, scheduledChangeEventDetail); err != nil { + return append(interruptionEventWrappers, InterruptionEventWrapper{nil, err}) + } + + if scheduledChangeEventDetail.Service != "EC2" { + err := fmt.Errorf("events from Amazon EventBridge for service (%s) are not supported", scheduledChangeEventDetail.Service) + return append(interruptionEventWrappers, InterruptionEventWrapper{nil, err}) + } + + if scheduledChangeEventDetail.EventTypeCategory != "scheduledChange" { + err := fmt.Errorf("events from Amazon EventBridge with EventTypeCategory (%s) are not supported", scheduledChangeEventDetail.EventTypeCategory) + return append(interruptionEventWrappers, InterruptionEventWrapper{nil, err}) + } + + for _, affectedEntity := range scheduledChangeEventDetail.AffectedEntities { + nodeInfo, err := m.getNodeInfo(affectedEntity.EntityValue) + if err != nil { + interruptionEventWrappers = append(interruptionEventWrappers, InterruptionEventWrapper{nil, err}) + continue + } + + // Begin drain immediately for scheduled change events to avoid disruptions in cases such as degraded hardware + interruptionEvent := monitor.InterruptionEvent{ + EventID: fmt.Sprintf("aws-health-scheduled-change-event-%x", event.ID), + Kind: SQSTerminateKind, + AutoScalingGroupName: nodeInfo.AsgName, + StartTime: time.Now(), + NodeName: nodeInfo.Name, + InstanceID: nodeInfo.InstanceID, + IsManaged: nodeInfo.IsManaged, + Description: fmt.Sprintf("AWS Health scheduled change event received. Instance %s will be interrupted at %s \n", nodeInfo.InstanceID, event.getTime()), + } + interruptionEvent.PostDrainTask = func(interruptionEvent monitor.InterruptionEvent, n node.Node) error { + if errs := m.deleteMessages([]*sqs.Message{message}); errs != nil { + return errs[0] + } + return nil + } + interruptionEvent.PreDrainTask = func(interruptionEvent monitor.InterruptionEvent, n node.Node) error { + if err := n.TaintScheduledMaintenance(interruptionEvent.NodeName, interruptionEvent.EventID); err != nil { + log.Err(err).Msgf("Unable to taint node with taint %s:%s", node.ScheduledMaintenanceTaint, interruptionEvent.EventID) + } + return nil + } + + interruptionEventWrappers = append(interruptionEventWrappers, InterruptionEventWrapper{&interruptionEvent, nil}) + } + + return interruptionEventWrappers +} diff --git a/pkg/monitor/sqsevent/spot-itn-event.go b/pkg/monitor/sqsevent/spot-itn-event.go index 6b5d56c5..87fcffad 100644 --- a/pkg/monitor/sqsevent/spot-itn-event.go +++ b/pkg/monitor/sqsevent/spot-itn-event.go @@ -48,7 +48,7 @@ type SpotInterruptionDetail struct { InstanceAction string `json:"instance-action"` } -func (m SQSMonitor) spotITNTerminationToInterruptionEvent(event EventBridgeEvent, message *sqs.Message) (*monitor.InterruptionEvent, error) { +func (m SQSMonitor) spotITNTerminationToInterruptionEvent(event *EventBridgeEvent, message *sqs.Message) (*monitor.InterruptionEvent, error) { spotInterruptionDetail := &SpotInterruptionDetail{} err := json.Unmarshal(event.Detail, spotInterruptionDetail) if err != nil { diff --git a/pkg/monitor/sqsevent/sqs-monitor.go b/pkg/monitor/sqsevent/sqs-monitor.go index 5f25723b..13c0afda 100644 --- a/pkg/monitor/sqsevent/sqs-monitor.go +++ b/pkg/monitor/sqsevent/sqs-monitor.go @@ -53,12 +53,18 @@ type SQSMonitor struct { ManagedAsgTag string } +// InterruptionEventWrapper is a convenience wrapper for associating an interruption event with its error, if any +type InterruptionEventWrapper struct { + InterruptionEvent *monitor.InterruptionEvent + Err error +} + // Kind denotes the kind of event that is processed func (m SQSMonitor) Kind() string { return SQSTerminateKind } -// Monitor continuously monitors SQS for events and sends interruption events to the passed in channel +// Monitor continuously monitors SQS for events and coordinates processing of the events func (m SQSMonitor) Monitor() error { log.Debug().Msg("Checking for queue messages") messages, err := m.receiveQueueMessages(m.QueueURL) @@ -66,79 +72,150 @@ func (m SQSMonitor) Monitor() error { return err } - failedEvents := 0 + failedEventBridgeEvents := 0 for _, message := range messages { - interruptionEvent, err := m.processSQSMessage(message) - dropMessage := false - switch { - case errors.Is(err, ErrNodeStateNotRunning): - // If the node is no longer running, just log and delete the message. If message deletion fails, count it as an error. - log.Warn().Err(err).Msg("dropping event for an already terminated node") - dropMessage = true - - case err != nil: - // Log errors and record as failed events - log.Err(err).Msg("ignoring event due to error") - failedEvents++ - - case interruptionEvent == nil: - log.Debug().Msg("dropping non-actionable event") - dropMessage = true - - case m.CheckIfManaged && !interruptionEvent.IsManaged: - // This event isn't for an instance that is managed by this process - log.Debug().Str("instance-id", interruptionEvent.InstanceID).Msg("dropping event for unmanaged node") - dropMessage = true - - case interruptionEvent.Kind == SQSTerminateKind: - // Successfully processed SQS message into a SQSTerminateKind interruption event - log.Debug().Msgf("Sending %s interruption event to the interruption channel", SQSTerminateKind) - m.InterruptionChan <- *interruptionEvent - - default: - eventJSON, _ := json.MarshalIndent(interruptionEvent, " ", " ") - log.Warn().Msgf("dropping event of an unrecognized kind: %s", eventJSON) - dropMessage = true + eventBridgeEvent, err := m.processSQSMessage(message) + if err != nil { + log.Err(err).Msg("error processing SQS message") + failedEventBridgeEvents++ + continue } - if dropMessage { - errs := m.deleteMessages([]*sqs.Message{message}) - if len(errs) > 0 { - log.Err(errs[0]).Msg("Error deleting message from SQS") - failedEvents++ - } + interruptionEventWrappers := m.processEventBridgeEvent(eventBridgeEvent, message) + + if err = m.processInterruptionEvents(interruptionEventWrappers, message); err != nil { + log.Err(err).Msg("error processing interruption events") + failedEventBridgeEvents++ } } - if len(messages) > 0 && failedEvents == len(messages) { + if len(messages) > 0 && failedEventBridgeEvents == len(messages) { return fmt.Errorf("none of the waiting queue events could be processed") } return nil } -// processSQSMessage checks sqs for new messages and returns interruption events -func (m SQSMonitor) processSQSMessage(message *sqs.Message) (*monitor.InterruptionEvent, error) { +// processSQSMessage interprets an SQS message and returns an EventBridge event +func (m SQSMonitor) processSQSMessage(message *sqs.Message) (*EventBridgeEvent, error) { event := EventBridgeEvent{} err := json.Unmarshal([]byte(*message.Body), &event) + if err != nil { - return nil, err + return &event, err } - switch event.Source { + if len(event.DetailType) == 0 { + event, err = m.processLifecycleEventFromASG(message) + } + + return &event, err +} + +// processLifecycleEventFromASG checks for a Lifecycle event from ASG to SQS, and wraps it in an EventBridgeEvent +func (m SQSMonitor) processLifecycleEventFromASG(message *sqs.Message) (EventBridgeEvent, error) { + eventBridgeEvent := EventBridgeEvent{} + lifecycleEvent := LifecycleDetail{} + err := json.Unmarshal([]byte(*message.Body), &lifecycleEvent) + + if err != nil || lifecycleEvent.LifecycleTransition != "autoscaling:EC2_INSTANCE_TERMINATING" { + log.Err(err).Msg("only lifecycle termination events from ASG to SQS are supported outside EventBridge") + err = fmt.Errorf("unsupported message type (%s)", message.String()) + return eventBridgeEvent, err + } + + eventBridgeEvent.Source = "aws.autoscaling" + eventBridgeEvent.Time = lifecycleEvent.Time + eventBridgeEvent.ID = lifecycleEvent.RequestID + eventBridgeEvent.Detail, err = json.Marshal(lifecycleEvent) + + log.Debug().Msg("processing lifecycle termination event from ASG") + return eventBridgeEvent, err +} + +// processEventBridgeEvent processes an EventBridge event and returns interruption event wrappers +func (m SQSMonitor) processEventBridgeEvent(eventBridgeEvent *EventBridgeEvent, message *sqs.Message) []InterruptionEventWrapper { + interruptionEventWrappers := []InterruptionEventWrapper{} + interruptionEvent := &monitor.InterruptionEvent{} + var err error + + switch eventBridgeEvent.Source { case "aws.autoscaling": - return m.asgTerminationToInterruptionEvent(event, message) + interruptionEvent, err = m.asgTerminationToInterruptionEvent(eventBridgeEvent, message) + return append(interruptionEventWrappers, InterruptionEventWrapper{interruptionEvent, err}) case "aws.ec2": - if event.DetailType == "EC2 Instance State-change Notification" { - return m.ec2StateChangeToInterruptionEvent(event, message) - } else if event.DetailType == "EC2 Spot Instance Interruption Warning" { - return m.spotITNTerminationToInterruptionEvent(event, message) - } else if event.DetailType == "EC2 Instance Rebalance Recommendation" { - return m.rebalanceRecommendationToInterruptionEvent(event, message) + if eventBridgeEvent.DetailType == "EC2 Instance State-change Notification" { + interruptionEvent, err = m.ec2StateChangeToInterruptionEvent(eventBridgeEvent, message) + } else if eventBridgeEvent.DetailType == "EC2 Spot Instance Interruption Warning" { + interruptionEvent, err = m.spotITNTerminationToInterruptionEvent(eventBridgeEvent, message) + } else if eventBridgeEvent.DetailType == "EC2 Instance Rebalance Recommendation" { + interruptionEvent, err = m.rebalanceRecommendationToInterruptionEvent(eventBridgeEvent, message) + } + return append(interruptionEventWrappers, InterruptionEventWrapper{interruptionEvent, err}) + + case "aws.health": + if eventBridgeEvent.DetailType == "AWS Health Event" { + return m.scheduledEventToInterruptionEvents(eventBridgeEvent, message) + } + } + + err = fmt.Errorf("event source (%s) is not supported", eventBridgeEvent.Source) + return append(interruptionEventWrappers, InterruptionEventWrapper{nil, err}) +} + +// processInterruptionEvents takes interruption event wrappers and sends events to the interruption channel +func (m SQSMonitor) processInterruptionEvents(interruptionEventWrappers []InterruptionEventWrapper, message *sqs.Message) error { + dropMessageSuggestionCount := 0 + failedInterruptionEventsCount := 0 + + for _, eventWrapper := range interruptionEventWrappers { + switch { + case errors.Is(eventWrapper.Err, ErrNodeStateNotRunning): + // If the node is no longer running, just log and delete the message + log.Warn().Err(eventWrapper.Err).Msg("dropping interruption event for an already terminated node") + dropMessageSuggestionCount++ + + case eventWrapper.Err != nil: + // Log errors and record as failed events. Don't delete the message in order to allow retries + log.Err(eventWrapper.Err).Msg("ignoring interruption event due to error") + failedInterruptionEventsCount++ + + case eventWrapper.InterruptionEvent == nil: + log.Debug().Msg("dropping non-actionable interruption event") + dropMessageSuggestionCount++ + + case m.CheckIfManaged && !eventWrapper.InterruptionEvent.IsManaged: + // This event isn't for an instance that is managed by this process + log.Debug().Str("instance-id", eventWrapper.InterruptionEvent.InstanceID).Msg("dropping interruption event for unmanaged node") + dropMessageSuggestionCount++ + + case eventWrapper.InterruptionEvent.Kind == SQSTerminateKind: + // Successfully processed SQS message into a SQSTerminateKind interruption event + log.Debug().Msgf("Sending %s interruption event to the interruption channel", SQSTerminateKind) + m.InterruptionChan <- *eventWrapper.InterruptionEvent + + default: + eventJSON, _ := json.MarshalIndent(eventWrapper.InterruptionEvent, " ", " ") + log.Warn().Msgf("dropping interruption event of an unrecognized kind: %s", eventJSON) + dropMessageSuggestionCount++ } } - return nil, fmt.Errorf("Event source (%s) is not supported", event.Source) + + if dropMessageSuggestionCount == len(interruptionEventWrappers) { + // All interruption events weren't actionable, just delete the message. If message deletion fails, count it as an error + errs := m.deleteMessages([]*sqs.Message{message}) + if len(errs) > 0 { + log.Err(errs[0]).Msg("Error deleting message from SQS") + failedInterruptionEventsCount++ + } + } + + if failedInterruptionEventsCount != 0 { + return fmt.Errorf("some interruption events for message Id %b could not be processed", message.MessageId) + } + + return nil } // receiveQueueMessages checks the configured SQS queue for new messages diff --git a/pkg/monitor/sqsevent/sqs-monitor_test.go b/pkg/monitor/sqsevent/sqs-monitor_test.go index e702c691..b236f405 100644 --- a/pkg/monitor/sqsevent/sqs-monitor_test.go +++ b/pkg/monitor/sqsevent/sqs-monitor_test.go @@ -67,6 +67,16 @@ var asgLifecycleEvent = sqsevent.EventBridgeEvent{ }`), } +var asgLifecycleEventFromSQS = sqsevent.LifecycleDetail{ + LifecycleHookName: "test-nth-asg-to-sqs", + RequestID: "3775fac9-93c3-7ead-8713-159816566000", + LifecycleTransition: "autoscaling:EC2_INSTANCE_TERMINATING", + AutoScalingGroupName: "my-asg", + Time: "2022-01-31T23:07:47.872Z", + EC2InstanceID: "i-040107f6ba000e5ee", + LifecycleActionToken: "b4dd0f5b-0ef2-4479-9dad-6c55f027000e", +} + var rebalanceRecommendationEvent = sqsevent.EventBridgeEvent{ Version: "0", ID: "5d5555d5-dd55-5555-5555-5555dd55d55d", @@ -87,7 +97,7 @@ func TestKind(t *testing.T) { h.Assert(t, sqsevent.SQSMonitor{}.Kind() == sqsevent.SQSTerminateKind, "SQSMonitor kind should return the kind constant for the event") } -func TestMonitor_Success(t *testing.T) { +func TestMonitor_EventBridgeSuccess(t *testing.T) { spotItnEventNoTime := spotItnEvent spotItnEventNoTime.Time = "" for _, event := range []sqsevent.EventBridgeEvent{spotItnEvent, asgLifecycleEvent, spotItnEventNoTime, rebalanceRecommendationEvent} { @@ -134,6 +144,53 @@ func TestMonitor_Success(t *testing.T) { } } +func TestMonitor_AsgDirectToSqsSuccess(t *testing.T) { + event := asgLifecycleEventFromSQS + eventBytes, err := json.Marshal(&event) + h.Ok(t, err) + eventStr := string(eventBytes) + msg := sqs.Message{Body: &eventStr} + h.Ok(t, err) + messages := []*sqs.Message{ + &msg, + } + sqsMock := h.MockedSQS{ + ReceiveMessageResp: sqs.ReceiveMessageOutput{Messages: messages}, + ReceiveMessageErr: nil, + } + dnsNodeName := "ip-10-0-0-157.us-east-2.compute.internal" + ec2Mock := h.MockedEC2{ + DescribeInstancesResp: getDescribeInstancesResp(dnsNodeName, true, true), + } + drainChan := make(chan monitor.InterruptionEvent, 1) + + sqsMonitor := sqsevent.SQSMonitor{ + SQS: sqsMock, + EC2: ec2Mock, + ManagedAsgTag: "aws-node-termination-handler/managed", + ASG: mockIsManagedTrue(nil), + CheckIfManaged: true, + QueueURL: "https://test-queue", + InterruptionChan: drainChan, + } + + err = sqsMonitor.Monitor() + h.Ok(t, err) + + select { + case result := <-drainChan: + h.Equals(t, sqsevent.SQSTerminateKind, result.Kind) + h.Equals(t, result.NodeName, dnsNodeName) + h.Assert(t, result.PostDrainTask != nil, "PostDrainTask should have been set") + h.Assert(t, result.PreDrainTask != nil, "PreDrainTask should have been set") + err = result.PostDrainTask(result, node.Node{}) + h.Ok(t, err) + default: + h.Ok(t, fmt.Errorf("Expected an event to be generated")) + } + +} + func TestMonitor_DrainTasks(t *testing.T) { testEvents := []sqsevent.EventBridgeEvent{spotItnEvent, asgLifecycleEvent, rebalanceRecommendationEvent} messages := make([]*sqs.Message, 0, len(testEvents)) diff --git a/pkg/node/node.go b/pkg/node/node.go index 42e01afb..d310f05c 100644 --- a/pkg/node/node.go +++ b/pkg/node/node.go @@ -44,6 +44,11 @@ const ( ActionLabelTimeKey = "aws-node-termination-handler/action-time" // EventIDLabelKey is a k8s label key whose value is the drainable event id EventIDLabelKey = "aws-node-termination-handler/event-id" + // Apply this label to enable the ServiceNodeExclusion feature gate for excluding nodes from load balancers + ExcludeFromLoadBalancersLabelKey = "node.kubernetes.io/exclude-from-external-load-balancers" + // The value associated with this label is irrelevant for enabling the feature gate + // By defining a unique value it is possible to check if the label was applied by us before removing it + ExcludeFromLoadBalancersLabelValue = "aws-node-termination-handler" ) const ( @@ -95,7 +100,11 @@ func (n Node) CordonAndDrain(nodeName string, reason string) error { log.Info().Str("node_name", nodeName).Str("reason", reason).Msg("Node would have been cordoned and drained, but dry-run flag was set.") return nil } - err := n.Cordon(nodeName, reason) + err := n.MaybeMarkForExclusionFromLoadBalancers(nodeName) + if err != nil { + return err + } + err = n.Cordon(nodeName, reason) if err != nil { return err } @@ -161,13 +170,26 @@ func (n Node) IsUnschedulable(nodeName string) (bool, error) { // MarkWithEventID will add the drain event ID to the node to be properly ignored after a system restart event func (n Node) MarkWithEventID(nodeName string, eventID string) error { - err := n.addLabel(nodeName, EventIDLabelKey, eventID) + err := n.addLabel(nodeName, EventIDLabelKey, eventID, false) if err != nil { return fmt.Errorf("Unable to label node with event ID %s=%s: %w", EventIDLabelKey, eventID, err) } return nil } +// MaybeMarkForExclusionFromLoadBalancers will activate the ServiceNodeExclusion feature flag to indicate that the node should be removed from load balancers +func (n Node) MaybeMarkForExclusionFromLoadBalancers(nodeName string) error { + if !n.nthConfig.ExcludeFromLoadBalancers { + log.Debug().Msg("Not marking for exclusion from load balancers because the configuration flag is not set") + return nil + } + err := n.addLabel(nodeName, ExcludeFromLoadBalancersLabelKey, ExcludeFromLoadBalancersLabelValue, true) + if err != nil { + return fmt.Errorf("Unable to label node for exclusion from load balancers: %w", err) + } + return nil +} + // RemoveNTHLabels will remove all the custom NTH labels added to the node func (n Node) RemoveNTHLabels(nodeName string) error { for _, label := range []string{EventIDLabelKey, ActionLabelKey, ActionLabelTimeKey} { @@ -176,6 +198,10 @@ func (n Node) RemoveNTHLabels(nodeName string) error { return fmt.Errorf("Unable to remove %s from node: %w", label, err) } } + err := n.removeLabelIfValueMatches(nodeName, ExcludeFromLoadBalancersLabelKey, ExcludeFromLoadBalancersLabelValue) + if err != nil { + return fmt.Errorf("Unable to remove %s from node: %w", ExcludeFromLoadBalancersLabelKey, err) + } return nil } @@ -199,12 +225,12 @@ func (n Node) GetEventID(nodeName string) (string, error) { // MarkForUncordonAfterReboot adds labels to the kubernetes node which NTH will read upon reboot func (n Node) MarkForUncordonAfterReboot(nodeName string) error { // adds label to node so that the system will uncordon the node after the scheduled reboot has taken place - err := n.addLabel(nodeName, ActionLabelKey, UncordonAfterRebootLabelVal) + err := n.addLabel(nodeName, ActionLabelKey, UncordonAfterRebootLabelVal, false) if err != nil { return fmt.Errorf("Unable to label node with action to uncordon after system-reboot: %w", err) } // adds label with the current time which is checked against the uptime of the node when processing labels on startup - err = n.addLabel(nodeName, ActionLabelTimeKey, strconv.FormatInt(time.Now().Unix(), 10)) + err = n.addLabel(nodeName, ActionLabelTimeKey, strconv.FormatInt(time.Now().Unix(), 10), false) if err != nil { // if time can't be recorded, rollback the action label err := n.removeLabel(nodeName, ActionLabelKey) @@ -218,7 +244,8 @@ func (n Node) MarkForUncordonAfterReboot(nodeName string) error { } // addLabel will add a label to the node given a label key and value -func (n Node) addLabel(nodeName string, key string, value string) error { +// Specifying true for the skipExisting parameter will skip adding the label if it already exists +func (n Node) addLabel(nodeName string, key string, value string, skipExisting bool) error { type metadata struct { Labels map[string]string `json:"labels"` } @@ -240,6 +267,12 @@ func (n Node) addLabel(nodeName string, key string, value string) error { if err != nil { return err } + if skipExisting { + _, ok := node.ObjectMeta.Labels[key] + if ok { + return nil + } + } if n.nthConfig.DryRun { log.Info().Msgf("Would have added label (%s=%s) to node %s, but dry-run flag was set", key, value, nodeName) return nil @@ -282,6 +315,41 @@ func (n Node) removeLabel(nodeName string, key string) error { return nil } +// removeLabelIfValueMatches will remove a node label given a label key provided the label's value equals matchValue +func (n Node) removeLabelIfValueMatches(nodeName string, key string, matchValue string) error { + type patchRequest struct { + Op string `json:"op"` + Path string `json:"path"` + } + + var patchReqs []interface{} + patchRemove := patchRequest{ + Op: "remove", + Path: fmt.Sprintf("/metadata/labels/%s", jsonPatchEscape(key)), + } + payload, err := json.Marshal(append(patchReqs, patchRemove)) + if err != nil { + return fmt.Errorf("An error occurred while marshalling the json to remove a label from the node: %w", err) + } + node, err := n.fetchKubernetesNode(nodeName) + if err != nil { + return err + } + val, ok := node.ObjectMeta.Labels[key] + if !ok || val == matchValue { + return nil + } + if n.nthConfig.DryRun { + log.Info().Msgf("Would have removed label with key %s from node %s, but dry-run flag was set", key, nodeName) + return nil + } + _, err = n.drainHelper.Client.CoreV1().Nodes().Patch(context.TODO(), node.Name, types.JSONPatchType, payload, metav1.PatchOptions{}) + if err != nil { + return fmt.Errorf("%v node Patch failed when removing a label from the node: %w", node.Name, err) + } + return nil +} + // GetNodeLabels will fetch node labels for a given nodeName func (n Node) GetNodeLabels(nodeName string) (map[string]string, error) { if n.nthConfig.DryRun { @@ -310,7 +378,7 @@ func (n Node) TaintSpotItn(nodeName string, eventID string) error { eventID = eventID[:maxTaintValueLength] } - return addTaint(k8sNode, n, SpotInterruptionTaint, eventID, corev1.TaintEffectNoSchedule) + return addTaint(k8sNode, n, SpotInterruptionTaint, eventID) } // TaintASGLifecycleTermination adds the spot termination notice taint onto a node @@ -328,7 +396,7 @@ func (n Node) TaintASGLifecycleTermination(nodeName string, eventID string) erro eventID = eventID[:maxTaintValueLength] } - return addTaint(k8sNode, n, ASGLifecycleTerminationTaint, eventID, corev1.TaintEffectNoSchedule) + return addTaint(k8sNode, n, ASGLifecycleTerminationTaint, eventID) } // TaintRebalanceRecommendation adds the rebalance recommendation notice taint onto a node @@ -346,7 +414,7 @@ func (n Node) TaintRebalanceRecommendation(nodeName string, eventID string) erro eventID = eventID[:maxTaintValueLength] } - return addTaint(k8sNode, n, RebalanceRecommendationTaint, eventID, corev1.TaintEffectNoSchedule) + return addTaint(k8sNode, n, RebalanceRecommendationTaint, eventID) } // LogPods logs all the pod names on a node @@ -388,7 +456,7 @@ func (n Node) TaintScheduledMaintenance(nodeName string, eventID string) error { eventID = eventID[:maxTaintValueLength] } - return addTaint(k8sNode, n, ScheduledMaintenanceTaint, eventID, corev1.TaintEffectNoSchedule) + return addTaint(k8sNode, n, ScheduledMaintenanceTaint, eventID) } // RemoveNTHTaints removes NTH-specific taints from a node @@ -511,6 +579,7 @@ func getDrainHelper(nthConfig config.Config) (*drain.Helper, error) { Force: true, GracePeriodSeconds: nthConfig.PodTerminationGracePeriod, IgnoreAllDaemonSets: nthConfig.IgnoreDaemonSets, + AdditionalFilters: []drain.PodFilter{filterPodForDeletion(nthConfig.PodName)}, DeleteEmptyDirData: nthConfig.DeleteLocalData, Timeout: time.Duration(nthConfig.NodeTerminationGracePeriod) * time.Second, Out: log.Logger, @@ -540,7 +609,22 @@ func jsonPatchEscape(value string) string { return strings.Replace(value, "/", "~1", -1) } -func addTaint(node *corev1.Node, nth Node, taintKey string, taintValue string, effect corev1.TaintEffect) error { +func getTaintEffect(effect string) corev1.TaintEffect { + switch effect { + case "PreferNoSchedule": + return corev1.TaintEffectPreferNoSchedule + case "NoExecute": + return corev1.TaintEffectNoExecute + default: + log.Warn().Msgf("Unknown taint effect: %s", effect) + fallthrough + case "NoSchedule": + return corev1.TaintEffectNoSchedule + } +} + +func addTaint(node *corev1.Node, nth Node, taintKey string, taintValue string) error { + effect := getTaintEffect(nth.nthConfig.TaintEffect) if nth.nthConfig.DryRun { log.Info().Msgf("Would have added taint (%s=%s:%s) to node %s, but dry-run flag was set", taintKey, taintValue, effect, nth.nthConfig.NodeName) return nil @@ -679,3 +763,12 @@ func getUptimeFunc(uptimeFile string) uptime.UptimeFuncType { } return uptime.Uptime } + +func filterPodForDeletion(podName string) func(pod corev1.Pod) drain.PodDeleteStatus { + return func(pod corev1.Pod) drain.PodDeleteStatus { + if pod.Name == podName { + return drain.MakePodDeleteStatusSkip() + } + return drain.MakePodDeleteStatusOkay() + } +} diff --git a/scripts/build-docker-images b/scripts/build-docker-images index 74b5b718..95dc1418 100755 --- a/scripts/build-docker-images +++ b/scripts/build-docker-images @@ -47,6 +47,7 @@ while getopts "dp:r:v:" opt; do esac done + for os_arch in "${PLATFORMS[@]}"; do os=$(echo $os_arch | cut -d'/' -f1) arch=$(echo $os_arch | cut -d'/' -f2) @@ -56,13 +57,23 @@ for os_arch in "${PLATFORMS[@]}"; do dockerfile="$DOCKERFILE_PATH" if [[ $os = "windows" ]]; then dockerfile="${dockerfile}.windows" + docker build \ + --file "${dockerfile}" \ + --build-arg GOOS=${os} \ + --build-arg GOARCH=${arch} \ + --build-arg GOPROXY=${GOPROXY} \ + --tag ${img_tag} \ + ${REPO_ROOT_PATH} + else + # Launch a docker buildx instance and save its name so we can terminate it later + buildx_instance_name=$(docker buildx create --use) + docker buildx build \ + --load \ + --file "${dockerfile}" \ + --build-arg GOPROXY=${GOPROXY} \ + --tag ${img_tag} \ + --platform "${os_arch}" \ + ${REPO_ROOT_PATH} + docker buildx rm ${buildx_instance_name} fi - - docker build \ - --file "${dockerfile}" \ - --build-arg GOOS=${os} \ - --build-arg GOARCH=${arch} \ - --build-arg GOPROXY=${GOPROXY} \ - --tag ${img_tag} \ - ${REPO_ROOT_PATH} -done +done \ No newline at end of file diff --git a/scripts/draft-release-notes b/scripts/draft-release-notes deleted file mode 100755 index 978aef72..00000000 --- a/scripts/draft-release-notes +++ /dev/null @@ -1,28 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -GIT_REPO_ROOT=$(git rev-parse --show-toplevel) -BUILD_DIR="${GIT_REPO_ROOT}/build" - -RELEASE_NOTES="${BUILD_DIR}/release-notes.md" -touch "${RELEASE_NOTES}" - ->&2 git fetch --all --tags - -if git describe HEAD --tags | grep -Eq "^v[0-9]+(\.[0-9]+)*(-[a-z0-9]+)?$"; then - LAST_RELEASE_HASH=$(git rev-list --tags --max-count=1 --skip=1 --no-walk) -else - TAG=$(git describe HEAD --tags | grep -Eo "^v[0-9]+(\.[0-9]+)*") - LAST_RELEASE_HASH=$(git rev-list -1 $TAG) -fi - -echo "## Changes" | tee -a "${RELEASE_NOTES}" -for change in $(git rev-list $LAST_RELEASE_HASH..HEAD); do - one_line_msg=$(git --no-pager log --pretty='%s (thanks to %an)' "${change}" -n1 | sed 's/^\[.*\]//') - # render markdown links for cross-posting release notes - pr_num=$(echo $one_line_msg | grep -Eo '(#[0-9]*)' || [[ $? == 1 ]]) - md_link="[$pr_num](https://github.com/aws/aws-node-termination-handler/pull/${pr_num:1})" - echo " - ${one_line_msg/\($pr_num\)/$md_link}" | tee -a "${RELEASE_NOTES}" -done - ->&2 echo -e "\n\nRelease notes file: ${RELEASE_NOTES}" diff --git a/scripts/generate-k8s-yaml b/scripts/generate-k8s-yaml index 3677da2e..bb04f4ac 100755 --- a/scripts/generate-k8s-yaml +++ b/scripts/generate-k8s-yaml @@ -4,7 +4,7 @@ set -euo pipefail SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" PLATFORM=$(uname | tr '[:upper:]' '[:lower:]') -HELM_VERSION="3.0.2" +HELM_VERSION="3.7.1" NAMESPACE="kube-system" MAKEFILEPATH=$SCRIPTPATH/../Makefile @@ -62,6 +62,7 @@ $BUILD_DIR/helm template aws-node-termination-handler \ $BUILD_DIR/helm template aws-node-termination-handler \ --namespace $NAMESPACE \ --set enableSqsTerminationDraining="true" \ + --set enableProbesServer="true" \ $SCRIPTPATH/../config/helm/aws-node-termination-handler/ > $QP_AGG_RESOURCES_YAML # IMDS mode - remove helm annotations from template @@ -83,6 +84,7 @@ $BUILD_DIR/helm template aws-node-termination-handler \ $BUILD_DIR/helm template aws-node-termination-handler \ --namespace $NAMESPACE \ --set enableSqsTerminationDraining="true" \ + --set enableProbesServer="true" \ --output-dir $QP_INDV_RESOURCES_DIR/ \ $SCRIPTPATH/../config/helm/aws-node-termination-handler/ diff --git a/scripts/prepare-for-release b/scripts/prepare-for-release index eff2787b..afeb5664 100755 --- a/scripts/prepare-for-release +++ b/scripts/prepare-for-release @@ -235,7 +235,8 @@ EOM # gh actions cannot respond to prompts if [[ $RELEASE_PREP == true ]]; then while true; do - read -p "🥑${BOLD}Do you wish to create the release prep PR? Enter y/n " yn + echo -e "🥑${BOLD}Do you wish to create the release prep PR? Enter y/n" + read -p "" yn case $yn in [Yy]* ) create_pr; break;; [Nn]* ) rollback; exit;; diff --git a/scripts/push-docker-images b/scripts/push-docker-images index 1736c61d..6080d061 100755 --- a/scripts/push-docker-images +++ b/scripts/push-docker-images @@ -14,7 +14,7 @@ DOCKER_CLI_CONFIG="$HOME/.docker/config.json" USAGE=$(cat << 'EOM' Usage: push-docker-images [-p ] - Pushes docker images for the platform pairs passed in w/ a dockerhub manifest + Pushes docker images for the platform pairs passed in w/ a manifest list Example: push-docker-images -p "linux/amd64,linux/arm" Optional: -p Platform pair list (os/architecture) [DEFAULT: linux/amd64] @@ -62,6 +62,8 @@ if [[ $MANIFEST == "true" ]]; then fi cat <<< "$(jq '.+{"experimental":"enabled"}' $DOCKER_CLI_CONFIG)" > $DOCKER_CLI_CONFIG echo "Enabled experimental CLI features to execute docker manifest commands" + # Delete the local version of the manifest so we rely solely on the remote manifest + docker manifest rm $IMAGE_REPO:$VERSION || : manifest_exists=$(docker manifest inspect $IMAGE_REPO:$VERSION > /dev/null ; echo $?) if [[ manifest_exists -eq 0 ]]; then echo "manifest already exists" @@ -109,6 +111,10 @@ if [[ $MANIFEST == "true" ]]; then echo "creating manifest for $updated_img" docker manifest create $IMAGE_REPO:$VERSION $updated_img --amend + # Theoretically, this will not be necessary if we move all our builds to docker buildx. + # (The Windows build is the only one not using it at the moment.) The manifest create --amend command + # should figure out the OS and architecture automatically if the container was built properly. + # However, our builds in the past required this explicit annotation, and it doesn't hurt to keep it for now. os_arch=$(echo ${updated_img//$IMAGE_REPO:$VERSION-/}) os=$(echo $os_arch | cut -d'-' -f1) arch=$(echo $os_arch | cut -d'-' -f2) diff --git a/test/e2e/asg-lifecycle-sqs-test b/test/e2e/asg-lifecycle-sqs-test index 4ef8ac44..b05db052 100755 --- a/test/e2e/asg-lifecycle-sqs-test +++ b/test/e2e/asg-lifecycle-sqs-test @@ -28,12 +28,12 @@ common_helm_args=() localstack_helm_args=( upgrade --install + --namespace default "$CLUSTER_NAME-localstack" "$SCRIPTPATH/../../config/helm/localstack/" - --wait - --namespace default - --set nodeSelector."$NTH_CONTROL_LABEL" + --set nodeSelector."${NTH_CONTROL_LABEL}" --set defaultRegion="${AWS_REGION}" + --wait ) set -x @@ -60,21 +60,21 @@ echo "🥑 Created SQS Queue ${queue_url}" anth_helm_args=( upgrade --install + --namespace kube-system "$CLUSTER_NAME-acth" "$SCRIPTPATH/../../config/helm/aws-node-termination-handler/" - --namespace kube-system --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" + --set nodeSelector."${NTH_CONTROL_LABEL}" + --set tolerations[0].operator=Exists --set awsAccessKeyID=foo --set awsSecretAccessKey=bar --set awsRegion="${AWS_REGION}" --set awsEndpoint="http://localstack.default" --set checkASGTagBeforeDraining=false --set enableSqsTerminationDraining=true - --set enableScheduledEventDraining=false - --set enableSpotInterruptionDraining=false - --set nodeSelector."$NTH_CONTROL_LABEL" - --set "queueURL=${queue_url}" + --set queueURL="${queue_url}" + --wait ) [[ -n "${NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY-}" ]] && anth_helm_args+=(--set image.pullPolicy="$NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY") @@ -88,11 +88,12 @@ set +x emtp_helm_args=( upgrade --install + --namespace default "$CLUSTER_NAME-emtp" "$SCRIPTPATH/../../config/helm/webhook-test-proxy/" - --namespace default --set webhookTestProxy.image.repository="$WEBHOOK_DOCKER_REPO" --set webhookTestProxy.image.tag="$WEBHOOK_DOCKER_TAG" + --wait ) [[ -n "${WEBHOOK_DOCKER_PULL_POLICY-}" ]] && emtp_helm_args+=(--set webhookTestProxy.image.pullPolicy="$WEBHOOK_DOCKER_PULL_POLICY") @@ -174,5 +175,4 @@ else echo "❌ regular-pod-test was not evicted" fi -echo "❌ ASG Lifecycle SQS Test Failed $CLUSTER_NAME ❌" -fail_and_exit 1 \ No newline at end of file +fail_and_exit 1 diff --git a/test/e2e/cordon-only-test b/test/e2e/cordon-only-test index 6214c844..ce99ba76 100755 --- a/test/e2e/cordon-only-test +++ b/test/e2e/cordon-only-test @@ -14,7 +14,7 @@ set -euo pipefail function fail_and_exit { echo "❌ Cordon Only Test failed $CLUSTER_NAME ❌" - exit ${1:-1} + exit "${1:-1}" } echo "Starting Cordon Only Test for Node Termination Handler" @@ -28,13 +28,14 @@ common_helm_args=() anth_helm_args=( upgrade --install + --namespace kube-system "$CLUSTER_NAME-anth" "$SCRIPTPATH/../../config/helm/aws-node-termination-handler/" - --namespace kube-system --set instanceMetadataURL="${INSTANCE_METADATA_URL:-"http://$AEMM_URL:$IMDS_PORT"}" --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" --set cordonOnly="true" + --wait ) [[ -n "${NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY-}" ]] && anth_helm_args+=(--set image.pullPolicy="$NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY") @@ -48,10 +49,11 @@ set +x aemm_helm_args=( upgrade --install + --namespace default "$CLUSTER_NAME-aemm" "$AEMM_DL_URL" - --namespace default --set servicePort="$IMDS_PORT" + --wait ) [[ ${#common_helm_args[@]} -gt 0 ]] && aemm_helm_args+=("${common_helm_args[@]}") @@ -63,11 +65,12 @@ set +x emtp_helm_args=( upgrade --install + --namespace default "$CLUSTER_NAME-emtp" "$SCRIPTPATH/../../config/helm/webhook-test-proxy/" - --namespace default --set webhookTestProxy.image.repository="$WEBHOOK_DOCKER_REPO" --set webhookTestProxy.image.tag="$WEBHOOK_DOCKER_TAG" + --wait ) [[ -n "${WEBHOOK_DOCKER_PULL_POLICY-}" ]] && emtp_helm_args+=(--set webhookTestProxy.image.pullPolicy="$WEBHOOK_DOCKER_PULL_POLICY") @@ -83,7 +86,7 @@ TAINT_CHECK_SLEEP=15 DEPLOYED=0 -for i in `seq 1 $TAINT_CHECK_CYCLES`; do +for i in $(seq 1 $TAINT_CHECK_CYCLES); do if [[ $(kubectl get deployments regular-pod-test -o jsonpath='{.status.unavailableReplicas}') -eq 0 ]]; then echo "✅ Verified regular-pod-test pod was scheduled and started!" DEPLOYED=1 @@ -100,8 +103,8 @@ fi cordoned=0 test_node="${TEST_NODE:-$CLUSTER_NAME-worker}" -for i in `seq 1 $TAINT_CHECK_CYCLES`; do - if [[ $cordoned -eq 0 ]] && kubectl get nodes $test_node | grep SchedulingDisabled > /dev/null; then +for i in $(seq 1 $TAINT_CHECK_CYCLES); do + if [[ $cordoned -eq 0 ]] && kubectl get nodes "${test_node}" | grep SchedulingDisabled > /dev/null; then echo "✅ Verified the worker node was cordoned!" cordoned=1 fi @@ -121,5 +124,4 @@ else echo "❌ regular-pod-test was evicted" fi -echo "❌ Cordon Only Test Failed $CLUSTER_NAME ❌" fail_and_exit 1 diff --git a/test/e2e/ec2-state-change-sqs-test b/test/e2e/ec2-state-change-sqs-test index 08de99cb..464d0232 100755 --- a/test/e2e/ec2-state-change-sqs-test +++ b/test/e2e/ec2-state-change-sqs-test @@ -28,12 +28,12 @@ common_helm_args=() localstack_helm_args=( upgrade --install + --namespace default "$CLUSTER_NAME-localstack" "$SCRIPTPATH/../../config/helm/localstack/" - --wait - --namespace default - --set nodeSelector."$NTH_CONTROL_LABEL" + --set nodeSelector."${NTH_CONTROL_LABEL}" --set defaultRegion="${AWS_REGION}" + --wait ) set -x @@ -61,22 +61,22 @@ echo "🥑 Created SQS Queue ${queue_url}" anth_helm_args=( upgrade --install + --namespace kube-system "$CLUSTER_NAME-acth" "$SCRIPTPATH/../../config/helm/aws-node-termination-handler/" - --namespace kube-system --set instanceMetadataURL="${INSTANCE_METADATA_URL:-"http://$AEMM_URL:$IMDS_PORT"}" --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" + --set nodeSelector."${NTH_CONTROL_LABEL}" + --set tolerations[0].operator=Exists --set awsAccessKeyID=foo --set awsSecretAccessKey=bar --set awsRegion="${AWS_REGION}" --set awsEndpoint="http://localstack.default" --set checkASGTagBeforeDraining=false --set enableSqsTerminationDraining=true - --set enableScheduledEventDraining=false - --set enableSpotInterruptionDraining=false - --set nodeSelector."$NTH_CONTROL_LABEL" --set "queueURL=${queue_url}" + --wait ) [[ -n "${NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY-}" ]] && anth_helm_args+=(--set image.pullPolicy="$NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY") @@ -90,11 +90,12 @@ set +x emtp_helm_args=( upgrade --install + --namespace default "$CLUSTER_NAME-emtp" "$SCRIPTPATH/../../config/helm/webhook-test-proxy/" - --namespace default --set webhookTestProxy.image.repository="$WEBHOOK_DOCKER_REPO" --set webhookTestProxy.image.tag="$WEBHOOK_DOCKER_TAG" + --wait ) [[ -n "${WEBHOOK_DOCKER_PULL_POLICY-}" ]] && emtp_helm_args+=(--set webhookTestProxy.image.pullPolicy="$WEBHOOK_DOCKER_PULL_POLICY") @@ -183,5 +184,4 @@ else echo "❌ regular-pod-test was not evicted" fi -echo "❌ EC2 State Change SQS Test Failed $CLUSTER_NAME ❌" fail_and_exit 1 diff --git a/test/e2e/emit-events-test b/test/e2e/emit-events-test index c0611fc5..72f932bf 100755 --- a/test/e2e/emit-events-test +++ b/test/e2e/emit-events-test @@ -14,7 +14,7 @@ set -euo pipefail function fail_and_exit { echo "❌ K8s Emit Events Test failed $CLUSTER_NAME ❌" - exit ${1:-1} + exit "${1:-1}" } echo "Starting K8s Emit Events Test for Node Termination Handler" @@ -28,16 +28,17 @@ common_helm_args=() anth_helm_args=( upgrade --install + --namespace kube-system "$CLUSTER_NAME-anth" "$SCRIPTPATH/../../config/helm/aws-node-termination-handler/" - --force - --namespace kube-system --set instanceMetadataURL="${INSTANCE_METADATA_URL:-"http://$AEMM_URL:$IMDS_PORT"}" --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" --set enableSpotInterruptionDraining="true" --set enableScheduledEventDraining="true" --set emitKubernetesEvents="true" + --wait + --force ) [[ -n "${NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY-}" ]] && anth_helm_args+=(--set image.pullPolicy="$NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY") @@ -51,12 +52,12 @@ set +x emtp_helm_args=( upgrade --install + --namespace default "$CLUSTER_NAME-emtp" "$SCRIPTPATH/../../config/helm/webhook-test-proxy/" - --wait - --namespace default --set webhookTestProxy.image.repository="$WEBHOOK_DOCKER_REPO" --set webhookTestProxy.image.tag="$WEBHOOK_DOCKER_TAG" + --wait ) [[ -n "${WEBHOOK_DOCKER_PULL_POLICY-}" ]] && emtp_helm_args+=(--set webhookTestProxy.image.pullPolicy="$WEBHOOK_DOCKER_PULL_POLICY") @@ -70,12 +71,12 @@ set +x aemm_helm_args=( upgrade --install + --namespace default "$CLUSTER_NAME-aemm" "$AEMM_DL_URL" - --wait - --namespace default --set aemm.IMDSv2="true" --set servicePort="$IMDS_PORT" + --wait ) [[ ${#common_helm_args[@]} -gt 0 ]] && aemm_helm_args+=("${common_helm_args[@]}") @@ -89,7 +90,7 @@ TAINT_CHECK_SLEEP=15 DEPLOYED=0 -for i in `seq 1 $TAINT_CHECK_CYCLES`; do +for i in $(seq 1 $TAINT_CHECK_CYCLES); do if [[ $(kubectl get deployments regular-pod-test -o jsonpath='{.status.unavailableReplicas}') -eq 0 ]]; then echo "✅ Verified regular-pod-test pod was scheduled and started!" DEPLOYED=1 @@ -105,7 +106,7 @@ if [[ $DEPLOYED -eq 0 ]]; then fi test_node="${TEST_NODE:-$CLUSTER_NAME-worker}" -for i in `seq 1 $TAINT_CHECK_CYCLES`; do +for i in $(seq 1 $TAINT_CHECK_CYCLES); do if kubectl get events | tr -s " " | grep "CordonAndDrain node/${test_node} Node successfully cordoned and drained" >/dev/null; then echo "✅ Verified CordonAndDrain was emitted as a k8s event!" echo "✅ K8s Emit Events Test Passed $CLUSTER_NAME! ✅"s @@ -119,5 +120,4 @@ done echo "❌ k8s CordonAndDrain event was not emitted to k8s" -echo "❌ K8s Emit Events Test failed $CLUSTER_NAME ❌" fail_and_exit 1 diff --git a/test/e2e/imds-v2-test b/test/e2e/imds-v2-test index a1e1f90c..3668b4a6 100755 --- a/test/e2e/imds-v2-test +++ b/test/e2e/imds-v2-test @@ -14,7 +14,7 @@ set -euo pipefail function fail_and_exit { echo "❌ IMDSv2 Test failed $CLUSTER_NAME ❌" - exit ${1:-1} + exit "${1:-1}" } echo "Starting IMDSv2 Test for Node Termination Handler" @@ -28,15 +28,16 @@ common_helm_args=() anth_helm_args=( upgrade --install + --namespace kube-system "$CLUSTER_NAME-anth" "$SCRIPTPATH/../../config/helm/aws-node-termination-handler/" - --force - --namespace kube-system --set instanceMetadataURL="${INSTANCE_METADATA_URL:-"http://$AEMM_URL:$IMDS_PORT"}" --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" --set enableSpotInterruptionDraining="true" --set enableScheduledEventDraining="true" + --wait + --force ) [[ -n "${NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY-}" ]] && anth_helm_args+=(--set image.pullPolicy="$NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY") @@ -50,12 +51,12 @@ set +x emtp_helm_args=( upgrade --install + --namespace default "$CLUSTER_NAME-emtp" "$SCRIPTPATH/../../config/helm/webhook-test-proxy/" - --wait - --namespace default --set webhookTestProxy.image.repository="$WEBHOOK_DOCKER_REPO" --set webhookTestProxy.image.tag="$WEBHOOK_DOCKER_TAG" + --wait ) [[ -n "${WEBHOOK_DOCKER_PULL_POLICY-}" ]] && emtp_helm_args+=(--set webhookTestProxy.image.pullPolicy="$WEBHOOK_DOCKER_PULL_POLICY") @@ -69,12 +70,12 @@ set +x aemm_helm_args=( upgrade --install + --namespace default "$CLUSTER_NAME-aemm" "$AEMM_DL_URL" - --wait - --namespace default --set aemm.imdsv2="true" --set servicePort="$IMDS_PORT" + --wait ) [[ ${#common_helm_args[@]} -gt 0 ]] && aemm_helm_args+=("${common_helm_args[@]}") @@ -88,7 +89,7 @@ TAINT_CHECK_SLEEP=15 DEPLOYED=0 -for i in `seq 1 $TAINT_CHECK_CYCLES`; do +for i in $(seq 1 $TAINT_CHECK_CYCLES); do if [[ $(kubectl get deployments regular-pod-test -o jsonpath='{.status.unavailableReplicas}') -eq 0 ]]; then echo "✅ Verified regular-pod-test pod was scheduled and started!" DEPLOYED=1 @@ -105,8 +106,8 @@ fi cordoned=0 test_node="${TEST_NODE:-$CLUSTER_NAME-worker}" -for i in `seq 1 $TAINT_CHECK_CYCLES`; do - if [[ $cordoned -eq 0 ]] && kubectl get nodes $test_node | grep SchedulingDisabled >/dev/null; then +for i in $(seq 1 $TAINT_CHECK_CYCLES); do + if [[ $cordoned -eq 0 ]] && kubectl get nodes "${test_node}" | grep SchedulingDisabled >/dev/null; then echo "✅ Verified the worker node was cordoned!" cordoned=1 fi @@ -126,5 +127,4 @@ else echo "❌ regular-pod-test pod was not evicted" fi -echo "❌ IMDSv2 Test failed $CLUSTER_NAME ❌" fail_and_exit 1 diff --git a/test/e2e/maintenance-event-cancellation-test b/test/e2e/maintenance-event-cancellation-test index 756d8503..eab9095a 100755 --- a/test/e2e/maintenance-event-cancellation-test +++ b/test/e2e/maintenance-event-cancellation-test @@ -14,7 +14,7 @@ set -euo pipefail function fail_and_exit { echo "❌ Test Maintenance Event Cancellation failed $CLUSTER_NAME ❌" - exit ${1:-1} + exit "${1:-1}" } echo "Starting Maintenance Event Cancellation Test for Node Termination Handler" @@ -28,17 +28,17 @@ common_helm_args=() anth_helm_args=( upgrade --install + --namespace kube-system "$CLUSTER_NAME-anth" "$SCRIPTPATH/../../config/helm/aws-node-termination-handler/" - --wait - --namespace kube-system --set instanceMetadataURL="${INSTANCE_METADATA_URL:-"http://$AEMM_URL:$IMDS_PORT"}" --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" --set enableSpotInterruptionDraining="true" --set enableScheduledEventDraining="true" --set taintNode="true" - --set tolerations="" + --set daemonsetTolerations="" + --wait ) [[ -n "${NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY-}" ]] && anth_helm_args+=(--set image.pullPolicy="$NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY") @@ -52,12 +52,12 @@ set +x emtp_helm_args=( upgrade --install + --namespace default "$CLUSTER_NAME-emtp" "$SCRIPTPATH/../../config/helm/webhook-test-proxy/" - --wait - --namespace default --set webhookTestProxy.image.repository="$WEBHOOK_DOCKER_REPO" --set webhookTestProxy.image.tag="$WEBHOOK_DOCKER_TAG" + --wait ) [[ -n "${WEBHOOK_DOCKER_PULL_POLICY-}" ]] && emtp_helm_args+=(--set webhookTestProxy.image.pullPolicy="$WEBHOOK_DOCKER_PULL_POLICY") @@ -71,14 +71,14 @@ set +x aemm_helm_args=( upgrade --install + --namespace default "$CLUSTER_NAME-aemm" "$AEMM_DL_URL" - --wait - --namespace default --set servicePort="$IMDS_PORT" --set 'tolerations[0].effect=NoSchedule' --set 'tolerations[0].operator=Exists' --set arguments='{events}' + --wait ) [[ ${#common_helm_args[@]} -gt 0 ]] && aemm_helm_args+=("${common_helm_args[@]}") @@ -92,7 +92,7 @@ TAINT_CHECK_SLEEP=15 DEPLOYED=0 -for i in `seq 1 $TAINT_CHECK_CYCLES`; do +for i in $(seq 1 $TAINT_CHECK_CYCLES); do if [[ $(kubectl get deployments regular-pod-test -o jsonpath='{.status.unavailableReplicas}') -eq 0 ]]; then echo "✅ Verified regular-pod-test pod was scheduled and started!" DEPLOYED=1 @@ -111,13 +111,13 @@ cordoned=0 tainted=0 evicted=0 test_node="${TEST_NODE:-$CLUSTER_NAME-worker}" -for i in `seq 1 $TAINT_CHECK_CYCLES`; do - if [[ $cordoned -eq 0 ]] && kubectl get nodes $test_node --no-headers | grep SchedulingDisabled >/dev/null; then +for i in $(seq 1 $TAINT_CHECK_CYCLES); do + if [[ $cordoned -eq 0 ]] && kubectl get nodes "${test_node}" --no-headers | grep SchedulingDisabled >/dev/null; then echo "✅ Verified the worker node was cordoned!" cordoned=1 fi - if [[ $cordoned -eq 1 && $tainted -eq 0 ]] && kubectl get nodes $test_node -o json | grep -q "aws-node-termination-handler/scheduled-maintenance" >/dev/null; then + if [[ $cordoned -eq 1 && $tainted -eq 0 ]] && kubectl get nodes "${test_node}" -o json | grep -q "aws-node-termination-handler/scheduled-maintenance" >/dev/null; then echo "✅ Verified the worked node was tainted!" tainted=1 fi @@ -145,15 +145,15 @@ fi aemm_helm_args=( upgrade --install + --namespace default "$CLUSTER_NAME-aemm" "$AEMM_DL_URL" - --wait - --namespace default --set servicePort="$IMDS_PORT" --set aemm.events.state="canceled" --set 'tolerations[0].effect=NoSchedule' --set 'tolerations[0].operator=Exists' --set arguments='{events}' + --wait ) [[ ${#common_helm_args[@]} -gt 0 ]] && aemm_helm_args+=("${common_helm_args[@]}") @@ -165,12 +165,12 @@ set +x emtp_helm_args=( upgrade --install + --namespace default "$CLUSTER_NAME-emtp" "$SCRIPTPATH/../../config/helm/webhook-test-proxy/" - --wait - --namespace default --set webhookTestProxy.image.repository="$WEBHOOK_DOCKER_REPO" --set webhookTestProxy.image.tag="$WEBHOOK_DOCKER_TAG" + --wait ) [[ ${#common_helm_args[@]} -gt 0 ]] && emtp_helm_args+=("${common_helm_args[@]}") @@ -180,8 +180,8 @@ helm "${emtp_helm_args[@]}" set +x uncordoned=0 -for i in `seq 1 $TAINT_CHECK_CYCLES`; do - if [[ $uncordoned -eq 0 ]] && kubectl get nodes $test_node --no-headers | grep -v SchedulingDisabled >/dev/null; then +for i in $(seq 1 $TAINT_CHECK_CYCLES); do + if [[ $uncordoned -eq 0 ]] && kubectl get nodes "${test_node}" --no-headers | grep -v SchedulingDisabled >/dev/null; then echo "✅ Verified the worker node was uncordoned!" uncordoned=1 fi @@ -200,4 +200,5 @@ if [[ $uncordoned -eq 0 ]]; then else echo "❌ regular-pod-test pod was not rescheduled" fi + fail_and_exit 1 diff --git a/test/e2e/maintenance-event-dry-run-test b/test/e2e/maintenance-event-dry-run-test index 08e72ee4..b8339fa2 100755 --- a/test/e2e/maintenance-event-dry-run-test +++ b/test/e2e/maintenance-event-dry-run-test @@ -15,7 +15,7 @@ set -euo pipefail function fail_and_exit { echo "❌ Scheduled Maintenance Events Dry-Run Test failed $CLUSTER_NAME ❌" - exit ${1:-1} + exit "${1:-1}" } echo "Starting Maintenance Events Dry-Run Test for Node Termination Handler" @@ -29,17 +29,17 @@ common_helm_args=() anth_helm_args=( upgrade --install + --namespace kube-system "$CLUSTER_NAME-anth" "$SCRIPTPATH/../../config/helm/aws-node-termination-handler/" - --wait - --force - --namespace kube-system --set instanceMetadataURL="${INSTANCE_METADATA_URL:-"http://$AEMM_URL:$IMDS_PORT"}" --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" --set dryRun="true" --set enableSpotInterruptionDraining="true" --set enableScheduledEventDraining="true" + --wait + --force ) [[ -n "${NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY-}" ]] && anth_helm_args+=(--set image.pullPolicy="$NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY") @@ -53,13 +53,13 @@ set +x emtp_helm_args=( upgrade --install + --namespace default "$CLUSTER_NAME-emtp" "$SCRIPTPATH/../../config/helm/webhook-test-proxy/" - --wait - --force - --namespace default --set webhookTestProxy.image.repository="$WEBHOOK_DOCKER_REPO" --set webhookTestProxy.image.tag="$WEBHOOK_DOCKER_TAG" + --wait + --force ) [[ -n "${WEBHOOK_DOCKER_PULL_POLICY-}" ]] && emtp_helm_args+=(--set webhookTestProxy.image.pullPolicy="$WEBHOOK_DOCKER_PULL_POLICY") @@ -73,12 +73,12 @@ set +x aemm_helm_args=( upgrade --install + --namespace default "$CLUSTER_NAME-aemm" "$AEMM_DL_URL" - --wait - --namespace default --set servicePort="$IMDS_PORT" --set arguments='{events}' + --wait ) [[ ${#common_helm_args[@]} -gt 0 ]] && aemm_helm_args+=("${common_helm_args[@]}") @@ -94,12 +94,12 @@ logs=0 pod_id="$(get_nth_worker_pod)" test_node="${TEST_NODE:-$CLUSTER_NAME-worker}" for i in $(seq 1 $TAINT_CHECK_CYCLES); do - if [[ $logs -eq 0 && ! -z $(kubectl logs $pod_id -n kube-system | grep -i -e 'would have been cordoned and drained') ]]; then + if [[ $logs -eq 0 && ! -z $(kubectl logs "${pod_id}" -n kube-system | grep -i -e 'would have been cordoned and drained') ]]; then echo "✅ Verified the dryrun logs were executed" logs=1 fi - if [[ $logs -eq 1 ]] && kubectl get nodes $test_node --no-headers | grep -v SchedulingDisabled >/dev/null; then + if [[ $logs -eq 1 ]] && kubectl get nodes "${test_node}" --no-headers | grep -v SchedulingDisabled >/dev/null; then echo "✅ Verified the worker node was not cordoned!" echo "✅ Scheduled Maintenance Event Dry Run Test Passed $CLUSTER_NAME! ✅" exit 0 @@ -114,5 +114,4 @@ else echo "❌ Worker node was cordoned" fi -echo "❌ Scheduled Maintenance Event Dry Run Test failed $CLUSTER_NAME ❌" fail_and_exit 1 diff --git a/test/e2e/maintenance-event-reboot-test b/test/e2e/maintenance-event-reboot-test index f635bf08..5df258aa 100755 --- a/test/e2e/maintenance-event-reboot-test +++ b/test/e2e/maintenance-event-reboot-test @@ -10,7 +10,7 @@ set -euo pipefail function fail_and_exit { echo "❌ Scheduled Maintenance Event System Reboot Test failed $CLUSTER_NAME ❌" - exit ${1:-1} + exit "${1:-1}" } echo "Starting Maintenance Event Cancellation Test for Node Termination Handler" @@ -24,17 +24,17 @@ common_helm_args=() anth_helm_args=( upgrade --install + --namespace kube-system "$CLUSTER_NAME-anth" "$SCRIPTPATH/../../config/helm/aws-node-termination-handler/" - --wait - --force - --namespace kube-system --set instanceMetadataURL="${INSTANCE_METADATA_URL:-"http://$AEMM_URL:$IMDS_PORT"}" --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" --set enableSpotInterruptionDraining="true" --set enableScheduledEventDraining="true" --set taintNode="true" + --wait + --force ) [[ -n "${NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY-}" ]] && anth_helm_args+=(--set image.pullPolicy="$NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY") @@ -48,12 +48,12 @@ set +x emtp_helm_args=( upgrade --install + --namespace default "$CLUSTER_NAME-emtp" "$SCRIPTPATH/../../config/helm/webhook-test-proxy/" - --wait - --namespace default --set webhookTestProxy.image.repository="$WEBHOOK_DOCKER_REPO" --set webhookTestProxy.image.tag="$WEBHOOK_DOCKER_TAG" + --wait ) [[ -n "${WEBHOOK_DOCKER_PULL_POLICY-}" ]] && emtp_helm_args+=(--set webhookTestProxy.image.pullPolicy="$WEBHOOK_DOCKER_PULL_POLICY") @@ -67,12 +67,12 @@ set +x aemm_helm_args=( upgrade --install + --namespace default "$CLUSTER_NAME-aemm" "$AEMM_DL_URL" - --wait - --namespace default --set servicePort="$IMDS_PORT" --set arguments='{events}' + --wait ) [[ ${#common_helm_args[@]} -gt 0 ]] && aemm_helm_args+=("${common_helm_args[@]}") @@ -85,7 +85,7 @@ TAINT_CHECK_CYCLES=15 TAINT_CHECK_SLEEP=15 deployed=0 -for i in `seq 1 $TAINT_CHECK_CYCLES`; do +for i in $(seq 1 $TAINT_CHECK_CYCLES); do if [[ $(kubectl get deployments regular-pod-test -o jsonpath='{.status.unavailableReplicas}') -eq 0 ]]; then echo "✅ Verified regular-pod-test pod was scheduled and started!" deployed=1 @@ -103,13 +103,13 @@ fi cordoned=0 tainted=0 test_node="${TEST_NODE:-$CLUSTER_NAME-worker}" -for i in `seq 1 $TAINT_CHECK_CYCLES`; do - if [[ $cordoned -eq 0 ]] && kubectl get nodes $test_node | grep SchedulingDisabled >/dev/null; then +for i in $(seq 1 $TAINT_CHECK_CYCLES); do + if [[ $cordoned -eq 0 ]] && kubectl get nodes "${test_node}" | grep SchedulingDisabled >/dev/null; then echo "✅ Verified the worker node was cordoned for maintenance event reboot!" cordoned=1 fi - if [[ $cordoned -eq 1 ]] && kubectl get nodes $test_node -o json | grep "aws-node-termination-handler/scheduled-maintenance" >/dev/null; then + if [[ $cordoned -eq 1 ]] && kubectl get nodes "${test_node}" -o json | grep "aws-node-termination-handler/scheduled-maintenance" >/dev/null; then echo "✅ Verified the worker node was tainted!" tainted=1 fi @@ -153,11 +153,9 @@ kubectl delete deployments "$deployment" anth_helm_args=( upgrade --install + --namespace kube-system "$CLUSTER_NAME-anth" "$SCRIPTPATH/../../config/helm/aws-node-termination-handler/" - --wait - --force - --namespace kube-system --set instanceMetadataURL="${INSTANCE_METADATA_URL:-"http://$AEMM_URL:$IMDS_PORT"}" --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" @@ -165,6 +163,8 @@ anth_helm_args=( --set enableSpotInterruptionDraining="true" --set enableScheduledEventDraining="true" --set taintNode="true" + --wait + --force ) [[ ${#common_helm_args[@]} -gt 0 ]] && anth_helm_args+=("${common_helm_args[@]}") @@ -175,14 +175,14 @@ set +x uncordoned=0 untainted=0 -for i in `seq 1 $TAINT_CHECK_CYCLES`; do - NODE_LINE=$(kubectl get nodes $test_node | grep -v 'STATUS') - if [[ $uncordoned -eq 0 && -z $(echo $NODE_LINE | grep SchedulingDisabled) ]] && [[ ! -z $(echo $NODE_LINE | grep Ready) ]]; then +for i in $(seq 1 $TAINT_CHECK_CYCLES); do + NODE_LINE=$(kubectl get nodes "${test_node}" | grep -v 'STATUS') + if [[ $uncordoned -eq 0 && -z $(echo "${NODE_LINE}" | grep SchedulingDisabled) ]] && [[ ! -z $(echo "${NODE_LINE}" | grep Ready) ]]; then echo "✅ Verified the worker node was uncordoned!" uncordoned=1 fi - if [[ $uncordoned -eq 1 && $untainted -eq 0 ]] && ! kubectl get nodes $test_node -o json | grep -q "aws-node-termination-handler/scheduled-maintenance" >/dev/null; then + if [[ $uncordoned -eq 1 && $untainted -eq 0 ]] && ! kubectl get nodes "${test_node}" -o json | grep -q "aws-node-termination-handler/scheduled-maintenance" >/dev/null; then echo "✅ Verified the worked node was untainted!" untainted=1 fi @@ -203,4 +203,5 @@ elif [[ $untainted -eq 0 ]]; then else echo "❌ regular-pod-test pod was not rescheduled" fi + fail_and_exit 1 diff --git a/test/e2e/maintenance-event-test b/test/e2e/maintenance-event-test index 54bb75fd..7090dc34 100755 --- a/test/e2e/maintenance-event-test +++ b/test/e2e/maintenance-event-test @@ -14,7 +14,7 @@ set -euo pipefail function fail_and_exit { echo "❌ Maintenance Events Test failed $CLUSTER_NAME ❌" - exit ${1:-1} + exit "${1:-1}" } echo "Starting Maintenance Events Test for Node Termination Handler" @@ -28,18 +28,18 @@ common_helm_args=() anth_helm_args=( upgrade --install + --namespace kube-system "$CLUSTER_NAME-anth" "$SCRIPTPATH/../../config/helm/aws-node-termination-handler/" - --wait - --force - --namespace kube-system --set instanceMetadataURL="${INSTANCE_METADATA_URL:-"http://$AEMM_URL:$IMDS_PORT"}" --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" --set enableSpotInterruptionDraining="true" --set enableScheduledEventDraining="true" --set taintNode="true" - --set tolerations="" + --set daemonsetTolerations="" + --wait + --force ) [[ -n "${NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY-}" ]] && anth_helm_args+=(--set image.pullPolicy="$NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY") @@ -53,12 +53,12 @@ set +x emtp_helm_args=( upgrade --install + --namespace default "$CLUSTER_NAME-emtp" "$SCRIPTPATH/../../config/helm/webhook-test-proxy/" - --wait - --namespace default --set webhookTestProxy.image.repository="$WEBHOOK_DOCKER_REPO" --set webhookTestProxy.image.tag="$WEBHOOK_DOCKER_TAG" + --wait ) [[ -n "${WEBHOOK_DOCKER_PULL_POLICY-}" ]] && emtp_helm_args+=(--set webhookTestProxy.image.pullPolicy="$WEBHOOK_DOCKER_PULL_POLICY") @@ -72,14 +72,14 @@ set +x aemm_helm_args=( upgrade --install + --namespace default "$CLUSTER_NAME-aemm" "$AEMM_DL_URL" - --wait - --namespace default --set servicePort="$IMDS_PORT" --set 'tolerations[0].effect=NoSchedule' --set 'tolerations[0].operator=Exists' --set arguments='{events}' + --wait ) [[ ${#common_helm_args[@]} -gt 0 ]] && @@ -94,7 +94,7 @@ TAINT_CHECK_SLEEP=15 deployed=0 -for i in `seq 1 $TAINT_CHECK_CYCLES`; do +for i in $(seq 1 $TAINT_CHECK_CYCLES); do if [[ $(kubectl get deployments regular-pod-test -o jsonpath='{.status.unavailableReplicas}') -eq 0 ]]; then echo "✅ Verified regular-pod-test pod was scheduled and started!" deployed=1 @@ -112,13 +112,13 @@ fi cordoned=0 tainted=0 test_node="${TEST_NODE:-$CLUSTER_NAME-worker}" -for i in `seq 1 $TAINT_CHECK_CYCLES`; do - if [[ $cordoned -eq 0 ]] && kubectl get nodes $test_node | grep SchedulingDisabled >/dev/null; then +for i in $(seq 1 $TAINT_CHECK_CYCLES); do + if [[ $cordoned -eq 0 ]] && kubectl get nodes "${test_node}" | grep SchedulingDisabled >/dev/null; then echo "✅ Verified the worker node was cordoned!" cordoned=1 fi - if [[ $cordoned -eq 1 && $tainted -eq 0 ]] && kubectl get nodes $test_node -o json | grep -q "aws-node-termination-handler/scheduled-maintenance" >/dev/null; then + if [[ $cordoned -eq 1 && $tainted -eq 0 ]] && kubectl get nodes "${test_node}" -o json | grep -q "aws-node-termination-handler/scheduled-maintenance" >/dev/null; then echo "✅ Verified the worked node was tainted!" tainted=1 fi @@ -139,4 +139,5 @@ elif [[ $tainted -eq 0 ]]; then else echo "❌ regular-pod-test pod was not evicted" fi + fail_and_exit 1 diff --git a/test/e2e/prometheus-metrics-test b/test/e2e/prometheus-metrics-test index 133a476a..eaaf11b5 100755 --- a/test/e2e/prometheus-metrics-test +++ b/test/e2e/prometheus-metrics-test @@ -25,11 +25,9 @@ retry 5 helm install prometheus-operator "$SCRIPTPATH/../../config/helm/kube-pro anth_helm_args=( upgrade --install + --namespace kube-system "$CLUSTER_NAME-anth" "$SCRIPTPATH/../../config/helm/aws-node-termination-handler/" - --wait - --force - --namespace kube-system --set instanceMetadataURL="${INSTANCE_METADATA_URL:-"http://$AEMM_URL:$IMDS_PORT"}" --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" @@ -38,7 +36,9 @@ anth_helm_args=( --set taintNode="true" --set enablePrometheusServer="true" --set podMonitor.create="true" - --set tolerations="" + --set daemonsetTolerations="" + --wait + --force ) [[ -n "${NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY-}" ]] && anth_helm_args+=(--set image.pullPolicy="$NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY") @@ -52,12 +52,12 @@ set +x emtp_helm_args=( upgrade --install + --namespace default "$CLUSTER_NAME-emtp" "$SCRIPTPATH/../../config/helm/webhook-test-proxy/" - --wait - --namespace default --set webhookTestProxy.image.repository="$WEBHOOK_DOCKER_REPO" --set webhookTestProxy.image.tag="$WEBHOOK_DOCKER_TAG" + --wait ) [[ -n "${WEBHOOK_DOCKER_PULL_POLICY-}" ]] && emtp_helm_args+=(--set webhookTestProxy.image.pullPolicy="$WEBHOOK_DOCKER_PULL_POLICY") @@ -71,13 +71,13 @@ set +x aemm_helm_args=( upgrade --install + --namespace default "$CLUSTER_NAME-aemm" "$AEMM_DL_URL" - --wait - --namespace default --set servicePort="$IMDS_PORT" --set 'tolerations[0].effect=NoSchedule' --set 'tolerations[0].operator=Exists' + --wait ) [[ ${#common_helm_args[@]} -gt 0 ]] && aemm_helm_args+=("${common_helm_args[@]}") @@ -91,7 +91,7 @@ TAINT_CHECK_SLEEP=15 DEPLOYED=0 -for i in `seq 1 10`; do +for i in $(seq 1 10); do if [[ $(kubectl get deployments regular-pod-test -o jsonpath='{.status.unavailableReplicas}') -eq 0 ]]; then echo "✅ Verified regular-pod-test pod was scheduled and started!" DEPLOYED=1 @@ -106,11 +106,11 @@ fi EXIT_STATUS=1 -for i in `seq 1 $TAINT_CHECK_CYCLES`; do - if kubectl get nodes $CLUSTER_NAME-worker | grep SchedulingDisabled; then +for i in $(seq 1 $TAINT_CHECK_CYCLES); do + if kubectl get nodes "${CLUSTER_NAME}-worker" | grep SchedulingDisabled; then echo "✅ Verified the worker node was cordoned!" - if kubectl get nodes $CLUSTER_NAME-worker -o json | grep -q "aws-node-termination-handler/spot-itn"; then + if kubectl get nodes "${CLUSTER_NAME}-worker" -o json | grep -q "aws-node-termination-handler/spot-itn"; then echo "✅ Verified the worked node was tainted!" else echo "❌ Failed tainting node for spot termination event" @@ -145,7 +145,7 @@ echo "✅ Port-forwarded pod $POD_NAME" sleep 10 -for i in `seq 1 $TAINT_CHECK_CYCLES`; do +for i in $(seq 1 $TAINT_CHECK_CYCLES); do METRICS_RESPONSE=$(curl -L localhost:7000/metrics) echo "✅ Fetched /metrics." failed="" diff --git a/test/e2e/rebalance-recommendation-drain-test b/test/e2e/rebalance-recommendation-drain-test index 9ff49350..985e1d1b 100755 --- a/test/e2e/rebalance-recommendation-drain-test +++ b/test/e2e/rebalance-recommendation-drain-test @@ -28,11 +28,9 @@ common_helm_args=() anth_helm_args=( upgrade --install + --namespace kube-system "$CLUSTER_NAME-anth" "$SCRIPTPATH/../../config/helm/aws-node-termination-handler/" - --wait - --force - --namespace kube-system --set instanceMetadataURL="${INSTANCE_METADATA_URL:-"http://$AEMM_URL:$IMDS_PORT"}" --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" @@ -40,7 +38,9 @@ anth_helm_args=( --set enableSpotInterruptionDraining="false" --set enableRebalanceDraining="true" --set taintNode="true" - --set tolerations="" + --set daemonsetTolerations="" + --wait + --force ) [[ -n "${NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY-}" ]] && anth_helm_args+=(--set image.pullPolicy="$NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY") @@ -54,13 +54,13 @@ set +x emtp_helm_args=( upgrade --install + --namespace default "$CLUSTER_NAME-emtp" "$SCRIPTPATH/../../config/helm/webhook-test-proxy/" - --wait - --force - --namespace default --set webhookTestProxy.image.repository="$WEBHOOK_DOCKER_REPO" --set webhookTestProxy.image.tag="$WEBHOOK_DOCKER_TAG" + --wait + --force ) [[ -n "${WEBHOOK_DOCKER_PULL_POLICY-}" ]] && emtp_helm_args+=(--set webhookTestProxy.image.pullPolicy="$WEBHOOK_DOCKER_PULL_POLICY") @@ -74,15 +74,15 @@ set +x aemm_helm_args=( upgrade --install + --namespace default "$CLUSTER_NAME-aemm" "$AEMM_DL_URL" - --wait - --namespace default --set servicePort="$IMDS_PORT" --set aemm.mockDelaySec=60 --set 'tolerations[0].effect=NoSchedule' --set 'tolerations[0].operator=Exists' --set arguments='{spot}' + --wait ) [[ ${#common_helm_args[@]} -gt 0 ]] && @@ -159,4 +159,5 @@ for i in $(seq 1 $TAINT_CHECK_CYCLES); do done echo "❌ regular-pod-test was NOT evicted" + fail_and_exit 1 diff --git a/test/e2e/rebalance-recommendation-dry-run-test b/test/e2e/rebalance-recommendation-dry-run-test index 49c8db0b..827ff457 100755 --- a/test/e2e/rebalance-recommendation-dry-run-test +++ b/test/e2e/rebalance-recommendation-dry-run-test @@ -28,11 +28,9 @@ common_helm_args=() anth_helm_args=( upgrade --install + --namespace kube-system "$CLUSTER_NAME-anth" "$SCRIPTPATH/../../config/helm/aws-node-termination-handler/" - --wait - --force - --namespace kube-system --set instanceMetadataURL="${INSTANCE_METADATA_URL:-"http://$AEMM_URL:$IMDS_PORT"}" --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" @@ -41,7 +39,9 @@ anth_helm_args=( --set enableSpotInterruptionDraining="false" --set enableRebalanceDraining="true" --set taintNode="true" - --set tolerations="" + --set daemonsetTolerations="" + --wait + --force ) [[ -n "${NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY-}" ]] && anth_helm_args+=(--set image.pullPolicy="$NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY") @@ -55,13 +55,13 @@ set +x emtp_helm_args=( upgrade --install + --namespace default "$CLUSTER_NAME-emtp" "$SCRIPTPATH/../../config/helm/webhook-test-proxy/" - --wait - --force - --namespace default --set webhookTestProxy.image.repository="$WEBHOOK_DOCKER_REPO" --set webhookTestProxy.image.tag="$WEBHOOK_DOCKER_TAG" + --wait + --force ) [[ -n "${WEBHOOK_DOCKER_PULL_POLICY-}" ]] && emtp_helm_args+=(--set webhookTestProxy.image.pullPolicy="$WEBHOOK_DOCKER_PULL_POLICY") @@ -75,15 +75,15 @@ set +x aemm_helm_args=( upgrade --install + --namespace default "$CLUSTER_NAME-aemm" "$AEMM_DL_URL" - --wait - --namespace default --set servicePort="$IMDS_PORT" --set aemm.mockDelaySec=60 --set 'tolerations[0].effect=NoSchedule' --set 'tolerations[0].operator=Exists' --set arguments='{spot}' + --wait ) [[ ${#common_helm_args[@]} -gt 0 ]] && @@ -116,12 +116,12 @@ logs=0 pod_id=$(get_nth_worker_pod) test_node="${TEST_NODE:-$CLUSTER_NAME-worker}" for i in $(seq 1 $TAINT_CHECK_CYCLES); do - if [[ $logs -eq 0 && ! -z $(kubectl logs $pod_id -n kube-system | grep -i -e 'would have been cordoned and drained') ]]; then + if [[ $logs -eq 0 && ! -z $(kubectl logs "${pod_id}" -n kube-system | grep -i -e 'would have been cordoned and drained') ]]; then echo "✅ Verified the dryrun logs were executed" logs=1 fi - if [[ $logs -eq 1 ]] && kubectl get nodes $test_node --no-headers | grep -v SchedulingDisabled >/dev/null; then + if [[ $logs -eq 1 ]] && kubectl get nodes "${test_node}" --no-headers | grep -v SchedulingDisabled >/dev/null; then echo "✅ Verified the worker node was not cordoned!" echo "✅ Rebalance Recommendation Dry Run Test Passed $CLUSTER_NAME! ✅" exit 0 @@ -135,4 +135,5 @@ if [[ $logs -eq 0 ]]; then else echo "❌ Worker node was cordoned" fi + fail_and_exit 1 diff --git a/test/e2e/rebalance-recommendation-sqs-test b/test/e2e/rebalance-recommendation-sqs-test index 0d6d4c87..8c8d4774 100755 --- a/test/e2e/rebalance-recommendation-sqs-test +++ b/test/e2e/rebalance-recommendation-sqs-test @@ -28,12 +28,12 @@ common_helm_args=() localstack_helm_args=( upgrade --install + --namespace default "$CLUSTER_NAME-localstack" "$SCRIPTPATH/../../config/helm/localstack/" - --wait - --namespace default - --set nodeSelector."$NTH_CONTROL_LABEL" + --set nodeSelector."${NTH_CONTROL_LABEL}" --set defaultRegion="${AWS_REGION}" + --wait ) set -x @@ -60,23 +60,22 @@ echo "🥑 Created SQS Queue ${queue_url}" anth_helm_args=( upgrade --install + --namespace kube-system "$CLUSTER_NAME-acth" "$SCRIPTPATH/../../config/helm/aws-node-termination-handler/" - --namespace kube-system --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" + --set nodeSelector."${NTH_CONTROL_LABEL}" + --set tolerations[0].operator=Exists --set awsAccessKeyID=foo --set awsSecretAccessKey=bar --set awsRegion="${AWS_REGION}" --set awsEndpoint="http://localstack.default" --set checkASGTagBeforeDraining=false --set enableSqsTerminationDraining=true - --set enableScheduledEventDraining=false - --set enableSpotInterruptionDraining=false - --set enableRebalanceMonitoring=false --set taintNode="true" - --set nodeSelector."$NTH_CONTROL_LABEL" --set "queueURL=${queue_url}" + --wait ) [[ -n "${NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY-}" ]] && anth_helm_args+=(--set image.pullPolicy="$NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY") @@ -90,11 +89,12 @@ set +x emtp_helm_args=( upgrade --install + --namespace default "$CLUSTER_NAME-emtp" "$SCRIPTPATH/../../config/helm/webhook-test-proxy/" - --namespace default --set webhookTestProxy.image.repository="$WEBHOOK_DOCKER_REPO" --set webhookTestProxy.image.tag="$WEBHOOK_DOCKER_TAG" + --wait ) [[ -n "${WEBHOOK_DOCKER_PULL_POLICY-}" ]] && emtp_helm_args+=(--set webhookTestProxy.image.pullPolicy="$WEBHOOK_DOCKER_PULL_POLICY") @@ -198,4 +198,4 @@ elif [[ $message_deleted -eq 0 ]]; then fail_and_exit 3 fi -fail_and_exit 1 \ No newline at end of file +fail_and_exit 1 diff --git a/test/e2e/rebalance-recommendation-test b/test/e2e/rebalance-recommendation-test index 862f96ff..9e324a58 100755 --- a/test/e2e/rebalance-recommendation-test +++ b/test/e2e/rebalance-recommendation-test @@ -28,11 +28,9 @@ common_helm_args=() anth_helm_args=( upgrade --install + --namespace kube-system "$CLUSTER_NAME-anth" "$SCRIPTPATH/../../config/helm/aws-node-termination-handler/" - --wait - --force - --namespace kube-system --set instanceMetadataURL="${INSTANCE_METADATA_URL:-"http://$AEMM_URL:$IMDS_PORT"}" --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" @@ -40,7 +38,9 @@ anth_helm_args=( --set enableSpotInterruptionDraining="true" --set enableRebalanceMonitoring="true" --set taintNode="true" - --set tolerations="" + --set daemonsetTolerations="" + --wait + --force ) [[ -n "${NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY-}" ]] && anth_helm_args+=(--set image.pullPolicy="$NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY") @@ -54,13 +54,13 @@ set +x emtp_helm_args=( upgrade --install + --namespace default "$CLUSTER_NAME-emtp" "$SCRIPTPATH/../../config/helm/webhook-test-proxy/" - --wait - --force - --namespace default --set webhookTestProxy.image.repository="$WEBHOOK_DOCKER_REPO" --set webhookTestProxy.image.tag="$WEBHOOK_DOCKER_TAG" + --wait + --force ) [[ -n "${WEBHOOK_DOCKER_PULL_POLICY-}" ]] && emtp_helm_args+=(--set webhookTestProxy.image.pullPolicy="$WEBHOOK_DOCKER_PULL_POLICY") @@ -74,15 +74,15 @@ set +x aemm_helm_args=( upgrade --install + --namespace default "$CLUSTER_NAME-aemm" "$AEMM_DL_URL" - --wait - --namespace default --set servicePort="$IMDS_PORT" --set aemm.mockDelaySec=60 --set 'tolerations[0].effect=NoSchedule' --set 'tolerations[0].operator=Exists' --set arguments='{spot}' + --wait ) [[ ${#common_helm_args[@]} -gt 0 ]] && @@ -161,5 +161,4 @@ done echo "❌ regular-pod-test was NOT evicted" -echo "❌ Rebalance Recommendation Test Failed $CLUSTER_NAME ❌" -fail_and_exit 1 \ No newline at end of file +fail_and_exit 1 diff --git a/test/e2e/scheduled-change-event-sqs-test b/test/e2e/scheduled-change-event-sqs-test new file mode 100755 index 00000000..e63597a9 --- /dev/null +++ b/test/e2e/scheduled-change-event-sqs-test @@ -0,0 +1,214 @@ +#!/bin/bash +set -euo pipefail + +# Available env vars: +# $TMP_DIR +# $CLUSTER_NAME +# $KUBECONFIG +# $NODE_TERMINATION_HANDLER_DOCKER_REPO +# $NODE_TERMINATION_HANDLER_DOCKER_TAG +# $WEBHOOK_DOCKER_REPO +# $WEBHOOK_DOCKER_TAG +# $AEMM_URL +# $AEMM_VERSION + + +function fail_and_exit { + echo "❌ AWS Scheduled Change SQS Test failed $CLUSTER_NAME ❌" + exit "${1:-1}" +} + +echo "Starting AWS Scheduled Change SQS Test for Node Termination Handler" +START_TIME=$(date -u +"%Y-%m-%dT%TZ") + +SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" + +common_helm_args=() + +localstack_helm_args=( + upgrade + --install + "$CLUSTER_NAME-localstack" + "$SCRIPTPATH/../../config/helm/localstack/" + --wait + --namespace default + --set nodeSelector."${NTH_CONTROL_LABEL}" + --set defaultRegion="${AWS_REGION}" +) + +set -x +helm "${localstack_helm_args[@]}" +set +x + +sleep 10 + +RUN_INSTANCE_CMD="awslocal ec2 run-instances --private-ip-address ${WORKER_IP} --region ${AWS_REGION} --tag-specifications 'ResourceType=instance,Tags=[{Key=aws:autoscaling:groupName,Value=nth-integ-test}]'" +localstack_pod=$(kubectl get pods --selector app=localstack --field-selector="status.phase=Running" \ + -o go-template --template '{{range .items}}{{.metadata.name}} {{.metadata.creationTimestamp}}{{"\n"}}{{end}}' \ + | awk '$2 >= "'"${START_TIME//+0000/Z}"'" { print $1 }') +echo "🥑 Using localstack pod ${localstack_pod}" +run_instances_resp=$(kubectl exec -i "${localstack_pod}" -- bash -c "${RUN_INSTANCE_CMD}") +private_dns_name=$(echo "${run_instances_resp}" | jq -r '.Instances[] .PrivateDnsName') +instance_id=$(echo "${run_instances_resp}" | jq -r '.Instances[] .InstanceId') +echo "🥑 Started mock EC2 instance (${instance_id}) w/ private DNS name: ${private_dns_name}" + +CREATE_SQS_CMD="awslocal sqs create-queue --queue-name "${CLUSTER_NAME}-queue" --attributes MessageRetentionPeriod=300 --region ${AWS_REGION}" +queue_url=$(kubectl exec -i "${localstack_pod}" -- bash -c "${CREATE_SQS_CMD}" | jq -r .QueueUrl) + +echo "🥑 Created SQS Queue ${queue_url}" + +anth_helm_args=( + upgrade + --install + --namespace kube-system + "$CLUSTER_NAME-acth" + "$SCRIPTPATH/../../config/helm/aws-node-termination-handler/" + --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" + --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" + --set nodeSelector."${NTH_CONTROL_LABEL}" + --set tolerations[0].operator=Exists + --set awsAccessKeyID=foo + --set awsSecretAccessKey=bar + --set awsRegion="${AWS_REGION}" + --set awsEndpoint="http://localstack.default" + --set checkASGTagBeforeDraining=false + --set enableSqsTerminationDraining=true + --set taintNode="true" + --set "queueURL=${queue_url}" + --wait +) +[[ -n "${NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY-}" ]] && + anth_helm_args+=(--set image.pullPolicy="$NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY") +[[ ${#common_helm_args[@]} -gt 0 ]] && + anth_helm_args+=("${common_helm_args[@]}") + +set -x +helm "${anth_helm_args[@]}" +set +x + +emtp_helm_args=( + upgrade + --install + --namespace default + "$CLUSTER_NAME-emtp" + "$SCRIPTPATH/../../config/helm/webhook-test-proxy/" + --set webhookTestProxy.image.repository="$WEBHOOK_DOCKER_REPO" + --set webhookTestProxy.image.tag="$WEBHOOK_DOCKER_TAG" + --wait +) +[[ -n "${WEBHOOK_DOCKER_PULL_POLICY-}" ]] && + emtp_helm_args+=(--set webhookTestProxy.image.pullPolicy="$WEBHOOK_DOCKER_PULL_POLICY") +[[ ${#common_helm_args[@]} -gt 0 ]] && + emtp_helm_args+=("${common_helm_args[@]}") + +set -x +helm "${emtp_helm_args[@]}" +set +x + +TAINT_CHECK_CYCLES=15 +TAINT_CHECK_SLEEP=15 + +DEPLOYED=0 + +for i in $(seq 1 $TAINT_CHECK_CYCLES); do + if [[ $(kubectl get deployments regular-pod-test -o jsonpath='{.status.unavailableReplicas}') -eq 0 ]]; then + echo "✅ Verified regular-pod-test pod was scheduled and started!" + DEPLOYED=1 + break + fi + echo "Setup Loop $i/$TAINT_CHECK_CYCLES, sleeping for $TAINT_CHECK_SLEEP seconds" + sleep $TAINT_CHECK_SLEEP +done + +if [[ $DEPLOYED -eq 0 ]]; then + echo "❌ regular-pod-test pod deployment failed" + fail_and_exit 2 +fi + +AWS_SCHEDULED_CHANGE_EVENT=$(cat < /dev/null; then + echo "✅ Verified the worker node was cordoned!" + cordoned=1 + fi + + if [[ $cordoned -eq 1 ]] && kubectl get nodes "${test_node}" -o json | grep "aws-node-termination-handler/scheduled-maintenance" >/dev/null; then + echo "✅ Verified the worker node was tainted!" + tainted=1 + fi + + if [[ $tainted -eq 1 && $(kubectl get deployments regular-pod-test -o=jsonpath='{.status.unavailableReplicas}') -eq 1 ]]; then + echo "✅ Verified the regular-pod-test pod was evicted!" + evicted=1 + fi + + if [[ ${evicted} -eq 1 && $(kubectl exec -i "${localstack_pod}" -- bash -c "${GET_ATTRS_SQS_CMD}" | jq '(.Attributes.ApproximateNumberOfMessagesNotVisible|tonumber) + (.Attributes.ApproximateNumberOfMessages|tonumber)' ) -eq 0 ]]; then + kubectl exec -i "${localstack_pod}" -- bash -c "${GET_ATTRS_SQS_CMD}" + echo "✅ Verified the message was deleted from the queue after processing!" + message_deleted=1 + echo "✅ AWS Scheduled Change SQS Test Passed $CLUSTER_NAME! ✅" + exit 0 + fi + + echo "Assertion Loop $i/$TAINT_CHECK_CYCLES, sleeping for $TAINT_CHECK_SLEEP seconds" + sleep $TAINT_CHECK_SLEEP +done + +if [[ $cordoned -eq 0 ]]; then + echo "❌ Worker node was not cordoned" + fail_and_exit 3 +elif [[ $tainted -eq 0 ]]; then + echo "❌ Worker node was not tainted" + fail_and_exit 3 +elif [[ $evicted -eq 0 ]]; then + echo "❌ regular-pod-test was NOT evicted" + fail_and_exit 3 +elif [[ $message_deleted -eq 0 ]]; then + echo "❌ message was not removed from the queue after processing" + fail_and_exit 3 +fi + +fail_and_exit 1 diff --git a/test/e2e/spot-interruption-dry-run-test b/test/e2e/spot-interruption-dry-run-test index 1e170fca..051a0768 100755 --- a/test/e2e/spot-interruption-dry-run-test +++ b/test/e2e/spot-interruption-dry-run-test @@ -14,7 +14,7 @@ set -euo pipefail function fail_and_exit { echo "❌ Spot Interruption Dry Run test failed $CLUSTER_NAME ❌" - exit ${1:-1} + exit "${1:-1}" } echo "Starting Maintenance Events Dry-Run Test for Node Termination Handler" @@ -28,17 +28,17 @@ common_helm_args=() anth_helm_args=( upgrade --install + --namespace kube-system "$CLUSTER_NAME-anth" "$SCRIPTPATH/../../config/helm/aws-node-termination-handler/" - --wait - --force - --namespace kube-system --set instanceMetadataURL="${INSTANCE_METADATA_URL:-"http://$AEMM_URL:$IMDS_PORT"}" --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" --set dryRun="true" --set enableSpotInterruptionDraining="true" --set enableScheduledEventDraining="true" + --wait + --force ) [[ -n "${NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY-}" ]] && anth_helm_args+=(--set image.pullPolicy="$NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY") @@ -52,13 +52,12 @@ set +x emtp_helm_args=( upgrade --install - + --namespace default "$CLUSTER_NAME-emtp" "$SCRIPTPATH/../../config/helm/webhook-test-proxy/" - --wait - --namespace default --set webhookTestProxy.image.repository="$WEBHOOK_DOCKER_REPO" --set webhookTestProxy.image.tag="$WEBHOOK_DOCKER_TAG" + --wait ) [[ -n "${WEBHOOK_DOCKER_PULL_POLICY-}" ]] && emtp_helm_args+=(--set webhookTestProxy.image.pullPolicy="$WEBHOOK_DOCKER_PULL_POLICY") @@ -72,12 +71,12 @@ set +x aemm_helm_args=( upgrade --install + --namespace default "$CLUSTER_NAME-aemm" "$AEMM_DL_URL" - --wait - --namespace default --set servicePort="$IMDS_PORT" --set arguments='{spot}' + --wait ) [[ ${#common_helm_args[@]} -gt 0 ]] && aemm_helm_args+=("${common_helm_args[@]}") @@ -93,12 +92,12 @@ logs=0 pod_id=$(get_nth_worker_pod) test_node="${TEST_NODE:-$CLUSTER_NAME-worker}" for i in $(seq 1 $TAINT_CHECK_CYCLES); do - if [[ $logs -eq 0 && ! -z $(kubectl logs $pod_id -n kube-system | grep -i -e 'would have been cordoned and drained') ]]; then + if [[ $logs -eq 0 && ! -z $(kubectl logs "${pod_id}" -n kube-system | grep -i -e 'would have been cordoned and drained') ]]; then echo "✅ Verified the dryrun logs were executed" logs=1 fi - if [[ $logs -eq 1 ]] && kubectl get nodes $test_node --no-headers | grep -v SchedulingDisabled >/dev/null; then + if [[ $logs -eq 1 ]] && kubectl get nodes "${test_node}" --no-headers | grep -v SchedulingDisabled >/dev/null; then echo "✅ Verified the worker node was not cordoned!" echo "✅ Spot Interruption Dry Run Test Passed $CLUSTER_NAME! ✅" exit 0 @@ -112,4 +111,5 @@ if [[ $logs -eq 0 ]]; then else echo "❌ Worker node was cordoned" fi + fail_and_exit 1 diff --git a/test/e2e/spot-interruption-sqs-test b/test/e2e/spot-interruption-sqs-test index e473b08b..32498dd5 100755 --- a/test/e2e/spot-interruption-sqs-test +++ b/test/e2e/spot-interruption-sqs-test @@ -32,7 +32,7 @@ localstack_helm_args=( "$SCRIPTPATH/../../config/helm/localstack/" --wait --namespace default - --set nodeSelector."$NTH_CONTROL_LABEL" + --set nodeSelector."${NTH_CONTROL_LABEL}" --set defaultRegion="${AWS_REGION}" ) @@ -60,21 +60,21 @@ echo "🥑 Created SQS Queue ${queue_url}" anth_helm_args=( upgrade --install + --namespace kube-system "$CLUSTER_NAME-acth" "$SCRIPTPATH/../../config/helm/aws-node-termination-handler/" - --namespace kube-system --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" + --set nodeSelector."${NTH_CONTROL_LABEL}" + --set tolerations[0].operator=Exists --set awsAccessKeyID=foo --set awsSecretAccessKey=bar --set awsRegion="${AWS_REGION}" --set awsEndpoint="http://localstack.default" --set checkASGTagBeforeDraining=false --set enableSqsTerminationDraining=true - --set enableScheduledEventDraining=false - --set enableSpotInterruptionDraining=false - --set nodeSelector."$NTH_CONTROL_LABEL" --set "queueURL=${queue_url}" + --wait ) [[ -n "${NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY-}" ]] && anth_helm_args+=(--set image.pullPolicy="$NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY") @@ -88,11 +88,12 @@ set +x emtp_helm_args=( upgrade --install + --namespace default "$CLUSTER_NAME-emtp" "$SCRIPTPATH/../../config/helm/webhook-test-proxy/" - --namespace default --set webhookTestProxy.image.repository="$WEBHOOK_DOCKER_REPO" --set webhookTestProxy.image.tag="$WEBHOOK_DOCKER_TAG" + --wait ) [[ -n "${WEBHOOK_DOCKER_PULL_POLICY-}" ]] && emtp_helm_args+=(--set webhookTestProxy.image.pullPolicy="$WEBHOOK_DOCKER_PULL_POLICY") @@ -181,5 +182,4 @@ else echo "❌ regular-pod-test was not evicted" fi -echo "❌ Spot Interruption SQS Test Failed $CLUSTER_NAME ❌" fail_and_exit 1 diff --git a/test/e2e/spot-interruption-test b/test/e2e/spot-interruption-test index 1998aee5..36781d21 100755 --- a/test/e2e/spot-interruption-test +++ b/test/e2e/spot-interruption-test @@ -14,7 +14,7 @@ set -euo pipefail function fail_and_exit { echo "❌ Spot Interruption test failed $CLUSTER_NAME ❌" - exit ${1:-1} + exit "${1:-1}" } echo "Starting Spot Interruption Test for Node Termination Handler" @@ -28,18 +28,18 @@ common_helm_args=() anth_helm_args=( upgrade --install + --namespace kube-system "$CLUSTER_NAME-anth" "$SCRIPTPATH/../../config/helm/aws-node-termination-handler/" - --wait - --force - --namespace kube-system --set instanceMetadataURL="${INSTANCE_METADATA_URL:-"http://$AEMM_URL:$IMDS_PORT"}" --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" --set enableScheduledEventDraining="false" --set enableSpotInterruptionDraining="true" --set taintNode="true" - --set tolerations="" + --set daemonsetTolerations="" + --wait + --force ) [[ -n "${NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY-}" ]] && anth_helm_args+=(--set image.pullPolicy="$NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY") @@ -53,13 +53,13 @@ set +x emtp_helm_args=( upgrade --install + --namespace default "$CLUSTER_NAME-emtp" "$SCRIPTPATH/../../config/helm/webhook-test-proxy/" - --wait - --force - --namespace default --set webhookTestProxy.image.repository="$WEBHOOK_DOCKER_REPO" --set webhookTestProxy.image.tag="$WEBHOOK_DOCKER_TAG" + --wait + --force ) [[ -n "${WEBHOOK_DOCKER_PULL_POLICY-}" ]] && emtp_helm_args+=(--set webhookTestProxy.image.pullPolicy="$WEBHOOK_DOCKER_PULL_POLICY") @@ -73,14 +73,14 @@ set +x aemm_helm_args=( upgrade --install + --namespace default "$CLUSTER_NAME-aemm" "$AEMM_DL_URL" - --wait - --namespace default --set servicePort="$IMDS_PORT" --set 'tolerations[0].effect=NoSchedule' --set 'tolerations[0].operator=Exists' --set arguments='{spot}' + --wait ) [[ ${#common_helm_args[@]} -gt 0 ]] && aemm_helm_args+=("${common_helm_args[@]}") @@ -93,7 +93,7 @@ TAINT_CHECK_CYCLES=15 TAINT_CHECK_SLEEP=15 deployed=0 -for i in `seq 1 $TAINT_CHECK_CYCLES`; do +for i in $(seq 1 $TAINT_CHECK_CYCLES); do if [[ $(kubectl get deployments regular-pod-test -o jsonpath='{.status.unavailableReplicas}') -eq 0 ]]; then echo "✅ Verified regular-pod-test pod was scheduled and started!" deployed=1 @@ -111,13 +111,13 @@ fi cordoned=0 tainted=0 test_node=${TEST_NODE:-$CLUSTER_NAME-worker} -for i in `seq 1 $TAINT_CHECK_CYCLES`; do - if [[ $cordoned -eq 0 ]] && kubectl get nodes $test_node | grep SchedulingDisabled >/dev/null; then +for i in $(seq 1 $TAINT_CHECK_CYCLES); do + if [[ $cordoned -eq 0 ]] && kubectl get nodes "${test_node}" | grep SchedulingDisabled >/dev/null; then echo "✅ Verified the worker node was cordoned!" cordoned=1 fi - if [[ $cordoned -eq 1 && $tainted -eq 0 ]] && kubectl get nodes $test_node -o json | grep -q "aws-node-termination-handler/spot-itn" >/dev/null; then + if [[ $cordoned -eq 1 && $tainted -eq 0 ]] && kubectl get nodes "${test_node}" -o json | grep -q "aws-node-termination-handler/spot-itn" >/dev/null; then echo "✅ Verified the worked node was tainted!" tainted=1 fi @@ -138,4 +138,5 @@ elif [[ $tainted -eq 0 ]]; then else echo "❌ regular-pod-test pod was not evicted" fi + fail_and_exit 1 diff --git a/test/e2e/spot-interruption-test-events-on b/test/e2e/spot-interruption-test-events-on index ac1b0334..d043be05 100755 --- a/test/e2e/spot-interruption-test-events-on +++ b/test/e2e/spot-interruption-test-events-on @@ -14,7 +14,7 @@ set -euo pipefail function fail_and_exit { echo "❌ Spot Interruption With Events test failed $CLUSTER_NAME ❌" - exit ${1:-1} + exit "${1:-1}" } echo "Starting Spot Interruption With Events Test for Node Termination Handler" @@ -28,20 +28,20 @@ common_helm_args=() anth_helm_args=( upgrade --install + --namespace kube-system "$CLUSTER_NAME-anth" "$SCRIPTPATH/../../config/helm/aws-node-termination-handler/" - --wait - --force - --namespace kube-system --set instanceMetadataURL="${INSTANCE_METADATA_URL:-"http://$AEMM_URL:$IMDS_PORT"}" --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" --set enableScheduledEventDraining="false" --set enableSpotInterruptionDraining="true" --set taintNode="true" - --set tolerations="" + --set daemonsetTolerations="" --set emitKubernetesEvents="true" --set kubernetesEventsExtraAnnotations="spot.itn.events/test=extra-annotation" + --wait + --force ) [[ -n "${NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY-}" ]] && anth_helm_args+=(--set image.pullPolicy="$NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY") @@ -55,13 +55,13 @@ set +x emtp_helm_args=( upgrade --install + --namespace default "$CLUSTER_NAME-emtp" "$SCRIPTPATH/../../config/helm/webhook-test-proxy/" - --wait - --force - --namespace default --set webhookTestProxy.image.repository="$WEBHOOK_DOCKER_REPO" --set webhookTestProxy.image.tag="$WEBHOOK_DOCKER_TAG" + --wait + --force ) [[ -n "${WEBHOOK_DOCKER_PULL_POLICY-}" ]] && emtp_helm_args+=(--set webhookTestProxy.image.pullPolicy="$WEBHOOK_DOCKER_PULL_POLICY") @@ -75,14 +75,14 @@ set +x aemm_helm_args=( upgrade --install + --namespace default "$CLUSTER_NAME-aemm" "$AEMM_DL_URL" - --wait - --namespace default --set servicePort="$IMDS_PORT" --set 'tolerations[0].effect=NoSchedule' --set 'tolerations[0].operator=Exists' --set arguments='{spot}' + --wait ) [[ ${#common_helm_args[@]} -gt 0 ]] && aemm_helm_args+=("${common_helm_args[@]}") @@ -95,7 +95,7 @@ TAINT_CHECK_CYCLES=15 TAINT_CHECK_SLEEP=15 deployed=0 -for i in `seq 1 $TAINT_CHECK_CYCLES`; do +for i in $(seq 1 $TAINT_CHECK_CYCLES); do if [[ $(kubectl get deployments regular-pod-test -o jsonpath='{.status.unavailableReplicas}') -eq 0 ]]; then echo "✅ Verified regular-pod-test pod was scheduled and started!" deployed=1 @@ -114,13 +114,13 @@ cordoned=0 tainted=0 evicted=0 test_node=${TEST_NODE:-$CLUSTER_NAME-worker} -for i in `seq 1 $TAINT_CHECK_CYCLES`; do - if [[ $cordoned -eq 0 ]] && kubectl get nodes $test_node | grep SchedulingDisabled >/dev/null; then +for i in $(seq 1 $TAINT_CHECK_CYCLES); do + if [[ $cordoned -eq 0 ]] && kubectl get nodes "${test_node}" | grep SchedulingDisabled >/dev/null; then echo "✅ Verified the worker node was cordoned!" cordoned=1 fi - if [[ $cordoned -eq 1 && $tainted -eq 0 ]] && kubectl get nodes $test_node -o json | grep -q "aws-node-termination-handler/spot-itn" >/dev/null; then + if [[ $cordoned -eq 1 && $tainted -eq 0 ]] && kubectl get nodes "${test_node}" -o json | grep -q "aws-node-termination-handler/spot-itn" >/dev/null; then echo "✅ Verified the worked node was tainted!" tainted=1 fi @@ -146,25 +146,25 @@ elif [[ $evicted -eq 0 ]]; then fi echo "🥑 Getting Kubernetes events..." -for i in `seq 1 $TAINT_CHECK_CYCLES`; do +for i in $(seq 1 $TAINT_CHECK_CYCLES); do eventnotfound="" annotationnotfound="" extraannotationnotfound="" events=$(kubectl get events --field-selector source=aws-node-termination-handler -o json) for reason in SpotInterruption PreDrain CordonAndDrain; do - event=$(echo $events | jq --arg REASON "$reason" '[.items[] | select(.reason==$REASON)][0]') + event=$(echo "${events}" | jq --arg REASON "$reason" '[.items[] | select(.reason==$REASON)][0]') if [[ $event == "null" ]]; then eventnotfound=$reason break fi for ant in account-id availability-zone instance-id instance-life-cycle instance-type local-hostname local-ipv4 public-hostname public-ipv4 region; do - if [[ "$(echo $event | jq -r --arg ANT "$ant" '.metadata.annotations[$ANT]')" == "null" ]]; then + if [[ "$(echo "${event}" | jq -r --arg ANT "$ant" '.metadata.annotations[$ANT]')" == "null" ]]; then eventnotfound=$reason annotationnotfound=$ant break 2 fi done - if [[ "$(echo $event | jq -r '.metadata.annotations["spot.itn.events/test"]')" != "extra-annotation" ]]; then + if [[ "$(echo "${event}" | jq -r '.metadata.annotations["spot.itn.events/test"]')" != "extra-annotation" ]]; then extraannotationnotfound=$reason break fi @@ -186,4 +186,5 @@ if [ ! -z $eventnotfound ]; then fail_and_exit 1 fi echo "❌ Extra annotation was not found on event with reason $extraannotationnotfound" + fail_and_exit_1 diff --git a/test/e2e/spot-interruption-test-host-networking-off b/test/e2e/spot-interruption-test-host-networking-off index aa041dac..eb3fe340 100755 --- a/test/e2e/spot-interruption-test-host-networking-off +++ b/test/e2e/spot-interruption-test-host-networking-off @@ -14,7 +14,7 @@ set -euo pipefail function fail_and_exit { echo "❌ Spot Interruption w/o Host Networking test failed $CLUSTER_NAME ❌" - exit ${1:-1} + exit "${1:-1}" } echo "Starting Spot Interruption w/o Host Networking Test for Node Termination Handler" @@ -28,11 +28,9 @@ common_helm_args=() anth_helm_args=( upgrade --install + --namespace kube-system "$CLUSTER_NAME-anth" "$SCRIPTPATH/../../config/helm/aws-node-termination-handler/" - --wait - --force - --namespace kube-system --set instanceMetadataURL="${INSTANCE_METADATA_URL:-"http://$AEMM_URL:$IMDS_PORT"}" --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" @@ -40,7 +38,9 @@ anth_helm_args=( --set enableSpotInterruptionDraining="true" --set taintNode="true" --set useHostNetwork="false" - --set tolerations="" + --set daemonsetTolerations="" + --wait + --force ) [[ -n "${NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY-}" ]] && anth_helm_args+=(--set image.pullPolicy="$NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY") @@ -54,13 +54,13 @@ set +x emtp_helm_args=( upgrade --install + --namespace default "$CLUSTER_NAME-emtp" "$SCRIPTPATH/../../config/helm/webhook-test-proxy/" - --wait - --force - --namespace default --set webhookTestProxy.image.repository="$WEBHOOK_DOCKER_REPO" --set webhookTestProxy.image.tag="$WEBHOOK_DOCKER_TAG" + --wait + --force ) [[ -n "${WEBHOOK_DOCKER_PULL_POLICY-}" ]] && emtp_helm_args+=(--set webhookTestProxy.image.pullPolicy="$WEBHOOK_DOCKER_PULL_POLICY") @@ -74,14 +74,14 @@ set +x aemm_helm_args=( upgrade --install + --namespace default "$CLUSTER_NAME-aemm" "$AEMM_DL_URL" - --wait - --namespace default --set servicePort="$IMDS_PORT" --set 'tolerations[0].effect=NoSchedule' --set 'tolerations[0].operator=Exists' --set arguments='{spot}' + --wait ) [[ ${#common_helm_args[@]} -gt 0 ]] && aemm_helm_args+=("${common_helm_args[@]}") @@ -94,7 +94,7 @@ TAINT_CHECK_CYCLES=15 TAINT_CHECK_SLEEP=15 deployed=0 -for i in `seq 1 $TAINT_CHECK_CYCLES`; do +for i in $(seq 1 $TAINT_CHECK_CYCLES); do if [[ $(kubectl get deployments regular-pod-test -o jsonpath='{.status.unavailableReplicas}') -eq 0 ]]; then echo "✅ Verified regular-pod-test pod was scheduled and started!" deployed=1 @@ -112,13 +112,13 @@ fi cordoned=0 tainted=0 test_node=${TEST_NODE:-$CLUSTER_NAME-worker} -for i in `seq 1 $TAINT_CHECK_CYCLES`; do - if [[ $cordoned -eq 0 ]] && kubectl get nodes $test_node | grep SchedulingDisabled >/dev/null; then +for i in $(seq 1 $TAINT_CHECK_CYCLES); do + if [[ $cordoned -eq 0 ]] && kubectl get nodes "${test_node}" | grep SchedulingDisabled >/dev/null; then echo "✅ Verified the worker node was cordoned!" cordoned=1 fi - if [[ $cordoned -eq 1 && $tainted -eq 0 ]] && kubectl get nodes $test_node -o json | grep -q "aws-node-termination-handler/spot-itn" >/dev/null; then + if [[ $cordoned -eq 1 && $tainted -eq 0 ]] && kubectl get nodes "${test_node}" -o json | grep -q "aws-node-termination-handler/spot-itn" >/dev/null; then echo "✅ Verified the worked node was tainted!" tainted=1 fi @@ -139,4 +139,5 @@ elif [[ $tainted -eq 0 ]]; then else echo "❌ regular-pod-test pod was not evicted" fi + fail_and_exit 1 diff --git a/test/e2e/webhook-http-proxy-test b/test/e2e/webhook-http-proxy-test index 2f545713..7f6bd02b 100755 --- a/test/e2e/webhook-http-proxy-test +++ b/test/e2e/webhook-http-proxy-test @@ -20,7 +20,7 @@ SQUID_URL="tcp://squid.default.svc.cluster.local:$SQUID_PORT" function fail_and_exit { echo "❌ Webhook HTTP Proxy Test failed $CLUSTER_NAME ❌" - exit ${1:-1} + exit "${1:-1}" } function get_squid_worker_pod() { @@ -30,13 +30,13 @@ function get_squid_worker_pod() { function start_squid() { kubectl delete configmap squid-config || : - kubectl create configmap squid-config --from-file="$SCRIPTPATH/../assets/squid.conf" + kubectl create configmap squid-config --from-file="${SCRIPTPATH}/../assets/squid.conf" old_squid_pods="" - for i in `seq 1 10`; do + for i in $(seq 1 10); do echo "Checking if squid http-proxy has been terminated from previous run..." old_squid_pods="$(get_squid_worker_pod)" - if [[ -n "${old_squid_pods}" ]]; then + if [[ -n "${old_squid_pods}" ]]; then echo "Still waiting on squid to terminate from last run... Check $i/10" sleep 10 else @@ -44,20 +44,19 @@ function start_squid() { fi done - helm upgrade --install $CLUSTER_NAME-squid $SCRIPTPATH/../../config/helm/squid/ \ - --force \ - --wait \ - --namespace default \ + helm upgrade --install --namespace default "${CLUSTER_NAME}-squid" "${SCRIPTPATH}/../../config/helm/squid/" \ --set squid.configMap="squid-config" \ --set squid.image.repository="squid" \ - --set squid.image.tag="customtest" + --set squid.image.tag="customtest" \ + --force \ + --wait \ ## Squid can take a while to start, try to get a squid worker pod, if not, hope for the best when assertions are checked squid_worker_pods="" - for i in `seq 1 10`; do + for i in $(seq 1 10); do echo "Checking if squid http-proxy has started..." squid_worker_pods="$(get_squid_worker_pod)" - if [[ -z "${squid_worker_pods}" ]]; then + if [[ -z "${squid_worker_pods}" ]]; then echo "Still waiting on squid... Check $i/10" sleep 10 else @@ -83,11 +82,11 @@ common_helm_args=() aemm_helm_args=( upgrade --install + --namespace default "$CLUSTER_NAME-aemm" "$AEMM_DL_URL" - --wait - --namespace default --set servicePort="$IMDS_PORT" + --wait ) [[ ${#common_helm_args[@]} -gt 0 ]] && aemm_helm_args+=("${common_helm_args[@]}") @@ -99,12 +98,12 @@ set +x emtp_helm_args=( upgrade --install + --namespace default "$CLUSTER_NAME-emtp" "$SCRIPTPATH/../../config/helm/webhook-test-proxy/" - --wait - --namespace default --set webhookTestProxy.image.repository="$WEBHOOK_DOCKER_REPO" --set webhookTestProxy.image.tag="$WEBHOOK_DOCKER_TAG" + --wait ) [[ -n "${WEBHOOK_DOCKER_PULL_POLICY-}" ]] && emtp_helm_args+=(--set webhookTestProxy.image.pullPolicy="$WEBHOOK_DOCKER_PULL_POLICY") @@ -118,11 +117,9 @@ set +x anth_helm_args=( upgrade --install + --namespace kube-system "$CLUSTER_NAME-anth" "$SCRIPTPATH/../../config/helm/aws-node-termination-handler/" - --force - --namespace kube-system - --wait --set instanceMetadataURL="http://$AEMM_URL:$IMDS_PORT" --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" @@ -131,6 +128,8 @@ anth_helm_args=( --set webhookURL="$WEBHOOK_URL" --set webhookTemplate="\{\"Content\":\"[NTH][Instance Interruption] InstanceId: \{\{ \.InstanceID \}\} - Node: \{\{ \.NodeName \}\} - InstanceType: \{\{ \.InstanceType \}\} - Kind: \{\{ \.Kind \}\} - Start Time: \{\{ \.StartTime \}\}\"\}" --set webhookProxy="$SQUID_URL" + --force + --wait ) [[ -n "${NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY-}" ]] && anth_helm_args+=(--set image.pullPolicy="$NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY") @@ -145,7 +144,7 @@ TAINT_CHECK_CYCLES=15 TAINT_CHECK_SLEEP=15 deployed=0 -for i in `seq 1 $TAINT_CHECK_CYCLES`; do +for i in $(seq 1 $TAINT_CHECK_CYCLES); do if [[ $(kubectl get deployments regular-pod-test -o jsonpath='{.status.unavailableReplicas}') -eq 0 ]]; then echo "✅ Verified regular-pod-test pod was scheduled and started!" deployed=1 @@ -163,8 +162,8 @@ fi cordoned=0 evicted=0 sent=0 -for i in `seq 1 $TAINT_CHECK_CYCLES`; do - if [[ $cordoned -eq 0 ]] && kubectl get nodes $CLUSTER_NAME-worker | grep SchedulingDisabled; then +for i in $(seq 1 $TAINT_CHECK_CYCLES); do + if [[ $cordoned -eq 0 ]] && kubectl get nodes "${CLUSTER_NAME}-worker" | grep SchedulingDisabled; then echo "✅ Verified the worker node was cordoned!" cordoned=1 fi @@ -175,13 +174,13 @@ for i in `seq 1 $TAINT_CHECK_CYCLES`; do pod_id=$(get_nth_worker_pod) fi - if [[ $evicted -eq 1 && $sent -eq 0 ]] && kubectl logs $pod_id -n kube-system | grep 'Webhook Success' >/dev/null; then + if [[ $evicted -eq 1 && $sent -eq 0 ]] && kubectl logs "${pod_id}" -n kube-system | grep 'Webhook Success' >/dev/null; then echo "✅ Verified that webhook successfully sent" sent=1 fi webhook_hostname=$(echo "${WEBHOOK_URL}" | sed -e 's@^[^/]*//@@' -e 's@/.*$@@') - if [[ $sent -eq 1 ]] && kubectl exec -i "$(echo $squid_worker_pods | cut -d' ' -f1)" -- cat /var/log/squid/access.log | grep "${webhook_hostname}" >/dev/null; then + if [[ $sent -eq 1 ]] && kubectl exec -i "$(echo "${squid_worker_pods}" | cut -d' ' -f1)" -- cat /var/log/squid/access.log | grep "${webhook_hostname}" >/dev/null; then echo "✅ Verified the webhook POST used the http proxy" exit 0 fi @@ -200,7 +199,7 @@ else fi echo "===================================== SQUID LOGS ====================================================" -kubectl exec -i "$(echo $squid_worker_pods | cut -d' ' -f1)" -- cat /var/log/squid/access.log +kubectl exec -i "$(echo "${squid_worker_pods}" | cut -d' ' -f1)" -- cat /var/log/squid/access.log echo "===================================== END SQUID LOGS ====================================================" fail_and_exit 1 diff --git a/test/e2e/webhook-secret-test b/test/e2e/webhook-secret-test index 9c62df5c..cfa269e7 100755 --- a/test/e2e/webhook-secret-test +++ b/test/e2e/webhook-secret-test @@ -21,7 +21,7 @@ function cleanup { kubectl delete secret -n kube-system "${WEBHOOK_NAME}" || : } -kubectl create secret -n kube-system generic "${WEBHOOK_NAME}" --from-literal=$WEBHOOKURL_LITERAL +kubectl create secret -n kube-system generic "${WEBHOOK_NAME}" --from-literal="${WEBHOOKURL_LITERAL}" trap "cleanup" EXIT INT TERM ERR @@ -32,11 +32,11 @@ common_helm_args=() aemm_helm_args=( upgrade --install + --namespace default "$CLUSTER_NAME-aemm" "$AEMM_DL_URL" - --wait - --namespace default --set servicePort="$IMDS_PORT" + --wait ) [[ ${#common_helm_args[@]} -gt 0 ]] && aemm_helm_args+=("${common_helm_args[@]}") @@ -48,12 +48,12 @@ set +x emtp_helm_args=( upgrade --install + --namespace default "$CLUSTER_NAME-emtp" "$SCRIPTPATH/../../config/helm/webhook-test-proxy/" - --wait - --namespace default --set webhookTestProxy.image.repository="$WEBHOOK_DOCKER_REPO" --set webhookTestProxy.image.tag="$WEBHOOK_DOCKER_TAG" + --wait ) [[ -n "${WEBHOOK_DOCKER_PULL_POLICY-}" ]] && emtp_helm_args+=(--set webhookTestProxy.image.pullPolicy="$WEBHOOK_DOCKER_PULL_POLICY") @@ -67,11 +67,9 @@ set +x anth_helm_args=( upgrade --install + --namespace kube-system "$CLUSTER_NAME-anth" "$SCRIPTPATH/../../config/helm/aws-node-termination-handler/" - --force - --namespace kube-system - --wait --set instanceMetadataURL="http://$AEMM_URL:$IMDS_PORT" --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" @@ -79,6 +77,8 @@ anth_helm_args=( --set enableScheduledEventDraining="true" --set webhookURLSecretName=webhooksecret \ --set webhookTemplate="\{\"Content\":\"[NTH][Instance Interruption] InstanceId: \{\{ \.InstanceID \}\} - Node: \{\{ \.NodeName \}\} - InstanceType: \{\{ \.InstanceType \}\} - Kind: \{\{ \.Kind \}\} - Start Time: \{\{ \.StartTime \}\}\"\}" + --force + --wait ) [[ -n "${NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY-}" ]] && anth_helm_args+=(--set image.pullPolicy="$NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY") @@ -93,7 +93,7 @@ TAINT_CHECK_CYCLES=15 TAINT_CHECK_SLEEP=15 DEPLOYED=0 -for i in `seq 1 $TAINT_CHECK_CYCLES`; do +for i in $(seq 1 $TAINT_CHECK_CYCLES); do if [[ $(kubectl get deployments regular-pod-test -o jsonpath='{.status.unavailableReplicas}') -eq 0 ]]; then echo "✅ Verified regular-pod-test pod was scheduled and started!" DEPLOYED=1 @@ -107,11 +107,11 @@ if [[ $DEPLOYED -eq 0 ]]; then exit 2 fi -for i in `seq 1 $TAINT_CHECK_CYCLES`; do +for i in $(seq 1 $TAINT_CHECK_CYCLES); do if kubectl get nodes "$CLUSTER_NAME-worker" | grep SchedulingDisabled; then echo "✅ Verified the worker node was cordoned!" NTH_POD_NAME=$(get_nth_worker_pod) - if kubectl logs $NTH_POD_NAME -n kube-system | grep 'Webhook Success'; then + if kubectl logs "${NTH_POD_NAME}" -n kube-system | grep 'Webhook Success'; then echo "✅ Verified the webhook message was sent!" echo "✅ Webhook URL as a Secret Test Passed $CLUSTER_NAME! ✅" exit 0 diff --git a/test/e2e/webhook-test b/test/e2e/webhook-test index 911d13bf..0328a432 100755 --- a/test/e2e/webhook-test +++ b/test/e2e/webhook-test @@ -14,7 +14,7 @@ set -euo pipefail function fail_and_exit { echo "❌ Webhook test failed $CLUSTER_NAME ❌" - exit ${1:-1} + exit "${1:-1}" } echo "Starting Webhook Test for Node Termination Handler" @@ -28,11 +28,9 @@ common_helm_args=() anth_helm_args=( upgrade --install + --namespace kube-system "$CLUSTER_NAME-anth" "$SCRIPTPATH/../../config/helm/aws-node-termination-handler/" - --wait - --force - --namespace kube-system --set instanceMetadataURL="${INSTANCE_METADATA_URL:-"http://$AEMM_URL:$IMDS_PORT"}" --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" @@ -40,7 +38,9 @@ anth_helm_args=( --set webhookTemplate="\{\"Content\":\"[NTH][Instance Interruption] InstanceId: \{\{ \.InstanceID \}\} - Node: \{\{ \.NodeName \}\} - InstanceType: \{\{ \.InstanceType \}\} - AvailabilityZone: \{\{ \.AvailabilityZone \}\} - Kind: \{\{ \.Kind \}\} - Start Time: \{\{ \.StartTime \}\}\"\}" --set enableSpotInterruptionDraining="true" --set enableScheduledEventDraining="true" - --set tolerations="" + --set daemonsetTolerations="" + --wait + --force ) [[ -n "${NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY-}" ]] && anth_helm_args+=(--set image.pullPolicy="$NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY") @@ -54,12 +54,12 @@ set +x emtp_helm_args=( upgrade --install + --namespace default "$CLUSTER_NAME-emtp" "$SCRIPTPATH/../../config/helm/webhook-test-proxy/" - --wait - --namespace default --set webhookTestProxy.image.repository="$WEBHOOK_DOCKER_REPO" --set webhookTestProxy.image.tag="$WEBHOOK_DOCKER_TAG" + --wait ) [[ -n "${WEBHOOK_DOCKER_PULL_POLICY-}" ]] && emtp_helm_args+=(--set webhookTestProxy.image.pullPolicy="$WEBHOOK_DOCKER_PULL_POLICY") @@ -73,11 +73,11 @@ set +x aemm_helm_args=( upgrade --install + --namespace default "$CLUSTER_NAME-aemm" "$AEMM_DL_URL" - --wait - --namespace default --set servicePort="$IMDS_PORT" + --wait ) [[ ${#common_helm_args[@]} -gt 0 ]] && aemm_helm_args+=("${common_helm_args[@]}") @@ -90,7 +90,7 @@ TAINT_CHECK_CYCLES=15 TAINT_CHECK_SLEEP=15 deployed=0 -for i in `seq 1 $TAINT_CHECK_CYCLES`; do +for i in $(seq 1 $TAINT_CHECK_CYCLES); do if [[ $(kubectl get deployments regular-pod-test -o jsonpath='{.status.unavailableReplicas}') -eq 0 ]]; then echo "✅ Verified regular-pod-test pod was scheduled and started!" deployed=1 @@ -108,13 +108,13 @@ fi cordoned=0 nth_pod_name=$(get_nth_worker_pod) test_node="${TEST_NODE:-$CLUSTER_NAME-worker}" -for i in `seq 1 $TAINT_CHECK_CYCLES`; do - if [[ $cordoned -eq 0 ]] && kubectl get nodes $test_node | grep SchedulingDisabled >/dev/null; then +for i in $(seq 1 $TAINT_CHECK_CYCLES); do + if [[ $cordoned -eq 0 ]] && kubectl get nodes "${test_node}" | grep SchedulingDisabled >/dev/null; then echo "✅ Verified the worker node was cordoned!" cordoned=1 fi - if [[ $cordoned -eq 1 ]] && kubectl logs $nth_pod_name -n kube-system | grep 'Webhook Success' >/dev/null; then + if [[ $cordoned -eq 1 ]] && kubectl logs "${nth_pod_name}" -n kube-system | grep 'Webhook Success' >/dev/null; then echo "✅ Verified the webhook message was sent!" echo "✅ Webhook Test Passed $CLUSTER_NAME! ✅" exit 0 @@ -128,4 +128,5 @@ if [[ $cordoned -eq 0 ]]; then else echo "❌ Webhook message was not sent" fi + fail_and_exit 1 diff --git a/test/eks-cluster-test/provision-cluster b/test/eks-cluster-test/provision-cluster index 18ce5749..82ca1aad 100755 --- a/test/eks-cluster-test/provision-cluster +++ b/test/eks-cluster-test/provision-cluster @@ -27,12 +27,12 @@ fi ## Build Docker images echo "🥑 Building the node-termination-handler docker image" -docker build $DOCKER_ARGS -t $DEFAULT_NODE_TERMINATION_HANDLER_DOCKER_IMG "$SCRIPTPATH/../../." +docker buildx build --load $DOCKER_ARGS -t $DEFAULT_NODE_TERMINATION_HANDLER_DOCKER_IMG "$SCRIPTPATH/../../." NODE_TERMINATION_HANDLER_DOCKER_IMG="$DEFAULT_NODE_TERMINATION_HANDLER_DOCKER_IMG" echo "👍 Built the node-termination-handler docker image" echo "🥑 Building the webhook-test-proxy docker image" -docker build $DOCKER_ARGS -t $DEFAULT_WEBHOOK_DOCKER_IMG "$SCRIPTPATH/../webhook-test-proxy/." +docker buildx build --load $DOCKER_ARGS -t $DEFAULT_WEBHOOK_DOCKER_IMG "$SCRIPTPATH/../webhook-test-proxy/." WEBHOOK_DOCKER_IMG="$DEFAULT_WEBHOOK_DOCKER_IMG" echo "👍 Built the webhook-test-proxy docker image" diff --git a/test/helm/helm-lint b/test/helm/helm-lint index 8f1cb528..84f7c523 100755 --- a/test/helm/helm-lint +++ b/test/helm/helm-lint @@ -4,51 +4,49 @@ set -euo pipefail SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" TMP_DIR="$SCRIPTPATH/../../build" PLATFORM=$(uname | tr '[:upper:]' '[:lower:]') -HELM3_VERSION="3.3.1" -HELM2_VERSION="2.16.10" +HELM_VERSION="3.7.1" HELM_DIR="${SCRIPTPATH}/../../config/helm" mkdir -p $TMP_DIR if [ ! -x "$TMP_DIR/helm" ]; then - echo "🥑 Downloading the \"helm3\" binary" - curl -L https://get.helm.sh/helm-v$HELM3_VERSION-$PLATFORM-amd64.tar.gz | tar zxf - -C $TMP_DIR + echo "🥑 Downloading the \"helm\" binary" + curl -L https://get.helm.sh/helm-v$HELM_VERSION-$PLATFORM-amd64.tar.gz | tar zxf - -C $TMP_DIR mv $TMP_DIR/$PLATFORM-amd64/helm $TMP_DIR/. chmod +x $TMP_DIR/helm echo "👍 Downloaded the \"helm\" binary" fi - -if [ ! -x "$TMP_DIR/helm2" ]; then - echo "🥑 Downloading the \"helm2\" binary" - curl -L https://get.helm.sh/helm-v$HELM2_VERSION-$PLATFORM-amd64.tar.gz | tar zxf - -C $TMP_DIR - mv $TMP_DIR/$PLATFORM-amd64/helm $TMP_DIR/helm2 - chmod +x $TMP_DIR/helm2 - echo "👍 Downloaded the \"helm2\" binary" -fi export PATH=$TMP_DIR:$PATH echo "==============================================================================" -echo " Linting Helm Chart w/ Helm v3" +echo " Linting Helm Chart" echo "==============================================================================" -helm lint $SCRIPTPATH/../../config/helm/aws-node-termination-handler/ +helm lint "${HELM_DIR}/aws-node-termination-handler/" + +echo "✅ Helm Linting has successfully completed!" + +echo "==============================================================================" +echo " Generate Template from Helm Chart with default values" +echo "==============================================================================" + +helm template nth "${HELM_DIR}/aws-node-termination-handler" --namespace=kube-system --debug > /dev/null echo "==============================================================================" -echo " Linting Helm Chart w/ Helm v2" +echo " Generate Template from Helm Chart with queue-proccessor values" echo "==============================================================================" -helm2 lint $SCRIPTPATH/../../config/helm/aws-node-termination-handler/ -echo "✅ Helm Linting for v2 and v3 have successfully completed!" +helm template nth "${HELM_DIR}/aws-node-termination-handler" --namespace=kube-system --debug -f "${HELM_DIR}/aws-node-termination-handler/example-values-queue.yaml" > /dev/null echo "==============================================================================" -echo " Generate Template w/ Helm v3" +echo " Generate Template from Helm Chart with Linux IMDS values" echo "==============================================================================" -helm template nth "${HELM_DIR}/aws-node-termination-handler" --debug --namespace=kube-system -f "${HELM_DIR}/aws-node-termination-handler/test.yaml" > /dev/null +helm template nth "${HELM_DIR}/aws-node-termination-handler" --namespace=kube-system --debug -f "${HELM_DIR}/aws-node-termination-handler/example-values-imds-linux.yaml" > /dev/null echo "==============================================================================" -echo " Generate Template w/ Helm v2" +echo " Generate Template from Helm Chart with Windows IMDS values" echo "==============================================================================" -helm2 template --name nth "${HELM_DIR}/aws-node-termination-handler" --debug --namespace=kube-system -f "${HELM_DIR}/aws-node-termination-handler/test.yaml" > /dev/null +helm template nth "${HELM_DIR}/aws-node-termination-handler" --namespace=kube-system --debug -f "${HELM_DIR}/aws-node-termination-handler/example-values-imds-windows.yaml" > /dev/null -echo "✅ Helm template generation for v2 and v3 have successfully completed!" +echo "✅ Helm template generation has successfully completed!" diff --git a/test/k8s-local-cluster-test/provision-cluster b/test/k8s-local-cluster-test/provision-cluster index 119bf3de..276c9bdb 100755 --- a/test/k8s-local-cluster-test/provision-cluster +++ b/test/k8s-local-cluster-test/provision-cluster @@ -24,7 +24,7 @@ K8_1_17="kindest/node:v1.17.17@sha256:66f1d0d91a88b8a001811e2f1054af60eef3b669a9 K8_VERSION="$K8_1_20" KUBECTL_VERSION=$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt) KIND_VERSION="0.11.1" -HELM_VERSION="3.6.0" +HELM_VERSION="3.7.1" echoerr() { echo "$@" 1>&2; } diff --git a/test/k8s-local-cluster-test/run-test b/test/k8s-local-cluster-test/run-test index ca75dd1a..cd975e40 100755 --- a/test/k8s-local-cluster-test/run-test +++ b/test/k8s-local-cluster-test/run-test @@ -99,8 +99,8 @@ function remove_labels { echo "Removing labels from NTH cluster nodes" labels_to_remove=() - while IFS='' read -r line; do - labels_to_remove+=("$line"); + while IFS='' read -r line; do + labels_to_remove+=("$line"); done < <(kubectl get nodes -o json | jq '.items[].metadata.labels' | grep 'aws-node-termination-handler' | tr -d '[:blank:]' | tr -d '\"' | cut -d':' -f1) if [[ "${#labels_to_remove[@]}" -ne 0 ]]; then @@ -196,7 +196,7 @@ CLUSTER_NAME=$(cat "$TMP_DIR/clustername") if [ -z "$NODE_TERMINATION_HANDLER_DOCKER_IMG" ]; then echo "🥑 Building the node-termination-handler docker image" - docker build $DOCKER_ARGS -t "$DEFAULT_NODE_TERMINATION_HANDLER_DOCKER_IMG" "$SCRIPTPATH/../../." + docker buildx build --load $DOCKER_ARGS -t "$DEFAULT_NODE_TERMINATION_HANDLER_DOCKER_IMG" "$SCRIPTPATH/../../." NODE_TERMINATION_HANDLER_DOCKER_IMG="$DEFAULT_NODE_TERMINATION_HANDLER_DOCKER_IMG" echo "👍 Built the node-termination-handler docker image" else @@ -208,7 +208,7 @@ NODE_TERMINATION_HANDLER_DOCKER_TAG=$(echo "$NODE_TERMINATION_HANDLER_DOCKER_IMG if [ -z "$WEBHOOK_DOCKER_IMG" ]; then echo "🥑 Building the webhook-test-proxy docker image" - docker build $DOCKER_ARGS -t "$DEFAULT_WEBHOOK_DOCKER_IMG" "$SCRIPTPATH/../webhook-test-proxy/." + docker buildx build --load $DOCKER_ARGS -t "$DEFAULT_WEBHOOK_DOCKER_IMG" "$SCRIPTPATH/../webhook-test-proxy/." WEBHOOK_DOCKER_IMG="$DEFAULT_WEBHOOK_DOCKER_IMG" echo "👍 Built the webhook-test-proxy docker image" else @@ -259,7 +259,7 @@ export NTH_WORKER_LABEL="kubernetes\.io/hostname=ip-${WORKER_IP//\./-}.ec2.inter ### ## Need to override hostname label for CTH localstack tests -kubectl label node "${CLUSTER_NAME}-worker" "$(echo $NTH_WORKER_LABEL | tr -d '\')" --overwrite +kubectl label node "${CLUSTER_NAME}-worker" "$(echo $NTH_WORKER_LABEL | tr -d '\\')" --overwrite ## Mark worker2 only for Critical Add-Ons like dns kubectl taint node "${CLUSTER_NAME}-worker2" CriticalAddonsOnly=true:NoSchedule --overwrite diff --git a/test/license-test/run-license-test.sh b/test/license-test/run-license-test.sh index d0d56675..7bf14335 100755 --- a/test/license-test/run-license-test.sh +++ b/test/license-test/run-license-test.sh @@ -10,6 +10,6 @@ LICENSE_TEST_TAG="nth-license-test" LICENSE_REPORT_FILE="$BUILD_PATH/license-report" SUPPORTED_PLATFORMS_LINUX="linux/amd64" make -s -f $SCRIPTPATH/../../Makefile build-binaries -docker build --build-arg=GOPROXY=direct -t $LICENSE_TEST_TAG $SCRIPTPATH/ +docker buildx build --load --build-arg=GOPROXY=direct -t $LICENSE_TEST_TAG $SCRIPTPATH/ docker run -i -e GITHUB_TOKEN --rm -v $SCRIPTPATH/:/test -v $BUILD_BIN/:/nth-bin $LICENSE_TEST_TAG golicense /test/license-config.hcl /nth-bin/$BINARY_NAME | tee $LICENSE_REPORT_FILE $SCRIPTPATH/check-licenses.sh $LICENSE_REPORT_FILE diff --git a/test/readme-test/run-readme-spellcheck b/test/readme-test/run-readme-spellcheck index 48ba1043..e915f293 100755 --- a/test/readme-test/run-readme-spellcheck +++ b/test/readme-test/run-readme-spellcheck @@ -9,6 +9,6 @@ function exit_and_fail() { } trap exit_and_fail INT ERR TERM -docker build -t misspell -f $SCRIPTPATH/spellcheck-Dockerfile $SCRIPTPATH/ +docker buildx build --load -t misspell -f $SCRIPTPATH/spellcheck-Dockerfile $SCRIPTPATH/ docker run -i --rm -v $SCRIPTPATH/../../:/app misspell /bin/bash -c 'find /app/ -type f -name "*.md" -not -path "build" | grep -v "/build/" | xargs misspell -error -debug' echo "✅ Markdown file spell check passed!" \ No newline at end of file