Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cmd/node-termination-handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -328,7 +328,7 @@ func drainOrCordonIfNecessary(interruptionEventStore *interruptioneventstore.Sto
}
}

interruptionEventStore.MarkAllAsDrained(nodeName)
interruptionEventStore.MarkAllAsProcessed(nodeName)
if nthConfig.WebhookURL != "" {
webhook.Post(nodeMetadata, drainEvent, nthConfig)
}
Expand Down
12 changes: 6 additions & 6 deletions docs/aemm_interruption_testing.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,14 +38,14 @@ If you run the example above you might notice that the logs are heavily populate
```
2020/09/15 21:13:41 Sending interruption event to the interruption channel
2020/09/15 21:13:41 Got interruption event from channel {InstanceID:i-1234567890abcdef0 InstanceType:m4.xlarge PublicHostname:ec2-192-0-2-54.compute-1.amazonaws.com PublicIP:192.0.2.54 LocalHostname:ip-172-16-34-43.ec2.internal LocalIP:172.16.34.43 AvailabilityZone:us-east-1a} {EventID:spot-itn-47ddfb5e39791606bec3e91fea4cdfa86f86a60ddaf014c8b4af8e008f134b19 Kind:SPOT_ITN Description:Spot ITN received. Instance will be interrupted at 2020-09-15T21:15:41Z
State: NodeName:ip-192-168-123-456.us-east-1.compute.internal StartTime:2020-09-15 21:15:41 +0000 UTC EndTime:0001-01-01 00:00:00 +0000 UTC Drained:false PreDrainTask:0x113c8a0 PostDrainTask:<nil>}
State: NodeName:ip-192-168-123-456.us-east-1.compute.internal StartTime:2020-09-15 21:15:41 +0000 UTC EndTime:0001-01-01 00:00:00 +0000 UTC NodeProcessed:false PreDrainTask:0x113c8a0 PostDrainTask:<nil>}
WARNING: ignoring DaemonSet-managed Pods: default/amazon-ec2-metadata-mock-pszj2, kube-system/aws-node-bl2bj, kube-system/aws-node-termination-handler-2pvjr, kube-system/kube-proxy-fct9f
evicting pod "coredns-67bfd975c5-rgkh7"
evicting pod "coredns-67bfd975c5-6g88n"
2020/09/15 21:13:42 Node "ip-192-168-123-456.us-east-1.compute.internal" successfully cordoned and drained.
2020/09/15 21:13:43 Sending interruption event to the interruption channel
2020/09/15 21:13:43 Got interruption event from channel {InstanceID:i-1234567890abcdef0 InstanceType:m4.xlarge PublicHostname:ec2-192-0-2-54.compute-1.amazonaws.com PublicIP:192.0.2.54 LocalHostname:ip-172-16-34-43.ec2.internal LocalIP:172.16.34.43 AvailabilityZone:us-east-1a} {EventID:spot-itn-97be476b6246aba6401ba36e54437719bfdf987773e9c83fe30336eb7fea9704 Kind:SPOT_ITN Description:Spot ITN received. Instance will be interrupted at 2020-09-15T21:15:43Z
State: NodeName:ip-192-168-123-456.us-east-1.compute.internal StartTime:2020-09-15 21:15:43 +0000 UTC EndTime:0001-01-01 00:00:00 +0000 UTC Drained:false PreDrainTask:0x113c8a0 PostDrainTask:<nil>}
State: NodeName:ip-192-168-123-456.us-east-1.compute.internal StartTime:2020-09-15 21:15:43 +0000 UTC EndTime:0001-01-01 00:00:00 +0000 UTC NodeProcessed:false PreDrainTask:0x113c8a0 PostDrainTask:<nil>}
WARNING: ignoring DaemonSet-managed Pods: default/amazon-ec2-metadata-mock-pszj2, kube-system/aws-node-bl2bj, kube-system/aws-node-termination-handler-2pvjr, kube-system/kube-proxy-fct9f
2020/09/15 21:13:44 Node "ip-192-168-123-456.us-east-1.compute.internal" successfully cordoned and drained.
2020/09/15 21:13:45 Sending interruption event to the interruption channel
Expand All @@ -54,17 +54,17 @@ WARNING: ignoring DaemonSet-managed Pods: default/amazon-ec2-metadata-mock-pszj2

This isn't a mistake, by default AEMM will respond to any request for metadata with a spot interruption occurring 2 minutes
later than the request time.\* AWS Node Termination Handler polls for events every 2 seconds by default, so the effect is
that new interruption events are found and processed every 2 seconds.
that new interruption events are found and processed every 2 seconds.

In reality there will only be a single interruption event, and you can mock this by setting the `spot.time` parameter of
AEMM when installing it.
AEMM when installing it.
```
helm install amazon-ec2-metadata-mock amazon-ec2-metadata-mock-1.6.0.tgz \
--set aemm.spot.time="2020-09-09T22:40:47Z" \
--namespace default
```

Now when you check the logs you should only see a single event get processed.
Now when you check the logs you should only see a single event get processed.

For more ways of configuring AEMM check out the [Helm configuration page](https://github.com/aws/amazon-ec2-metadata-mock/tree/main/helm/amazon-ec2-metadata-mock).

Expand All @@ -82,7 +82,7 @@ for the local tests that use a kind cluster, and [here](https://github.com/aws/a
for the eks-cluster e2e tests.

Check out the [ReadMe](https://github.com/aws/aws-node-termination-handler/tree/main/test) in our test folder for more
info on the e2e tests.
info on the e2e tests.

---

Expand Down
10 changes: 5 additions & 5 deletions pkg/interruptioneventstore/interruption-event-store.go
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ func (s *Store) ShouldDrainNode() bool {

func (s *Store) shouldEventDrain(interruptionEvent *monitor.InterruptionEvent) bool {
_, ignored := s.ignoredEvents[interruptionEvent.EventID]
if !ignored && !interruptionEvent.Drained && s.TimeUntilDrain(interruptionEvent) <= 0 {
if !ignored && !interruptionEvent.NodeProcessed && s.TimeUntilDrain(interruptionEvent) <= 0 {
return true
}
return false
Expand All @@ -108,19 +108,19 @@ func (s *Store) TimeUntilDrain(interruptionEvent *monitor.InterruptionEvent) tim
return drainTime.Sub(time.Now())
}

// MarkAllAsDrained should be called after the node has been drained to prevent further unnecessary drain calls to the k8s api
func (s *Store) MarkAllAsDrained(nodeName string) {
// MarkAllAsProcessed should be called after the node has been drained to prevent further unnecessary drain calls to the k8s api
func (s *Store) MarkAllAsProcessed(nodeName string) {
s.Lock()
defer s.Unlock()
for _, interruptionEvent := range s.interruptionEventStore {
if interruptionEvent.NodeName == nodeName {
interruptionEvent.Drained = true
interruptionEvent.NodeProcessed = true
}
}
}

// IgnoreEvent will store an event ID so that monitor loops cannot write to the store with the same event ID
// Drain actions are ignored on the passed in event ID by setting the Drained flag to true
// Drain actions are ignored on the passed in event ID by setting the NodeProcessed flag to true
func (s *Store) IgnoreEvent(eventID string) {
if eventID == "" {
return
Expand Down
22 changes: 11 additions & 11 deletions pkg/interruptioneventstore/interruption-event-store_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,26 +99,26 @@ func TestShouldDrainNode(t *testing.T) {
h.Equals(t, true, store.ShouldDrainNode())
}

func TestMarkAllAsDrained(t *testing.T) {
func TestMarkAllAsProcessed(t *testing.T) {
store := interruptioneventstore.New(config.Config{})
event1 := &monitor.InterruptionEvent{
EventID: "1",
StartTime: time.Now().Add(time.Second * 20),
Drained: false,
NodeName: node1,
EventID: "1",
StartTime: time.Now().Add(time.Second * 20),
NodeProcessed: false,
NodeName: node1,
}
event2 := &monitor.InterruptionEvent{
EventID: "2",
StartTime: time.Now().Add(time.Second * 20),
Drained: false,
NodeName: node1,
EventID: "2",
StartTime: time.Now().Add(time.Second * 20),
NodeProcessed: false,
NodeName: node1,
}

store.AddInterruptionEvent(event1)
store.AddInterruptionEvent(event2)
store.MarkAllAsDrained(node1)
store.MarkAllAsProcessed(node1)

// When events are marked as Drained=true, then they are no longer
// When events are marked as NodeProcessed=true, then they are no longer
// returned by the GetActiveEvent func, so we expect false
_, isActive := store.GetActiveEvent()
h.Equals(t, false, isActive)
Expand Down
2 changes: 1 addition & 1 deletion pkg/monitor/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ type InterruptionEvent struct {
InstanceID string
StartTime time.Time
EndTime time.Time
Drained bool
NodeProcessed bool
InProgress bool
PreDrainTask DrainTask `json:"-"`
PostDrainTask DrainTask `json:"-"`
Expand Down