From e5731b17ea6dc38616e26ed253581f659936bc15 Mon Sep 17 00:00:00 2001 From: Konrad `ktoso` Malawski Date: Fri, 17 Jan 2020 21:47:07 +0900 Subject: [PATCH 01/12] =crdt slightly improved logging, message sounded wrong when ignoring leadership change event --- Sources/DistributedActors/CRDT/CRDTReplicatorShell.swift | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Sources/DistributedActors/CRDT/CRDTReplicatorShell.swift b/Sources/DistributedActors/CRDT/CRDTReplicatorShell.swift index 6d0984cf5..8f2b69b41 100644 --- a/Sources/DistributedActors/CRDT/CRDTReplicatorShell.swift +++ b/Sources/DistributedActors/CRDT/CRDTReplicatorShell.swift @@ -104,8 +104,9 @@ extension CRDT.Replicator { self.receiveClusterEvent(context, event: .membershipChange(change)) } - default: + case .membershipChange: context.log.trace("Ignoring cluster event \(event), only interested in >= .up events", metadata: self.metadata(context)) + default: () // ignore other events } } From 1e8d059233221f87b6c44f7d2a57e5374234b551 Mon Sep 17 00:00:00 2001 From: Konrad `ktoso` Malawski Date: Fri, 17 Jan 2020 22:34:28 +0900 Subject: [PATCH 02/12] =cluster #55 add automatic OnDownActions, default to system shutdown --- Sources/DistributedActors/ActorLogging.swift | 4 ++ Sources/DistributedActors/ActorSystem.swift | 2 +- .../Cluster/ClusterSettings.swift | 36 +++++++++++++ .../Cluster/ClusterShell.swift | 5 +- .../Cluster/SWIM/SWIMShell.swift | 1 - .../DistributedActorsTestKit/LogCapture.swift | 11 ++++ .../ClusterOnDownActionTests+XCTest.swift | 30 +++++++++++ .../Cluster/ClusterOnDownActionTests.swift | 54 +++++++++++++++++++ .../DeadLetterTests.swift | 22 +++----- Tests/LinuxMain.swift | 1 + 10 files changed, 147 insertions(+), 19 deletions(-) create mode 100644 Tests/DistributedActorsTests/Cluster/ClusterOnDownActionTests+XCTest.swift create mode 100644 Tests/DistributedActorsTests/Cluster/ClusterOnDownActionTests.swift diff --git a/Sources/DistributedActors/ActorLogging.swift b/Sources/DistributedActors/ActorLogging.swift index 2289aeb64..e20c33d5f 100644 --- a/Sources/DistributedActors/ActorLogging.swift +++ b/Sources/DistributedActors/ActorLogging.swift @@ -83,6 +83,10 @@ public struct ActorLogger { } public static func make(system: ActorSystem, identifier: String? = nil) -> Logger { + if let overriddenLoggerFactory = system.settings.overrideLoggerFactory { + return overriddenLoggerFactory(identifier ?? system.name) + } + // we need to add our own storage, and can't do so to Logger since it is a struct... // so we need to make such "proxy log handler", that does out actor specific things. var proxyHandler = ActorOriginLogHandler(system) diff --git a/Sources/DistributedActors/ActorSystem.swift b/Sources/DistributedActors/ActorSystem.swift index 61acf2778..084bf29b8 100644 --- a/Sources/DistributedActors/ActorSystem.swift +++ b/Sources/DistributedActors/ActorSystem.swift @@ -98,7 +98,7 @@ public final class ActorSystem { // MARK: Logging public var log: Logger { - var l = ActorLogger.make(system: self) // we only do this to go "through" the proxy; we may not need it in the future? + var l = ActorLogger.make(system: self) l.logLevel = self.settings.defaultLogLevel return l } diff --git a/Sources/DistributedActors/Cluster/ClusterSettings.swift b/Sources/DistributedActors/Cluster/ClusterSettings.swift index 20035b672..0d6b8c292 100644 --- a/Sources/DistributedActors/Cluster/ClusterSettings.swift +++ b/Sources/DistributedActors/Cluster/ClusterSettings.swift @@ -143,8 +143,15 @@ public struct ClusterSettings { // ==== ---------------------------------------------------------------------------------------------------------------- // MARK: Cluster membership and failure detection + /// Strategy how members determine if others (or myself) shall be marked as `.down`. + /// This strategy should be set to the same (or compatible) strategy on all members of a cluster to avoid split brain situations. public var downingStrategy: DowningStrategySettings = .none + /// When this member node notices it has been marked as `.down` in the membership, it can automatically perform an action. + /// This setting determines which action to take. Generally speaking, the best course of action is to quickly and gracefully + /// shut down the node and process, potentially leaving a higher level orchestrator to replace the node (e.g. k8s starting a new pod for the cluster). + public var onDownAction: OnDownActionStrategySettings = .shutdown + /// Configures the SWIM failure failure detector. public var swim: SWIM.Settings = .default @@ -169,6 +176,9 @@ public struct ClusterSettings { } } +// ==== ---------------------------------------------------------------------------------------------------------------- +// MARK: DowningStrategySettings + public enum DowningStrategySettings { case none case timeout(TimeoutBasedDowningStrategySettings) @@ -182,3 +192,29 @@ public enum DowningStrategySettings { } } } + +// ==== ---------------------------------------------------------------------------------------------------------------- +// MARK: OnDownActionStrategySettings + +public enum OnDownActionStrategySettings { + /// Take no (automatic) action upon noticing that this member is marked as `.down`. + /// + /// When using this mode you should take special care to implement some form of shutting down of this node (!). + /// As a `Cluster.MemberStatus.down` node is effectively useless for the rest of the cluster -- i.e. other + /// members MUST refuse communication with this down node. + case none + /// Upon noticing that this member is marked as `.down`, initiate a shutdown. + case shutdown + + func make() -> (ActorSystem) -> Void { + switch self { + case .none: + return { _ in () } // do nothing + case .shutdown: + return { system in + system.log.warning("This node was marked as `.down` in membership. Performing OnDownAction as configured: shutting down the system.") + system.shutdown() + } + } + } +} diff --git a/Sources/DistributedActors/Cluster/ClusterShell.swift b/Sources/DistributedActors/Cluster/ClusterShell.swift index c896eb73b..ac07a7a07 100644 --- a/Sources/DistributedActors/Cluster/ClusterShell.swift +++ b/Sources/DistributedActors/Cluster/ClusterShell.swift @@ -968,9 +968,12 @@ extension ClusterShell { guard memberToDown.node != state.myselfNode else { // ==== ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // Down(self node); ensuring SWIM knows about this and should likely initiate graceful shutdown + context.log.warning("Self node was determined [.down]!", metadata: [ + "cluster/membership": "\(state.membership)", // TODO: introduce state.metadata pattern? + ]) self.swimRef.tell(.local(.confirmDead(state.myselfNode))) - context.log.warning("Self node was determined [.down]. (TODO: initiate shutdown based on config)") // TODO: initiate a shutdown it configured to do so + context.system.settings.cluster.onDownAction.make()(context.system) return state } diff --git a/Sources/DistributedActors/Cluster/SWIM/SWIMShell.swift b/Sources/DistributedActors/Cluster/SWIM/SWIMShell.swift index ee6cb20bd..695d0f31a 100644 --- a/Sources/DistributedActors/Cluster/SWIM/SWIMShell.swift +++ b/Sources/DistributedActors/Cluster/SWIM/SWIMShell.swift @@ -403,7 +403,6 @@ internal struct SWIMShell { case .associated(let control): continueWithAssociation(.success(control.remoteNode)) case .tombstone: - context.log.info("TOMBSTONE: \(remoteNode)") let msg = "Association target node is already .tombstoned, not associating. Node \(reflecting: remoteNode) likely to be removed from gossip shortly." continueWithAssociation(.failure(EnsureAssociationError(msg))) return // we shall not associate with this tombstoned node (!) diff --git a/Sources/DistributedActorsTestKit/LogCapture.swift b/Sources/DistributedActorsTestKit/LogCapture.swift index 9517d5c3e..a7ec96482 100644 --- a/Sources/DistributedActorsTestKit/LogCapture.swift +++ b/Sources/DistributedActorsTestKit/LogCapture.swift @@ -60,6 +60,17 @@ public final class LogCapture { } } } + + public func awaitLogContaining(_ testKit: ActorTestKit, text: String, within: TimeAmount = .seconds(3), file: StaticString = #file, line: UInt = #line) throws { + return try testKit.eventually(within: within, file: file, line: line) { + let logs = self.logs + if !logs.contains(where: { log in + "\(log)".contains(text) + }) { + throw TestError("Logs did not contain [\(text)].") + } + } + } } extension LogCapture { diff --git a/Tests/DistributedActorsTests/Cluster/ClusterOnDownActionTests+XCTest.swift b/Tests/DistributedActorsTests/Cluster/ClusterOnDownActionTests+XCTest.swift new file mode 100644 index 000000000..b0a8a3ad9 --- /dev/null +++ b/Tests/DistributedActorsTests/Cluster/ClusterOnDownActionTests+XCTest.swift @@ -0,0 +1,30 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift Distributed Actors open source project +// +// Copyright (c) 2020 Apple Inc. and the Swift Distributed Actors project authors +// Licensed under Apache License v2.0 +// +// See LICENSE.txt for license information +// See CONTRIBUTORS.md for the list of Swift Distributed Actors project authors +// +// SPDX-License-Identifier: Apache-2.0 +// +//===----------------------------------------------------------------------===// + +import XCTest + +/// +/// NOTE: This file was generated by generate_linux_tests.rb +/// +/// Do NOT edit this file directly as it will be regenerated automatically when needed. +/// + +extension ClusterOnDownActionTests { + static var allTests: [(String, (ClusterOnDownActionTests) -> () throws -> Void)] { + return [ + ("test_onNodeDowned_performShutdown", test_onNodeDowned_performShutdown), + ("test_onNodeDowned_configuredNoop_doNothing", test_onNodeDowned_configuredNoop_doNothing), + ] + } +} diff --git a/Tests/DistributedActorsTests/Cluster/ClusterOnDownActionTests.swift b/Tests/DistributedActorsTests/Cluster/ClusterOnDownActionTests.swift new file mode 100644 index 000000000..380a288a4 --- /dev/null +++ b/Tests/DistributedActorsTests/Cluster/ClusterOnDownActionTests.swift @@ -0,0 +1,54 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift Distributed Actors open source project +// +// Copyright (c) 2018-2019 Apple Inc. and the Swift Distributed Actors project authors +// Licensed under Apache License v2.0 +// +// See LICENSE.txt for license information +// See CONTRIBUTORS.md for the list of Swift Distributed Actors project authors +// +// SPDX-License-Identifier: Apache-2.0 +// +//===----------------------------------------------------------------------===// + +@testable import DistributedActors +import DistributedActorsTestKit +import NIOSSL +import XCTest + +final class ClusterOnDownActionTests: ClusteredNodesTestBase { + func test_onNodeDowned_performShutdown() throws { + try shouldNotThrow { + let (first, second) = self.setUpPair() + + try self.joinNodes(node: first, with: second) + + second.cluster.down(node: first.cluster.node.node) + + try self.capturedLogs(of: first).awaitLogContaining(self.testKit(first), text: "Self node was determined [.down]!") + + try self.testKit(first).eventually(within: .seconds(1)) { + guard first.isShuttingDown else { + throw TestError("First system should be shutting down, it was marked down and the default onDownAction should have triggered!") + } + } + } + } + + func test_onNodeDowned_configuredNoop_doNothing() throws { + try shouldNotThrow { + let (first, second) = self.setUpPair { settings in + settings.cluster.onDownAction = .none + } + + try self.joinNodes(node: first, with: second) + + second.cluster.down(node: first.cluster.node.node) + + try self.capturedLogs(of: first).awaitLogContaining(self.testKit(first), text: "Self node was determined [.down]!") + + first.isShuttingDown.shouldBeFalse() + } + } +} diff --git a/Tests/DistributedActorsTests/DeadLetterTests.swift b/Tests/DistributedActorsTests/DeadLetterTests.swift index 2e504cad6..aa3c907b8 100644 --- a/Tests/DistributedActorsTests/DeadLetterTests.swift +++ b/Tests/DistributedActorsTests/DeadLetterTests.swift @@ -29,8 +29,8 @@ final class DeadLetterTests: ActorSystemTestBase { office.deliver("Hello") - try self.awaitLogContaining(text: "[Hello]:Swift.String was not delivered") - try self.awaitLogContaining(text: "/user/someone") + try self.logCapture.awaitLogContaining(self.testKit, text: "[Hello]:Swift.String was not delivered") + try self.logCapture.awaitLogContaining(self.testKit, text: "/user/someone") } // ==== ------------------------------------------------------------------------------------------------------------ @@ -48,8 +48,8 @@ final class DeadLetterTests: ActorSystemTestBase { ref.tell("Are you still there?") - try self.awaitLogContaining(text: "Are you still there?") - try self.awaitLogContaining(text: "/user/ludwig") + try self.logCapture.awaitLogContaining(self.testKit, text: "Are you still there?") + try self.logCapture.awaitLogContaining(self.testKit, text: "/user/ludwig") } func test_askingTerminatedActor_shouldResultInDeadLetter() throws { @@ -72,17 +72,7 @@ final class DeadLetterTests: ActorSystemTestBase { try answer.nioFuture.wait() } - try self.awaitLogContaining(text: "This is a question") - try self.awaitLogContaining(text: "/user/ludwig") - } - - private func awaitLogContaining(text: String, file: StaticString = #file, line: UInt = #line) throws { - return try self.testKit.eventually(within: .seconds(3), file: file, line: line) { - if !self.logCapture.logs.contains(where: { log in - "\(log)".contains(text) - }) { - throw TestError("Keep waiting; Contained only: \(self.logCapture.logs)") - } - } + try self.logCapture.awaitLogContaining(self.testKit, text: "This is a question") + try self.logCapture.awaitLogContaining(self.testKit, text: "/user/ludwig") } } diff --git a/Tests/LinuxMain.swift b/Tests/LinuxMain.swift index 92d6d3109..712bf9738 100644 --- a/Tests/LinuxMain.swift +++ b/Tests/LinuxMain.swift @@ -69,6 +69,7 @@ XCTMain([ testCase(ClusterEventsSerializationTests.allTests), testCase(ClusterLeaderActionsTests.allTests), testCase(ClusterMembershipGossipTests.allTests), + testCase(ClusterOnDownActionTests.allTests), testCase(ClusterReceptionistTests.allTests), testCase(ConcurrencyHelpersTests.allTests), testCase(CustomStringInterpolationTests.allTests), From 2d9ea0c5f37d449cfbcdf9a314bbd3eb3139a203 Mon Sep 17 00:00:00 2001 From: Konrad `ktoso` Malawski Date: Mon, 20 Jan 2020 12:37:28 +0900 Subject: [PATCH 03/12] =test fix #377 in face of auto shutdown --- .../Cluster/AssociationClusteredTests.swift | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Tests/DistributedActorsTests/Cluster/AssociationClusteredTests.swift b/Tests/DistributedActorsTests/Cluster/AssociationClusteredTests.swift index 8d15f324c..cd51db1f1 100644 --- a/Tests/DistributedActorsTests/Cluster/AssociationClusteredTests.swift +++ b/Tests/DistributedActorsTests/Cluster/AssociationClusteredTests.swift @@ -264,11 +264,14 @@ final class ClusterAssociationTests: ClusteredNodesTestBase { func test_down_self_shouldChangeMembershipSelfToBeDown() throws { try shouldNotThrow { - let (first, second) = setUpPair() + let (first, second) = setUpPair { settings in + settings.cluster.onDownAction = .none // as otherwise we can't inspect if we really changed the status to .down, as we might shutdown too quickly :-) + } second.cluster.join(node: first.cluster.node.node) try assertAssociated(first, withExactly: second.cluster.node) + // down myself first.cluster.down(node: first.cluster.node.node) let localProbe = self.testKit(first).spawnTestProbe(expecting: Cluster.Membership.self) From 1ffcb87b420ef682675a8475c629527345655e5b Mon Sep 17 00:00:00 2001 From: Konrad `ktoso` Malawski Date: Mon, 20 Jan 2020 14:07:29 +0900 Subject: [PATCH 04/12] =test #378 Temp workaround for downing() self + a node shutting itself down immediately, must become leaving instead --- Sources/DistributedActors/Cluster/ClusterControl.swift | 4 ++++ Sources/DistributedActors/Cluster/ClusterShell.swift | 6 +++++- .../Cluster/SystemMessages+Redelivery.swift | 4 ++-- .../Cluster/Transport/TransportPipelines.swift | 5 ++--- .../ActorSingletonPluginTests.swift | 3 ++- 5 files changed, 15 insertions(+), 7 deletions(-) diff --git a/Sources/DistributedActors/Cluster/ClusterControl.swift b/Sources/DistributedActors/Cluster/ClusterControl.swift index 5bbc31616..3719dff04 100644 --- a/Sources/DistributedActors/Cluster/ClusterControl.swift +++ b/Sources/DistributedActors/Cluster/ClusterControl.swift @@ -47,6 +47,10 @@ public struct ClusterControl { self.ref.tell(.command(.initJoin(node))) } +// func leave() { +// // issue a .leaving and then ensure everyone has seen it, then become down +// } + /// Mark as `Cluster.MemberStatus.down` _any_ incarnation of a member matching the passed in `node`. public func down(node: Node) { self.ref.tell(.command(.downCommand(node))) diff --git a/Sources/DistributedActors/Cluster/ClusterShell.swift b/Sources/DistributedActors/Cluster/ClusterShell.swift index ac07a7a07..57ecedb24 100644 --- a/Sources/DistributedActors/Cluster/ClusterShell.swift +++ b/Sources/DistributedActors/Cluster/ClusterShell.swift @@ -946,6 +946,10 @@ extension ClusterShell { return .same } + if membersToDown.contains(where: { m in m.node == state.myselfNode }) { + state.log.warning("Downing self node [\(state.myselfNode)]. Prefer issuing `cluster.leave()` when leaving gracefully.") + } + var state: ClusterShellState = state for memberToDown in membersToDown { state = self.onDownCommand0(context, state: state, member: memberToDown) @@ -972,7 +976,7 @@ extension ClusterShell { "cluster/membership": "\(state.membership)", // TODO: introduce state.metadata pattern? ]) - self.swimRef.tell(.local(.confirmDead(state.myselfNode))) + self.swimRef.tell(.local(.confirmDead(memberToDown.node))) context.system.settings.cluster.onDownAction.make()(context.system) return state diff --git a/Sources/DistributedActors/Cluster/SystemMessages+Redelivery.swift b/Sources/DistributedActors/Cluster/SystemMessages+Redelivery.swift index 0fbad9c68..8e876101c 100644 --- a/Sources/DistributedActors/Cluster/SystemMessages+Redelivery.swift +++ b/Sources/DistributedActors/Cluster/SystemMessages+Redelivery.swift @@ -346,7 +346,7 @@ extension InboundSystemMessages.InboundSystemMessageArrivalDirective { // MARK: Settings public struct OutboundSystemMessageRedeliverySettings { - public static let `default` = OutboundSystemMessageRedeliverySettings() + public static let `default`: OutboundSystemMessageRedeliverySettings = .init() /// When enabled, logs all outbound messages using the tracelog facility. /// Logs lines will be marked with: [tracelog:sys-msg-redelivery] @@ -359,7 +359,7 @@ public struct OutboundSystemMessageRedeliverySettings { var redeliveryInterval: TimeAmount = .seconds(1) internal var makeRedeliveryBackoff: ConstantBackoffStrategy { - return Backoff.constant(self.redeliveryInterval) + Backoff.constant(self.redeliveryInterval) } /// Configures the maximum number (per association) diff --git a/Sources/DistributedActors/Cluster/Transport/TransportPipelines.swift b/Sources/DistributedActors/Cluster/Transport/TransportPipelines.swift index 3ff666ce0..13c6ba8d9 100644 --- a/Sources/DistributedActors/Cluster/Transport/TransportPipelines.swift +++ b/Sources/DistributedActors/Cluster/Transport/TransportPipelines.swift @@ -368,8 +368,8 @@ internal final class SystemMessageRedeliveryHandler: ChannelDuplexHandler { self.tracelog(.outbound, message: redeliveryEnvelope) context.writeAndFlush(self.wrapOutboundOut(redeliveryEnvelope), promise: promise) case .bufferOverflowMustAbortAssociation: + self.log.error("Outbound system message queue overflow! MUST abort association, system state integrity cannot be ensured (e.g. terminated signals may have been lost).") if let node = transportEnvelope.recipient.node { - // TODO: use ClusterControl once implemented self.clusterShell.tell(.command(.downCommand(node.node))) } } @@ -667,10 +667,9 @@ extension ClusterShell { let log = ActorLogger.make(system: system, identifier: "server") - // FIXME: PASS IN FROM ASSOCIATION SINCE MUST SURVIVE CONNECTIONS !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + // FIXME: PASS IN FROM ASSOCIATION SINCE MUST SURVIVE CONNECTIONS! // TODO: tests about killing connections the hard way let outboundSysMsgs = OutboundSystemMessageRedelivery(settings: .default) let inboundSysMsgs = InboundSystemMessages(settings: .default) - // FIXME: PASS IN FROM ASSOCIATION SINCE MUST SURVIVE CONNECTIONS !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! // TODO: Ensure we don't read faster than we can write by adding the BackPressureHandler into the pipeline. let otherHandlers: [(String?, ChannelHandler)] = [ diff --git a/Tests/ActorSingletonPluginTests/ActorSingletonPluginTests.swift b/Tests/ActorSingletonPluginTests/ActorSingletonPluginTests.swift index 186d098fc..63edc4f26 100644 --- a/Tests/ActorSingletonPluginTests/ActorSingletonPluginTests.swift +++ b/Tests/ActorSingletonPluginTests/ActorSingletonPluginTests.swift @@ -214,7 +214,8 @@ final class ActorSingletonPluginTests: ClusteredNodesTestBase { try replyProbe3.expectMessage("Hello-1 Charlie!") // Take down the leader - first.cluster.down(node: first.cluster.node.node) + // first.cluster.down(node: first.cluster.node.node) // FIXME: must also work when the node downs itself and shuts down (!!!) (we do not move to down currently, no default downing impl) + second.cluster.down(node: first.cluster.node.node) // Ensure the node is seen down try self.ensureNodes(.down, on: second, systems: first) From fcc4aef456e82060e57395cee0417581b8fbf265 Mon Sep 17 00:00:00 2001 From: Konrad `ktoso` Malawski Date: Tue, 21 Jan 2020 00:40:19 +0900 Subject: [PATCH 05/12] =cluster,down Hardening across the board, to survive a node hard quitting - more tests for all cases of "self downing", by shutting down, leaving and downing myself. - downing is now reimplemented in terms of Cluster.Events, which makes it more resilient -- it had a bug where a not-self leader would be marked it would throw, since the membership was only partially maintained - the singleton tests have been now passing consistently - cluster OnDownAction shutdown now is gracefulshutdown and has a timeout. Current impl is harsh and just delays the shutting down. --- .../ActorShell+Children.swift | 2 +- Sources/DistributedActors/ActorSystem.swift | 2 +- .../Cluster/Cluster+Member.swift | 5 + .../Cluster/Cluster+Membership.swift | 8 +- .../Cluster/ClusterControl.swift | 10 +- .../Cluster/ClusterReceptionist.swift | 65 ++++- .../Cluster/ClusterSettings.swift | 47 +--- .../Cluster/ClusterShell.swift | 120 +++++---- .../Cluster/ClusterShellState.swift | 14 +- .../DowningStrategy/DowningStrategy.swift | 137 ++++------ .../TimeoutBasedDowningStrategy.swift | 98 ++++--- .../Cluster/Leadership.swift | 14 +- .../Cluster/SWIM/SWIMSettings.swift | 12 +- .../Cluster/SWIM/SWIMShell.swift | 18 +- .../Pattern/ConvergentGossip.swift | 50 ++-- Sources/DistributedActors/Refs.swift | 2 +- .../ActorSystemTestBase.swift | 18 +- .../Cluster/ClusteredNodesTestBase.swift | 5 + .../DistributedActorsTestKit/LogCapture.swift | 11 +- .../DistributedActorsTestKit/TestProbes.swift | 2 +- ...SingletonPluginClusteredTests+XCTest.swift | 31 +++ .../ActorSingletonPluginClusteredTests.swift | 239 ++++++++++++++++++ .../ActorSingletonPluginTests+XCTest.swift | 5 +- .../ActorSingletonPluginTests.swift | 5 +- .../DowningClusteredTests+XCTest.swift | 32 +++ .../DowningClusteredTests.swift | 92 +++++++ ...eoutBasedDowningInstanceTests+XCTest.swift | 2 +- .../TimeoutBasedDowningInstanceTests.swift | 68 +++-- .../Cluster/LeadershipTests+XCTest.swift | 18 +- .../Cluster/LeadershipTests.swift | 63 +++-- ...> SWIMInstanceClusteredTests+XCTest.swift} | 4 +- ...swift => SWIMInstanceClusteredTests.swift} | 2 +- .../Cluster/SWIM/SWIMShellTests.swift | 2 +- Tests/LinuxMain.swift | 4 +- 34 files changed, 867 insertions(+), 340 deletions(-) rename {Tests/DistributedActorsTests => Sources/DistributedActorsTestKit}/ActorSystemTestBase.swift (75%) create mode 100644 Tests/ActorSingletonPluginTests/ActorSingletonPluginClusteredTests+XCTest.swift create mode 100644 Tests/ActorSingletonPluginTests/ActorSingletonPluginClusteredTests.swift create mode 100644 Tests/DistributedActorsTests/Cluster/DowningStrategy/DowningClusteredTests+XCTest.swift create mode 100644 Tests/DistributedActorsTests/Cluster/DowningStrategy/DowningClusteredTests.swift rename Tests/DistributedActorsTests/Cluster/SWIM/{SWIMInstance+ClusterTests+XCTest.swift => SWIMInstanceClusteredTests+XCTest.swift} (89%) rename Tests/DistributedActorsTests/Cluster/SWIM/{SWIMInstance+ClusterTests.swift => SWIMInstanceClusteredTests.swift} (98%) diff --git a/Sources/DistributedActors/ActorShell+Children.swift b/Sources/DistributedActors/ActorShell+Children.swift index 959b7589b..a4831de0b 100644 --- a/Sources/DistributedActors/ActorShell+Children.swift +++ b/Sources/DistributedActors/ActorShell+Children.swift @@ -398,5 +398,5 @@ public enum ActorContextError: Error { /// It is not allowed to spawn case duplicateActorPath(path: ActorPath) /// It is not allowed to spawn new actors when the system is stopping - case alreadyStopping + case alreadyStopping(String) } diff --git a/Sources/DistributedActors/ActorSystem.swift b/Sources/DistributedActors/ActorSystem.swift index 084bf29b8..91edc1756 100644 --- a/Sources/DistributedActors/ActorSystem.swift +++ b/Sources/DistributedActors/ActorSystem.swift @@ -337,7 +337,7 @@ public final class ActorSystem { self.settings.plugins.stopAll(self) DispatchQueue.global().async { - self.log.log(level: .debug, "SHUTTING DOWN ACTOR SYSTEM [\(self.name)]. All actors will be stopped.", file: #file, function: #function, line: #line) + self.log.log(level: .debug, "Shutting down actor system [\(self.name)]. All actors will be stopped.", file: #file, function: #function, line: #line) if let cluster = self._cluster { let receptacle = BlockingReceptacle() cluster.ref.tell(.command(.shutdown(receptacle))) // FIXME: should be shutdown diff --git a/Sources/DistributedActors/Cluster/Cluster+Member.swift b/Sources/DistributedActors/Cluster/Cluster+Member.swift index f501eaebf..85f8b1eba 100644 --- a/Sources/DistributedActors/Cluster/Cluster+Member.swift +++ b/Sources/DistributedActors/Cluster/Cluster+Member.swift @@ -28,6 +28,11 @@ extension Cluster { public var status: Cluster.MemberStatus /// Reachability signifies the failure detectors assessment about this members "reachability" i.e. if it is responding to health checks or not. + /// + /// ### Reachability of .down or .removed nodes + /// Worth pointing out that a `.down` member may still have a `.reachable` reachability field, + /// this usually means that the decision to move the member `.down` was not made by the failure detection layer, + /// but rather issued programmatically, or by some other non-reachability provoked reason. public var reachability: Cluster.MemberReachability /// Sequence number at which this node was moved to `.up` by a leader. diff --git a/Sources/DistributedActors/Cluster/Cluster+Membership.swift b/Sources/DistributedActors/Cluster/Cluster+Membership.swift index 5af666a4e..511d7e907 100644 --- a/Sources/DistributedActors/Cluster/Cluster+Membership.swift +++ b/Sources/DistributedActors/Cluster/Cluster+Membership.swift @@ -236,7 +236,8 @@ extension Cluster.Membership { if self.firstMember(change.node.node) == nil { // TODO: more general? // TODO this entire method should be simpler _ = self.join(change.node) } - return self.mark(change.node, as: status) + let change = self.mark(change.node, as: status) + return change } } @@ -278,6 +279,11 @@ extension Cluster.Membership { } } + /// Alias for `applyLeadershipChange(to:)` + public mutating func applyLeadershipChange(_ change: Cluster.LeadershipChange?) throws -> Cluster.LeadershipChange? { + try self.applyLeadershipChange(to: change?.newLeader) + } + /// - Returns: the changed member if the change was a transition (unreachable -> reachable, or back), /// or `nil` if the reachability is the same as already known by the membership. public mutating func applyReachabilityChange(_ change: Cluster.ReachabilityChange) -> Cluster.Member? { diff --git a/Sources/DistributedActors/Cluster/ClusterControl.swift b/Sources/DistributedActors/Cluster/ClusterControl.swift index 3719dff04..7a073e538 100644 --- a/Sources/DistributedActors/Cluster/ClusterControl.swift +++ b/Sources/DistributedActors/Cluster/ClusterControl.swift @@ -47,12 +47,16 @@ public struct ClusterControl { self.ref.tell(.command(.initJoin(node))) } -// func leave() { -// // issue a .leaving and then ensure everyone has seen it, then become down -// } + public func leave() { + self.ref.tell(.command(.downCommand(self.node.node))) + } /// Mark as `Cluster.MemberStatus.down` _any_ incarnation of a member matching the passed in `node`. public func down(node: Node) { self.ref.tell(.command(.downCommand(node))) } + + public func down(member: Cluster.Member) { + self.ref.tell(.command(.downCommandMember(member))) + } } diff --git a/Sources/DistributedActors/Cluster/ClusterReceptionist.swift b/Sources/DistributedActors/Cluster/ClusterReceptionist.swift index 2514b8cfc..1b57f09af 100644 --- a/Sources/DistributedActors/Cluster/ClusterReceptionist.swift +++ b/Sources/DistributedActors/Cluster/ClusterReceptionist.swift @@ -226,7 +226,10 @@ internal enum ClusterReceptionist { } private static func syncRegistrations(context: ActorContext, myself: ActorRef) throws { - let remoteControls = context.system._cluster!.associationRemoteControls // FIXME: should not be needed and use cluster members instead + guard let cluster = context.system._cluster else { // FIXME: should not be needed and use cluster members instead + return // cannot get _cluster, perhaps we are shutting down already? + } + let remoteControls = cluster.associationRemoteControls guard !remoteControls.isEmpty else { return // nothing to do, no remote members @@ -249,6 +252,64 @@ internal enum ClusterReceptionist { } private static func makeRemoteAddress(on node: UniqueNode) -> ActorAddress { - return try! .init(node: node, path: ActorPath([ActorPathSegment("system"), ActorPathSegment("receptionist")]), incarnation: .wellKnown) + try! .init(node: node, path: ActorPath([ActorPathSegment("system"), ActorPathSegment("receptionist")]), incarnation: .wellKnown) // try! safe, we know the path is legal + } +} + +// ==== ---------------------------------------------------------------------------------------------------------------- +// MARK: DowningStrategySettings + +public enum DowningStrategySettings { + case none + case timeout(TimeoutBasedDowningStrategySettings) + + func make(_ clusterSettings: ClusterSettings) -> DowningStrategy? { + switch self { + case .none: + return nil + case .timeout(let settings): + return TimeoutBasedDowningStrategy(settings, selfNode: clusterSettings.uniqueBindNode) + } + } +} + +// ==== ---------------------------------------------------------------------------------------------------------------- +// MARK: OnDownActionStrategySettings + +public enum OnDownActionStrategySettings { + /// Take no (automatic) action upon noticing that this member is marked as [.down]. + /// + /// When using this mode you should take special care to implement some form of shutting down of this node (!). + /// As a `Cluster.MemberStatus.down` node is effectively useless for the rest of the cluster -- i.e. other + /// members MUST refuse communication with this down node. + case none + /// Upon noticing that this member is marked as [.down], initiate a shutdown. + case gracefulShutdown(delay: TimeAmount) + + func make() -> (ActorSystem) throws -> Void { + switch self { + case .none: + return { _ in () } // do nothing + + case .gracefulShutdown(let shutdownDelay): + return { system in + _ = try system.spawn("leaver", of: String.self, .setup { context in + guard .milliseconds(0) < shutdownDelay else { + context.log.warning("This node was marked as [.down], delay is immediate. Shutting down the system immediately!") + system.shutdown() + return .stop + } + + context.timers.startSingle(key: "shutdown-delay", message: "shutdown", delay: shutdownDelay) + system.log.warning("This node was marked as [.down], performing OnDownAction as configured: shutting down the system, in \(shutdownDelay)") + + return .receiveMessage { _ in + system.log.warning("Shutting down...") + system.shutdown() + return .stop + } + }) + } + } } } diff --git a/Sources/DistributedActors/Cluster/ClusterSettings.swift b/Sources/DistributedActors/Cluster/ClusterSettings.swift index 0d6b8c292..184d12120 100644 --- a/Sources/DistributedActors/Cluster/ClusterSettings.swift +++ b/Sources/DistributedActors/Cluster/ClusterSettings.swift @@ -145,12 +145,12 @@ public struct ClusterSettings { /// Strategy how members determine if others (or myself) shall be marked as `.down`. /// This strategy should be set to the same (or compatible) strategy on all members of a cluster to avoid split brain situations. - public var downingStrategy: DowningStrategySettings = .none + public var downingStrategy: DowningStrategySettings = .timeout(.default) /// When this member node notices it has been marked as `.down` in the membership, it can automatically perform an action. /// This setting determines which action to take. Generally speaking, the best course of action is to quickly and gracefully /// shut down the node and process, potentially leaving a higher level orchestrator to replace the node (e.g. k8s starting a new pod for the cluster). - public var onDownAction: OnDownActionStrategySettings = .shutdown + public var onDownAction: OnDownActionStrategySettings = .gracefulShutdown(delay: .seconds(3)) /// Configures the SWIM failure failure detector. public var swim: SWIM.Settings = .default @@ -175,46 +175,3 @@ public struct ClusterSettings { self.tls = tls } } - -// ==== ---------------------------------------------------------------------------------------------------------------- -// MARK: DowningStrategySettings - -public enum DowningStrategySettings { - case none - case timeout(TimeoutBasedDowningStrategySettings) - - func make(_ clusterSettings: ClusterSettings) -> DowningStrategy? { - switch self { - case .none: - return nil - case .timeout(let settings): - return TimeoutBasedDowningStrategy(settings, selfNode: clusterSettings.uniqueBindNode) - } - } -} - -// ==== ---------------------------------------------------------------------------------------------------------------- -// MARK: OnDownActionStrategySettings - -public enum OnDownActionStrategySettings { - /// Take no (automatic) action upon noticing that this member is marked as `.down`. - /// - /// When using this mode you should take special care to implement some form of shutting down of this node (!). - /// As a `Cluster.MemberStatus.down` node is effectively useless for the rest of the cluster -- i.e. other - /// members MUST refuse communication with this down node. - case none - /// Upon noticing that this member is marked as `.down`, initiate a shutdown. - case shutdown - - func make() -> (ActorSystem) -> Void { - switch self { - case .none: - return { _ in () } // do nothing - case .shutdown: - return { system in - system.log.warning("This node was marked as `.down` in membership. Performing OnDownAction as configured: shutting down the system.") - system.shutdown() - } - } - } -} diff --git a/Sources/DistributedActors/Cluster/ClusterShell.swift b/Sources/DistributedActors/Cluster/ClusterShell.swift index 57ecedb24..7979116c1 100644 --- a/Sources/DistributedActors/Cluster/ClusterShell.swift +++ b/Sources/DistributedActors/Cluster/ClusterShell.swift @@ -274,10 +274,13 @@ internal class ClusterShell { case handshakeWith(Node, replyTo: ActorRef?) case retryHandshake(HandshakeStateMachine.InitiatedState) - case reachabilityChanged(UniqueNode, Cluster.MemberReachability) + case failureDetectorReachabilityChanged(UniqueNode, Cluster.MemberReachability) /// Used to signal a "down was issued" either by the user, or another part of the system. case downCommand(Node) + /// Used to signal a "down was issued" either by the user, or another part of the system. + case downCommandMember(Cluster.Member) + case shutdown(BlockingReceptacle) // TODO: could be NIO future } @@ -404,8 +407,7 @@ extension ClusterShell { case .retryHandshake(let initiated): return self.connectSendHandshakeOffer(context, state, initiated: initiated) - // FIXME: this is now a cluster event !!!!! - case .reachabilityChanged(let node, let reachability): + case .failureDetectorReachabilityChanged(let node, let reachability): guard let member = state.membership.uniqueMember(node) else { return .same // reachability change of unknown node } @@ -420,7 +422,11 @@ extension ClusterShell { return self.onShutdownCommand(context, state: state, signalOnceUnbound: receptacle) case .downCommand(let node): - return self.onDownCommand(context, state: state, node: node) + return self.ready(state: state.membership.members(node).reduce(state) { _, member in + self.onDownCommand(context, state: state, member: member) + }) + case .downCommandMember(let member): + return self.ready(state: self.onDownCommand(context, state: state, member: member)) } } @@ -469,6 +475,10 @@ extension ClusterShell { let changeDirective = state.applyClusterEventAsChange(event) self.interpretLeaderActions(&state, changeDirective.leaderActions) + if case .membershipChange(let change) = event { + self.tryIntroduceGossipPeer(context, state, change: change) + } + if changeDirective.applied { state.latestGossip.incrementOwnerVersion() // we only publish the event if it really caused a change in membership, to avoid echoing "the same" change many times. @@ -496,7 +506,12 @@ extension ClusterShell { "gossip/before": "\(beforeGossipMerge)", "gossip/now": "\(state.latestGossip)", ]) + mergeDirective.effectiveChanges.forEach { effectiveChange in +// /// we are careful to only introduce here, and only "new" members +// // TODO: consider reusing receptionist for this (!) +// self.tryIntroduceGossipPeer(context, state, change: effectiveChange) +// let event: Cluster.Event = .membershipChange(effectiveChange) self.clusterEvents.publish(event) } @@ -504,26 +519,8 @@ extension ClusterShell { return self.ready(state: state) } - func updateGossipPeers(context: ActorContext) { - // ==== Update membership information to gossip to our latest "view" --------------------------------------- - // TODO: make it cleaner? though we decided to go with manual peer management as the ClusterShell owns it, hm - let members = state.membership.members(atMost: .removed) - for member in members { - if state.myselfNode != member.node { - // TODO: consider receptionist instead of this; we're "early" but receptionist could already be spreading its info to this node, since we associated. - let gossipPeer: ConvergentGossip.Ref = context.system._resolve( - context: .init(address: ._clusterGossip(on: member.node), system: context.system) - ) - state.gossipControl.introduce(peer: gossipPeer) - } - } - // TODO: was this needed here? state.gossipControl.update(Cluster.Gossip()) - } - return .setup { context in - updateGossipPeers(context: context) - - return .receive { context, message in + .receive { context, message in switch message { case .command(let command): return receiveShellCommand(context, command: command) case .query(let query): return receiveQuery(context, query: query) @@ -534,6 +531,24 @@ extension ClusterShell { } } } + + func tryIntroduceGossipPeer(_ context: ActorContext, _ state: ClusterShellState, change: Cluster.MembershipChange, file: String = #file, line: UInt = #line) { + guard change.toStatus < .down else { + return + } + guard change.member.node != state.myselfNode else { + return + } + // TODO: make it cleaner? though we decided to go with manual peer management as the ClusterShell owns it, hm + + // TODO: consider receptionist instead of this; we're "early" but receptionist could already be spreading its info to this node, since we associated. + let gossipPeer: ConvergentGossip.Ref = context.system._resolve( + context: .init(address: ._clusterGossip(on: change.member.node), system: context.system) + ) + // FIXME: make sure that if the peer terminated, we don't add it again in here, receptionist would be better then to power this... + // today it can happen that a node goes down but we dont know yet so we add it again :O + state.gossipControl.introduce(peer: gossipPeer) + } } // ==== ---------------------------------------------------------------------------------------------------------------- @@ -663,6 +678,9 @@ extension ClusterShell { state = self.finishTerminateAssociation(context.system, state: state, removalDirective: removalDirective) } + // TODO: try to pull off with receptionist the same dance + self.tryIntroduceGossipPeer(context, state, change: directive.membershipChange) + /// a new node joined, thus if we are the leader, we should perform leader tasks to potentially move it to .up let actions = state.tryCollectLeaderActions() self.interpretLeaderActions(&state, actions) // TODO: not so DRY between acceptAndAssociate and onHandshakeAccepted, DRY it up @@ -778,6 +796,8 @@ extension ClusterShell { self.cacheAssociationRemoteControl(directive.association) state.log.debug("Associated with: \(reflecting: completed.remoteNode); Membership change: \(directive.membershipChange), resulting in: \(state.membership)") + self.tryIntroduceGossipPeer(context, state, change: directive.membershipChange) + // by emitting these `change`s, we not only let anyone interested know about this, // but we also enable the shell (or leadership) to update the leader if it needs changing. if directive.membershipChange.replaced != nil, @@ -849,7 +869,13 @@ extension ClusterShell { state.log.warning("Received .restInPeace from \(fromNode), meaning this node is known to be .down or worse, and should terminate. Initiating self .down-ing.", metadata: [ "sender/node": "\(String(reflecting: fromNode))", ]) - return self.onDownCommand(context, state: state, node: myselfNode.node) + + guard let myselfMember = state.membership.uniqueMember(myselfNode) else { + state.log.error("Unable to find Cluster.Member for \(myselfNode) self node! This should not happen, please file an issue.") + return .same + } + + return self.ready(state: self.onDownCommand(context, state: state, member: myselfMember)) } private func notifyHandshakeFailure(state: HandshakeStateMachine.State, node: Node, error: Error) { @@ -867,24 +893,23 @@ extension ClusterShell { } // ==== ---------------------------------------------------------------------------------------------------------------- -// MARK: Unbind +// MARK: Shutdown extension ClusterShell { - // TODO: become "shutdown" rather than just unbind fileprivate func onShutdownCommand(_ context: ActorContext, state: ClusterShellState, signalOnceUnbound: BlockingReceptacle) -> Behavior { let addrDesc = "\(state.settings.uniqueBindNode.node.host):\(state.settings.uniqueBindNode.node.port)" return context.awaitResult(of: state.channel.close(), timeout: context.system.settings.cluster.unbindTimeout) { - // TODO: also close all associations (!!!) + // FIXME: also close all associations (!!!) switch $0 { case .success: context.log.info("Unbound server socket [\(addrDesc)], node: \(reflecting: state.myselfNode)") - signalOnceUnbound.offerOnce(()) self.serializationPool.shutdown() + signalOnceUnbound.offerOnce(()) return .stop case .failure(let err): context.log.warning("Failed while unbinding server socket [\(addrDesc)], node: \(reflecting: state.myselfNode). Error: \(err)") - signalOnceUnbound.offerOnce(()) self.serializationPool.shutdown() + signalOnceUnbound.offerOnce(()) throw err } } @@ -927,7 +952,7 @@ extension ClusterShell { var state = state // TODO: make sure we don't end up infinitely spamming reachability events - if state.applyMemberReachabilityChange(change) != nil { + if state.membership.applyReachabilityChange(change) != nil { self.clusterEvents.publish(.reachabilityChange(change)) self.recordMetrics(context.system.metrics, membership: state.membership) return self.ready(state: state) // TODO: return membershipChanged() where we can do the publish + record in one spot @@ -938,30 +963,10 @@ extension ClusterShell { /// Convenience function for directly handling down command in shell. /// Attempts to locate which member to down and delegates further. - func onDownCommand(_ context: ActorContext, state: ClusterShellState, node: Node) -> Behavior { - let membersToDown = state.membership.members(node).filter { $0.status < .down } - - guard !membersToDown.isEmpty else { - state.log.info("No members to .down; Known members of non-unique node [\(node)]: \(state.membership.members(node))") - return .same - } - - if membersToDown.contains(where: { m in m.node == state.myselfNode }) { - state.log.warning("Downing self node [\(state.myselfNode)]. Prefer issuing `cluster.leave()` when leaving gracefully.") - } - - var state: ClusterShellState = state - for memberToDown in membersToDown { - state = self.onDownCommand0(context, state: state, member: memberToDown) - } - - return self.ready(state: state) - } - - func onDownCommand0(_ context: ActorContext, state: ClusterShellState, member memberToDown: Cluster.Member) -> ClusterShellState { + func onDownCommand(_ context: ActorContext, state: ClusterShellState, member memberToDown: Cluster.Member) -> ClusterShellState { var state = state - if let change = state.applyMembershipChange(memberToDown.node, toStatus: .down) { + if let change = state.membership.apply(.init(member: memberToDown, toStatus: .down)) { self.clusterEvents.publish(.membershipChange(change)) if let logChangeLevel = state.settings.logMembershipChanges { @@ -972,12 +977,19 @@ extension ClusterShell { guard memberToDown.node != state.myselfNode else { // ==== ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // Down(self node); ensuring SWIM knows about this and should likely initiate graceful shutdown - context.log.warning("Self node was determined [.down]!", metadata: [ + context.log.warning("Self node was marked [.down]!", metadata: [ // TODO: carry reason why -- was it gossip, manual or other "cluster/membership": "\(state.membership)", // TODO: introduce state.metadata pattern? ]) self.swimRef.tell(.local(.confirmDead(memberToDown.node))) - context.system.settings.cluster.onDownAction.make()(context.system) + + do { + let onDownAction = context.system.settings.cluster.onDownAction.make() + try onDownAction(context.system) // TODO: return a future and run with a timeout + } catch { + context.system.log.error("Failed to executed onDownAction! Shutting down system forcefully! Error: \(error)") + context.system.shutdown() + } return state } diff --git a/Sources/DistributedActors/Cluster/ClusterShellState.swift b/Sources/DistributedActors/Cluster/ClusterShellState.swift index a99f4352c..21a1d3555 100644 --- a/Sources/DistributedActors/Cluster/ClusterShellState.swift +++ b/Sources/DistributedActors/Cluster/ClusterShellState.swift @@ -435,7 +435,7 @@ extension ClusterShellState { changeWasApplied = false } case .reachabilityChange(let change): - if self.applyMemberReachabilityChange(change) != nil { + if self.membership.applyReachabilityChange(change) != nil { self.log.trace("Applied reachability change: \(change)", metadata: self.metadata) changeWasApplied = true } else { @@ -467,18 +467,6 @@ extension ClusterShellState { let leaderActions: [LeaderAction] } - /// - Returns: the `Cluster.MembershipChange` that was the result of moving the member identified by the `node` to the `toStatus`, - /// or `nil` if no (observable) change resulted from this move (e.g. marking a `.dead` node as `.dead` again, is not a "change"). - mutating func applyMembershipChange(_ node: UniqueNode, toStatus: Cluster.MemberStatus) -> Cluster.MembershipChange? { - return self.membership.apply(Cluster.MembershipChange(member: Cluster.Member(node: node, status: toStatus))) - } - - /// - Returns: the changed member if the change was a transition (unreachable -> reachable, or back), - /// or `nil` if the reachability is the same as already known by the membership. - mutating func applyMemberReachabilityChange(_ change: Cluster.ReachabilityChange) -> Cluster.Member? { - return self.membership.applyReachabilityChange(change) - } - /// If, and only if, the current node is a leader it performs a set of tasks, such as moving nodes to `.up` etc. // TODO: test the actions when leader, not leader, that only applies to joining ones etc mutating func tryCollectLeaderActions() -> [LeaderAction] { diff --git a/Sources/DistributedActors/Cluster/DowningStrategy/DowningStrategy.swift b/Sources/DistributedActors/Cluster/DowningStrategy/DowningStrategy.swift index d72aceb7a..ca8e96beb 100644 --- a/Sources/DistributedActors/Cluster/DowningStrategy/DowningStrategy.swift +++ b/Sources/DistributedActors/Cluster/DowningStrategy/DowningStrategy.swift @@ -12,40 +12,24 @@ // //===----------------------------------------------------------------------===// -internal protocol DowningStrategy { - func onLeaderChange(to: Cluster.Member?) throws -> DowningStrategyDirectives.LeaderChangeDirective - func onTimeout(_ member: Cluster.Member) -> DowningStrategyDirectives.TimeoutDirective +import Logging - func onMemberUnreachable(_ member: Cluster.Member) -> DowningStrategyDirectives.MemberUnreachableDirective - func onMemberReachable(_ member: Cluster.Member) -> DowningStrategyDirectives.MemberReachableDirective +/// Allows implementing downing strategies, without having to re-implement and reinvent logging and subscription logic. +/// Downing strategies can focus on inspecting the membership and issuing timers if needed. +internal protocol DowningStrategy { + func onClusterEvent(event: Cluster.Event) throws -> DowningStrategyDirective - func onMemberRemoved(_ member: Cluster.Member) -> DowningStrategyDirectives.MemberRemovedDirective + func onTimeout(_ member: Cluster.Member) -> DowningStrategyDirective } -internal enum DowningStrategyDirectives { - enum LeaderChangeDirective { - case none - case markAsDown(Set) - } - - enum TimeoutDirective { - case none - case markAsDown(UniqueNode) - } - - enum MemberReachableDirective { - case none - case cancelTimer - } +internal enum DowningStrategyDirective { + case none + case markAsDown(Set) + case startTimer(key: TimerKey, message: DowningStrategyMessage, delay: TimeAmount) + case cancelTimer(key: TimerKey) - enum MemberRemovedDirective { - case none - case cancelTimer - } - - enum MemberUnreachableDirective { - case none - case startTimer(key: TimerKey, message: DowningStrategyMessage, delay: TimeAmount) + static func markAsDown(_ member: Cluster.Member) -> Self { + Self.markAsDown([member]) } } @@ -64,7 +48,7 @@ internal struct DowningStrategyShell { } var behavior: Behavior { - return .setup { context in + .setup { context in let clusterEventSubRef = context.subReceive(Cluster.Event.self) { event in do { try self.receiveClusterEvent(context, event: event) @@ -77,13 +61,9 @@ internal struct DowningStrategyShell { return .receiveMessage { message in switch message { case .timeout(let member): - context.log.debug("Received timeout for [\(member)]") - switch self.strategy.onTimeout(member) { - case .markAsDown(let node): - self.markAsDown(context, member: node) - case .none: - () // nothing to be done - } + let directive = self.strategy.onTimeout(member) + context.log.debug("Received timeout for [\(member)], resulting in: \(directive)") + self.interpret(context, directive) } return .same @@ -91,61 +71,42 @@ internal struct DowningStrategyShell { } } - func markAsDown(_ context: ActorContext, members: Set) { - for member in members { - self.markAsDown(context, member: member) - } + func receiveClusterEvent(_ context: ActorContext, event: Cluster.Event) throws { + let directive: DowningStrategyDirective = try self.strategy.onClusterEvent(event: event) + context.log.trace("Downing strategy reaction to \(event) resulted in: \(directive)", metadata: self.metadata) + self.interpret(context, directive) } - func markAsDown(_ context: ActorContext, member node: UniqueNode) { - context.log.info("Strategy [\(type(of: self.strategy))] decision about unreachable member [\(node)]: marking as: .down") - context.system.cluster.down(node: node.node) - } + func interpret(_ context: ActorContext, _ directive: DowningStrategyDirective) { + switch directive { + case .markAsDown(let members): + self.markAsDown(context, members: members) - func receiveClusterEvent(_ context: ActorContext, event: Cluster.Event) throws { - switch event { - case .snapshot: - () // ignore, we don't need the full membership for decisions // TODO: or do we... - case .leadershipChange(let change): - let directive = try self.strategy.onLeaderChange(to: change.newLeader) - switch directive { - case .markAsDown(let downMembers): - self.markAsDown(context, members: downMembers) - case .none: - () // no members to mark down - } + case .startTimer(let key, let message, let delay): + context.log.trace("Start timer \(key), message: \(message), delay: \(delay)") + context.timers.startSingle(key: key, message: message, delay: delay) + case .cancelTimer(let key): + context.log.trace("Cancel timer \(key)") + context.timers.cancel(for: key) - case .membershipChange(let change) where change.isRemoval: - context.log.debug("Member [\(change.member)] has been removed") - let directive = self.strategy.onMemberRemoved(change.member) - switch directive { - case .cancelTimer: - context.timers.cancel(for: TimerKey(change.member)) - case .none: - () // this member was not marked unreachable, so ignore - } - case .membershipChange: // TODO: actually store and act based on membership - () // - - case .reachabilityChange(let change): - context.log.debug("Member [\(change)] has become \(change.member.reachability)") - - if change.toReachable { - let directive = self.strategy.onMemberReachable(change.member) - switch directive { - case .cancelTimer: - context.timers.cancel(for: TimerKey(change.member.node)) - case .none: - () // this member was not marked unreachable, so ignore - } - } else { - switch self.strategy.onMemberUnreachable(change.member) { - case .startTimer(let key, let message, let delay): - context.timers.startSingle(key: key, message: message, delay: delay) - case .none: - () // nothing to be done - } - } + case .none: + () // nothing to be done } } + + func markAsDown(_ context: ActorContext, members: Set) { + for member in members { + context.log.info("Decision to [.down] member [\(member)]!", metadata: self.metadata.merging([ + "downing/member": "\(member)", + ], uniquingKeysWith: { l, _ in l })) + context.system.cluster.down(member: member) + } + } + + var metadata: Logger.Metadata { + [ + "tag": "downing", + "downing/strategy": "\(type(of: self.strategy))", + ] + } } diff --git a/Sources/DistributedActors/Cluster/DowningStrategy/TimeoutBasedDowningStrategy.swift b/Sources/DistributedActors/Cluster/DowningStrategy/TimeoutBasedDowningStrategy.swift index b741755c7..c3f6325ba 100644 --- a/Sources/DistributedActors/Cluster/DowningStrategy/TimeoutBasedDowningStrategy.swift +++ b/Sources/DistributedActors/Cluster/DowningStrategy/TimeoutBasedDowningStrategy.swift @@ -24,33 +24,83 @@ internal final class TimeoutBasedDowningStrategy { var membership: Cluster.Membership var isLeader: Bool { - return self.membership.leader?.node == self.selfNode + self.membership.isLeader(self.selfNode) } // unreachable members will be marked down after the timeout expires - var _unreachable: Set + var _unreachable: Set // buffer for nodes that will be marked down, if this node becomes the leader - var _markAsDown: Set + var _markAsDown: Set init(_ settings: TimeoutBasedDowningStrategySettings, selfNode: UniqueNode) { self.settings = settings self.selfNode = selfNode self._unreachable = [] self._markAsDown = [] - self.membership = Cluster.Membership().joining(selfNode) + self.membership = .empty } } -// FIXME: Implement more in terms of "change" APIs extension TimeoutBasedDowningStrategy: DowningStrategy { - func onMemberUnreachable(_ member: Cluster.Member) -> DowningStrategyDirectives.MemberUnreachableDirective { - self._unreachable.insert(member.node) + func onClusterEvent(event: Cluster.Event) throws -> DowningStrategyDirective { + switch event { + case .snapshot(let snapshot): + self.membership = snapshot + return .none + + case .membershipChange(let change): + guard let change = self.membership.apply(change) else { + return .none + } + + if change.isAtLeastDown || change.isRemoval || change.isReplacement { + // it was marked as down by someone, we don't need to track it anymore + _ = self._markAsDown.remove(change.member) + _ = self._unreachable.remove(change.member) + return .cancelTimer(key: self.timerKey(change.member)) + } + + return .none + + case .leadershipChange(let change): + return try self.onLeaderChange(to: change.newLeader) + + case .reachabilityChange(let change): + if change.toUnreachable { + return self.onMemberUnreachable(change) + } else { + return self.onMemberReachable(change) + } + } + } + + func onMemberUnreachable(_ change: Cluster.ReachabilityChange) -> DowningStrategyDirective { + _ = self.membership.applyReachabilityChange(change) + let member = change.member + + self._unreachable.insert(member) + + return .startTimer(key: self.timerKey(member), message: .timeout(member), delay: self.settings.downUnreachableMembersAfter) + } - return .startTimer(key: TimerKey(member.node), message: .timeout(member), delay: self.settings.downUnreachableMembersAfter) + func onMemberReachable(_ change: Cluster.ReachabilityChange) -> DowningStrategyDirective { + _ = self.membership.applyReachabilityChange(change) + let member = change.member + + _ = self._markAsDown.remove(member) + if self._unreachable.remove(member) != nil { + return .cancelTimer(key: self.timerKey(member)) + } + + return .none } - func onLeaderChange(to leader: Cluster.Member?) throws -> DowningStrategyDirectives.LeaderChangeDirective { + func timerKey(_ member: Cluster.Member) -> TimerKey { + TimerKey(member.node) + } + + func onLeaderChange(to leader: Cluster.Member?) throws -> DowningStrategyDirective { _ = try self.membership.applyLeadershipChange(to: leader) if self.isLeader, !self._markAsDown.isEmpty { @@ -61,34 +111,24 @@ extension TimeoutBasedDowningStrategy: DowningStrategy { } } - func onTimeout(_ member: Cluster.Member) -> DowningStrategyDirectives.TimeoutDirective { - guard let address = self._unreachable.remove(member.node) else { + func onTimeout(_ member: Cluster.Member) -> DowningStrategyDirective { + guard let nodeToDown = self._unreachable.remove(member) else { return .none } - if self.isLeader { - return .markAsDown(address) + return .markAsDown([nodeToDown]) } else { - self._markAsDown.insert(address) + self._markAsDown.insert(nodeToDown) return .none } } - func onMemberRemoved(_ member: Cluster.Member) -> DowningStrategyDirectives.MemberRemovedDirective { - self._markAsDown.remove(member.node) - - if self._unreachable.remove(member.node) != nil { - return .cancelTimer - } - - return .none - } - - func onMemberReachable(_ member: Cluster.Member) -> DowningStrategyDirectives.MemberReachableDirective { - self._markAsDown.remove(member.node) + // TODO: remove this + func onMemberRemoved(_ member: Cluster.Member) -> DowningStrategyDirective { + self._markAsDown.remove(member) - if self._unreachable.remove(member.node) != nil { - return .cancelTimer + if self._unreachable.remove(member) != nil { + return .cancelTimer(key: self.timerKey(member)) } return .none @@ -99,6 +139,6 @@ public struct TimeoutBasedDowningStrategySettings { public var downUnreachableMembersAfter: TimeAmount = .seconds(1) public static var `default`: TimeoutBasedDowningStrategySettings { - return .init() + .init() } } diff --git a/Sources/DistributedActors/Cluster/Leadership.swift b/Sources/DistributedActors/Cluster/Leadership.swift index d271f9410..8ca94e5a2 100644 --- a/Sources/DistributedActors/Cluster/Leadership.swift +++ b/Sources/DistributedActors/Cluster/Leadership.swift @@ -197,13 +197,17 @@ extension Leadership { } // ==== ---------------------------------------------------------------------------------------------------------------- -// MARK: LowestReachableMember election strategy +// MARK: LowestAddressReachableMember election strategy extension Leadership { /// Simple strategy which does not require any additional coordination from members to select a leader. /// /// All `MemberStatus.joining`, `MemberStatus.up` _reachable_ members are sorted by their addresses, - /// and the "lowest" is selected as the leader. // TODO: to be extended to respect member roles as well + /// and the "lowest" is selected as the leader. + /// + /// Only REACHABLE nodes are taken into account. This means that in a situation with a cluster partition, + /// there WILL be multiple leaders. In this coordination free scheme, this is needed in order to avoid "getting stuck", + /// without a leader to perform the unreachable -> down move. // TODO keep thinking if we can do better here, we could to a quorum downing IMHO, and remove this impl completely as it's very "bad". /// /// ### Use cases /// This strategy works well for non critical tasks, which nevertheless benefit from performing them more centrally @@ -239,7 +243,7 @@ extension Leadership { /// // TODO: In situations which need strong guarantees, this leadership election scheme does NOT provide strong enough /// guarantees, and you should consider using another scheme or consensus based modes. - public struct LowestAddressMember: LeaderElection { + public struct LowestAddressReachableMember: LeaderElection { let minimumNumberOfMembersToDecide: Int let loseLeadershipIfBelowMinNrOfMembers: Bool @@ -250,7 +254,7 @@ extension Leadership { public mutating func runElection(context: LeaderElectionContext, membership: Cluster.Membership) -> LeaderElectionResult { var membership = membership - let membersToSelectAmong = membership.members(atMost: .up) + let membersToSelectAmong = membership.members(atMost: .up, reachability: .reachable) let enoughMembers = membersToSelectAmong.count >= self.minimumNumberOfMembersToDecide if enoughMembers { @@ -339,7 +343,7 @@ extension ClusterSettings { case .none: return nil case .lowestAddress(let nr): - return Leadership.LowestAddressMember(minimumNrOfMembers: nr) + return Leadership.LowestAddressReachableMember(minimumNrOfMembers: nr) } } } diff --git a/Sources/DistributedActors/Cluster/SWIM/SWIMSettings.swift b/Sources/DistributedActors/Cluster/SWIM/SWIMSettings.swift index ef9bf611c..619fdf551 100644 --- a/Sources/DistributedActors/Cluster/SWIM/SWIMSettings.swift +++ b/Sources/DistributedActors/Cluster/SWIM/SWIMSettings.swift @@ -19,7 +19,7 @@ import Logging public struct SWIMSettings { public static var `default`: SWIMSettings { - return .init() + .init() } /// Optional "SWIM instance name" to be included in log statements, @@ -88,11 +88,13 @@ public struct SWIMFailureDetectorSettings { // FIXME: those timeouts are not the actual timeout, the actual timeout is recalculated each time when we get more `suspect` information - /// Suspicion timeouts are specified as number of probe intervals. E.g. a `probeInterval` - /// of 300 milliseconds and `suspicionTimeoutMax` means that a suspicious node will be - /// marked `.dead` after approx. 900ms. + /// Suspicion timeouts are specified as number of probe intervals. + /// E.g. a `probeInterval = .milliseconds(300)` and `suspicionTimeoutMax = 3` means that a suspicious node + /// will be escalated as `.unreachable` after approximately 900ms. Once it is confirmed dead by the high-level + /// membership (e.g. immediately, or after an additional grace period, or vote), it will be marked `.dead` in swim, + /// and `.down` in the high-level membership. public var suspicionTimeoutPeriodsMax: Int = 10 - public var suspicionTimeoutPeriodsMin: Int = 10 + // public var suspicionTimeoutPeriodsMin: Int = 10 // FIXME: this is once we have LHA, Local Health Aware Suspicion public var probeInterval: TimeAmount = .seconds(1) public var pingTimeout: TimeAmount = .milliseconds(300) diff --git a/Sources/DistributedActors/Cluster/SWIM/SWIMShell.swift b/Sources/DistributedActors/Cluster/SWIM/SWIMShell.swift index 695d0f31a..dc9875aa6 100644 --- a/Sources/DistributedActors/Cluster/SWIM/SWIMShell.swift +++ b/Sources/DistributedActors/Cluster/SWIM/SWIMShell.swift @@ -327,16 +327,30 @@ internal struct SWIMShell { // TODO: push more of logic into SWIM instance, the calculating // FIXME: use decaying timeout as proposed in lifeguard paper let timeoutPeriods = (self.swim.protocolPeriod - self.swim.settings.failureDetector.suspicionTimeoutPeriodsMax) - for member in self.swim.suspects where member.protocolPeriod <= timeoutPeriods { + context.log.trace("Checking suspicion timeouts...", metadata: [ + "swim/suspects": "\(self.swim.suspects)", + "swim/protocolPeriod": "\(self.swim.protocolPeriod)", + "swim/timeoutPeriods": "\(timeoutPeriods)", + ]) + for member in self.swim.suspects { + context.log.trace("Checking \(member)...") + if member.protocolPeriod <= timeoutPeriods { + () // ok, continue checking + } else { + continue // skip + } + if let node = member.ref.address.node { if let incarnation = member.status.incarnation { + context.log.trace("Marking \(member.node) as .unreachable!") self.swim.mark(member.ref, as: .unreachable(incarnation: incarnation)) } // if unreachable or dead, we don't need to notify the clusterRef if member.status.isUnreachable || member.status.isDead { continue } - self.clusterRef.tell(.command(.reachabilityChanged(node, .unreachable))) + context.log.info("Notifying cluster, node \(member.node) is unreachable!") + self.clusterRef.tell(.command(.failureDetectorReachabilityChanged(node, .unreachable))) } } } diff --git a/Sources/DistributedActors/Pattern/ConvergentGossip.swift b/Sources/DistributedActors/Pattern/ConvergentGossip.swift index 4e0cd3232..c3a2f8f4c 100644 --- a/Sources/DistributedActors/Pattern/ConvergentGossip.swift +++ b/Sources/DistributedActors/Pattern/ConvergentGossip.swift @@ -14,6 +14,8 @@ /// Convergent gossip is a gossip mechanism which aims to equalize some state across all peers participating. final class ConvergentGossip { + typealias GossipPeerRef = ActorRef + let settings: Settings // TODO: store Envelope and inside it the payload @@ -68,9 +70,15 @@ final class ConvergentGossip { } } - private func onIntroducePeer(_ context: ActorContext, peer: ActorRef) { + private func onIntroducePeer(_ context: ActorContext, peer: GossipPeerRef) { if self.peers.insert(context.watch(peer)).inserted { - context.log.trace("Added peer: \(peer), total peers: [\(self.peers.count)]: \(self.peers)") + context.log.trace("Got introduced to peer [\(peer)], pushing initial gossip immediately", metadata: [ + "gossip/peerCount": "\(self.peers.count)", + "gossip/peers": "\(self.peers.map { $0.address })", + ]) + + // TODO: implement this rather as "high priority peer to gossip to" + self.sendGossip(context, to: peer) // TODO: consider if we should do a quick gossip to any new peers etc // TODO: peers are removed when they die, no manual way to do it } @@ -97,29 +105,35 @@ final class ConvergentGossip { // FIXME: this is still just broadcasting (!) private func onPeriodicGossipTick(_ context: ActorContext) { + guard let target = self.peers.shuffled().first else { + // no peer to gossip to... + return + } + + self.sendGossip(context, to: target) + self.scheduleNextGossipRound(context: context) + } + + private func sendGossip(_ context: ActorContext, to target: GossipPeerRef) { guard let payload = self.payload else { - context.log.trace("No payload set, skipping gossip round.") - self.scheduleNextGossipRound(context: context) + context.log.trace("No payload set, skipping gossip round", metadata: [ + "gossip/target": "\(target)", + ]) return } - // TODO: Optimization looking at seen table, decide who is not going to gain info form us anyway, and de-prioritize them - // That's nicer for small clusters, I guess - // let gossipCandidatePeers = self. - let envelope = GossipEnvelope(payload: payload) // TODO: carry all the vector clocks + // TODO: Optimization looking at seen table, decide who is not going to gain info form us anyway, and de-prioritize them that's nicer for small clusters, I guess + let envelope = GossipEnvelope(payload: payload) // TODO: carry all the vector clocks here rather in the payload - // FIXME: this gossips to ALL, but should randomly pick some, with preference of nodes which are "behind" - context.log.trace("Sending gossip to \(self.peers)", metadata: [ - "gossip/peers": "\(self.peers)", + // TODO: if we have seen tables, we can use them to bias the gossip towards the "more behind" nodes + context.log.trace("Sending gossip to \(target)", metadata: [ + "gossip/target": "\(target.address)", + "gossip/peerCount": "\(self.peers.count)", + "gossip/peers": "\(self.peers.map { $0.address })", "actor/message": "\(envelope)", ]) - // TODO: if we have seen tables, we can use them to bias the gossip towards the "more behind" nodes - if let peer = self.peers.shuffled().first { - peer.tell(.gossip(envelope)) - } - - self.scheduleNextGossipRound(context: context) + target.tell(.gossip(envelope)) } private func scheduleNextGossipRound(context: ActorContext) { @@ -137,7 +151,7 @@ extension ConvergentGossip { // local messages case updatePayload(Payload) - case introducePeer(ActorRef) + case introducePeer(GossipPeerRef) // internal messages case _clusterEvent(Cluster.Event) diff --git a/Sources/DistributedActors/Refs.swift b/Sources/DistributedActors/Refs.swift index d75140f18..f08b59e9a 100644 --- a/Sources/DistributedActors/Refs.swift +++ b/Sources/DistributedActors/Refs.swift @@ -544,7 +544,7 @@ public class Guardian { func makeChild(path: ActorPath, spawn: () throws -> ActorShell) throws -> ActorRef { return try self._childrenLock.synchronized { if self.stopping { - throw ActorContextError.alreadyStopping + throw ActorContextError.alreadyStopping("system: \(self.system?.name ?? "")") } if self._children.contains(name: path.name) { diff --git a/Tests/DistributedActorsTests/ActorSystemTestBase.swift b/Sources/DistributedActorsTestKit/ActorSystemTestBase.swift similarity index 75% rename from Tests/DistributedActorsTests/ActorSystemTestBase.swift rename to Sources/DistributedActorsTestKit/ActorSystemTestBase.swift index 65bfa6d0e..876ab6b3e 100644 --- a/Tests/DistributedActorsTests/ActorSystemTestBase.swift +++ b/Sources/DistributedActorsTestKit/ActorSystemTestBase.swift @@ -14,46 +14,46 @@ @testable import DistributedActors import DistributedActorsConcurrencyHelpers -import DistributedActorsTestKit import NIO import XCTest /// Base class to handle the repetitive setUp/tearDown code involved in most ActorSystem requiring tests. -class ActorSystemTestBase: ClusteredNodesTestBase { - var system: ActorSystem { +// TODO: Document and API guarantees +open class ActorSystemTestBase: ClusteredNodesTestBase { + public var system: ActorSystem { guard let node = self._nodes.first else { fatalError("No system spawned!") } return node } - var eventLoopGroup: EventLoopGroup { + public var eventLoopGroup: EventLoopGroup { self.system._eventLoopGroup } - var testKit: ActorTestKit { + public var testKit: ActorTestKit { self.testKit(self.system) } - var logCapture: LogCapture { + public var logCapture: LogCapture { guard let handler = self._logCaptures.first else { fatalError("No log capture installed!") } return handler } - override func setUp() { + open override func setUp() { super.setUp() _ = self.setUpNode(String(describing: type(of: self))) { _ in () } } - override func tearDown() { + open override func tearDown() { super.tearDown() } - override func setUpNode(_ name: String, _ modifySettings: ((inout ActorSystemSettings) -> Void)?) -> ActorSystem { + open override func setUpNode(_ name: String, _ modifySettings: ((inout ActorSystemSettings) -> Void)?) -> ActorSystem { super.setUpNode(name) { settings in settings.cluster.enabled = false modifySettings?(&settings) diff --git a/Sources/DistributedActorsTestKit/Cluster/ClusteredNodesTestBase.swift b/Sources/DistributedActorsTestKit/Cluster/ClusteredNodesTestBase.swift index 0e9a44259..e645800a2 100644 --- a/Sources/DistributedActorsTestKit/Cluster/ClusteredNodesTestBase.swift +++ b/Sources/DistributedActorsTestKit/Cluster/ClusteredNodesTestBase.swift @@ -63,10 +63,15 @@ open class ClusteredNodesTestBase: XCTestCase { let node = ActorSystem(name) { settings in settings.cluster.enabled = true settings.cluster.node.port = self.nextPort() + if self.captureLogs { settings.overrideLoggerFactory = capture.loggerFactory(captureLabel: name) } + settings.cluster.autoLeaderElection = .lowestAddress(minNumberOfMembers: 2) + + settings.cluster.swim.failureDetector.suspicionTimeoutPeriodsMax = 3 + self.configureActorSystem(settings: &settings) modifySettings?(&settings) } diff --git a/Sources/DistributedActorsTestKit/LogCapture.swift b/Sources/DistributedActorsTestKit/LogCapture.swift index a7ec96482..49e3114b5 100644 --- a/Sources/DistributedActorsTestKit/LogCapture.swift +++ b/Sources/DistributedActorsTestKit/LogCapture.swift @@ -135,7 +135,7 @@ extension LogCapture { } metadataString.append(allString) } - metadataString = String(metadataString) + metadataString = String(metadataString.dropLast(1)) } } let date = ActorOriginLogHandler._createFormatter().string(from: log.date) @@ -228,13 +228,14 @@ extension LogCapture { public func shouldContain( prefix: String? = nil, message: String? = nil, + grep: String? = nil, at level: Logger.Level? = nil, expectedFile: String? = nil, expectedLine: Int = -1, failTest: Bool = true, file: StaticString = #file, line: UInt = #line, column: UInt = #column ) throws -> CapturedLogMessage { - precondition(prefix != nil || message != nil || level != nil, "At least one query parameter must be not `nil`!") + precondition(prefix != nil || message != nil || grep != nil || level != nil || level != nil || expectedFile != nil, "At least one query parameter must be not `nil`!") let callSite = CallSiteInfo(file: file, line: line, column: column, function: #function) let found = self.logs.lazy @@ -254,6 +255,12 @@ extension LogCapture { } else { return true } + }.filter { log in + if let expected = grep { + return "\(log)".contains(expected) + } else { + return true + } }.filter { log in if let expected = level { return log.level == expected diff --git a/Sources/DistributedActorsTestKit/TestProbes.swift b/Sources/DistributedActorsTestKit/TestProbes.swift index e118d652d..67b95dbc2 100644 --- a/Sources/DistributedActorsTestKit/TestProbes.swift +++ b/Sources/DistributedActorsTestKit/TestProbes.swift @@ -82,7 +82,7 @@ public class ActorTestProbe { do { self.internalRef = try spawn(behavior) } catch { - fatalError("Failed to spawn \(ActorTestProbe.self): \(error)", file: file, line: line) + fatalError("Failed to spawn \(ActorTestProbe.self): [\(error)]:\(String(reflecting: type(of: error)))", file: file, line: line) } self.name = self.internalRef.address.name diff --git a/Tests/ActorSingletonPluginTests/ActorSingletonPluginClusteredTests+XCTest.swift b/Tests/ActorSingletonPluginTests/ActorSingletonPluginClusteredTests+XCTest.swift new file mode 100644 index 000000000..c9776374e --- /dev/null +++ b/Tests/ActorSingletonPluginTests/ActorSingletonPluginClusteredTests+XCTest.swift @@ -0,0 +1,31 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift Distributed Actors open source project +// +// Copyright (c) 2020 Apple Inc. and the Swift Distributed Actors project authors +// Licensed under Apache License v2.0 +// +// See LICENSE.txt for license information +// See CONTRIBUTORS.md for the list of Swift Distributed Actors project authors +// +// SPDX-License-Identifier: Apache-2.0 +// +//===----------------------------------------------------------------------===// + +import XCTest + +/// +/// NOTE: This file was generated by generate_linux_tests.rb +/// +/// Do NOT edit this file directly as it will be regenerated automatically when needed. +/// + +extension ActorSingletonPluginClusteredTests { + static var allTests: [(String, (ActorSingletonPluginClusteredTests) -> () throws -> Void)] { + return [ + ("test_singletonByClusterLeadership", test_singletonByClusterLeadership), + ("test_singletonByClusterLeadership_stashMessagesIfNoLeader", test_singletonByClusterLeadership_stashMessagesIfNoLeader), + ("test_singletonByClusterLeadership_withLeaderChange", test_singletonByClusterLeadership_withLeaderChange), + ] + } +} diff --git a/Tests/ActorSingletonPluginTests/ActorSingletonPluginClusteredTests.swift b/Tests/ActorSingletonPluginTests/ActorSingletonPluginClusteredTests.swift new file mode 100644 index 000000000..e5ae085e4 --- /dev/null +++ b/Tests/ActorSingletonPluginTests/ActorSingletonPluginClusteredTests.swift @@ -0,0 +1,239 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift Distributed Actors open source project +// +// Copyright (c) 2019 Apple Inc. and the Swift Distributed Actors project authors +// Licensed under Apache License v2.0 +// +// See LICENSE.txt for license information +// See CONTRIBUTORS.md for the list of Swift Distributed Actors project authors +// +// SPDX-License-Identifier: Apache-2.0 +// +//===----------------------------------------------------------------------===// + +@testable import ActorSingletonPlugin +import DistributedActors +import DistributedActorsTestKit +import XCTest + +final class ActorSingletonPluginClusteredTests: ClusteredNodesTestBase { + func test_singletonByClusterLeadership() throws { + try shouldNotThrow { + var singletonSettings = ActorSingletonSettings(name: GreeterSingleton.name) + singletonSettings.allocationStrategy = .leadership + + let first = self.setUpNode("first") { settings in + settings.cluster.node.port = 7111 + settings.cluster.autoLeaderElection = .lowestAddress(minNumberOfMembers: 3) + + settings += ActorSingleton(settings: singletonSettings, GreeterSingleton.makeBehavior(instance: GreeterSingleton("Hello-1"))) + + settings.serialization.registerCodable(for: GreeterSingleton.Message.self, underId: 10001) + } + let second = self.setUpNode("second") { settings in + settings.cluster.node.port = 8222 + settings.cluster.autoLeaderElection = .lowestAddress(minNumberOfMembers: 3) + + settings += ActorSingleton(settings: singletonSettings, GreeterSingleton.makeBehavior(instance: GreeterSingleton("Hello-2"))) + + settings.serialization.registerCodable(for: GreeterSingleton.Message.self, underId: 10001) + } + let third = self.setUpNode("third") { settings in + settings.cluster.node.port = 9333 + settings.cluster.autoLeaderElection = .lowestAddress(minNumberOfMembers: 3) + + settings += ActorSingleton(settings: singletonSettings, GreeterSingleton.makeBehavior(instance: GreeterSingleton("Hello-3"))) + + settings.serialization.registerCodable(for: GreeterSingleton.Message.self, underId: 10001) + } + + first.cluster.join(node: second.cluster.node.node) + third.cluster.join(node: second.cluster.node.node) + + // `first` will be the leader (lowest address) and runs the singleton + try self.ensureNodes(.up, within: .seconds(10), systems: first, second, third) + + let replyProbe1 = self.testKit(first).spawnTestProbe(expecting: String.self) + let ref1 = try first.singleton.ref(name: GreeterSingleton.name, of: GreeterSingleton.Message.self) + ref1.tell(.greet(name: "Charlie", _replyTo: replyProbe1.ref)) + + let replyProbe2 = self.testKit(second).spawnTestProbe(expecting: String.self) + let ref2 = try second.singleton.ref(name: GreeterSingleton.name, of: GreeterSingleton.Message.self) + ref2.tell(.greet(name: "Charlie", _replyTo: replyProbe2.ref)) + + let replyProbe3 = self.testKit(third).spawnTestProbe(expecting: String.self) + let ref3 = try third.singleton.ref(name: GreeterSingleton.name, of: GreeterSingleton.Message.self) + ref3.tell(.greet(name: "Charlie", _replyTo: replyProbe3.ref)) + + try replyProbe1.expectMessage("Hello-1 Charlie!") + try replyProbe2.expectMessage("Hello-1 Charlie!") + try replyProbe3.expectMessage("Hello-1 Charlie!") + } + } + + func test_singletonByClusterLeadership_stashMessagesIfNoLeader() throws { + try shouldNotThrow { + var singletonSettings = ActorSingletonSettings(name: GreeterSingleton.name) + singletonSettings.allocationStrategy = .leadership + + let first = self.setUpNode("first") { settings in + settings.cluster.node.port = 7111 + settings.cluster.autoLeaderElection = .lowestAddress(minNumberOfMembers: 3) + + settings += ActorSingleton(settings: singletonSettings, GreeterSingleton.makeBehavior(instance: GreeterSingleton("Hello-1"))) + + settings.serialization.registerCodable(for: GreeterSingleton.Message.self, underId: 10001) + } + let second = self.setUpNode("second") { settings in + settings.cluster.node.port = 8222 + settings.cluster.autoLeaderElection = .lowestAddress(minNumberOfMembers: 3) + + settings += ActorSingleton(settings: singletonSettings, GreeterSingleton.makeBehavior(instance: GreeterSingleton("Hello-2"))) + + settings.serialization.registerCodable(for: GreeterSingleton.Message.self, underId: 10001) + } + let third = self.setUpNode("third") { settings in + settings.cluster.node.port = 9333 + settings.cluster.autoLeaderElection = .lowestAddress(minNumberOfMembers: 3) + + settings += ActorSingleton(settings: singletonSettings, GreeterSingleton.makeBehavior(instance: GreeterSingleton("Hello-3"))) + + settings.serialization.registerCodable(for: GreeterSingleton.Message.self, underId: 10001) + } + + // No leader so singleton is not available, messages sent should be stashed + let replyProbe1 = self.testKit(first).spawnTestProbe(expecting: String.self) + let ref1 = try first.singleton.ref(name: GreeterSingleton.name, of: GreeterSingleton.Message.self) + ref1.tell(.greet(name: "Charlie", _replyTo: replyProbe1.ref)) + + let replyProbe2 = self.testKit(second).spawnTestProbe(expecting: String.self) + let ref2 = try second.singleton.ref(name: GreeterSingleton.name, of: GreeterSingleton.Message.self) + ref2.tell(.greet(name: "Charlie", _replyTo: replyProbe2.ref)) + + let replyProbe3 = self.testKit(third).spawnTestProbe(expecting: String.self) + let ref3 = try third.singleton.ref(name: GreeterSingleton.name, of: GreeterSingleton.Message.self) + ref3.tell(.greet(name: "Charlie", _replyTo: replyProbe3.ref)) + + try replyProbe1.expectNoMessage(for: .milliseconds(200)) + try replyProbe2.expectNoMessage(for: .milliseconds(200)) + try replyProbe3.expectNoMessage(for: .milliseconds(200)) + + first.cluster.join(node: second.cluster.node.node) + third.cluster.join(node: second.cluster.node.node) + + // `first` becomes the leader (lowest address) and runs the singleton + try self.ensureNodes(.up, within: .seconds(10), systems: first, second, third) + + try replyProbe1.expectMessage("Hello-1 Charlie!") + try replyProbe2.expectMessage("Hello-1 Charlie!") + try replyProbe3.expectMessage("Hello-1 Charlie!") + } + } + + func test_singletonByClusterLeadership_withLeaderChange() throws { + try shouldNotThrow { + var singletonSettings = ActorSingletonSettings(name: GreeterSingleton.name) + singletonSettings.allocationStrategy = .leadership + + let first = self.setUpNode("first") { settings in + settings.cluster.node.port = 7111 + settings.cluster.autoLeaderElection = .lowestAddress(minNumberOfMembers: 3) + + settings += ActorSingleton(settings: singletonSettings, GreeterSingleton.makeBehavior(instance: GreeterSingleton("Hello-1"))) + + settings.serialization.registerCodable(for: GreeterSingleton.Message.self, underId: 10001) + } + let second = self.setUpNode("second") { settings in + settings.cluster.node.port = 8222 + settings.cluster.autoLeaderElection = .lowestAddress(minNumberOfMembers: 3) + + settings += ActorSingleton(settings: singletonSettings, GreeterSingleton.makeBehavior(instance: GreeterSingleton("Hello-2"))) + + settings.serialization.registerCodable(for: GreeterSingleton.Message.self, underId: 10001) + } + let third = self.setUpNode("third") { settings in + settings.cluster.node.port = 9333 + settings.cluster.autoLeaderElection = .lowestAddress(minNumberOfMembers: 3) + + settings += ActorSingleton(settings: singletonSettings, GreeterSingleton.makeBehavior(instance: GreeterSingleton("Hello-3"))) + + settings.serialization.registerCodable(for: GreeterSingleton.Message.self, underId: 10001) + } + let fourth = self.setUpNode("fourth") { settings in + settings.cluster.node.port = 7444 + settings.cluster.autoLeaderElection = .lowestAddress(minNumberOfMembers: 3) + + settings += ActorSingleton(settings: singletonSettings, GreeterSingleton.makeBehavior(instance: GreeterSingleton("Hello-4"))) + + settings.serialization.registerCodable(for: GreeterSingleton.Message.self, underId: 10001) + } + + first.cluster.join(node: second.cluster.node.node) + third.cluster.join(node: second.cluster.node.node) + + try self.ensureNodes(.up, within: .seconds(10), systems: first, second, third) + + let replyProbe1 = self.testKit(first).spawnTestProbe(expecting: String.self) + let ref1 = try first.singleton.ref(name: GreeterSingleton.name, of: GreeterSingleton.Message.self) + ref1.tell(.greet(name: "Charlie", _replyTo: replyProbe1.ref)) + + let replyProbe2 = self.testKit(second).spawnTestProbe(expecting: String.self) + let ref2 = try second.singleton.ref(name: GreeterSingleton.name, of: GreeterSingleton.Message.self) + ref2.tell(.greet(name: "Charlie", _replyTo: replyProbe2.ref)) + + let replyProbe3 = self.testKit(third).spawnTestProbe(expecting: String.self) + let ref3 = try third.singleton.ref(name: GreeterSingleton.name, of: GreeterSingleton.Message.self) + ref3.tell(.greet(name: "Charlie", _replyTo: replyProbe3.ref)) + + // `first` has the lowest address so it should be the leader and singleton + try replyProbe1.expectMessage("Hello-1 Charlie!") + try replyProbe2.expectMessage("Hello-1 Charlie!") + try replyProbe3.expectMessage("Hello-1 Charlie!") + + // Take down the leader + first.cluster.down(node: first.cluster.node.node) // TODO: minimize a test just for the self downing +// third.cluster.down(node: first.cluster.node.node) + + // Make sure that `second` and `third` see `first` as down and become leader-less + try self.testKit(second).eventually(within: .seconds(10)) { + try self.assertMemberStatus(on: second, node: first.cluster.node, is: .down) + try self.assertLeaderNode(on: second, is: nil) + } + try self.testKit(third).eventually(within: .seconds(10)) { + try self.assertMemberStatus(on: third, node: first.cluster.node, is: .down) + try self.assertLeaderNode(on: third, is: nil) + } + + // No leader so singleton is not available, messages sent should be stashed + ref2.tell(.greet(name: "Charlie-2", _replyTo: replyProbe2.ref)) + ref3.tell(.greet(name: "Charlie-3", _replyTo: replyProbe3.ref)) + + // `fourth` will become the new leader and singleton + fourth.cluster.join(node: second.cluster.node.node) + + try self.ensureNodes(.up, within: .seconds(10), systems: fourth, second, third) + + // The stashed messages get routed to new singleton running on `fourth` + try replyProbe2.expectMessage("Hello-4 Charlie-2!") + try replyProbe3.expectMessage("Hello-4 Charlie-3!") + } + } +} + +// ==== ---------------------------------------------------------------------------------------------------------------- +// MARK: Test utilities + +struct GreeterSingleton: Actorable { + static let name = "greeter" + + private let greeting: String + + init(_ greeting: String) { + self.greeting = greeting + } + + func greet(name: String) -> String { + "\(self.greeting) \(name)!" + } +} diff --git a/Tests/ActorSingletonPluginTests/ActorSingletonPluginTests+XCTest.swift b/Tests/ActorSingletonPluginTests/ActorSingletonPluginTests+XCTest.swift index 58d562c13..4bbc79b9b 100644 --- a/Tests/ActorSingletonPluginTests/ActorSingletonPluginTests+XCTest.swift +++ b/Tests/ActorSingletonPluginTests/ActorSingletonPluginTests+XCTest.swift @@ -23,10 +23,7 @@ import XCTest extension ActorSingletonPluginTests { static var allTests: [(String, (ActorSingletonPluginTests) -> () throws -> Void)] { return [ - ("test_nonCluster", test_nonCluster), - ("test_singletonByClusterLeadership", test_singletonByClusterLeadership), - ("test_singletonByClusterLeadership_stashMessagesIfNoLeader", test_singletonByClusterLeadership_stashMessagesIfNoLeader), - ("test_singletonByClusterLeadership_withLeaderChange", test_singletonByClusterLeadership_withLeaderChange), + ("test_ClusterSingleton_shouldWorkWithoutCluster", test_ClusterSingleton_shouldWorkWithoutCluster), ] } } diff --git a/Tests/ActorSingletonPluginTests/ActorSingletonPluginTests.swift b/Tests/ActorSingletonPluginTests/ActorSingletonPluginTests.swift index 63edc4f26..bfdcac6c9 100644 --- a/Tests/ActorSingletonPluginTests/ActorSingletonPluginTests.swift +++ b/Tests/ActorSingletonPluginTests/ActorSingletonPluginTests.swift @@ -17,10 +17,11 @@ import DistributedActors import DistributedActorsTestKit import XCTest -final class ActorSingletonPluginTests: ClusteredNodesTestBase { - func test_nonCluster() throws { +final class ActorSingletonPluginTests: ActorSystemTestBase { + func test_ClusterSingleton_shouldWorkWithoutCluster() throws { // Singleton should work just fine without clustering let system = ActorSystem("test") { settings in + settings.cluster.enabled = false settings += ActorSingleton(GreeterSingleton.name, GreeterSingleton.makeBehavior(instance: GreeterSingleton("Hello"))) settings += ActorSingleton("\(GreeterSingleton.name)-other", GreeterSingleton.makeBehavior(instance: GreeterSingleton("Hi"))) } diff --git a/Tests/DistributedActorsTests/Cluster/DowningStrategy/DowningClusteredTests+XCTest.swift b/Tests/DistributedActorsTests/Cluster/DowningStrategy/DowningClusteredTests+XCTest.swift new file mode 100644 index 000000000..d3d52ba90 --- /dev/null +++ b/Tests/DistributedActorsTests/Cluster/DowningStrategy/DowningClusteredTests+XCTest.swift @@ -0,0 +1,32 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift Distributed Actors open source project +// +// Copyright (c) 2020 Apple Inc. and the Swift Distributed Actors project authors +// Licensed under Apache License v2.0 +// +// See LICENSE.txt for license information +// See CONTRIBUTORS.md for the list of Swift Distributed Actors project authors +// +// SPDX-License-Identifier: Apache-2.0 +// +//===----------------------------------------------------------------------===// + +import XCTest + +/// +/// NOTE: This file was generated by generate_linux_tests.rb +/// +/// Do NOT edit this file directly as it will be regenerated automatically when needed. +/// + +extension DowningClusteredTests { + static var allTests: [(String, (DowningClusteredTests) -> () throws -> Void)] { + return [ + ("test_stoppingSelfNode_shouldPropagateToOtherNodes_downedBy_leaveSelf", test_stoppingSelfNode_shouldPropagateToOtherNodes_downedBy_leaveSelf), + ("test_stoppingSelfNode_shouldPropagateToOtherNodes_downedBy_downSelf", test_stoppingSelfNode_shouldPropagateToOtherNodes_downedBy_downSelf), + ("test_stoppingSelfNode_shouldPropagateToOtherNodes_downedBy_shutdownSelf", test_stoppingSelfNode_shouldPropagateToOtherNodes_downedBy_shutdownSelf), + ("test_stoppingSelfNode_shouldPropagateToOtherNodes_downedBy_downFromSecond", test_stoppingSelfNode_shouldPropagateToOtherNodes_downedBy_downFromSecond), + ] + } +} diff --git a/Tests/DistributedActorsTests/Cluster/DowningStrategy/DowningClusteredTests.swift b/Tests/DistributedActorsTests/Cluster/DowningStrategy/DowningClusteredTests.swift new file mode 100644 index 000000000..ad57579c7 --- /dev/null +++ b/Tests/DistributedActorsTests/Cluster/DowningStrategy/DowningClusteredTests.swift @@ -0,0 +1,92 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift Distributed Actors open source project +// +// Copyright (c) 2018-2019 Apple Inc. and the Swift Distributed Actors project authors +// Licensed under Apache License v2.0 +// +// See LICENSE.txt for license information +// See CONTRIBUTORS.md for the list of Swift Distributed Actors project authors +// +// SPDX-License-Identifier: Apache-2.0 +// +//===----------------------------------------------------------------------===// + +@testable import DistributedActors +import DistributedActorsTestKit +import XCTest + +final class DowningClusteredTests: ClusteredNodesTestBase { + enum NodeStopMethod { + case leaveSelf // TODO: eventually this one will be more graceful, ensure others see us leave etc + case downSelf + case shutdownSelf + case downFromSecond + } + + func shared_stoppingSelfNode_shouldPropagateToOtherNodes(stopMethod: NodeStopMethod) throws { + let (first, second) = self.setUpPair { settings in + settings.cluster.onDownAction = .gracefulShutdown(delay: .seconds(0)) + settings.cluster.autoLeaderElection = .lowestAddress(minNumberOfMembers: 2) + + settings.cluster.downingStrategy = .timeout(.default) + } + let third = self.setUpNode("third") { settings in + settings.cluster.onDownAction = .gracefulShutdown(delay: .seconds(0)) + settings.cluster.autoLeaderElection = .lowestAddress(minNumberOfMembers: 2) + + settings.cluster.downingStrategy = .timeout(.default) + } + try self.joinNodes(node: first, with: second, ensureMembers: .up) + try self.joinNodes(node: third, with: second, ensureMembers: .up) + + switch stopMethod { + case .leaveSelf: + first.cluster.leave() + case .downSelf: + first.cluster.down(node: first.cluster.node.node) + case .shutdownSelf: + first.shutdown() + case .downFromSecond: + second.cluster.down(node: first.cluster.node.node) + } + + switch stopMethod { + case .leaveSelf, .downSelf, .downFromSecond: + _ = try shouldNotThrow { + try self.testKit(second).eventually(within: .seconds(10)) { + try self.capturedLogs(of: first).shouldContain(prefix: "Self node was marked [.down]!", failTest: false) + } + } + case .shutdownSelf: + () // no extra assertion + } + + _ = try shouldNotThrow { + try self.testKit(second).eventually(within: .seconds(20)) { + // at least one of the nodes will first realize that first is unreachable, + // upon doing so, the leader election picks a new leader -- it will be `second` on lowest-reachable-address leadership for example. + // As we now have a leader, downing may perform it's move.\ + + try self.capturedLogs(of: second).shouldContain(grep: "@localhost:9001, status: down", failTest: false) + try self.capturedLogs(of: third).shouldContain(grep: "@localhost:9001, status: down", failTest: false) + } + } + } + + func test_stoppingSelfNode_shouldPropagateToOtherNodes_downedBy_leaveSelf() throws { + try self.shared_stoppingSelfNode_shouldPropagateToOtherNodes(stopMethod: .leaveSelf) + } + + func test_stoppingSelfNode_shouldPropagateToOtherNodes_downedBy_downSelf() throws { + try self.shared_stoppingSelfNode_shouldPropagateToOtherNodes(stopMethod: .downSelf) + } + + func test_stoppingSelfNode_shouldPropagateToOtherNodes_downedBy_shutdownSelf() throws { + try self.shared_stoppingSelfNode_shouldPropagateToOtherNodes(stopMethod: .shutdownSelf) + } + + func test_stoppingSelfNode_shouldPropagateToOtherNodes_downedBy_downFromSecond() throws { + try self.shared_stoppingSelfNode_shouldPropagateToOtherNodes(stopMethod: .downFromSecond) + } +} diff --git a/Tests/DistributedActorsTests/Cluster/DowningStrategy/TimeoutBasedDowningInstanceTests+XCTest.swift b/Tests/DistributedActorsTests/Cluster/DowningStrategy/TimeoutBasedDowningInstanceTests+XCTest.swift index 4007eadb2..d0a0927a0 100644 --- a/Tests/DistributedActorsTests/Cluster/DowningStrategy/TimeoutBasedDowningInstanceTests+XCTest.swift +++ b/Tests/DistributedActorsTests/Cluster/DowningStrategy/TimeoutBasedDowningInstanceTests+XCTest.swift @@ -26,13 +26,13 @@ extension TimeoutBasedDowningInstanceTests { ("test_onLeaderChange_whenNotLeaderAndNewLeaderIsSelfAddress_shouldBecomeLeader", test_onLeaderChange_whenNotLeaderAndNewLeaderIsSelfAddress_shouldBecomeLeader), ("test_onLeaderChange_whenNotLeaderAndNewLeaderIsOtherAddress_shouldNotBecomeLeader", test_onLeaderChange_whenNotLeaderAndNewLeaderIsOtherAddress_shouldNotBecomeLeader), ("test_onLeaderChange_whenLeaderAndNewLeaderIsOtherAddress_shouldLoseLeadership", test_onLeaderChange_whenLeaderAndNewLeaderIsOtherAddress_shouldLoseLeadership), - ("test_onLeaderChange_whenNonMemberSelectedAsLeader_shouldThrow", test_onLeaderChange_whenNonMemberSelectedAsLeader_shouldThrow), ("test_onLeaderChange_whenLeaderAndNewLeaderIsSelfAddress_shouldStayLeader", test_onLeaderChange_whenLeaderAndNewLeaderIsSelfAddress_shouldStayLeader), ("test_onLeaderChange_whenLeaderAndNoNewLeaderIsElected_shouldLoseLeadership", test_onLeaderChange_whenLeaderAndNoNewLeaderIsElected_shouldLoseLeadership), ("test_onLeaderChange_whenNotLeaderAndNoNewLeaderIsElected_shouldNotBecomeLeader", test_onLeaderChange_whenNotLeaderAndNoNewLeaderIsElected_shouldNotBecomeLeader), ("test_onLeaderChange_whenBecomingLeaderAndNodesPendingToBeDowned_shouldReturnMarkAsDown", test_onLeaderChange_whenBecomingLeaderAndNodesPendingToBeDowned_shouldReturnMarkAsDown), ("test_onTimeout_whenNotCurrentlyLeader_shouldInsertMemberAddressIntoMarkAsDown", test_onTimeout_whenNotCurrentlyLeader_shouldInsertMemberAddressIntoMarkAsDown), ("test_onTimeout_whenCurrentlyLeader_shouldReturnMarkAsDown", test_onTimeout_whenCurrentlyLeader_shouldReturnMarkAsDown), + ("test_onTimeout_shouldNotRetainAlreadyIssuedAsDownMembers", test_onTimeout_shouldNotRetainAlreadyIssuedAsDownMembers), ("test_onMemberRemoved_whenMemberWasUnreachable_shouldReturnCancelTimer", test_onMemberRemoved_whenMemberWasUnreachable_shouldReturnCancelTimer), ("test_onMemberRemoved_whenMemberWasMarkAsDown_shouldReturnNone", test_onMemberRemoved_whenMemberWasMarkAsDown_shouldReturnNone), ("test_onMemberRemoved_whenMemberNotKnown_shouldReturnNone", test_onMemberRemoved_whenMemberNotKnown_shouldReturnNone), diff --git a/Tests/DistributedActorsTests/Cluster/DowningStrategy/TimeoutBasedDowningInstanceTests.swift b/Tests/DistributedActorsTests/Cluster/DowningStrategy/TimeoutBasedDowningInstanceTests.swift index 679d43492..52941bfa0 100644 --- a/Tests/DistributedActorsTests/Cluster/DowningStrategy/TimeoutBasedDowningInstanceTests.swift +++ b/Tests/DistributedActorsTests/Cluster/DowningStrategy/TimeoutBasedDowningInstanceTests.swift @@ -25,6 +25,9 @@ final class TimeoutBasedDowningInstanceTests: XCTestCase { let otherNode = UniqueNode(node: Node(systemName: "Test", host: "localhost", port: 9999), nid: .random()) lazy var otherMember = Cluster.Member(node: self.otherNode, status: .up) + let yetAnotherNode = UniqueNode(node: Node(systemName: "Test", host: "localhost", port: 2222), nid: .random()) + lazy var yetAnotherMember = Cluster.Member(node: self.yetAnotherNode, status: .up) + let nonMemberNode = UniqueNode(node: Node(systemName: "Test", host: "localhost", port: 1111), nid: .random()) lazy var nonMember = Cluster.Member(node: self.nonMemberNode, status: .up) @@ -76,13 +79,6 @@ final class TimeoutBasedDowningInstanceTests: XCTestCase { } } - func test_onLeaderChange_whenNonMemberSelectedAsLeader_shouldThrow() throws { - let err = shouldThrow { - try self.instance.membership.applyLeadershipChange(to: self.nonMember) - } - "\(err)".shouldStartWith(prefix: "nonMemberLeaderSelected") - } - func test_onLeaderChange_whenLeaderAndNewLeaderIsSelfAddress_shouldStayLeader() throws { _ = try self.instance.membership.applyLeadershipChange(to: self.selfMember) self.instance.isLeader.shouldBeTrue() @@ -105,13 +101,13 @@ final class TimeoutBasedDowningInstanceTests: XCTestCase { func test_onLeaderChange_whenBecomingLeaderAndNodesPendingToBeDowned_shouldReturnMarkAsDown() throws { let member = Cluster.Member(node: self.otherNode, status: .up) - self.instance._markAsDown.insert(member.node) + self.instance._markAsDown.insert(member) let directive = try self.instance.onLeaderChange(to: self.selfMember) guard case .markAsDown(let addresses) = directive else { throw Boom() } addresses.count.shouldEqual(1) - addresses.shouldContain(member.node) + addresses.shouldContain(member) self.instance.isLeader.shouldBeTrue() } @@ -121,27 +117,53 @@ final class TimeoutBasedDowningInstanceTests: XCTestCase { func test_onTimeout_whenNotCurrentlyLeader_shouldInsertMemberAddressIntoMarkAsDown() throws { let member = Cluster.Member(node: self.otherNode, status: .up) self.instance.isLeader.shouldBeFalse() - self.instance._unreachable.insert(member.node) + self.instance._unreachable.insert(member) let directive = self.instance.onTimeout(member) guard case .none = directive else { throw TestError("Expected directive to be .none") } - self.instance._markAsDown.shouldContain(member.node) + self.instance._markAsDown.shouldContain(member) } func test_onTimeout_whenCurrentlyLeader_shouldReturnMarkAsDown() throws { let member = Cluster.Member(node: self.otherNode, status: .up) _ = try self.instance.membership.applyLeadershipChange(to: self.selfMember) - self.instance._unreachable.insert(member.node) + self.instance._unreachable.insert(member) let directive = self.instance.onTimeout(member) - guard case .markAsDown(let address) = directive else { + guard case .markAsDown(let node) = directive else { throw TestError("Expected directive to be .markAsDown") } - address.shouldEqual(member.node) + node.shouldEqual([member]) + } + + func test_onTimeout_shouldNotRetainAlreadyIssuedAsDownMembers() throws { + _ = self.instance.membership.join(self.selfNode) + _ = self.instance.membership.join(self.otherNode) + _ = self.instance.membership.join(self.yetAnotherNode) + + // we are the leader, if a timeout happens, we should issue .down + _ = try self.instance.onLeaderChange(to: self.selfMember) + + let directive = self.instance.onMemberUnreachable(.init(member: self.otherMember.asUnreachable)) + guard case .startTimer = directive else { + throw TestError("Expected .startTimer, but got \(directive)") + } + + let downDecision = self.instance.onTimeout(self.otherMember) + guard case .markAsDown(let nodesToDown) = downDecision else { + throw TestError("Expected .markAsDown, but got \(directive)") + } + + nodesToDown.shouldEqual([self.otherMember]) + + // since we signalled the .down, no need to retain the member as "to be marked down" anymore, + // if we never cleaned that set it could be technically a memory leak, continuing to accumulate + // all members that we downed over the lifetime of the cluster. + self.instance._markAsDown.shouldBeEmpty() } // ==== ------------------------------------------------------------------------------------------------------------ @@ -149,7 +171,7 @@ final class TimeoutBasedDowningInstanceTests: XCTestCase { func test_onMemberRemoved_whenMemberWasUnreachable_shouldReturnCancelTimer() throws { let member = Cluster.Member(node: self.otherNode, status: .up) - self.instance._unreachable.insert(member.node) + self.instance._unreachable.insert(member) let directive = self.instance.onMemberRemoved(member) guard case .cancelTimer = directive else { @@ -159,7 +181,7 @@ final class TimeoutBasedDowningInstanceTests: XCTestCase { func test_onMemberRemoved_whenMemberWasMarkAsDown_shouldReturnNone() throws { let member = Cluster.Member(node: self.otherNode, status: .up) - self.instance._markAsDown.insert(member.node) + self.instance._markAsDown.insert(member) let directive = self.instance.onMemberRemoved(member) guard case .none = directive else { @@ -181,8 +203,8 @@ final class TimeoutBasedDowningInstanceTests: XCTestCase { func test_onMemberReachable_whenMemberWasUnreachable_shouldReturnCancelTimer() throws { let member = Cluster.Member(node: self.otherNode, status: .up) - self.instance._unreachable.insert(member.node) - let directive = self.instance.onMemberReachable(member) + self.instance._unreachable.insert(member) + let directive = self.instance.onMemberReachable(.init(member: member.asUnreachable)) guard case .cancelTimer = directive else { throw TestError("Expected directive to be .cancelTimer") @@ -191,8 +213,8 @@ final class TimeoutBasedDowningInstanceTests: XCTestCase { func test_onMemberReachable_whenMemberWasMarkAsDown_shouldReturnNone() throws { let member = Cluster.Member(node: self.otherNode, status: .up) - self.instance._markAsDown.insert(member.node) - let directive = self.instance.onMemberReachable(member) + self.instance._markAsDown.insert(member) + let directive = self.instance.onMemberReachable(.init(member: member.asUnreachable)) guard case .none = directive else { throw TestError("Expected directive to be .none") @@ -201,7 +223,7 @@ final class TimeoutBasedDowningInstanceTests: XCTestCase { func test_onMemberReachable_whenMemberNotKnown_shouldReturnNone() throws { let member = Cluster.Member(node: self.otherNode, status: .up) - let directive = self.instance.onMemberReachable(member) + let directive = self.instance.onMemberReachable(.init(member: member.asUnreachable)) guard case .none = directive else { throw TestError("Expected directive to be .none") @@ -213,9 +235,9 @@ final class TimeoutBasedDowningInstanceTests: XCTestCase { func test_onMemberUnreachable_shouldAddAddressOfMemberToUnreachableSet() throws { let member = Cluster.Member(node: self.otherNode, status: .up) - guard case .startTimer = self.instance.onMemberUnreachable(member) else { + guard case .startTimer = self.instance.onMemberUnreachable(.init(member: member.asUnreachable)) else { throw TestError("Expected directive to be .startTimer") } - self.instance._unreachable.shouldContain(member.node) + self.instance._unreachable.shouldContain(member) } } diff --git a/Tests/DistributedActorsTests/Cluster/LeadershipTests+XCTest.swift b/Tests/DistributedActorsTests/Cluster/LeadershipTests+XCTest.swift index 24afbca6c..218637016 100644 --- a/Tests/DistributedActorsTests/Cluster/LeadershipTests+XCTest.swift +++ b/Tests/DistributedActorsTests/Cluster/LeadershipTests+XCTest.swift @@ -23,14 +23,16 @@ import XCTest extension LeadershipTests { static var allTests: [(String, (LeadershipTests) -> () throws -> Void)] { return [ - ("test_LowestReachableMember_selectLeader", test_LowestReachableMember_selectLeader), - ("test_LowestReachableMember_notEnoughMembersToDecide", test_LowestReachableMember_notEnoughMembersToDecide), - ("test_LowestReachableMember_notEnoughMembersToDecide_fromWithToWithoutLeader", test_LowestReachableMember_notEnoughMembersToDecide_fromWithToWithoutLeader), - ("test_LowestReachableMember_whenCurrentLeaderDown", test_LowestReachableMember_whenCurrentLeaderDown), - ("test_LowestReachableMember_whenCurrentLeaderDown_enoughMembers", test_LowestReachableMember_whenCurrentLeaderDown_enoughMembers), - ("test_LowestReachableMember_whenCurrentLeaderUnreachable_notEnoughMinMembers", test_LowestReachableMember_whenCurrentLeaderUnreachable_notEnoughMinMembers), - ("test_LowestReachableMember_keepLeader_notEnoughMembers_DO_NOT_loseLeadershipIfBelowMinNrOfMembers", test_LowestReachableMember_keepLeader_notEnoughMembers_DO_NOT_loseLeadershipIfBelowMinNrOfMembers), - ("test_LowestReachableMember_keepLeader_notEnoughMembers_DO_loseLeadershipIfBelowMinNrOfMembers", test_LowestReachableMember_keepLeader_notEnoughMembers_DO_loseLeadershipIfBelowMinNrOfMembers), + ("test_LowestAddressReachableMember_selectLeader", test_LowestAddressReachableMember_selectLeader), + ("test_LowestAddressReachableMember_notEnoughMembersToDecide", test_LowestAddressReachableMember_notEnoughMembersToDecide), + ("test_LowestAddressReachableMember_notEnoughReachableMembersToDecide", test_LowestAddressReachableMember_notEnoughReachableMembersToDecide), + ("test_LowestAddressReachableMember_onlyUnreachableMembers_cantDecide", test_LowestAddressReachableMember_onlyUnreachableMembers_cantDecide), + ("test_LowestAddressReachableMember_notEnoughMembersToDecide_fromWithToWithoutLeader", test_LowestAddressReachableMember_notEnoughMembersToDecide_fromWithToWithoutLeader), + ("test_LowestAddressReachableMember_whenCurrentLeaderDown", test_LowestAddressReachableMember_whenCurrentLeaderDown), + ("test_LowestAddressReachableMember_whenCurrentLeaderDown_enoughMembers", test_LowestAddressReachableMember_whenCurrentLeaderDown_enoughMembers), + ("test_LowestAddressReachableMember_whenCurrentLeaderUnreachable_notEnoughMinMembers", test_LowestAddressReachableMember_whenCurrentLeaderUnreachable_notEnoughMinMembers), + ("test_LowestAddressReachableMember_keepLeader_notEnoughMembers_DO_NOT_loseLeadershipIfBelowMinNrOfMembers", test_LowestAddressReachableMember_keepLeader_notEnoughMembers_DO_NOT_loseLeadershipIfBelowMinNrOfMembers), + ("test_LowestAddressReachableMember_keepLeader_notEnoughMembers_DO_loseLeadershipIfBelowMinNrOfMembers", test_LowestAddressReachableMember_keepLeader_notEnoughMembers_DO_loseLeadershipIfBelowMinNrOfMembers), ] } } diff --git a/Tests/DistributedActorsTests/Cluster/LeadershipTests.swift b/Tests/DistributedActorsTests/Cluster/LeadershipTests.swift index cd539fbcb..51dba63b1 100644 --- a/Tests/DistributedActorsTests/Cluster/LeadershipTests.swift +++ b/Tests/DistributedActorsTests/Cluster/LeadershipTests.swift @@ -31,10 +31,10 @@ final class LeadershipTests: XCTestCase { ] // ==== ------------------------------------------------------------------------------------------------------------ - // MARK: LowestReachableMember + // MARK: LowestAddressReachableMember - func test_LowestReachableMember_selectLeader() throws { - var election = Leadership.LowestAddressMember(minimumNrOfMembers: 3) + func test_LowestAddressReachableMember_selectLeader() throws { + var election = Leadership.LowestAddressReachableMember(minimumNrOfMembers: 3) let membership = self.initialMembership @@ -42,8 +42,8 @@ final class LeadershipTests: XCTestCase { change.shouldEqual(Cluster.LeadershipChange(oldLeader: nil, newLeader: self.firstMember)) } - func test_LowestReachableMember_notEnoughMembersToDecide() throws { - var election = Leadership.LowestAddressMember(minimumNrOfMembers: 3) + func test_LowestAddressReachableMember_notEnoughMembersToDecide() throws { + var election = Leadership.LowestAddressReachableMember(minimumNrOfMembers: 3) var membership = self.initialMembership _ = membership.remove(self.firstMember.node) @@ -59,8 +59,37 @@ final class LeadershipTests: XCTestCase { change2.shouldEqual(Cluster.LeadershipChange(oldLeader: nil, newLeader: self.secondMember)) } - func test_LowestReachableMember_notEnoughMembersToDecide_fromWithToWithoutLeader() throws { - var election = Leadership.LowestAddressMember(minimumNrOfMembers: 3) + func test_LowestAddressReachableMember_notEnoughReachableMembersToDecide() throws { + var election = Leadership.LowestAddressReachableMember(minimumNrOfMembers: 3) + + var membership = self.initialMembership + _ = membership.mark(self.secondMember.node, reachability: .unreachable) + + // 2 reachable members -> not enough to make decision anymore + let change1: Cluster.LeadershipChange? = try election.runElection(context: self.fakeContext, membership: membership).future.wait() + change1.shouldBeNil() + + _ = membership.join(self.newMember.node) + + // 3 reachable members again, 1 unreachable, should work + let change2: Cluster.LeadershipChange? = try election.runElection(context: self.fakeContext, membership: membership).future.wait() + change2.shouldEqual(Cluster.LeadershipChange(oldLeader: nil, newLeader: self.firstMember)) + } + + func test_LowestAddressReachableMember_onlyUnreachableMembers_cantDecide() throws { + var election = Leadership.LowestAddressReachableMember(minimumNrOfMembers: 3) + + var membership = self.initialMembership + _ = membership.mark(self.firstMember.node, reachability: .unreachable) + _ = membership.mark(self.secondMember.node, reachability: .unreachable) + + // 1 reachable member -> not enough to make decision anymore + let change1: Cluster.LeadershipChange? = try election.runElection(context: self.fakeContext, membership: membership).future.wait() + change1.shouldBeNil() + } + + func test_LowestAddressReachableMember_notEnoughMembersToDecide_fromWithToWithoutLeader() throws { + var election = Leadership.LowestAddressReachableMember(minimumNrOfMembers: 3) var membership = self.initialMembership _ = try! membership.applyLeadershipChange(to: self.firstMember) // try! because `firstMember` is a member @@ -77,8 +106,8 @@ final class LeadershipTests: XCTestCase { change.shouldEqual(Cluster.LeadershipChange(oldLeader: leader, newLeader: nil)) } - func test_LowestReachableMember_whenCurrentLeaderDown() throws { - var election = Leadership.LowestAddressMember(minimumNrOfMembers: 3) + func test_LowestAddressReachableMember_whenCurrentLeaderDown() throws { + var election = Leadership.LowestAddressReachableMember(minimumNrOfMembers: 3) var membership = self.initialMembership _ = membership.join(self.newMember.node) @@ -91,8 +120,8 @@ final class LeadershipTests: XCTestCase { .shouldEqual(Cluster.LeadershipChange(oldLeader: nil, newLeader: self.secondMember)) } - func test_LowestReachableMember_whenCurrentLeaderDown_enoughMembers() throws { - var election = Leadership.LowestAddressMember(minimumNrOfMembers: 3) + func test_LowestAddressReachableMember_whenCurrentLeaderDown_enoughMembers() throws { + var election = Leadership.LowestAddressReachableMember(minimumNrOfMembers: 3) var membership = self.initialMembership _ = membership.join(self.newMember.node) @@ -105,8 +134,8 @@ final class LeadershipTests: XCTestCase { .shouldEqual(Cluster.LeadershipChange(oldLeader: nil, newLeader: self.secondMember)) } - func test_LowestReachableMember_whenCurrentLeaderUnreachable_notEnoughMinMembers() throws { - var election = Leadership.LowestAddressMember(minimumNrOfMembers: 3) + func test_LowestAddressReachableMember_whenCurrentLeaderUnreachable_notEnoughMinMembers() throws { + var election = Leadership.LowestAddressReachableMember(minimumNrOfMembers: 3) var membership = self.initialMembership let applyToMembership: (Cluster.LeadershipChange?) throws -> (Cluster.LeadershipChange?) = { change in @@ -128,13 +157,13 @@ final class LeadershipTests: XCTestCase { membership.leader.shouldEqual(self.firstMember) } - func test_LowestReachableMember_keepLeader_notEnoughMembers_DO_NOT_loseLeadershipIfBelowMinNrOfMembers() throws { + func test_LowestAddressReachableMember_keepLeader_notEnoughMembers_DO_NOT_loseLeadershipIfBelowMinNrOfMembers() throws { // - 3 nodes join // - first becomes leader // - third leaves // - second leaves // ! no need to drop the leadership from the first node, it shall remain the leader; - var election = Leadership.LowestAddressMember(minimumNrOfMembers: 3) // loseLeadershipIfBelowMinNrOfMembers: false by default + var election = Leadership.LowestAddressReachableMember(minimumNrOfMembers: 3) // loseLeadershipIfBelowMinNrOfMembers: false by default var membership: Cluster.Membership = self.initialMembership let applyToMembership: (Cluster.LeadershipChange?) throws -> (Cluster.LeadershipChange?) = { change in @@ -165,12 +194,12 @@ final class LeadershipTests: XCTestCase { membership.leader.shouldEqual(self.firstMember) } - func test_LowestReachableMember_keepLeader_notEnoughMembers_DO_loseLeadershipIfBelowMinNrOfMembers() throws { + func test_LowestAddressReachableMember_keepLeader_notEnoughMembers_DO_loseLeadershipIfBelowMinNrOfMembers() throws { // - 3 nodes join // - first becomes leader // - third leaves // ! not enough members to sustain leader, it should not be trusted anymore - var election = Leadership.LowestAddressMember(minimumNrOfMembers: 3, loseLeadershipIfBelowMinNrOfMembers: true) + var election = Leadership.LowestAddressReachableMember(minimumNrOfMembers: 3, loseLeadershipIfBelowMinNrOfMembers: true) var membership: Cluster.Membership = self.initialMembership let applyToMembership: (Cluster.LeadershipChange?) throws -> (Cluster.LeadershipChange?) = { change in diff --git a/Tests/DistributedActorsTests/Cluster/SWIM/SWIMInstance+ClusterTests+XCTest.swift b/Tests/DistributedActorsTests/Cluster/SWIM/SWIMInstanceClusteredTests+XCTest.swift similarity index 89% rename from Tests/DistributedActorsTests/Cluster/SWIM/SWIMInstance+ClusterTests+XCTest.swift rename to Tests/DistributedActorsTests/Cluster/SWIM/SWIMInstanceClusteredTests+XCTest.swift index 3181060ef..87f34cf36 100644 --- a/Tests/DistributedActorsTests/Cluster/SWIM/SWIMInstance+ClusterTests+XCTest.swift +++ b/Tests/DistributedActorsTests/Cluster/SWIM/SWIMInstanceClusteredTests+XCTest.swift @@ -20,8 +20,8 @@ import XCTest /// Do NOT edit this file directly as it will be regenerated automatically when needed. /// -extension SWIMInstanceClusterTests { - static var allTests: [(String, (SWIMInstanceClusterTests) -> () throws -> Void)] { +extension SWIMInstanceClusteredTests { + static var allTests: [(String, (SWIMInstanceClusteredTests) -> () throws -> Void)] { return [ ("test_swim_cluster_onGossipPayload_newMember_needsToConnect_successfully", test_swim_cluster_onGossipPayload_newMember_needsToConnect_successfully), ("test_swim_cluster_onGossipPayload_newMember_needsToConnect_andFails_shouldNotAddMember", test_swim_cluster_onGossipPayload_newMember_needsToConnect_andFails_shouldNotAddMember), diff --git a/Tests/DistributedActorsTests/Cluster/SWIM/SWIMInstance+ClusterTests.swift b/Tests/DistributedActorsTests/Cluster/SWIM/SWIMInstanceClusteredTests.swift similarity index 98% rename from Tests/DistributedActorsTests/Cluster/SWIM/SWIMInstance+ClusterTests.swift rename to Tests/DistributedActorsTests/Cluster/SWIM/SWIMInstanceClusteredTests.swift index b0de9ae70..ecea78fb3 100644 --- a/Tests/DistributedActorsTests/Cluster/SWIM/SWIMInstance+ClusterTests.swift +++ b/Tests/DistributedActorsTests/Cluster/SWIM/SWIMInstanceClusteredTests.swift @@ -17,7 +17,7 @@ import DistributedActorsTestKit import XCTest /// Tests of the SWIM.Instance which require the existence of actor systems, even if the instance tests are driven manually. -final class SWIMInstanceClusterTests: ClusteredNodesTestBase { +final class SWIMInstanceClusteredTests: ClusteredNodesTestBase { var localClusterProbe: ActorTestProbe! var remoteClusterProbe: ActorTestProbe! diff --git a/Tests/DistributedActorsTests/Cluster/SWIM/SWIMShellTests.swift b/Tests/DistributedActorsTests/Cluster/SWIM/SWIMShellTests.swift index ce24b364e..30afa8dd1 100644 --- a/Tests/DistributedActorsTests/Cluster/SWIM/SWIMShellTests.swift +++ b/Tests/DistributedActorsTests/Cluster/SWIM/SWIMShellTests.swift @@ -232,7 +232,7 @@ final class SWIMShellTests: ClusteredNodesTestBase { ref.tell(.local(.pingRandomMember)) let message = try firstClusterProbe.expectMessage() - guard case .command(.reachabilityChanged(let address, .unreachable)) = message else { + guard case .command(.failureDetectorReachabilityChanged(let address, .unreachable)) = message else { throw self.testKit(first).fail("expected to receive `.command(.markUnreachable)`, but got `\(message)`") } try self.holdStatus(.unreachable(incarnation: 0), for: remoteMemberRef, on: ref, within: .milliseconds(200)) diff --git a/Tests/LinuxMain.swift b/Tests/LinuxMain.swift index 712bf9738..17144ca43 100644 --- a/Tests/LinuxMain.swift +++ b/Tests/LinuxMain.swift @@ -40,6 +40,7 @@ XCTMain([ testCase(ActorMemoryTests.allTests), testCase(ActorNamingTests.allTests), testCase(ActorRefAdapterTests.allTests), + testCase(ActorSingletonPluginClusteredTests.allTests), testCase(ActorSingletonPluginTests.allTests), testCase(ActorSubReceiveTests.allTests), testCase(ActorSystemTests.allTests), @@ -77,6 +78,7 @@ XCTMain([ testCase(DeadlineTests.allTests), testCase(DeathWatchTests.allTests), testCase(DispatcherTests.allTests), + testCase(DowningClusteredTests.allTests), testCase(EventStreamTests.allTests), testCase(FixedThreadPoolTests.allTests), testCase(GenCodableTests.allTests), @@ -104,7 +106,7 @@ XCTMain([ testCase(RemoteMessagingTests.allTests), testCase(RemotingTLSTests.allTests), testCase(RingBufferTests.allTests), - testCase(SWIMInstanceClusterTests.allTests), + testCase(SWIMInstanceClusteredTests.allTests), testCase(SWIMInstanceTests.allTests), testCase(SWIMSerializationTests.allTests), testCase(SWIMShellTests.allTests), From 3772884f24f03b183a9bc2b924d9a9fa34eb0d67 Mon Sep 17 00:00:00 2001 From: Konrad `ktoso` Malawski Date: Tue, 21 Jan 2020 02:48:24 +0900 Subject: [PATCH 06/12] =remote #378 fix hope-for-the-best concurrency approach in remote control reaching --- .../ActorRef+RemotePersonality.swift | 46 +++++++++---------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/Sources/DistributedActors/Cluster/Transport/ActorRef+RemotePersonality.swift b/Sources/DistributedActors/Cluster/Transport/ActorRef+RemotePersonality.swift index a10af56cb..7371c90cf 100644 --- a/Sources/DistributedActors/Cluster/Transport/ActorRef+RemotePersonality.swift +++ b/Sources/DistributedActors/Cluster/Transport/ActorRef+RemotePersonality.swift @@ -43,9 +43,8 @@ public final class RemotePersonality { // The structure of the shell is such that the only thing that is a field in the class is this associations / remote controls map, // which refs access. all other state is not accessible by anyone else since it is hidden in the actor itself. - // TODO: again... this may be accessed concurrently since many actors invoke send on this ref - // Access only via `self.remoteControl` - private var _cachedAssociationRemoteControl: AssociationRemoteControl? +// // Access only via `self.remoteControl` +// private var _cachedAssociationRemoteControl: AssociationRemoteControl? private let clusterShell: ClusterShell let system: ActorSystem // TODO: maybe don't need to store it and access via clusterShell? @@ -65,7 +64,8 @@ public final class RemotePersonality { // TODO: optionally carry file/line? remoteControl.sendUserMessage(type: Message.self, envelope: Envelope(payload: .message(message)), recipient: self.address) } else { - self.deadLetters.adapted().tell(message, file: file, line: line) + pprint("no remote control!!!! \(self.address)") + self.system.personalDeadLetters(recipient: self.address).adapted().tell(message, file: file, line: line) } } @@ -81,26 +81,26 @@ public final class RemotePersonality { } } + // FIXME: The test_singletonByClusterLeadership_stashMessagesIfNoLeader exposes that we sometimes need to spin here!!! This is very bad, investigate + // FIXME: https://github.com/apple/swift-distributed-actors/issues/382 private var remoteControl: AssociationRemoteControl? { - // optimally we would: - if let control = self._cachedAssociationRemoteControl { - return control - } else { - // FIXME: has to be done atomically... since other senders also reaching here - guard let remoteAddress = self.address.node else { - fatalError("Attempted to access association remote control yet ref has no address! This should never happen and is a bug.") - } - switch self.clusterShell.associationRemoteControl(with: remoteAddress) { - case .unknown: - return nil - case .associated(let remoteControl): - self._cachedAssociationRemoteControl = remoteControl // TODO: atomically... - return remoteControl - case .tombstone: - return nil - } - // FIXME: not safe, should ?? deadletters perhaps; OR keep internal mini buffer? - // TODO: should check if association died in the meantime? + // optimally we would be abe to cache the remote control // FIXME: optimization change? + // if let control = self._cachedAssociationRemoteControl { + // return control + // } else { ... + + guard let remoteAddress = self.address.node else { + fatalError("Attempted to access association remote control yet ref has no address! This should never happen and is a bug.") + } + switch self.clusterShell.associationRemoteControl(with: remoteAddress) { + case .unknown: + // FIXME: how can we end up in a situation with no association yet; likely we got gossiped a node's address, but not open yet... + return self.remoteControl // FIXME: only limited spins should be allowed (!!!!) + case .associated(let remoteControl): + // self._cachedAssociationRemoteControl = remoteControl // TODO: atomically cache a remote control? + return remoteControl + case .tombstone: + return nil } } } From e74b91a1d2673a7a6140924fdf204ad8c76c019c Mon Sep 17 00:00:00 2001 From: Konrad `ktoso` Malawski Date: Tue, 21 Jan 2020 02:49:49 +0900 Subject: [PATCH 07/12] =singleton minor rephrasing to avoid 2 spots where control flow can "fail" to stash (for same reason) --- .../ActorSingletonProxy.swift | 48 +++++++++++-------- 1 file changed, 28 insertions(+), 20 deletions(-) diff --git a/Sources/ActorSingletonPlugin/ActorSingletonProxy.swift b/Sources/ActorSingletonPlugin/ActorSingletonProxy.swift index ee771bb34..604a7a380 100644 --- a/Sources/ActorSingletonPlugin/ActorSingletonProxy.swift +++ b/Sources/ActorSingletonPlugin/ActorSingletonProxy.swift @@ -115,7 +115,6 @@ internal class ActorSingletonProxy { } // Update `ref` regardless - context.log.debug("Updating ref for singleton [\(self.settings.name)] to node [\(String(describing: node))]") self.updateRef(context, node: node) } } @@ -161,34 +160,42 @@ internal class ActorSingletonProxy { } private func updateRef(_ context: ActorContext, _ newRef: ActorRef?) { - context.log.debug("Updating ref from [\(String(describing: self.ref))] to [\(String(describing: newRef))], flushing \(self.buffer.count) messages.") + context.log.debug("Updating ref from [\(String(describing: self.ref))] to [\(String(describing: newRef))], flushing \(self.buffer.count) messages") self.ref = newRef // Unstash messages if we have the singleton - if let ref = self.ref { - while let stashed = self.buffer.take() { - ref.tell(stashed) - } + guard let singleton = self.ref else { + return + } + + while let stashed = self.buffer.take() { + context.log.debug("Flushing \(stashed), to \(singleton)") + singleton.tell(stashed) } } private func forwardOrStash(_ context: ActorContext, message: Message) throws { // Forward the message if `singleton` is not `nil`, else stash it. if let singleton = self.ref { - context.log.trace("forwarding message: \(singleton.address)") + context.log.trace("Forwarding message \(message), to: \(singleton.address)", metadata: self.metadata(context)) singleton.tell(message) } else { - context.log.trace("stashing message") - if self.buffer.isFull { - // TODO: log this warning only "once in while" after buffer becomes full - context.log.warning("Buffer is full. Messages might start getting disposed.", metadata: self.metadata(context)) - // Move the oldest message to dead letters to make room - if let oldestMessage = self.buffer.take() { - context.system.deadLetters.tell(DeadLetter(oldestMessage, recipient: context.address)) + do { + try self.buffer.stash(message: message) + context.log.trace("Stashed message: \(message)", metadata: self.metadata(context)) + } catch { + switch error { + case StashError.full: + // TODO: log this warning only "once in while" after buffer becomes full + context.log.warning("Buffer is full. Messages might start getting disposed.", metadata: self.metadata(context)) + // Move the oldest message to dead letters to make room + if let oldestMessage = self.buffer.take() { + context.system.deadLetters.tell(DeadLetter(oldestMessage, recipient: context.address)) + } + default: + context.log.warning("Unable to stash message, error: \(error)", metadata: self.metadata(context)) } } - - try self.buffer.stash(message: message) } } } @@ -199,18 +206,19 @@ internal class ActorSingletonProxy { extension ActorSingletonProxy { func metadata(_: ActorContext) -> Logger.Metadata { var metadata: Logger.Metadata = [ - "name": "\(self.settings.name)", - "buffer": "\(self.buffer.count)/\(self.settings.bufferCapacity)", + "tag": "singleton", + "singleton/name": "\(self.settings.name)", + "singleton/buffer": "\(self.buffer.count)/\(self.settings.bufferCapacity)", ] if let targetNode = self.targetNode { metadata["targetNode"] = "\(targetNode)" } if let ref = self.ref { - metadata["ref"] = "\(ref)" + metadata["ref"] = "\(ref.address)" } if let managerRef = self.managerRef { - metadata["managerRef"] = "\(managerRef)" + metadata["managerRef"] = "\(managerRef.address)" } return metadata From 48bb4c1f95912d9ce88a4f0438927701dd618f16 Mon Sep 17 00:00:00 2001 From: Konrad `ktoso` Malawski Date: Tue, 21 Jan 2020 02:51:11 +0900 Subject: [PATCH 08/12] =leadership slightly rename to make it more obvious we talk about lowest address reachable nodes --- .../Cluster/ClusterSettings.swift | 2 +- .../Cluster/Leadership.swift | 9 +++-- .../Cluster/ClusteredNodesTestBase.swift | 2 +- .../ActorSingletonPluginClusteredTests.swift | 39 ++++++++++--------- .../ActorableOwnedMembershipTests.swift | 4 +- .../Cluster/ClusterLeaderActionsTests.swift | 18 ++++----- .../ClusterMembershipGossipTests.swift | 2 +- .../DowningClusteredTests.swift | 4 +- .../Cluster/LeadershipTests.swift | 20 +++++----- 9 files changed, 52 insertions(+), 48 deletions(-) diff --git a/Sources/DistributedActors/Cluster/ClusterSettings.swift b/Sources/DistributedActors/Cluster/ClusterSettings.swift index 184d12120..d4b334618 100644 --- a/Sources/DistributedActors/Cluster/ClusterSettings.swift +++ b/Sources/DistributedActors/Cluster/ClusterSettings.swift @@ -110,7 +110,7 @@ public struct ClusterSettings { // ==== ------------------------------------------------------------------------------------------------------------ // MARK: Leader Election - public var autoLeaderElection: LeadershipSelectionSettings = .lowestAddress(minNumberOfMembers: 2) + public var autoLeaderElection: LeadershipSelectionSettings = .lowestReachable(minNumberOfMembers: 2) // ==== ------------------------------------------------------------------------------------------------------------ // MARK: TLS & Security settings diff --git a/Sources/DistributedActors/Cluster/Leadership.swift b/Sources/DistributedActors/Cluster/Leadership.swift index 8ca94e5a2..698958381 100644 --- a/Sources/DistributedActors/Cluster/Leadership.swift +++ b/Sources/DistributedActors/Cluster/Leadership.swift @@ -243,10 +243,11 @@ extension Leadership { /// // TODO: In situations which need strong guarantees, this leadership election scheme does NOT provide strong enough /// guarantees, and you should consider using another scheme or consensus based modes. - public struct LowestAddressReachableMember: LeaderElection { + public struct LowestReachableMember: LeaderElection { let minimumNumberOfMembersToDecide: Int let loseLeadershipIfBelowMinNrOfMembers: Bool + /// - param minimumNrOfMembers: minimum number of REACHABLE members when a leader can be elected. public init(minimumNrOfMembers: Int, loseLeadershipIfBelowMinNrOfMembers: Bool = false) { self.minimumNumberOfMembersToDecide = minimumNrOfMembers self.loseLeadershipIfBelowMinNrOfMembers = loseLeadershipIfBelowMinNrOfMembers @@ -336,14 +337,14 @@ extension ClusterSettings { /// No automatic leader selection, you can write your own logic and issue a `Cluster.LeadershipChange` `Cluster.Event` to the `system.cluster.events` event stream. case none /// All nodes get ordered by their node addresses and the "lowest" is always selected as a leader. - case lowestAddress(minNumberOfMembers: Int) + case lowestReachable(minNumberOfMembers: Int) func make(_: ClusterSettings) -> LeaderElection? { switch self { case .none: return nil - case .lowestAddress(let nr): - return Leadership.LowestAddressReachableMember(minimumNrOfMembers: nr) + case .lowestReachable(let nr): + return Leadership.LowestReachableMember(minimumNrOfMembers: nr) } } } diff --git a/Sources/DistributedActorsTestKit/Cluster/ClusteredNodesTestBase.swift b/Sources/DistributedActorsTestKit/Cluster/ClusteredNodesTestBase.swift index e645800a2..d8377c8f5 100644 --- a/Sources/DistributedActorsTestKit/Cluster/ClusteredNodesTestBase.swift +++ b/Sources/DistributedActorsTestKit/Cluster/ClusteredNodesTestBase.swift @@ -68,7 +68,7 @@ open class ClusteredNodesTestBase: XCTestCase { settings.overrideLoggerFactory = capture.loggerFactory(captureLabel: name) } - settings.cluster.autoLeaderElection = .lowestAddress(minNumberOfMembers: 2) + settings.cluster.autoLeaderElection = .lowestReachable(minNumberOfMembers: 2) settings.cluster.swim.failureDetector.suspicionTimeoutPeriodsMax = 3 diff --git a/Tests/ActorSingletonPluginTests/ActorSingletonPluginClusteredTests.swift b/Tests/ActorSingletonPluginTests/ActorSingletonPluginClusteredTests.swift index e5ae085e4..4361a7bbe 100644 --- a/Tests/ActorSingletonPluginTests/ActorSingletonPluginClusteredTests.swift +++ b/Tests/ActorSingletonPluginTests/ActorSingletonPluginClusteredTests.swift @@ -18,6 +18,10 @@ import DistributedActorsTestKit import XCTest final class ActorSingletonPluginClusteredTests: ClusteredNodesTestBase { + override var alwaysPrintCaptureLogs: Bool { + true + } + func test_singletonByClusterLeadership() throws { try shouldNotThrow { var singletonSettings = ActorSingletonSettings(name: GreeterSingleton.name) @@ -25,7 +29,7 @@ final class ActorSingletonPluginClusteredTests: ClusteredNodesTestBase { let first = self.setUpNode("first") { settings in settings.cluster.node.port = 7111 - settings.cluster.autoLeaderElection = .lowestAddress(minNumberOfMembers: 3) + settings.cluster.autoLeaderElection = .lowestReachable(minNumberOfMembers: 3) settings += ActorSingleton(settings: singletonSettings, GreeterSingleton.makeBehavior(instance: GreeterSingleton("Hello-1"))) @@ -33,7 +37,7 @@ final class ActorSingletonPluginClusteredTests: ClusteredNodesTestBase { } let second = self.setUpNode("second") { settings in settings.cluster.node.port = 8222 - settings.cluster.autoLeaderElection = .lowestAddress(minNumberOfMembers: 3) + settings.cluster.autoLeaderElection = .lowestReachable(minNumberOfMembers: 3) settings += ActorSingleton(settings: singletonSettings, GreeterSingleton.makeBehavior(instance: GreeterSingleton("Hello-2"))) @@ -41,7 +45,7 @@ final class ActorSingletonPluginClusteredTests: ClusteredNodesTestBase { } let third = self.setUpNode("third") { settings in settings.cluster.node.port = 9333 - settings.cluster.autoLeaderElection = .lowestAddress(minNumberOfMembers: 3) + settings.cluster.autoLeaderElection = .lowestReachable(minNumberOfMembers: 3) settings += ActorSingleton(settings: singletonSettings, GreeterSingleton.makeBehavior(instance: GreeterSingleton("Hello-3"))) @@ -79,7 +83,7 @@ final class ActorSingletonPluginClusteredTests: ClusteredNodesTestBase { let first = self.setUpNode("first") { settings in settings.cluster.node.port = 7111 - settings.cluster.autoLeaderElection = .lowestAddress(minNumberOfMembers: 3) + settings.cluster.autoLeaderElection = .lowestReachable(minNumberOfMembers: 3) settings += ActorSingleton(settings: singletonSettings, GreeterSingleton.makeBehavior(instance: GreeterSingleton("Hello-1"))) @@ -87,7 +91,7 @@ final class ActorSingletonPluginClusteredTests: ClusteredNodesTestBase { } let second = self.setUpNode("second") { settings in settings.cluster.node.port = 8222 - settings.cluster.autoLeaderElection = .lowestAddress(minNumberOfMembers: 3) + settings.cluster.autoLeaderElection = .lowestReachable(minNumberOfMembers: 3) settings += ActorSingleton(settings: singletonSettings, GreeterSingleton.makeBehavior(instance: GreeterSingleton("Hello-2"))) @@ -95,7 +99,7 @@ final class ActorSingletonPluginClusteredTests: ClusteredNodesTestBase { } let third = self.setUpNode("third") { settings in settings.cluster.node.port = 9333 - settings.cluster.autoLeaderElection = .lowestAddress(minNumberOfMembers: 3) + settings.cluster.autoLeaderElection = .lowestReachable(minNumberOfMembers: 3) settings += ActorSingleton(settings: singletonSettings, GreeterSingleton.makeBehavior(instance: GreeterSingleton("Hello-3"))) @@ -105,15 +109,15 @@ final class ActorSingletonPluginClusteredTests: ClusteredNodesTestBase { // No leader so singleton is not available, messages sent should be stashed let replyProbe1 = self.testKit(first).spawnTestProbe(expecting: String.self) let ref1 = try first.singleton.ref(name: GreeterSingleton.name, of: GreeterSingleton.Message.self) - ref1.tell(.greet(name: "Charlie", _replyTo: replyProbe1.ref)) + ref1.tell(.greet(name: "Charlie-1", _replyTo: replyProbe1.ref)) let replyProbe2 = self.testKit(second).spawnTestProbe(expecting: String.self) let ref2 = try second.singleton.ref(name: GreeterSingleton.name, of: GreeterSingleton.Message.self) - ref2.tell(.greet(name: "Charlie", _replyTo: replyProbe2.ref)) + ref2.tell(.greet(name: "Charlie-2", _replyTo: replyProbe2.ref)) let replyProbe3 = self.testKit(third).spawnTestProbe(expecting: String.self) let ref3 = try third.singleton.ref(name: GreeterSingleton.name, of: GreeterSingleton.Message.self) - ref3.tell(.greet(name: "Charlie", _replyTo: replyProbe3.ref)) + ref3.tell(.greet(name: "Charlie-3", _replyTo: replyProbe3.ref)) try replyProbe1.expectNoMessage(for: .milliseconds(200)) try replyProbe2.expectNoMessage(for: .milliseconds(200)) @@ -125,9 +129,9 @@ final class ActorSingletonPluginClusteredTests: ClusteredNodesTestBase { // `first` becomes the leader (lowest address) and runs the singleton try self.ensureNodes(.up, within: .seconds(10), systems: first, second, third) - try replyProbe1.expectMessage("Hello-1 Charlie!") - try replyProbe2.expectMessage("Hello-1 Charlie!") - try replyProbe3.expectMessage("Hello-1 Charlie!") + try replyProbe1.expectMessage("Hello-1 Charlie-1!") + try replyProbe2.expectMessage("Hello-1 Charlie-2!") + try replyProbe3.expectMessage("Hello-1 Charlie-3!") } } @@ -138,7 +142,7 @@ final class ActorSingletonPluginClusteredTests: ClusteredNodesTestBase { let first = self.setUpNode("first") { settings in settings.cluster.node.port = 7111 - settings.cluster.autoLeaderElection = .lowestAddress(minNumberOfMembers: 3) + settings.cluster.autoLeaderElection = .lowestReachable(minNumberOfMembers: 3) settings += ActorSingleton(settings: singletonSettings, GreeterSingleton.makeBehavior(instance: GreeterSingleton("Hello-1"))) @@ -146,7 +150,7 @@ final class ActorSingletonPluginClusteredTests: ClusteredNodesTestBase { } let second = self.setUpNode("second") { settings in settings.cluster.node.port = 8222 - settings.cluster.autoLeaderElection = .lowestAddress(minNumberOfMembers: 3) + settings.cluster.autoLeaderElection = .lowestReachable(minNumberOfMembers: 3) settings += ActorSingleton(settings: singletonSettings, GreeterSingleton.makeBehavior(instance: GreeterSingleton("Hello-2"))) @@ -154,7 +158,7 @@ final class ActorSingletonPluginClusteredTests: ClusteredNodesTestBase { } let third = self.setUpNode("third") { settings in settings.cluster.node.port = 9333 - settings.cluster.autoLeaderElection = .lowestAddress(minNumberOfMembers: 3) + settings.cluster.autoLeaderElection = .lowestReachable(minNumberOfMembers: 3) settings += ActorSingleton(settings: singletonSettings, GreeterSingleton.makeBehavior(instance: GreeterSingleton("Hello-3"))) @@ -162,7 +166,7 @@ final class ActorSingletonPluginClusteredTests: ClusteredNodesTestBase { } let fourth = self.setUpNode("fourth") { settings in settings.cluster.node.port = 7444 - settings.cluster.autoLeaderElection = .lowestAddress(minNumberOfMembers: 3) + settings.cluster.autoLeaderElection = .lowestReachable(minNumberOfMembers: 3) settings += ActorSingleton(settings: singletonSettings, GreeterSingleton.makeBehavior(instance: GreeterSingleton("Hello-4"))) @@ -192,8 +196,7 @@ final class ActorSingletonPluginClusteredTests: ClusteredNodesTestBase { try replyProbe3.expectMessage("Hello-1 Charlie!") // Take down the leader - first.cluster.down(node: first.cluster.node.node) // TODO: minimize a test just for the self downing -// third.cluster.down(node: first.cluster.node.node) + first.cluster.leave() // Make sure that `second` and `third` see `first` as down and become leader-less try self.testKit(second).eventually(within: .seconds(10)) { diff --git a/Tests/DistributedActorsTests/Actorable/ActorableOwnedMembershipTests.swift b/Tests/DistributedActorsTests/Actorable/ActorableOwnedMembershipTests.swift index 20eeae594..cd6424e7f 100644 --- a/Tests/DistributedActorsTests/Actorable/ActorableOwnedMembershipTests.swift +++ b/Tests/DistributedActorsTests/Actorable/ActorableOwnedMembershipTests.swift @@ -20,10 +20,10 @@ final class ActorableOwnedMembershipTests: ClusteredNodesTestBase { func test_autoUpdatedMembership_updatesAutomatically() throws { try shouldNotThrow { let first = self.setUpNode("first") { settings in - settings.cluster.autoLeaderElection = .lowestAddress(minNumberOfMembers: 2) + settings.cluster.autoLeaderElection = .lowestReachable(minNumberOfMembers: 2) } let second = self.setUpNode("second") { settings in - settings.cluster.autoLeaderElection = .lowestAddress(minNumberOfMembers: 2) + settings.cluster.autoLeaderElection = .lowestReachable(minNumberOfMembers: 2) } try self.joinNodes(node: first, with: second, ensureMembers: .up) diff --git a/Tests/DistributedActorsTests/Cluster/ClusterLeaderActionsTests.swift b/Tests/DistributedActorsTests/Cluster/ClusterLeaderActionsTests.swift index 0752daa2f..e8a9937f5 100644 --- a/Tests/DistributedActorsTests/Cluster/ClusterLeaderActionsTests.swift +++ b/Tests/DistributedActorsTests/Cluster/ClusterLeaderActionsTests.swift @@ -26,7 +26,7 @@ final class ClusterLeaderActionsTests: ClusteredNodesTestBase { try shouldNotThrow { let first = self.setUpNode("first") { settings in settings.cluster.node.port = 7111 - settings.cluster.autoLeaderElection = .lowestAddress(minNumberOfMembers: 1) + settings.cluster.autoLeaderElection = .lowestReachable(minNumberOfMembers: 1) } let p = self.testKit(first).spawnTestProbe(expecting: Cluster.Event.self) @@ -62,15 +62,15 @@ final class ClusterLeaderActionsTests: ClusteredNodesTestBase { try shouldNotThrow { let first = self.setUpNode("first") { settings in settings.cluster.node.port = 7111 - settings.cluster.autoLeaderElection = .lowestAddress(minNumberOfMembers: 3) + settings.cluster.autoLeaderElection = .lowestReachable(minNumberOfMembers: 3) } let second = self.setUpNode("second") { settings in settings.cluster.node.port = 8222 - settings.cluster.autoLeaderElection = .lowestAddress(minNumberOfMembers: 3) + settings.cluster.autoLeaderElection = .lowestReachable(minNumberOfMembers: 3) } let third = self.setUpNode("third") { settings in settings.cluster.node.port = 9333 - settings.cluster.autoLeaderElection = .lowestAddress(minNumberOfMembers: 3) + settings.cluster.autoLeaderElection = .lowestReachable(minNumberOfMembers: 3) } first.cluster.join(node: second.cluster.node.node) @@ -117,13 +117,13 @@ final class ClusterLeaderActionsTests: ClusteredNodesTestBase { // to a new member. // let first = self.setUpNode("first") { settings in - settings.cluster.autoLeaderElection = .lowestAddress(minNumberOfMembers: 2) + settings.cluster.autoLeaderElection = .lowestReachable(minNumberOfMembers: 2) } let second = self.setUpNode("second") { settings in - settings.cluster.autoLeaderElection = .lowestAddress(minNumberOfMembers: 2) + settings.cluster.autoLeaderElection = .lowestReachable(minNumberOfMembers: 2) } let third = self.setUpNode("third") { settings in - settings.cluster.autoLeaderElection = .lowestAddress(minNumberOfMembers: 2) + settings.cluster.autoLeaderElection = .lowestReachable(minNumberOfMembers: 2) } let fourth = self.setUpNode("fourth") { settings in @@ -154,13 +154,13 @@ final class ClusterLeaderActionsTests: ClusteredNodesTestBase { // to a new member. // let first = self.setUpNode("first") { settings in - settings.cluster.autoLeaderElection = .lowestAddress(minNumberOfMembers: 2) + settings.cluster.autoLeaderElection = .lowestReachable(minNumberOfMembers: 2) } let p1 = self.testKit(first).spawnTestProbe(expecting: Cluster.Event.self) first.cluster.events.subscribe(p1.ref) let second = self.setUpNode("second") { settings in - settings.cluster.autoLeaderElection = .lowestAddress(minNumberOfMembers: 2) + settings.cluster.autoLeaderElection = .lowestReachable(minNumberOfMembers: 2) } let p2 = self.testKit(second).spawnTestProbe(expecting: Cluster.Event.self) second.cluster.events.subscribe(p2.ref) diff --git a/Tests/DistributedActorsTests/Cluster/ClusterMembershipGossipTests.swift b/Tests/DistributedActorsTests/Cluster/ClusterMembershipGossipTests.swift index 3144947cc..6e739307e 100644 --- a/Tests/DistributedActorsTests/Cluster/ClusterMembershipGossipTests.swift +++ b/Tests/DistributedActorsTests/Cluster/ClusterMembershipGossipTests.swift @@ -30,7 +30,7 @@ final class ClusterMembershipGossipTests: ClusteredNodesTestBase { func test_down_beGossipedToOtherNodes() throws { try shouldNotThrow { - let strategy = ClusterSettings.LeadershipSelectionSettings.lowestAddress(minNumberOfMembers: 3) + let strategy = ClusterSettings.LeadershipSelectionSettings.lowestReachable(minNumberOfMembers: 3) let first = self.setUpNode("first") { settings in settings.cluster.autoLeaderElection = strategy } diff --git a/Tests/DistributedActorsTests/Cluster/DowningStrategy/DowningClusteredTests.swift b/Tests/DistributedActorsTests/Cluster/DowningStrategy/DowningClusteredTests.swift index ad57579c7..0fce098bc 100644 --- a/Tests/DistributedActorsTests/Cluster/DowningStrategy/DowningClusteredTests.swift +++ b/Tests/DistributedActorsTests/Cluster/DowningStrategy/DowningClusteredTests.swift @@ -27,13 +27,13 @@ final class DowningClusteredTests: ClusteredNodesTestBase { func shared_stoppingSelfNode_shouldPropagateToOtherNodes(stopMethod: NodeStopMethod) throws { let (first, second) = self.setUpPair { settings in settings.cluster.onDownAction = .gracefulShutdown(delay: .seconds(0)) - settings.cluster.autoLeaderElection = .lowestAddress(minNumberOfMembers: 2) + settings.cluster.autoLeaderElection = .lowestReachable(minNumberOfMembers: 2) settings.cluster.downingStrategy = .timeout(.default) } let third = self.setUpNode("third") { settings in settings.cluster.onDownAction = .gracefulShutdown(delay: .seconds(0)) - settings.cluster.autoLeaderElection = .lowestAddress(minNumberOfMembers: 2) + settings.cluster.autoLeaderElection = .lowestReachable(minNumberOfMembers: 2) settings.cluster.downingStrategy = .timeout(.default) } diff --git a/Tests/DistributedActorsTests/Cluster/LeadershipTests.swift b/Tests/DistributedActorsTests/Cluster/LeadershipTests.swift index 51dba63b1..09fc9197b 100644 --- a/Tests/DistributedActorsTests/Cluster/LeadershipTests.swift +++ b/Tests/DistributedActorsTests/Cluster/LeadershipTests.swift @@ -34,7 +34,7 @@ final class LeadershipTests: XCTestCase { // MARK: LowestAddressReachableMember func test_LowestAddressReachableMember_selectLeader() throws { - var election = Leadership.LowestAddressReachableMember(minimumNrOfMembers: 3) + var election = Leadership.LowestReachableMember(minimumNrOfMembers: 3) let membership = self.initialMembership @@ -43,7 +43,7 @@ final class LeadershipTests: XCTestCase { } func test_LowestAddressReachableMember_notEnoughMembersToDecide() throws { - var election = Leadership.LowestAddressReachableMember(minimumNrOfMembers: 3) + var election = Leadership.LowestReachableMember(minimumNrOfMembers: 3) var membership = self.initialMembership _ = membership.remove(self.firstMember.node) @@ -60,7 +60,7 @@ final class LeadershipTests: XCTestCase { } func test_LowestAddressReachableMember_notEnoughReachableMembersToDecide() throws { - var election = Leadership.LowestAddressReachableMember(minimumNrOfMembers: 3) + var election = Leadership.LowestReachableMember(minimumNrOfMembers: 3) var membership = self.initialMembership _ = membership.mark(self.secondMember.node, reachability: .unreachable) @@ -77,7 +77,7 @@ final class LeadershipTests: XCTestCase { } func test_LowestAddressReachableMember_onlyUnreachableMembers_cantDecide() throws { - var election = Leadership.LowestAddressReachableMember(minimumNrOfMembers: 3) + var election = Leadership.LowestReachableMember(minimumNrOfMembers: 3) var membership = self.initialMembership _ = membership.mark(self.firstMember.node, reachability: .unreachable) @@ -89,7 +89,7 @@ final class LeadershipTests: XCTestCase { } func test_LowestAddressReachableMember_notEnoughMembersToDecide_fromWithToWithoutLeader() throws { - var election = Leadership.LowestAddressReachableMember(minimumNrOfMembers: 3) + var election = Leadership.LowestReachableMember(minimumNrOfMembers: 3) var membership = self.initialMembership _ = try! membership.applyLeadershipChange(to: self.firstMember) // try! because `firstMember` is a member @@ -107,7 +107,7 @@ final class LeadershipTests: XCTestCase { } func test_LowestAddressReachableMember_whenCurrentLeaderDown() throws { - var election = Leadership.LowestAddressReachableMember(minimumNrOfMembers: 3) + var election = Leadership.LowestReachableMember(minimumNrOfMembers: 3) var membership = self.initialMembership _ = membership.join(self.newMember.node) @@ -121,7 +121,7 @@ final class LeadershipTests: XCTestCase { } func test_LowestAddressReachableMember_whenCurrentLeaderDown_enoughMembers() throws { - var election = Leadership.LowestAddressReachableMember(minimumNrOfMembers: 3) + var election = Leadership.LowestReachableMember(minimumNrOfMembers: 3) var membership = self.initialMembership _ = membership.join(self.newMember.node) @@ -135,7 +135,7 @@ final class LeadershipTests: XCTestCase { } func test_LowestAddressReachableMember_whenCurrentLeaderUnreachable_notEnoughMinMembers() throws { - var election = Leadership.LowestAddressReachableMember(minimumNrOfMembers: 3) + var election = Leadership.LowestReachableMember(minimumNrOfMembers: 3) var membership = self.initialMembership let applyToMembership: (Cluster.LeadershipChange?) throws -> (Cluster.LeadershipChange?) = { change in @@ -163,7 +163,7 @@ final class LeadershipTests: XCTestCase { // - third leaves // - second leaves // ! no need to drop the leadership from the first node, it shall remain the leader; - var election = Leadership.LowestAddressReachableMember(minimumNrOfMembers: 3) // loseLeadershipIfBelowMinNrOfMembers: false by default + var election = Leadership.LowestReachableMember(minimumNrOfMembers: 3) // loseLeadershipIfBelowMinNrOfMembers: false by default var membership: Cluster.Membership = self.initialMembership let applyToMembership: (Cluster.LeadershipChange?) throws -> (Cluster.LeadershipChange?) = { change in @@ -199,7 +199,7 @@ final class LeadershipTests: XCTestCase { // - first becomes leader // - third leaves // ! not enough members to sustain leader, it should not be trusted anymore - var election = Leadership.LowestAddressReachableMember(minimumNrOfMembers: 3, loseLeadershipIfBelowMinNrOfMembers: true) + var election = Leadership.LowestReachableMember(minimumNrOfMembers: 3, loseLeadershipIfBelowMinNrOfMembers: true) var membership: Cluster.Membership = self.initialMembership let applyToMembership: (Cluster.LeadershipChange?) throws -> (Cluster.LeadershipChange?) = { change in From 8541ba6461f2f5df16f6eb1833480ccc7a09028e Mon Sep 17 00:00:00 2001 From: Konrad `ktoso` Malawski Date: Tue, 21 Jan 2020 02:52:41 +0900 Subject: [PATCH 09/12] =tests no need for logs dumping always in singleton test --- .../ActorSingletonPluginClusteredTests.swift | 4 ---- 1 file changed, 4 deletions(-) diff --git a/Tests/ActorSingletonPluginTests/ActorSingletonPluginClusteredTests.swift b/Tests/ActorSingletonPluginTests/ActorSingletonPluginClusteredTests.swift index 4361a7bbe..ab6d1b549 100644 --- a/Tests/ActorSingletonPluginTests/ActorSingletonPluginClusteredTests.swift +++ b/Tests/ActorSingletonPluginTests/ActorSingletonPluginClusteredTests.swift @@ -18,10 +18,6 @@ import DistributedActorsTestKit import XCTest final class ActorSingletonPluginClusteredTests: ClusteredNodesTestBase { - override var alwaysPrintCaptureLogs: Bool { - true - } - func test_singletonByClusterLeadership() throws { try shouldNotThrow { var singletonSettings = ActorSingletonSettings(name: GreeterSingleton.name) From 32279a4430f7f48cf5adc9cada7d0c05b65ca5a7 Mon Sep 17 00:00:00 2001 From: Konrad `ktoso` Malawski Date: Tue, 21 Jan 2020 02:56:36 +0900 Subject: [PATCH 10/12] =test avoid shutting down nodes, as we want to inspect them --- .../Cluster/ClusterMembershipGossipTests.swift | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Tests/DistributedActorsTests/Cluster/ClusterMembershipGossipTests.swift b/Tests/DistributedActorsTests/Cluster/ClusterMembershipGossipTests.swift index 6e739307e..5a717e208 100644 --- a/Tests/DistributedActorsTests/Cluster/ClusterMembershipGossipTests.swift +++ b/Tests/DistributedActorsTests/Cluster/ClusterMembershipGossipTests.swift @@ -33,12 +33,15 @@ final class ClusterMembershipGossipTests: ClusteredNodesTestBase { let strategy = ClusterSettings.LeadershipSelectionSettings.lowestReachable(minNumberOfMembers: 3) let first = self.setUpNode("first") { settings in settings.cluster.autoLeaderElection = strategy + settings.cluster.onDownAction = .none } let second = self.setUpNode("second") { settings in settings.cluster.autoLeaderElection = strategy + settings.cluster.onDownAction = .none } let third = self.setUpNode("third") { settings in settings.cluster.autoLeaderElection = strategy + settings.cluster.onDownAction = .none } first.cluster.join(node: second.cluster.node.node) From 8d7ddf1a07afc9a157ac6eb1192b3929e2cc9cd5 Mon Sep 17 00:00:00 2001 From: Konrad `ktoso` Malawski Date: Tue, 21 Jan 2020 19:30:38 +0900 Subject: [PATCH 11/12] =workaround,cluster #383 hacky workaround until resolving properly --- .../ActorRef+RemotePersonality.swift | 34 +++++++++++-------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/Sources/DistributedActors/Cluster/Transport/ActorRef+RemotePersonality.swift b/Sources/DistributedActors/Cluster/Transport/ActorRef+RemotePersonality.swift index 7371c90cf..15b401c6b 100644 --- a/Sources/DistributedActors/Cluster/Transport/ActorRef+RemotePersonality.swift +++ b/Sources/DistributedActors/Cluster/Transport/ActorRef+RemotePersonality.swift @@ -43,6 +43,7 @@ public final class RemotePersonality { // The structure of the shell is such that the only thing that is a field in the class is this associations / remote controls map, // which refs access. all other state is not accessible by anyone else since it is hidden in the actor itself. + // TODO: // // Access only via `self.remoteControl` // private var _cachedAssociationRemoteControl: AssociationRemoteControl? @@ -84,24 +85,29 @@ public final class RemotePersonality { // FIXME: The test_singletonByClusterLeadership_stashMessagesIfNoLeader exposes that we sometimes need to spin here!!! This is very bad, investigate // FIXME: https://github.com/apple/swift-distributed-actors/issues/382 private var remoteControl: AssociationRemoteControl? { - // optimally we would be abe to cache the remote control // FIXME: optimization change? - // if let control = self._cachedAssociationRemoteControl { - // return control - // } else { ... - guard let remoteAddress = self.address.node else { fatalError("Attempted to access association remote control yet ref has no address! This should never happen and is a bug.") } - switch self.clusterShell.associationRemoteControl(with: remoteAddress) { - case .unknown: - // FIXME: how can we end up in a situation with no association yet; likely we got gossiped a node's address, but not open yet... - return self.remoteControl // FIXME: only limited spins should be allowed (!!!!) - case .associated(let remoteControl): - // self._cachedAssociationRemoteControl = remoteControl // TODO: atomically cache a remote control? - return remoteControl - case .tombstone: - return nil + + // FIXME: this is a hack/workaround, see https://github.com/apple/swift-distributed-actors/issues/383 + let maxWorkaroundSpins = 100 + for spinNr in 1 ... maxWorkaroundSpins { + switch self.clusterShell.associationRemoteControl(with: remoteAddress) { + case .unknown: + // FIXME: we may get this if we did a resolve() yet the handshakes did not complete yet + if spinNr == maxWorkaroundSpins { + return self.remoteControl + } // else, fall through to the return nil below + case .associated(let remoteControl): + self.system.log.warning("FIXME: Workaround, ActorRef's RemotePersonality had to spin \(spinNr) times to obtain remoteControl to send message to \(self.address)") + // self._cachedAssociationRemoteControl = remoteControl // TODO: atomically cache a remote control? + return remoteControl + case .tombstone: + return nil + } } + + return nil } } From 73f695100080f5eba04ba6a11e4b652b15fbd5c1 Mon Sep 17 00:00:00 2001 From: Konrad `ktoso` Malawski Date: Tue, 21 Jan 2020 19:37:55 +0900 Subject: [PATCH 12/12] =test fix log assertions, timeout alignents --- .../ActorSingletonPluginClusteredTests.swift | 6 +- .../ActorSingletonPluginTests.swift | 223 ------------------ .../Cluster/ClusterOnDownActionTests.swift | 12 +- .../ShootTheOtherNodeClusteredTests.swift | 8 +- 4 files changed, 11 insertions(+), 238 deletions(-) diff --git a/Tests/ActorSingletonPluginTests/ActorSingletonPluginClusteredTests.swift b/Tests/ActorSingletonPluginTests/ActorSingletonPluginClusteredTests.swift index ab6d1b549..143c16e81 100644 --- a/Tests/ActorSingletonPluginTests/ActorSingletonPluginClusteredTests.swift +++ b/Tests/ActorSingletonPluginTests/ActorSingletonPluginClusteredTests.swift @@ -21,7 +21,7 @@ final class ActorSingletonPluginClusteredTests: ClusteredNodesTestBase { func test_singletonByClusterLeadership() throws { try shouldNotThrow { var singletonSettings = ActorSingletonSettings(name: GreeterSingleton.name) - singletonSettings.allocationStrategy = .leadership + singletonSettings.allocationStrategy = .byLeadership let first = self.setUpNode("first") { settings in settings.cluster.node.port = 7111 @@ -75,7 +75,7 @@ final class ActorSingletonPluginClusteredTests: ClusteredNodesTestBase { func test_singletonByClusterLeadership_stashMessagesIfNoLeader() throws { try shouldNotThrow { var singletonSettings = ActorSingletonSettings(name: GreeterSingleton.name) - singletonSettings.allocationStrategy = .leadership + singletonSettings.allocationStrategy = .byLeadership let first = self.setUpNode("first") { settings in settings.cluster.node.port = 7111 @@ -134,7 +134,7 @@ final class ActorSingletonPluginClusteredTests: ClusteredNodesTestBase { func test_singletonByClusterLeadership_withLeaderChange() throws { try shouldNotThrow { var singletonSettings = ActorSingletonSettings(name: GreeterSingleton.name) - singletonSettings.allocationStrategy = .leadership + singletonSettings.allocationStrategy = .byLeadership let first = self.setUpNode("first") { settings in settings.cluster.node.port = 7111 diff --git a/Tests/ActorSingletonPluginTests/ActorSingletonPluginTests.swift b/Tests/ActorSingletonPluginTests/ActorSingletonPluginTests.swift index bfdcac6c9..67f8b15ab 100644 --- a/Tests/ActorSingletonPluginTests/ActorSingletonPluginTests.swift +++ b/Tests/ActorSingletonPluginTests/ActorSingletonPluginTests.swift @@ -45,227 +45,4 @@ final class ActorSingletonPluginTests: ActorSystemTestBase { try replyProbe.expectMessage("Hi Charlie!") } - - func test_singletonByClusterLeadership() throws { - try shouldNotThrow { - var singletonSettings = ActorSingletonSettings(name: GreeterSingleton.name) - singletonSettings.allocationStrategy = .byLeadership - - let first = self.setUpNode("first") { settings in - settings.cluster.node.port = 7111 - settings.cluster.autoLeaderElection = .lowestAddress(minNumberOfMembers: 3) - - settings += ActorSingleton(settings: singletonSettings, GreeterSingleton.makeBehavior(instance: GreeterSingleton("Hello-1"))) - - settings.serialization.registerCodable(for: GreeterSingleton.Message.self, underId: 10001) - } - let second = self.setUpNode("second") { settings in - settings.cluster.node.port = 8222 - settings.cluster.autoLeaderElection = .lowestAddress(minNumberOfMembers: 3) - - settings += ActorSingleton(settings: singletonSettings, GreeterSingleton.makeBehavior(instance: GreeterSingleton("Hello-2"))) - - settings.serialization.registerCodable(for: GreeterSingleton.Message.self, underId: 10001) - } - let third = self.setUpNode("third") { settings in - settings.cluster.node.port = 9333 - settings.cluster.autoLeaderElection = .lowestAddress(minNumberOfMembers: 3) - - settings += ActorSingleton(settings: singletonSettings, GreeterSingleton.makeBehavior(instance: GreeterSingleton("Hello-3"))) - - settings.serialization.registerCodable(for: GreeterSingleton.Message.self, underId: 10001) - } - - first.cluster.join(node: second.cluster.node.node) - third.cluster.join(node: second.cluster.node.node) - - // `first` will be the leader (lowest address) and runs the singleton - try self.ensureNodes(.up, within: .seconds(10), systems: first, second, third) - - let replyProbe1 = self.testKit(first).spawnTestProbe(expecting: String.self) - let ref1 = try first.singleton.ref(name: GreeterSingleton.name, of: GreeterSingleton.Message.self) - ref1.tell(.greet(name: "Charlie", _replyTo: replyProbe1.ref)) - - let replyProbe2 = self.testKit(second).spawnTestProbe(expecting: String.self) - let ref2 = try second.singleton.ref(name: GreeterSingleton.name, of: GreeterSingleton.Message.self) - ref2.tell(.greet(name: "Charlie", _replyTo: replyProbe2.ref)) - - let replyProbe3 = self.testKit(third).spawnTestProbe(expecting: String.self) - let ref3 = try third.singleton.ref(name: GreeterSingleton.name, of: GreeterSingleton.Message.self) - ref3.tell(.greet(name: "Charlie", _replyTo: replyProbe3.ref)) - - try replyProbe1.expectMessage("Hello-1 Charlie!") - try replyProbe2.expectMessage("Hello-1 Charlie!") - try replyProbe3.expectMessage("Hello-1 Charlie!") - } - } - - func test_singletonByClusterLeadership_stashMessagesIfNoLeader() throws { - try shouldNotThrow { - var singletonSettings = ActorSingletonSettings(name: GreeterSingleton.name) - singletonSettings.allocationStrategy = .byLeadership - - let first = self.setUpNode("first") { settings in - settings.cluster.node.port = 7111 - settings.cluster.autoLeaderElection = .lowestAddress(minNumberOfMembers: 3) - - settings += ActorSingleton(settings: singletonSettings, GreeterSingleton.makeBehavior(instance: GreeterSingleton("Hello-1"))) - - settings.serialization.registerCodable(for: GreeterSingleton.Message.self, underId: 10001) - } - let second = self.setUpNode("second") { settings in - settings.cluster.node.port = 8222 - settings.cluster.autoLeaderElection = .lowestAddress(minNumberOfMembers: 3) - - settings += ActorSingleton(settings: singletonSettings, GreeterSingleton.makeBehavior(instance: GreeterSingleton("Hello-2"))) - - settings.serialization.registerCodable(for: GreeterSingleton.Message.self, underId: 10001) - } - let third = self.setUpNode("third") { settings in - settings.cluster.node.port = 9333 - settings.cluster.autoLeaderElection = .lowestAddress(minNumberOfMembers: 3) - - settings += ActorSingleton(settings: singletonSettings, GreeterSingleton.makeBehavior(instance: GreeterSingleton("Hello-3"))) - - settings.serialization.registerCodable(for: GreeterSingleton.Message.self, underId: 10001) - } - - // No leader so singleton is not available, messages sent should be stashed - let replyProbe1 = self.testKit(first).spawnTestProbe(expecting: String.self) - let ref1 = try first.singleton.ref(name: GreeterSingleton.name, of: GreeterSingleton.Message.self) - ref1.tell(.greet(name: "Charlie", _replyTo: replyProbe1.ref)) - - let replyProbe2 = self.testKit(second).spawnTestProbe(expecting: String.self) - let ref2 = try second.singleton.ref(name: GreeterSingleton.name, of: GreeterSingleton.Message.self) - ref2.tell(.greet(name: "Charlie", _replyTo: replyProbe2.ref)) - - let replyProbe3 = self.testKit(third).spawnTestProbe(expecting: String.self) - let ref3 = try third.singleton.ref(name: GreeterSingleton.name, of: GreeterSingleton.Message.self) - ref3.tell(.greet(name: "Charlie", _replyTo: replyProbe3.ref)) - - try replyProbe1.expectNoMessage(for: .milliseconds(200)) - try replyProbe2.expectNoMessage(for: .milliseconds(200)) - try replyProbe3.expectNoMessage(for: .milliseconds(200)) - - first.cluster.join(node: second.cluster.node.node) - third.cluster.join(node: second.cluster.node.node) - - // `first` becomes the leader (lowest address) and runs the singleton - try self.ensureNodes(.up, within: .seconds(10), systems: first, second, third) - - try replyProbe1.expectMessage("Hello-1 Charlie!") - try replyProbe2.expectMessage("Hello-1 Charlie!") - try replyProbe3.expectMessage("Hello-1 Charlie!") - } - } - - func test_singletonByClusterLeadership_withLeaderChange() throws { - try shouldNotThrow { - var singletonSettings = ActorSingletonSettings(name: GreeterSingleton.name) - singletonSettings.allocationStrategy = .byLeadership - - let first = self.setUpNode("first") { settings in - settings.cluster.node.port = 7111 - settings.cluster.autoLeaderElection = .lowestAddress(minNumberOfMembers: 3) - settings.serialization.registerCodable(for: GreeterSingleton.Message.self, underId: 10001) - - settings += ActorSingleton(settings: singletonSettings, GreeterSingleton.makeBehavior(instance: GreeterSingleton("Hello-1"))) - } - let second = self.setUpNode("second") { settings in - settings.cluster.node.port = 8222 - settings.cluster.autoLeaderElection = .lowestAddress(minNumberOfMembers: 3) - settings.serialization.registerCodable(for: GreeterSingleton.Message.self, underId: 10001) - - settings += ActorSingleton(settings: singletonSettings, GreeterSingleton.makeBehavior(instance: GreeterSingleton("Hello-2"))) - } - let third = self.setUpNode("third") { settings in - settings.cluster.node.port = 9333 - settings.cluster.autoLeaderElection = .lowestAddress(minNumberOfMembers: 3) - settings.serialization.registerCodable(for: GreeterSingleton.Message.self, underId: 10001) - - settings += ActorSingleton(settings: singletonSettings, GreeterSingleton.makeBehavior(instance: GreeterSingleton("Hello-3"))) - } - let fourth = self.setUpNode("fourth") { settings in - settings.cluster.node.port = 7444 - settings.cluster.autoLeaderElection = .lowestAddress(minNumberOfMembers: 3) - settings.serialization.registerCodable(for: GreeterSingleton.Message.self, underId: 10001) - - settings += ActorSingleton(settings: singletonSettings, GreeterSingleton.makeBehavior(instance: GreeterSingleton("Hello-4"))) - } - - try self.joinNodes(node: first, with: second) - try self.joinNodes(node: third, with: second) - try self.ensureNodes(.up, within: .seconds(10), systems: first, second, third) - - let replyProbe1 = self.testKit(first).spawnTestProbe(expecting: String.self) - let ref1 = try first.singleton.ref(name: GreeterSingleton.name, of: GreeterSingleton.Message.self) - ref1.tell(.greet(name: "Charlie", _replyTo: replyProbe1.ref)) - - let replyProbe2 = self.testKit(second).spawnTestProbe(expecting: String.self) - let ref2 = try second.singleton.ref(name: GreeterSingleton.name, of: GreeterSingleton.Message.self) - ref2.tell(.greet(name: "Charlie", _replyTo: replyProbe2.ref)) - - let replyProbe3 = self.testKit(third).spawnTestProbe(expecting: String.self) - let ref3 = try third.singleton.ref(name: GreeterSingleton.name, of: GreeterSingleton.Message.self) - ref3.tell(.greet(name: "Charlie", _replyTo: replyProbe3.ref)) - - // `first` has the lowest address so it should be the leader and singleton - try replyProbe1.expectMessage("Hello-1 Charlie!") - try replyProbe2.expectMessage("Hello-1 Charlie!") - try replyProbe3.expectMessage("Hello-1 Charlie!") - - // Take down the leader - // first.cluster.down(node: first.cluster.node.node) // FIXME: must also work when the node downs itself and shuts down (!!!) (we do not move to down currently, no default downing impl) - second.cluster.down(node: first.cluster.node.node) - - // Ensure the node is seen down - try self.ensureNodes(.down, on: second, systems: first) - try self.ensureNodes(.down, on: third, systems: first) - - // At the same time, since the node was downed, it is not the leader anymore - try self.testKit(third).eventually(within: .seconds(10)) { - try self.assertLeaderNode(on: third, is: nil) - try self.assertLeaderNode(on: second, is: nil) - } - - // `fourth` will become the new leader and singleton - try self.joinNodes(node: fourth, with: second) - try self.ensureNodes(.up, within: .seconds(10), systems: fourth, second, third) - - try self.testKit(first).eventually(within: .seconds(10)) { - // The stashed messages get routed to new singleton running on `fourth` - - ref2.tell(.greet(name: "Charlie-2", _replyTo: replyProbe2.ref)) - guard let reply2 = try replyProbe2.maybeExpectMessage() else { - pprint("lost msg @ node 2") - throw TestError("No reply to \(replyProbe2) yet, singleton still rebalancing...?") - } - reply2.shouldEqual("Hello-4 Charlie-2!") - - ref3.tell(.greet(name: "Charlie-3", _replyTo: replyProbe3.ref)) - guard let reply3 = try replyProbe3.maybeExpectMessage() else { - pprint("lost msg @ node 3") - throw TestError("No reply to \(replyProbe3) yet, singleton still rebalancing...?") - } - reply3.shouldEqual("Hello-4 Charlie-3!") - } - } - } -} - -// ==== ---------------------------------------------------------------------------------------------------------------- -// MARK: Test utilities - -struct GreeterSingleton: Actorable { - static let name = "greeter" - - private let greeting: String - - init(_ greeting: String) { - self.greeting = greeting - } - - func greet(name: String) -> String { - "\(self.greeting) \(name)!" - } } diff --git a/Tests/DistributedActorsTests/Cluster/ClusterOnDownActionTests.swift b/Tests/DistributedActorsTests/Cluster/ClusterOnDownActionTests.swift index 380a288a4..557911888 100644 --- a/Tests/DistributedActorsTests/Cluster/ClusterOnDownActionTests.swift +++ b/Tests/DistributedActorsTests/Cluster/ClusterOnDownActionTests.swift @@ -20,17 +20,19 @@ import XCTest final class ClusterOnDownActionTests: ClusteredNodesTestBase { func test_onNodeDowned_performShutdown() throws { try shouldNotThrow { - let (first, second) = self.setUpPair() + let (first, second) = self.setUpPair { settings in + settings.cluster.onDownAction = .gracefulShutdown(delay: .milliseconds(300)) + } try self.joinNodes(node: first, with: second) second.cluster.down(node: first.cluster.node.node) - try self.capturedLogs(of: first).awaitLogContaining(self.testKit(first), text: "Self node was determined [.down]!") + try self.capturedLogs(of: first).awaitLogContaining(self.testKit(first), text: "Self node was marked [.down]!") - try self.testKit(first).eventually(within: .seconds(1)) { + try self.testKit(first).eventually(within: .seconds(3)) { guard first.isShuttingDown else { - throw TestError("First system should be shutting down, it was marked down and the default onDownAction should have triggered!") + throw TestError("System \(first) was not shutting down. It was marked down and the default onDownAction should have triggered!") } } } @@ -46,7 +48,7 @@ final class ClusterOnDownActionTests: ClusteredNodesTestBase { second.cluster.down(node: first.cluster.node.node) - try self.capturedLogs(of: first).awaitLogContaining(self.testKit(first), text: "Self node was determined [.down]!") + try self.capturedLogs(of: first).awaitLogContaining(self.testKit(first), text: "Self node was marked [.down]!") first.isShuttingDown.shouldBeFalse() } diff --git a/Tests/DistributedActorsTests/Cluster/ShootTheOtherNodeClusteredTests.swift b/Tests/DistributedActorsTests/Cluster/ShootTheOtherNodeClusteredTests.swift index f44fa7f2c..de8215ea6 100644 --- a/Tests/DistributedActorsTests/Cluster/ShootTheOtherNodeClusteredTests.swift +++ b/Tests/DistributedActorsTests/Cluster/ShootTheOtherNodeClusteredTests.swift @@ -18,10 +18,6 @@ import NIO import XCTest final class ShootTheOtherNodeClusteredTests: ClusteredNodesTestBase { - override var alwaysPrintCaptureLogs: Bool { - true - } - override func configureLogCapture(settings: inout LogCapture.Settings) { settings.excludeGrep = [ "TimerKey", @@ -50,9 +46,7 @@ final class ShootTheOtherNodeClusteredTests: ClusteredNodesTestBase { // we do NOT failTest:, since we are in an eventuallyBlock and are waiting for the logs to happen still // the eventually block will escalate the thrown errors if they do not cease within the time limit. try self.capturedLogs(of: remote).shouldContain(prefix: "Received .restInPeace", failTest: false) - try self.capturedLogs(of: remote).shouldContain(prefix: "Self node was determined [.down]", failTest: false) + try self.capturedLogs(of: remote).shouldContain(prefix: "Self node was marked [.down]", failTest: false) } - -// try self.assertMemberStatus(on: local, node: remote.cluster.node, is: .down) } }