diff --git a/Samples/Sources/SampleCluster/main.swift b/Samples/Sources/SampleCluster/main.swift index 0d9911130..7f4226b49 100644 --- a/Samples/Sources/SampleCluster/main.swift +++ b/Samples/Sources/SampleCluster/main.swift @@ -15,13 +15,9 @@ import Dispatch import DistributedActors -print("Getting args") - var args = CommandLine.arguments args.removeFirst() -print("got args") - print("\(args)") guard args.count >= 1 else { @@ -31,7 +27,7 @@ guard args.count >= 1 else { let system = ActorSystem("System") { settings in settings.cluster.enabled = true settings.cluster.bindPort = Int(args[0])! - settings.cluster.downingStrategy = .none + settings.cluster.downingStrategy = .timeout(.default) settings.defaultLogLevel = .debug } @@ -42,11 +38,8 @@ let ref = try system.spawn("hello", of: Cluster.Event.self, .receive { context, system.cluster.events.subscribe(ref) if args.count >= 3 { - print("getting host") let host = args[1] - print("parsing port") let port = Int(args[2])! - print("Joining") system.cluster.join(node: Node(systemName: "System", host: host, port: port)) } diff --git a/Samples/Sources/SampleDiningPhilosophers/Philosopher.swift b/Samples/Sources/SampleDiningPhilosophers/Philosopher.swift index 06a758ae5..a2ede8fd7 100644 --- a/Samples/Sources/SampleDiningPhilosophers/Philosopher.swift +++ b/Samples/Sources/SampleDiningPhilosophers/Philosopher.swift @@ -11,7 +11,6 @@ // SPDX-License-Identifier: Apache-2.0 // //===----------------------------------------------------------------------===// - import DistributedActors public class Philosopher { @@ -115,7 +114,7 @@ public class Philosopher { case .forkReply(.busy(let fork)): fatalError("Received fork busy response from an unexpected fork: \(fork)! Already in hand: \(inHand), and pending: \(pending)") - // Ignore others... + // Ignore others... case .think: return .ignore // since we'll decide to become thinking ourselves case .eat: return .ignore // since we'll decide to become eating ourselves } diff --git a/Sources/DistributedActors/ActorAddress.swift b/Sources/DistributedActors/ActorAddress.swift index 35afa3b9a..8391124ee 100644 --- a/Sources/DistributedActors/ActorAddress.swift +++ b/Sources/DistributedActors/ActorAddress.swift @@ -564,11 +564,11 @@ public struct Node: Hashable { extension Node: CustomStringConvertible, CustomDebugStringConvertible { public var description: String { - return "\(self.protocol)://\(self.systemName)@\(self.host):\(self.port)" + "\(self.protocol)://\(self.systemName)@\(self.host):\(self.port)" } public var debugDescription: String { - return self.description + self.description } } @@ -576,7 +576,7 @@ extension Node: Comparable { // Silly but good enough comparison for deciding "who is lower node" // as we only use those for "tie-breakers" any ordering is fine to be honest here. public static func < (lhs: Node, rhs: Node) -> Bool { - return "\(lhs)" < "\(rhs)" + "\(lhs)" < "\(rhs)" } public func hash(into hasher: inout Hasher) { @@ -586,7 +586,7 @@ extension Node: Comparable { } public static func == (lhs: Node, rhs: Node) -> Bool { - return lhs.protocol == rhs.protocol && lhs.host == rhs.host && lhs.port == rhs.port + lhs.protocol == rhs.protocol && lhs.host == rhs.host && lhs.port == rhs.port } } @@ -624,7 +624,7 @@ public struct UniqueNode: Hashable { self.node.host = newValue } get { - return self.node.host + self.node.host } } @@ -633,14 +633,14 @@ public struct UniqueNode: Hashable { self.node.port = newValue } get { - return self.node.port + self.node.port } } } extension UniqueNode: CustomStringConvertible, CustomDebugStringConvertible { public var description: String { - return "\(self.node)" + "\(self.node)" } public var debugDescription: String { @@ -652,7 +652,7 @@ extension UniqueNode: CustomStringConvertible, CustomDebugStringConvertible { extension UniqueNode: Comparable { public static func == (lhs: UniqueNode, rhs: UniqueNode) -> Bool { // we first compare the NodeIDs since they're quicker to compare and for diff systems always would differ, even if on same physical address - return lhs.nid == rhs.nid && lhs.node == rhs.node + lhs.nid == rhs.nid && lhs.node == rhs.node } // Silly but good enough comparison for deciding "who is lower node" diff --git a/Sources/DistributedActors/ActorSystem.swift b/Sources/DistributedActors/ActorSystem.swift index 91edc1756..ddda1d482 100644 --- a/Sources/DistributedActors/ActorSystem.swift +++ b/Sources/DistributedActors/ActorSystem.swift @@ -267,6 +267,12 @@ public final class ActorSystem { /// Starts plugins after the system is fully initialized self.settings.plugins.startAll(self) + + self.log.info("Actor System [\(self.name)] initialized.") + if settings.cluster.enabled { + self.log.info("Actor System Settings in effect: Cluster.autoLeaderElection: \(self.settings.cluster.autoLeaderElection)") + self.log.info("Actor System Settings in effect: Cluster.downingStrategy: \(self.settings.cluster.downingStrategy)") + } } public convenience init() { diff --git a/Sources/DistributedActors/CRDT/CRDTReplicatorShell.swift b/Sources/DistributedActors/CRDT/CRDTReplicatorShell.swift index 8f2b69b41..4d784f297 100644 --- a/Sources/DistributedActors/CRDT/CRDTReplicatorShell.swift +++ b/Sources/DistributedActors/CRDT/CRDTReplicatorShell.swift @@ -100,7 +100,7 @@ extension CRDT.Replicator { self.remoteReplicators.remove(remoteReplicatorRef) case .snapshot(let snapshot): - Cluster.Membership.diff(from: .empty, to: snapshot).changes.forEach { change in + Cluster.Membership._diff(from: .empty, to: snapshot).changes.forEach { change in self.receiveClusterEvent(context, event: .membershipChange(change)) } diff --git a/Sources/DistributedActors/Clocks/VersionVector.swift b/Sources/DistributedActors/Clocks/VersionVector.swift index 36ea1942d..0b7227428 100644 --- a/Sources/DistributedActors/Clocks/VersionVector.swift +++ b/Sources/DistributedActors/Clocks/VersionVector.swift @@ -39,6 +39,8 @@ public struct VersionVector { // Internal state is a dictionary of replicas and their corresponding version internal var state: [ReplicaId: Int] = [:] + public static let empty: VersionVector = .init() + public static func first(at replicaId: ReplicaId) -> Self { .init((replicaId, 1)) } @@ -89,6 +91,13 @@ public struct VersionVector { self.state.merge(other.state, uniquingKeysWith: max) } + /// Prune any trace of the passed in replica id. + public func pruneReplica(_ replicaId: ReplicaId) -> Self { + var s = self + s.state.removeValue(forKey: replicaId) + return s + } + /// Obtain current version at the given replica. If the replica is unknown, the default version is 0. /// /// - Parameter replicaId: The replica whose version is being queried. diff --git a/Sources/DistributedActors/Cluster/Cluster+Gossip.swift b/Sources/DistributedActors/Cluster/Cluster+Gossip.swift index f9dac8fba..d03e810c9 100644 --- a/Sources/DistributedActors/Cluster/Cluster+Gossip.swift +++ b/Sources/DistributedActors/Cluster/Cluster+Gossip.swift @@ -19,7 +19,7 @@ extension Cluster { /// Gossip payload about members in the cluster. /// /// Used to guarantee phrases like "all nodes have seen a node A in status S", upon which the Leader may act. - struct Gossip { + struct Gossip: Equatable { // TODO: can be moved to generic envelope --------- let owner: UniqueNode /// A table maintaining our perception of other nodes views on the version of membership. @@ -29,7 +29,7 @@ extension Cluster { var seen: Cluster.Gossip.SeenTable /// The version vector of this gossip and the `Membership` state owned by it. var version: VersionVector { - self.seen.table[self.owner]! // !-safe, since we _always)_ know our own world view + self.seen.underlying[self.owner]! // !-safe, since we _always_ know our own world view } // TODO: end of can be moved to generic envelope --------- @@ -44,7 +44,6 @@ extension Cluster { self.seen = Cluster.Gossip.SeenTable(myselfNode: ownerNode, version: VersionVector()) // The actual payload - // self.membership = .initial(ownerNode) self.membership = .empty // MUST be empty, as on the first "self gossip, we cause all ClusterEvents } @@ -62,25 +61,103 @@ extension Cluster { /// Merge an incoming gossip _into_ the current gossip. /// Ownership of this gossip is retained, versions are bumped, and membership is merged. mutating func mergeForward(incoming: Gossip) -> MergeDirective { - // TODO: note: we could technically always just merge anyway; all data we have here is CRDT like anyway - let causalRelation: VersionVector.CausalRelation = self.seen.compareVersion(observedOn: self.owner, to: incoming.version) - self.seen.merge(owner: self.owner, incoming: incoming) // always merge, as we grow our knowledge about what the other node has "seen" + var incoming = incoming + // 1) decide the relationship between this gossip and the incoming one + let causalRelation: VersionVector.CausalRelation = self.version.compareTo(incoming.version) + + // 1.1) Protect the node from any gossip from a .down node (!), it cannot and MUST NOT be trusted. + let incomingGossipOwnerKnownLocally = self.membership.uniqueMember(incoming.owner) + guard let incomingOwnerMember = incoming.membership.uniqueMember(incoming.owner) else { + return .init(causalRelation: causalRelation, effectiveChanges: []) + } + switch incomingGossipOwnerKnownLocally { + case .some(let locallyKnownMember) where locallyKnownMember.status.isDown: + // we have NOT removed it yet, but it is down, so we ignore it + return .init(causalRelation: causalRelation, effectiveChanges: []) + case .none where incomingOwnerMember.status.isAtLeastDown: + // we have likely removed it, and it is down anyway, so we ignore it completely + return .init(causalRelation: causalRelation, effectiveChanges: []) + default: + () // ok, so it is fine and still alive + } + + // 1.2) Protect from zombies: Any nodes that we know are dead or down, we should not accept any information from + let incomingConcurrentDownMembers = incoming.membership.members(atLeast: .down) + for pruneFromIncomingBeforeMerge in incomingConcurrentDownMembers + where self.membership.uniqueMember(pruneFromIncomingBeforeMerge.node) == nil { + _ = incoming.pruneMember(pruneFromIncomingBeforeMerge) + } + + // 2) calculate membership changes; if this gossip is strictly more recent than the incoming one, + // we can skip this as we "know" that we already know everything that the incoming has to offer (optimization) + let changes: [MembershipChange] if case .happenedAfter = causalRelation { + // ignore all changes >> // our local view happened strictly _after_ the incoming one, thus it is guaranteed // it will not provide us with new information; This is only an optimization, and would work correctly without it. - // TODO: consider doing the same for .same? - return .init(causalRelation: causalRelation, effectiveChanges: []) + changes = [] + } else { + // incoming is concurrent, ahead, or same + changes = self.membership.mergeFrom(incoming: incoming.membership, myself: self.owner) + } + + self.seen.merge(selfOwner: self.owner, incoming: incoming.seen) + + // 3) if any removals happened, we need to prune the removed nodes from the seen table + for change in changes + where change.toStatus.isRemoved && change.member.node != self.owner { + self.seen.prune(change.member.node) } - let changes = self.membership.mergeForward(fromAhead: incoming.membership) return .init(causalRelation: causalRelation, effectiveChanges: changes) } + /// Remove member from `membership` and prune the seen tables of any trace of the removed node. + mutating func pruneMember(_ member: Member) -> Cluster.MembershipChange? { + self.seen.prune(member.node) // always prune is okey + let change = self.membership.removeCompletely(member.node) + return change + } + struct MergeDirective { let causalRelation: VersionVector.CausalRelation let effectiveChanges: [Cluster.MembershipChange] } + + /// Checks for convergence of the membership (seen table) among members. + /// + /// ### Convergence + /// Convergence means that "all (considered) members" have seen at-least the version that the convergence + /// is checked against (this version). In other words, if a member is seen as `.joining` in this version + /// other members are guaranteed to have seen this information, or their membership may have progressed further + /// e.g. the member may have already moved to `.up` or further in their perception. + /// + /// Only `.up` and `.leaving` members are considered, since joining members are "too early" + /// to matter in decisions, and down members shall never participate in decision making. + func converged() -> Bool { + let members = self.membership.members(withStatus: [.joining, .up, .leaving]) // FIXME: we should not require joining nodes in convergence, can losen up a bit here I hope + let requiredVersion = self.version + + if members.isEmpty { + return true // no-one is around disagree with me! }:-) + } + + let laggingBehindMemberFound = members.contains { member in + if let memberSeenVersion = self.seen.version(at: member.node) { + switch memberSeenVersion.compareTo(requiredVersion) { + case .happenedBefore, .concurrent: + return true // found an offending member, it is lagging behind, thus no convergence + case .happenedAfter, .same: + return false + } + } else { + return true // no version in other member, thus we have no idea where it's at -> assuming it is behind + } + } + + return !laggingBehindMemberFound + } } } @@ -107,16 +184,19 @@ extension Cluster.Gossip { /// - node B: is the "farthest" along the vector timeline, yet has never seen gossip from C /// - node C (we think): has never seen any gossip from either A or B, realistically though it likely has, /// however it has not yet sent a gossip to "us" such that we could have gotten its updated version vector. - struct SeenTable { - var table: [UniqueNode: VersionVector] + struct SeenTable: Equatable { + var underlying: [UniqueNode: VersionVector] + + init() { + self.underlying = [:] + } init(myselfNode: UniqueNode, version: VersionVector) { - self.table = [myselfNode: version] + self.underlying = [myselfNode: version] } - /// Nodes seen by this table var nodes: Dictionary.Keys { - self.table.keys + self.underlying.keys } /// If the table does NOT include the `node`, we assume that the `latestVersion` is "more recent than no information at all." @@ -126,34 +206,26 @@ extension Cluster.Gossip { /// - SeeAlso: The definition of `VersionVector.CausalRelation` for detailed discussion of all possible relations. func compareVersion(observedOn owner: UniqueNode, to incomingVersion: VersionVector) -> VersionVector.CausalRelation { /// We know that the node has seen _at least_ the membership at `nodeVersion`. - guard let versionOnNode = self.table[owner] else { - return .happenedBefore - } - - return versionOnNode.compareTo(incomingVersion) + (self.underlying[owner] ?? VersionVector()).compareTo(incomingVersion) } - // FIXME: This could be too many layers; - // FIXME: Shouldn't we merge all incoming owner's, from the entire incoming table? !!!!!!!!!!!!!!!!!!!!!!!! - // The information carried in Cluster.Membership includes all information /// Merging an incoming `Cluster.Gossip` into a `Cluster.Gossip.SeenTable` means "progressing (version) time" /// for both "us" and the incoming data's owner in "our view" about it. /// - /// In other words, we gained information and our membership has "moved forward" as - mutating func merge(owner: UniqueNode, incoming: Cluster.Gossip) { - for seenNode in incoming.seen.nodes { - var seenVersion = self.table[seenNode] ?? VersionVector() - seenVersion.merge(other: incoming.seen.version(at: seenNode) ?? VersionVector()) // though always not-nil - self.table[seenNode] = seenVersion - } + /// In other words, we gained information and our membership has "moved forward". + /// + mutating func merge(selfOwner: UniqueNode, incoming: SeenTable) { + var ownerVersion = self.version(at: selfOwner) ?? VersionVector() - // in addition, we also merge the incoming table directly with ours, - // as the remote's "own" version means that all information it shared with us in gossip - // is "at least as up to date" as its version, we've now also seen "at least as much" information - // along the vector time. - var localVersion = self.table[owner] ?? VersionVector() - localVersion.merge(other: incoming.version) // we gained information from the incoming gossip - self.table[owner] = localVersion + for incomingNode in incoming.nodes { + if let incomingVersion = incoming.version(at: incomingNode) { + var thatNodeVersion = self.underlying[incomingNode] ?? VersionVector() + thatNodeVersion.merge(other: incomingVersion) + ownerVersion.merge(other: thatNodeVersion) + self.underlying[incomingNode] = thatNodeVersion + } + } + self.underlying[selfOwner] = ownerVersion } // TODO: func haveNotYetSeen(version: VersionVector): [UniqueNode] @@ -171,14 +243,14 @@ extension Cluster.Gossip { /// in the A field, meaning we need to gossip with B to converge those two version vectors. @discardableResult mutating func incrementVersion(owner: UniqueNode, at node: UniqueNode) -> VersionVector { - if var version = self.table[owner] { + if var version = self.underlying[owner] { version.increment(at: .uniqueNode(node)) - self.table[owner] = version + self.underlying[owner] = version return version } else { // we treat incrementing from "nothing" as creating a new entry let version = VersionVector((.uniqueNode(node), 1)) - self.table[owner] = version + self.underlying[owner] = version return version } } @@ -187,21 +259,38 @@ extension Cluster.Gossip { /// This "view" represents "our" latest information about what we know that node has observed. /// This information may (and most likely is) outdated as the nodes continue to gossip to one another. func version(at node: UniqueNode) -> VersionVector? { - self.table[node] + self.underlying[node] + } + + /// Prunes any trace of the passed in node from the seen table. + /// This includes the version vector that this node may have observed, and also any part of other's version vectors + /// where this node was present. + /// + /// Performing this operation should be done with great care, as it means that if "the same exact node" were + /// to "come back" it would be indistinguishable from being a new node. Measures to avoid this from happening + /// must be taken on the cluster layer, by using and checking for tombstones. // TODO: make a nasty test for this, a simple one we got; See MembershipGossipSeenTableTests + mutating func prune(_ nodeToPrune: UniqueNode) { + _ = self.underlying.removeValue(forKey: nodeToPrune) + let replicaToPrune: ReplicaId = .uniqueNode(nodeToPrune) + + for (key, version) in self.underlying where version.contains(replicaToPrune, 0) { + self.underlying[key] = version.pruneReplica(replicaToPrune) + // TODO: test removing non existing member + } } } } extension Cluster.Gossip.SeenTable: CustomStringConvertible, CustomDebugStringConvertible { public var description: String { - "Cluster.Gossip.SeenTable(\(self.table))" + "Cluster.Gossip.SeenTable(\(self.underlying))" } var debugDescription: String { var s = "Cluster.Gossip.SeenTable(\n" let entryHeadingPadding = String(repeating: " ", count: 4) let entryPadding = String(repeating: " ", count: 4 * 2) - table.sorted(by: { $0.key < $1.key }).forEach { node, vv in + underlying.sorted(by: { $0.key < $1.key }).forEach { node, vv in let entryHeader = "\(entryHeadingPadding)\(node) observed versions:\n" s.append(entryHeader) diff --git a/Sources/DistributedActors/Cluster/Cluster+Member.swift b/Sources/DistributedActors/Cluster/Cluster+Member.swift index 85f8b1eba..149ad13af 100644 --- a/Sources/DistributedActors/Cluster/Cluster+Member.swift +++ b/Sources/DistributedActors/Cluster/Cluster+Member.swift @@ -37,12 +37,12 @@ extension Cluster { /// Sequence number at which this node was moved to `.up` by a leader. /// The sequence starts at `1`, and 0 means the node was not moved to up _yet_. - public var upNumber: Int? + public var _upNumber: Int? public init(node: UniqueNode, status: Cluster.MemberStatus) { self.node = node self.status = status - self.upNumber = nil + self._upNumber = nil self.reachability = .reachable } @@ -50,7 +50,7 @@ extension Cluster { assert(!status.isJoining, "Node \(node) was \(status) yet was given upNumber: \(upNumber). This is incorrect, as only at-least .up members may have upNumbers!") self.node = node self.status = status - self.upNumber = upNumber + self._upNumber = upNumber self.reachability = .reachable } @@ -70,10 +70,10 @@ extension Cluster { /// Used to gossip a `.down` decision, but not accidentally move the node "back" to down if it already was leaving or removed. public var asDownIfNotAlready: Member { switch self.status { - case .down, .leaving, .removed: - return self - case .joining, .up: + case .joining, .up, .leaving: return Member(node: self.node, status: .down) + case .down, .removed: + return self } } @@ -81,28 +81,41 @@ extension Cluster { /// /// Note that moving only happens along the lifecycle of a member, e.g. trying to move forward from .up do .joining /// will result in a `nil` change and no changes being made to the member. - public mutating func moveForward(_ status: Cluster.MemberStatus) -> Cluster.MembershipChange? { + public mutating func moveForward(to status: Cluster.MemberStatus) -> Cluster.MembershipChange? { + // only allow moving "forward" guard self.status < status else { return nil } - let oldMember = self - self.status = status - // FIXME: potential to lose upNumbers here! Need to revisit the upNumber things anyway, not in love with it - return Cluster.MembershipChange(member: oldMember, toStatus: status) + // special handle if we are about to move to .removed, this is only allowed from .down + if status == .removed { + // special handle removals + if self.status == .down { + defer { self.status = .removed } + return .init(member: self, toStatus: .removed) + } else { + return nil + } + } + + defer { self.status = status } + return Cluster.MembershipChange(member: self, toStatus: status) + } + + public func movingForward(to status: MemberStatus) -> Self { + var m = self + _ = m.moveForward(to: status) + return m } } } -extension Cluster.Member { +extension Cluster.Member: Equatable { public func hash(into hasher: inout Hasher) { self.node.hash(into: &hasher) } public static func == (lhs: Cluster.Member, rhs: Cluster.Member) -> Bool { - if lhs.node != rhs.node { - return false - } - return true + lhs.node == rhs.node } } @@ -115,7 +128,13 @@ extension Cluster.Member { /// and stopped on demand. Putting certain types of workloads onto "old(est)" nodes in such clusters has the benefit /// of most likely not needing to balance/move work off them too often (in face of many ad-hoc worker spawns). public static let ageOrdering: (Cluster.Member, Cluster.Member) -> Bool = { l, r in - (l.upNumber ?? 0) < (r.upNumber ?? 0) + (l._upNumber ?? 0) < (r._upNumber ?? 0) + } + + /// An ordering by the members' `node` properties, e.g. 1.1.1.1 is "lower" than 2.2.2.2. + /// This ordering somewhat unusual, however always consistent and used to select a leader -- see `LowestReachableMember`. + public static let lowestAddressOrdering: (Cluster.Member, Cluster.Member) -> Bool = { l, r in + l.node < r.node } } @@ -125,7 +144,7 @@ extension Cluster.Member: CustomStringConvertible, CustomDebugStringConvertible } public var debugDescription: String { - "Member(\(String(reflecting: self.node)), status: \(self.status), reachability: \(self.reachability)\(self.upNumber.map { ", upNumber: \($0)" } ?? ""))" + "Member(\(String(reflecting: self.node)), status: \(self.status), reachability: \(self.reachability)\(self._upNumber.map { ", upNumber: \($0)" } ?? ""))" } } @@ -138,7 +157,7 @@ extension Cluster.Member: Codable { extension Cluster { /// Describes the status of a member within the clusters lifecycle. - public enum MemberStatus: String, Comparable { + public enum MemberStatus: String, CaseIterable, Comparable { /// Describes a node which is connected to at least one other member in the cluster, /// it may want to serve some traffic, however should await the leader moving it to .up /// before it takes on serious work. @@ -170,16 +189,17 @@ extension Cluster { /// with members which have been down as they may contain severely outdated opinions about the cluster and state that it contains. /// In other words: "Members don't talk to zombies." case down - /// Describes a member which is safe to _completely remove_ from future gossips. - /// This status is managed internally and not really of concern to end users (it could be treated equivalent to .down - /// by applications safely). Notably, this status should never really be "stored" in membership, other than for purposes - /// of gossiping to other nodes that they also may remove the node. + + /// Describes a member which _has been completely removed_ from the membership and gossips. + /// + /// This value is not gossiped, rather, in face if an "ahead" (as per version vector time) incoming gossip + /// with a missing entry for a known .down member shall be assumed removed. /// - /// The result of a .removed being gossiped is the complete removal of the associated member from any membership information - /// in the future. As this may pose a risk, e.g. if a `.down` node remains active for many hours for some reason, and - /// we'd have removed it from the membership completely, it would allow such node to "join again" and be (seemingly) - /// a "new node", leading to all kinds of potential issues. Thus the margin to remove members has to be threaded carefully and - /// managed by a leader action, rather than (as .down is) be possible to invoke by any node at any time. + /// Moving into the .removed state may ONLY be performed from a .down state, and must be performed by the cluster + /// leader if and only if the cluster views of all live members are `Cluster.Gossip.converged()`. + /// + /// Note, that a removal also ensures storage of tombstones on the networking layer, such that any future attempts + /// of such node re-connecting will be automatically rejected, disallowing the node to "come back" (which we'd call a "zombie" node). case removed public static let maxStrLen = 7 // hardcoded strlen of the words used for joining...removed; used for padding @@ -226,6 +246,11 @@ extension Cluster.MemberStatus { self == .down } + /// Convenience function to check if a status is `.removed` or `.removed` + public var isAtLeastDown: Bool { + self >= .down + } + /// Convenience function to check if a status is `.removed` public var isRemoved: Bool { self == .removed @@ -276,6 +301,16 @@ extension Cluster { } } +extension Cluster.MemberReachability { + public var isReachable: Bool { + self == .reachable + } + + public var isUnreachable: Bool { + self == .unreachable + } +} + extension Cluster.MemberReachability: Codable { // Codable: synthesized conformance } diff --git a/Sources/DistributedActors/Cluster/Cluster+Membership.swift b/Sources/DistributedActors/Cluster/Cluster+Membership.swift index 511d7e907..1a8026dee 100644 --- a/Sources/DistributedActors/Cluster/Cluster+Membership.swift +++ b/Sources/DistributedActors/Cluster/Cluster+Membership.swift @@ -30,20 +30,15 @@ extension Cluster { /// and attempts to connect to the same cluster as its previous "incarnation". Such situation is called a replacement, and by the assumption /// of that it should not be possible to run many nodes on exact same host+port the previous node is immediately ejected and marked down. /// - // TODO: diagram of state transitions for the members - // TODO: how does seen table relate to this - // TODO: should we not also mark other nodes observations of members in here? - public struct Membership: Hashable, ExpressibleByArrayLiteral { + /// ### Member state transitions + /// Members can only move "forward" along their status lifecycle, refer to `Cluster.MemberStatus` docs for a diagram of legal transitions. + public struct Membership: ExpressibleByArrayLiteral { public typealias ArrayLiteralElement = Cluster.Member public static var empty: Cluster.Membership { .init(members: []) } - internal static func initial(_ myselfNode: UniqueNode) -> Cluster.Membership { - Cluster.Membership.empty.joining(myselfNode) - } - /// Members MUST be stored `UniqueNode` rather than plain node, since there may exist "replacements" which we need /// to track gracefully -- in order to tell all other nodes that those nodes are now down/leaving/removed, if a /// node took their place. This makes lookup by `Node` not nice, but realistically, that lookup is quite rare -- only @@ -69,14 +64,14 @@ extension Cluster { /// This operation is guaranteed to return a member if it was added to the membership UNLESS the member has been `.removed` /// and dropped which happens only after an extended period of time. // FIXME: That period of time is not implemented public func uniqueMember(_ node: UniqueNode) -> Cluster.Member? { - return self._members[node] + self._members[node] } /// Picks "first", in terms of least progressed among its lifecycle member in presence of potentially multiple members /// for a non-unique `Node`. In practice, this happens when an existing node is superseded by a "replacement", and the /// previous node becomes immediately down. - public func firstMember(_ node: Node) -> Cluster.Member? { - return self._members.values.sorted(by: Cluster.MemberStatus.lifecycleOrdering).first(where: { $0.node.node == node }) + public func member(_ node: Node) -> Cluster.Member? { + self._members.values.sorted(by: Cluster.MemberStatus.lifecycleOrdering).first(where: { $0.node.node == node }) } public func youngestMember() -> Cluster.Member? { @@ -87,12 +82,6 @@ extension Cluster { self.members(atLeast: .joining).min(by: Cluster.Member.ageOrdering) } - public func members(_ node: Node) -> [Cluster.Member] { - return self._members.values - .filter { $0.node.node == node } - .sorted(by: Cluster.MemberStatus.lifecycleOrdering) - } - /// Count of all members (regardless of their `MemberStatus`) public var count: Int { self._members.count @@ -115,15 +104,23 @@ extension Cluster { } public func members(withStatus status: Cluster.MemberStatus, reachability: Cluster.MemberReachability? = nil) -> [Cluster.Member] { + self.members(withStatus: [status], reachability: reachability) + } + + public func members(withStatus statuses: Set, reachability: Cluster.MemberReachability? = nil) -> [Cluster.Member] { let reachabilityFilter: (Cluster.Member) -> Bool = { member in reachability == nil || member.reachability == reachability } return self._members.values.filter { - $0.status == status && reachabilityFilter($0) + statuses.contains($0.status) && reachabilityFilter($0) } } public func members(atLeast status: Cluster.MemberStatus, reachability: Cluster.MemberReachability? = nil) -> [Cluster.Member] { + if status == .joining, reachability == nil { + return Array(self._members.values) + } + let reachabilityFilter: (Cluster.Member) -> Bool = { member in reachability == nil || member.reachability == reachability } @@ -133,11 +130,15 @@ extension Cluster { } public func members(atMost status: Cluster.MemberStatus, reachability: Cluster.MemberReachability? = nil) -> [Cluster.Member] { + if status == .removed, reachability == nil { + return Array(self._members.values) + } + let reachabilityFilter: (Cluster.Member) -> Bool = { member in reachability == nil || member.reachability == reachability } return self._members.values.filter { - status >= $0.status && reachabilityFilter($0) + $0.status <= status && reachabilityFilter($0) } } @@ -188,6 +189,43 @@ extension Cluster { } } +// Implementation notes: Membership/Member equality +// Membership equality is special, as it manually DOES take into account the Member's states (status, reachability), +// whilst the Member equality by itself does not. This is somewhat ugly, however it allows us to perform automatic +// seen table owner version updates whenever "the membership has changed." We may want to move away from this and make +// these explicit methods, though for now this seems to be the equality we always want when we use Membership, and the +// one we want when we compare members -- as we want to know "does a thing contain this member" rather than "does a thing +// contain this full exact state of a member," whereas for Membership we want to know "is the state of the membership exactly the same." +extension Cluster.Membership: Hashable { + public func hash(into hasher: inout Hasher) { + hasher.combine(self._leaderNode) + for member in self._members.values { + hasher.combine(member.node) + hasher.combine(member.status) + hasher.combine(member.reachability) + } + } + + public static func == (lhs: Cluster.Membership, rhs: Cluster.Membership) -> Bool { + guard lhs._leaderNode == rhs._leaderNode else { + return false + } + guard lhs._members.count == rhs._members.count else { + return false + } + for (lNode, lMember) in lhs._members { + if let rMember = rhs._members[lNode], + lMember.node != rMember.node || + lMember.status != rMember.status || + lMember.reachability != rMember.reachability { + return false + } + } + + return true + } +} + extension Cluster.Membership: CustomStringConvertible, CustomDebugStringConvertible { /// Pretty multi-line output of a membership, useful for manual inspection public func prettyDescription(label: String) -> String { @@ -224,19 +262,25 @@ extension Cluster.Membership { /// - Returns: the resulting change that was applied to the membership; note that this may be `nil`, /// if the change did not cause any actual change to the membership state (e.g. signaling a join of the same node twice). public mutating func apply(_ change: Cluster.MembershipChange) -> Cluster.MembershipChange? { - switch change.toStatus { - case .joining: - if self.uniqueMember(change.node) != nil { - // If we actually already have this node, it is a MARK, not a join "over" - return self.mark(change.node, as: change.toStatus) - } else { - return self.join(change.node) - } - case let status: - if self.firstMember(change.node.node) == nil { // TODO: more general? // TODO this entire method should be simpler - _ = self.join(change.node) - } - let change = self.mark(change.node, as: status) + if case .removed = change.toStatus { + return self.removeCompletely(change.node) + } + + if let knownUnique = self.uniqueMember(change.node) { + // it is known uniquely, so we just update its status + return self.mark(knownUnique.node, as: change.toStatus) + } + + if let previousMember = self.member(change.node.node) { + // we are joining "over" an existing incarnation of a node; causing the existing node to become .down immediately + _ = self.removeCompletely(previousMember.node) // the replacement event will handle the down notifications + self._members[change.node] = change.member + + // emit a replacement membership change, this will cause down cluster events for previous member + return .init(replaced: previousMember, by: change.member) + } else { + // node is normally joining + self._members[change.member.node] = change.member return change } } @@ -292,19 +336,10 @@ extension Cluster.Membership { /// Returns the change; e.g. if we replaced a node the change `from` will be populated and perhaps a connection should /// be closed to that now-replaced node, since we have replaced it with a new node. - public mutating func join(_ node: UniqueNode) -> Cluster.MembershipChange { - let newMember = Cluster.Member(node: node, status: .joining) - - if let member = self.firstMember(node.node) { - // we are joining "over" an existing incarnation of a node; causing the existing node to become .down immediately - self._members[member.node] = Cluster.Member(node: member.node, status: .down) - self._members[node] = newMember - return .init(replaced: member, by: newMember) - } else { - // node is normally joining - self._members[node] = newMember - return .init(node: node, fromStatus: nil, toStatus: .joining) - } + public mutating func join(_ node: UniqueNode) -> Cluster.MembershipChange? { + var change = Cluster.MembershipChange(member: Cluster.Member(node: node, status: .joining)) + change.fromStatus = nil + return self.apply(change) } public func joining(_ node: UniqueNode) -> Cluster.Membership { @@ -328,12 +363,12 @@ extension Cluster.Membership { var updatedMember = existingExactMember updatedMember.status = status if status == .up { - updatedMember.upNumber = self.youngestMember()?.upNumber ?? 1 + updatedMember._upNumber = self.youngestMember()?._upNumber ?? 1 } self._members[existingExactMember.node] = updatedMember return Cluster.MembershipChange(member: existingExactMember, toStatus: status) - } else if let beingReplacedMember = self.firstMember(node.node) { + } else if let beingReplacedMember = self.member(node.node) { // We did not get a member by exact UniqueNode match, but we got one by Node match... // this means this new node that we are trying to mark is a "replacement" and the `beingReplacedNode` must be .downed! @@ -343,11 +378,11 @@ extension Cluster.Membership { // replacement: let replacedNode = Cluster.Member(node: beingReplacedMember.node, status: .down) - let newNode = Cluster.Member(node: node, status: status) + let nodeD = Cluster.Member(node: node, status: status) self._members[replacedNode.node] = replacedNode - self._members[newNode.node] = newNode + self._members[nodeD.node] = nodeD - return Cluster.MembershipChange(replaced: beingReplacedMember, by: newNode) + return Cluster.MembershipChange(replaced: beingReplacedMember, by: nodeD) } else { // no such member -> no change applied return nil @@ -385,9 +420,11 @@ extension Cluster.Membership { } /// REMOVES (as in, completely, without leaving even a tombstone or `down` marker) a `Member` from the `Membership`. - /// /// If the membership is not aware of this member this is treated as no-op. - public mutating func remove(_ node: UniqueNode) -> Cluster.MembershipChange? { + /// + /// - Warning: When removing nodes from cluster one MUST also prune the seen tables (!) of the gossip. + /// Rather than calling this function directly, invoke `Cluster.Gossip.removeMember()` which performs all needed cleanups. + public mutating func removeCompletely(_ node: UniqueNode) -> Cluster.MembershipChange? { if let member = self._members[node] { self._members.removeValue(forKey: node) return .init(member: member, toStatus: .removed) @@ -397,52 +434,103 @@ extension Cluster.Membership { } /// Returns new membership while removing an existing member, identified by the passed in node. - public func removing(_ node: UniqueNode) -> Cluster.Membership { + public func removingCompletely(_ node: UniqueNode) -> Cluster.Membership { var membership = self - _ = membership.remove(node) + _ = membership.removeCompletely(node) return membership } } extension Cluster.Membership { - /// Special Merge function that only moves members "forward" however never removes them, as removal MUST ONLY be + /// Special merge function that only moves members "forward" however never removes them, as removal MUST ONLY be /// issued specifically by a leader working on the assumption that the `incoming` Membership is KNOWN to be "ahead", /// and e.g. if any nodes are NOT present in the incoming membership, they shall be considered `.removed`. /// /// Otherwise, functions as a normal merge, by moving all members "forward" in their respective lifecycles. - /// Meaning the following transitions are possible: + /// + /// The following table illustrates the possible state transitions of a node during a merge: /// /// ``` - /// self | incoming - /// ---------------+----------------------------------------------|------------------------- - /// --> [.joining, .up, .leaving, .down, .removed] --> - /// [.joining] --> [.joining, .up, .leaving, .down, .removed] --> - /// [.up] --> [.up, .leaving, .down, .removed] --> - /// [.leaving] --> [.leaving, .down, .removed] --> - /// [.down] --> [.down, .removed] --> - /// [.removed]* --> [.removed] --> - /// --> --> - /// * realistically a .removed will never be _stored_ it may be incoming which means that a leader has decided that we - /// it is safe to remove the member from the membership, i.e. all nodes have converged on seeing it as leaving - /// ``` + /// node's status | "ahead" node's status | resulting node's status + /// ---------------+--------------------------------------|------------------------- + /// --> [.joining, .up, .leaving] --> + /// --> [.down, .removed] --> + /// [.joining] --> [.joining, .up, .leaving, .down] --> + /// [.up] --> [.up, .leaving, .down] --> + /// [.leaving] --> [.leaving, .down] --> + /// [.down] --> [.down] --> + /// [.down] --> (if some node removed) --> + /// [.down] --> (iff myself node removed) --> .removed + /// --> [.removed]** --> + /// + /// * `.removed` is never stored EXCEPT if the `myself` member has been seen removed by other members of the cluster. + /// ** `.removed` should never be gossiped/incoming within the cluster, but if it were to happen it is treated like a removal. + /// + /// Warning: Leaders are not "merged", they get elected by each node (!). /// /// - Returns: any membership changes that occurred (and have affected the current membership). - public mutating func mergeForward(fromAhead ahead: Cluster.Membership) -> [Cluster.MembershipChange] { + public mutating func mergeFrom(incoming: Cluster.Membership, myself: UniqueNode?) -> [Cluster.MembershipChange] { var changes: [Cluster.MembershipChange] = [] - for incomingMember in ahead._members.values { - if var member = self._members[incomingMember.node] { - if let change = member.moveForward(incomingMember.status) { - self._members[incomingMember.node] = member - changes.append(change) + // Set of nodes whose members are currently .down, and not present in the incoming gossip. + // + // as we apply incoming member statuses, remove members from this set + // if any remain in the set, it means they were removed in the incoming membership + // since we strongly assume the incoming one is "ahead" (i.e. `self happenedBefore ahead`), + // we remove these members and emit .removed changes. + var downNodesToRemove: Set = Set(self.members(withStatus: .down).map { $0.node }) + + // 1) move forward any existing members or new members according to the `ahead` statuses + for incomingMember in incoming._members.values { + downNodesToRemove.remove(incomingMember.node) + + guard var knownMember = self._members[incomingMember.node] else { + // member NOT known locally ---------------------------------------------------------------------------- + + // only proceed if the member isn't already on its way out + guard incomingMember.status < Cluster.MemberStatus.down else { + // no need to do anything if it is a removal coming in, yet we already do not know this node + continue } - } else { - // member not known locally + + // it is information about a new member, merge it in self._members[incomingMember.node] = incomingMember - changes.append(.init(member: incomingMember)) + + var change = Cluster.MembershipChange(member: incomingMember) + change.fromStatus = nil // since "new" + changes.append(change) + continue + } + + // it is a known member ------------------------------------------------------------------------------------ + if let change = knownMember.moveForward(to: incomingMember.status) { + if change.toStatus.isRemoved { + self._members.removeValue(forKey: incomingMember.node) + } else { + self._members[incomingMember.node] = knownMember + } + changes.append(change) } } + // 2) if any nodes we know about locally, were not included in the `ahead` membership + changes.append(contentsOf: downNodesToRemove.compactMap { nodeToRemove in + if nodeToRemove == myself { + // we do NOT remove ourselves completely from our own membership, we remain .removed however + return self.mark(nodeToRemove, as: .removed) + } else { + // This is safe since we KNOW the node used to be .down before, + // and removals are only performed on convergent cluster state. + // Thus all members in the cluster have seen the node as down, or already removed it. + // Removal also causes the unique node to be tombstoned in cluster and connections severed, + // such that it shall never be contacted again. + // + // Even if received "old" concurrent gossips with the node still present, we know it would be at-least + // down, and thus we'd NOT add it to the membership again, due to the ` + .down = .` merge rule. + return self.removeCompletely(nodeToRemove) + } + }) + return changes } } @@ -479,8 +567,8 @@ extension Cluster.Membership { extension Cluster.Membership { /// Compute a diff between two membership states. - /// The diff includes any member state changes, as well as - internal static func diff(from: Cluster.Membership, to: Cluster.Membership) -> MembershipDiff { + // TODO: diffing is not super well tested, may lose up numbers + internal static func _diff(from: Cluster.Membership, to: Cluster.Membership) -> MembershipDiff { var entries: [Cluster.MembershipChange] = [] entries.reserveCapacity(max(from._members.count, to._members.count)) @@ -492,11 +580,11 @@ extension Cluster.Membership { if let toMember = to.uniqueMember(member.node) { to._members.removeValue(forKey: member.node) if member.status != toMember.status { - entries.append(.init(node: member.node, fromStatus: member.status, toStatus: toMember.status)) + entries.append(.init(member: member, toStatus: toMember.status)) } } else { // member is not present `to`, thus it was removed - entries.append(.init(node: member.node, fromStatus: member.status, toStatus: .removed)) + entries.append(.init(member: member, toStatus: .removed)) } } @@ -540,26 +628,65 @@ extension Cluster { extension Cluster { /// Represents a change made to a `Membership`, it can be received from gossip and shall be applied to local memberships, /// or may originate from local decisions (such as joining or downing). - public struct MembershipChange: Equatable { + public struct MembershipChange: Hashable { + /// Current member that is part of the membership after this change + public internal(set) var member: Cluster.Member + /// The node which the change concerns. - public let node: UniqueNode + public var node: UniqueNode { + self.member.node + } /// Only set if the change is a "replacement", which can happen only if a node joins /// from the same physical address (host + port), however its UID has changed. - public private(set) var replaced: Cluster.Member? + internal private(set) var replaced: Cluster.Member? + + /// A replacement means that a new node appeared on the same host/port, and thus the old node must be assumed down. + internal var replacementDownPreviousNodeChange: Cluster.MembershipChange? { + guard let replacedMember = self.replaced else { + return nil + } + return .init(member: replacedMember, toStatus: .down) + } - public private(set) var fromStatus: Cluster.MemberStatus? + public internal(set) var fromStatus: Cluster.MemberStatus? public let toStatus: Cluster.MemberStatus init(member: Cluster.Member, toStatus: Cluster.MemberStatus? = nil) { - self.node = member.node - self.replaced = nil - self.fromStatus = member.status - self.toStatus = toStatus ?? member.status + // FIXME: enable these assertions +// assertBacktrace( +// toStatus == nil || !(toStatus == .removed && member.status != .down), +// """ +// Only legal and expected -> [.removed] transitions are from [.down], \ +// yet attempted to move \(member) to \(toStatus, orElse: "nil") +// """ +// ) + + if let to = toStatus { + var m = member + m.status = to + self.member = m + self.replaced = nil + self.fromStatus = member.status + self.toStatus = to + } else { + self.member = member + self.replaced = nil + self.fromStatus = nil + self.toStatus = member.status + } } init(node: UniqueNode, fromStatus: Cluster.MemberStatus?, toStatus: Cluster.MemberStatus) { - self.node = node + // FIXME: enable these assertions +// assertBacktrace( +// !(toStatus == .removed && fromStatus != .down), +// """ +// Only legal and expected -> [.removed] transitions are from [.down], \ +// yet attempted to move \(node) from \(fromStatus, orElse: "nil") to \(toStatus) +// """ +// ) + self.member = .init(node: node, status: toStatus) self.replaced = nil self.fromStatus = fromStatus self.toStatus = toStatus @@ -571,15 +698,10 @@ extension Cluster { assert(replaced.node.port == newMember.node.port, "Replacement Cluster.MembershipChange should be for same non-unique node; Was: \(replaced), and \(newMember)") self.replaced = replaced - self.node = newMember.node - self.fromStatus = nil // a replacement means that the new member is "new" after all, so the move is from unknown + self.member = newMember + self.fromStatus = replaced.status self.toStatus = newMember.status } - - /// Current member that is part of the membership after this change - public var member: Cluster.Member { - Cluster.Member(node: self.node, status: self.toStatus) - } } } diff --git a/Sources/DistributedActors/Cluster/ClusterShell+LeaderActions.swift b/Sources/DistributedActors/Cluster/ClusterShell+LeaderActions.swift new file mode 100644 index 000000000..a97a013a5 --- /dev/null +++ b/Sources/DistributedActors/Cluster/ClusterShell+LeaderActions.swift @@ -0,0 +1,173 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift Distributed Actors open source project +// +// Copyright (c) 2018-2019 Apple Inc. and the Swift Distributed Actors project authors +// Licensed under Apache License v2.0 +// +// See LICENSE.txt for license information +// See CONTRIBUTORS.md for the list of Swift Distributed Actors project authors +// +// SPDX-License-Identifier: Apache-2.0 +// +//===----------------------------------------------------------------------===// + +import DistributedActorsConcurrencyHelpers +import Logging +import NIO + +// ==== ---------------------------------------------------------------------------------------------------------------- +// MARK: Collect leader actions + +extension ClusterShellState { + /// If, and only if, the current node is a leader it performs a set of tasks, such as moving nodes to `.up` etc. + func collectLeaderActions() -> [LeaderAction] { + guard self.membership.isLeader(self.myselfNode) else { + return [] // since we are not the leader, we perform no tasks + } + + guard self.latestGossip.converged() else { + return [] // leader actions are only performed when + } + + func collectMemberUpMoves() -> [LeaderAction] { + let joiningMembers = self.membership.members(withStatus: .joining).sorted(by: Cluster.Member.ageOrdering) + + return joiningMembers.map { joiningMember in + let change = Cluster.MembershipChange(member: joiningMember, toStatus: .up) + return LeaderAction.moveMember(change) + } + } + + func collectDownMemberRemovals() -> [LeaderAction] { + let toExitMembers = self.membership.members(withStatus: .down) + + return toExitMembers.map { member in + LeaderAction.removeMember(alreadyDownMember: member) + } + } + + var leadershipActions: [LeaderAction] = [] + leadershipActions.append(contentsOf: collectMemberUpMoves()) + leadershipActions.append(contentsOf: collectDownMemberRemovals()) + + return leadershipActions + } + + enum LeaderAction: Equatable { + case moveMember(Cluster.MembershipChange) + case removeMember(alreadyDownMember: Cluster.Member) + } +} + +// ==== ---------------------------------------------------------------------------------------------------------------- +// MARK: Interpret leader actions in Shell + +extension ClusterShell { + func interpretLeaderActions( + _ system: ActorSystem, _ previousState: ClusterShellState, + _ leaderActions: [ClusterShellState.LeaderAction], + file: String = #file, line: UInt = #line + ) -> ClusterShellState { + guard !leaderActions.isEmpty else { + return previousState + } + + guard previousState.latestGossip.converged() else { + previousState.log.warning("SKIPPING LEADER ACTIONS, NOT CONVERGED", metadata: [ + "tag": "leader-action", + "leader/actions": "\(leaderActions)", + "gossip/current": "\(previousState.latestGossip)", + "leader/interpret/location": "\(file):\(line)", + ]) + return previousState + } + + var state = previousState + state.log.trace("Performing leader actions: \(leaderActions)", metadata: [ + "tag": "leader-action", + "leader/actions": "\(leaderActions)", + "gossip/converged": "\(state.latestGossip.converged())", + "gossip/current": "\(state.latestGossip)", + "leader/interpret/location": "\(file):\(line)", + ]) + + for leaderAction in leaderActions { + switch leaderAction { + case .moveMember(let movingUp): + self.interpretMoveMemberLeaderAction(&state, movingUp: movingUp) + + case .removeMember(let memberToRemove): + self.interpretRemoveMemberLeaderAction(system, &state, memberToRemove: memberToRemove) + } + } + + previousState.log.trace("Membership state after leader actions: \(state.membership)", metadata: [ + "tag": "leader-action", + "leader/interpret/location": "\(file):\(line)", + "gossip/current": "\(state.latestGossip)", + "gossip/before": "\(previousState.latestGossip)", + ]) + + return state + } + + func interpretMoveMemberLeaderAction(_ state: inout ClusterShellState, movingUp: Cluster.MembershipChange) { + guard let change = state.membership.apply(movingUp) else { + return + } + + if let downReplacedNodeChange = change.replacementDownPreviousNodeChange { + state.log.info("Downing replaced member: \(change)", metadata: [ + "tag": "leader-action", + ]) + state.events.publish(.membershipChange(downReplacedNodeChange)) + } + + state.log.info("Leader moved member: \(change)", metadata: [ + "tag": "leader-action", + ]) + + state.events.publish(.membershipChange(change)) + } + + /// Removal also terminates (and tombstones) the association to the given node. + func interpretRemoveMemberLeaderAction(_ system: ActorSystem, _ state: inout ClusterShellState, memberToRemove: Cluster.Member) { + let previousGossip = state.latestGossip + // !!! IMPORTANT !!! + // We MUST perform the prune on the _latestGossip_, not the wrapper, + // as otherwise the wrapper enforces "vector time moves forward" + guard let removalChange = state._latestGossip.pruneMember(memberToRemove) else { + return + } + state._latestGossip.incrementOwnerVersion() + state.gossipControl.update(payload: state._latestGossip) + + switch state.association(with: memberToRemove.node.node) { + case .some(.associated(let associated)): + self.terminateAssociation(system, state: &state, associated) + case .some(.tombstone(let tombstone)): + state.log.trace("Attempted to remove association but not associated (already tombstoned): \(memberToRemove), tombstone: \(tombstone)") + case .none: + // very carefully ensure that even though it was not associated, we DO store a tombstone for it -- this is in case + // we are racing against establishing an association with a node that we already know should be dead (and for some reason it'd reply to our handshake (zombie!)). + if let removed = state.removeAssociation(system, associatedNode: memberToRemove.node) { + self.finishTerminateAssociation(system, state: &state, removalDirective: removed) + } else { + let enforceTombstone = ClusterShellState.RemoveAssociationDirective(removedAssociation: nil, tombstone: .init(remoteNode: memberToRemove.node)) + state.log.warning("Attempted to remove association but not associated NOR tombstoned: \(memberToRemove), attempting to tombstone: \(enforceTombstone)") + self.finishTerminateAssociation(system, state: &state, removalDirective: enforceTombstone) + } + } + + state.log.info("Leader removed member: \(memberToRemove), all nodes are certain to have seen it as [.down] before", metadata: [ + "tag": "leader-action", + "gossip/current": "\(state.latestGossip)", + "gossip/before": "\(previousGossip)", + ]) + + // TODO: will this "just work" as we removed from membership, so gossip will tell others...? + // or do we need to push a round of gossip with .removed anyway? + state.events.publish(.membershipChange(removalChange)) + } +} diff --git a/Sources/DistributedActors/Cluster/ClusterShell.swift b/Sources/DistributedActors/Cluster/ClusterShell.swift index 615f5e76a..f2aebcf32 100644 --- a/Sources/DistributedActors/Cluster/ClusterShell.swift +++ b/Sources/DistributedActors/Cluster/ClusterShell.swift @@ -84,15 +84,13 @@ internal class ClusterShell { } /// Terminate an association including its connection, and store a tombstone for it - internal func terminateAssociation(_ system: ActorSystem, state: ClusterShellState, _ associated: Association.AssociatedState) -> ClusterShellState { + internal func terminateAssociation(_ system: ActorSystem, state: inout ClusterShellState, _ associated: Association.AssociatedState) { traceLog_Remote(system.cluster.node, "Terminate association [\(associated.remoteNode)]") - var state = state - if let directive = state.removeAssociation(system, associatedNode: associated.remoteNode) { - return self.finishTerminateAssociation(system, state: state, removalDirective: directive) + if let removalDirective = state.removeAssociation(system, associatedNode: associated.remoteNode) { + self.finishTerminateAssociation(system, state: &state, removalDirective: removalDirective) } else { - // no association to remove, thus nothing to act on - return state + () // no association to remove, thus nothing to act on } } @@ -102,10 +100,8 @@ internal class ClusterShell { /// - ensures node is at least .down in the Membership /// /// Can be invoked as result of a direct .down being issued, or because of a node replacement happening. - internal func finishTerminateAssociation(_ system: ActorSystem, state: ClusterShellState, removalDirective: ClusterShellState.RemoveAssociationDirective) -> ClusterShellState { + internal func finishTerminateAssociation(_ system: ActorSystem, state: inout ClusterShellState, removalDirective: ClusterShellState.RemoveAssociationDirective) { traceLog_Remote(system.cluster.node, "Finish terminate association [\(removalDirective.tombstone.remoteNode)]") - var state = state - let remoteNode = removalDirective.tombstone.remoteNode // tombstone the association in the shell immediately. @@ -118,7 +114,7 @@ internal class ClusterShell { // if the association was removed, we need to close it and ensure that other layers are synced up with this decision guard let removedAssociation = removalDirective.removedAssociation else { - return state + return } // notify the failure detector, that we shall assume this node as dead from here on. @@ -129,13 +125,15 @@ internal class ClusterShell { // Ensure to remove (completely) the member from the Membership, it is not even .leaving anymore. if state.membership.mark(remoteNode, as: .down) == nil { // it was already removed, nothing to do - state.log.trace("Finish association with \(remoteNode), yet node not in membership already?") + state.log.trace("Finish association with \(remoteNode), yet node not in membership already?", metadata: [ + "cluster/membership": "\(state.membership)", + ]) } // else: Note that we CANNOT remove() just yet, as we only want to do this when all nodes have seen the down/leaving + // The lat thing we attempt to do with the other node is to shoot it, in case it's a "zombie" that still may + // receive messages for some reason. let remoteControl = removedAssociation.makeRemoteControl() ClusterShell.shootTheOtherNodeAndCloseConnection(system: system, targetNodeRemoteControl: remoteControl) - - return state } /// Final action performed when severing ties with another node. @@ -425,9 +423,11 @@ extension ClusterShell { return self.onShutdownCommand(context, state: state, signalOnceUnbound: receptacle) case .downCommand(let node): - return self.ready(state: state.membership.members(node).reduce(state) { _, member in - self.onDownCommand(context, state: state, member: member) - }) + if let member = state.membership.member(node) { + return self.ready(state: self.onDownCommand(context, state: state, member: member)) + } else { + return self.ready(state: state) + } case .downCommandMember(let member): return self.ready(state: self.onDownCommand(context, state: state, member: member)) } @@ -475,8 +475,8 @@ extension ClusterShell { self.tracelog(context, .receive(from: state.myselfNode.node), message: event) var state = state - let changeDirective = state.applyClusterEventAsChange(event) - self.interpretLeaderActions(&state, changeDirective.leaderActions) + let changeDirective = state.applyClusterEvent(event) + state = self.interpretLeaderActions(context.system, state, changeDirective.leaderActions) if case .membershipChange(let change) = event { self.tryIntroduceGossipPeer(context, state, change: change) @@ -500,8 +500,8 @@ extension ClusterShell { var state = state let beforeGossipMerge = state.latestGossip + let mergeDirective = state.latestGossip.mergeForward(incoming: gossip) // mutates the gossip - // TODO: here we could check if state.latestGossip.converged { do stuff } context.log.trace("Local membership version is [.\(mergeDirective.causalRelation)] to incoming gossip; Merge resulted in \(mergeDirective.effectiveChanges.count) changes.", metadata: [ "tag": "membership", "membership/changes": Logger.MetadataValue.array(mergeDirective.effectiveChanges.map { Logger.MetadataValue.stringConvertible($0) }), @@ -511,14 +511,23 @@ extension ClusterShell { ]) mergeDirective.effectiveChanges.forEach { effectiveChange in -// /// we are careful to only introduce here, and only "new" members -// // TODO: consider reusing receptionist for this (!) -// self.tryIntroduceGossipPeer(context, state, change: effectiveChange) -// + // a change COULD have also been a replacement, in which case we need to publish it as well + // the removal od the + if let replacementChange = effectiveChange.replacementDownPreviousNodeChange { + self.clusterEvents.publish(.membershipChange(replacementChange)) + } let event: Cluster.Event = .membershipChange(effectiveChange) self.clusterEvents.publish(event) } + let leaderActions = state.collectLeaderActions() + if !leaderActions.isEmpty { + state.log.trace("Leadership actions upon gossip: \(leaderActions)", metadata: [ + "tag": "membership", + ]) + } + state = self.interpretLeaderActions(context.system, state, leaderActions) + return self.ready(state: state) } @@ -678,15 +687,15 @@ extension ClusterShell { if directive.membershipChange.replaced != nil, let removalDirective = directive.beingReplacedAssociationToTerminate { state.log.warning("Tombstone association: \(reflecting: removalDirective.tombstone.remoteNode)") - state = self.finishTerminateAssociation(context.system, state: state, removalDirective: removalDirective) + self.finishTerminateAssociation(context.system, state: &state, removalDirective: removalDirective) } // TODO: try to pull off with receptionist the same dance self.tryIntroduceGossipPeer(context, state, change: directive.membershipChange) /// a new node joined, thus if we are the leader, we should perform leader tasks to potentially move it to .up - let actions = state.tryCollectLeaderActions() - self.interpretLeaderActions(&state, actions) // TODO: not so DRY between acceptAndAssociate and onHandshakeAccepted, DRY it up + let actions = state.collectLeaderActions() + state = self.interpretLeaderActions(context.system, state, actions) /// only after leader (us, if we are one) performed its tasks, we update the metrics on membership (it might have modified membership) self.recordMetrics(context.system.metrics, membership: state.membership) @@ -718,21 +727,6 @@ extension ClusterShell { return .same } } - - internal func interpretLeaderActions(_ state: inout ClusterShellState, _ actions: [ClusterShellState.LeaderAction]) { - for action in actions { - switch action { - case .moveMember(let movingUp): - let change = state.membership.apply(movingUp) // TODO: include up numbers - - // FIXME: the changes should be gossiped rather than sent directly - if let change = change { - state.log.info("Leader moving member: \(change)") - state.events.publish(.membershipChange(change)) - } - } - } - } } // ==== ---------------------------------------------------------------------------------------------------------------- @@ -809,12 +803,12 @@ extension ClusterShell { // MUST be finishTerminate... and not terminate... because if we started a full terminateAssociation here // we would terminate the _current_ association which was already removed by the associate() because // it already _replaced_ some existing association (held in beingReplacedAssociation) - state = self.finishTerminateAssociation(context.system, state: state, removalDirective: replacedNodeRemovalDirective) + self.finishTerminateAssociation(context.system, state: &state, removalDirective: replacedNodeRemovalDirective) } /// a new node joined, thus if we are the leader, we should perform leader tasks to potentially move it to .up - let actions = state.tryCollectLeaderActions() - self.interpretLeaderActions(&state, actions) + let actions = state.collectLeaderActions() + state = self.interpretLeaderActions(context.system, state, actions) // TODO: return self.changedMembership which can do the publishing and publishing of metrics? we do it now in two places separately (incoming/outgoing accept) /// only after leader (us, if we are one) performed its tasks, we update the metrics on membership (it might have modified membership) @@ -994,6 +988,8 @@ extension ClusterShell { context.system.shutdown() } + state = self.interpretLeaderActions(context.system, state, state.collectLeaderActions()) + return state } @@ -1002,14 +998,18 @@ extension ClusterShell { guard let association = state.association(with: memberToDown.node.node) else { context.log.warning("Received Down command for not associated node [\(reflecting: memberToDown.node.node)], ignoring.") + state = self.interpretLeaderActions(context.system, state, state.collectLeaderActions()) return state } switch association { case .associated(let associated): - return self.terminateAssociation(context.system, state: state, associated) + self.terminateAssociation(context.system, state: &state, associated) + state = self.interpretLeaderActions(context.system, state, state.collectLeaderActions()) + return state case .tombstone: state.log.warning("Attempted to .down already tombstoned association/node: [\(memberToDown)]") + state = self.interpretLeaderActions(context.system, state, state.collectLeaderActions()) return state } } diff --git a/Sources/DistributedActors/Cluster/ClusterShellState.swift b/Sources/DistributedActors/Cluster/ClusterShellState.swift index 21a1d3555..675510466 100644 --- a/Sources/DistributedActors/Cluster/ClusterShellState.swift +++ b/Sources/DistributedActors/Cluster/ClusterShellState.swift @@ -70,11 +70,17 @@ internal struct ClusterShellState: ReadOnlyClusterState { self._latestGossip } set { - // FIXME: do we need the check? if self._latestGossip.membership == newValue.membership { self._latestGossip = newValue } else { - self._latestGossip = newValue.incrementingOwnerVersion() + let next: Cluster.Gossip + if self._latestGossip.version == newValue.version { + next = newValue.incrementingOwnerVersion() + } else { + next = newValue + } + + self._latestGossip = next } self.gossipControl.update(payload: self._latestGossip) } @@ -411,8 +417,8 @@ extension ClusterShellState { extension ClusterShellState { /// Generates and applies changes; generating actions to be taken by the `ClusterShell` if and only if it is the Leader, /// after this change has been applied. - mutating func applyClusterEventAsChange(_ event: Cluster.Event) -> AppliedClusterEventDirective { - let changeWasApplied: Bool + mutating func applyClusterEvent(_ event: Cluster.Event) -> AppliedClusterEventDirective { + var changeWasApplied: Bool switch event { case .leadershipChange(let change): @@ -441,10 +447,14 @@ extension ClusterShellState { } else { changeWasApplied = false } - case .snapshot: - // TODO: not handling snapshot here, we are a source of snapshots... yet what about gossip vs. "push membership", we may want ot handle here, by diff+apply - self.log.info("SNAPSHOT NOT APPLIED, NOT IMPLEMENTED; \(event)") + case .snapshot(let snapshot): + /// Realistically we are a SOURCE of snapshots, not a destination of them, however for completeness let's implement it: changeWasApplied = false + for change in Cluster.Membership._diff(from: .empty, to: snapshot).changes { + let directive = self.applyClusterEvent(.membershipChange(change)) + changeWasApplied = changeWasApplied || directive.applied + // actions we'll calculate below, once + } } guard changeWasApplied else { @@ -452,7 +462,7 @@ extension ClusterShellState { } // will be empty if myself node is NOT a leader - let leaderActions = self.tryCollectLeaderActions() // TODO: performing actions has been moved to ClusterShell, cleanup here some more + let leaderActions = self.collectLeaderActions() // TODO: actions may want to be acted upon, they're like directives, we currently have no such need though; // such actions be e.g. "kill association right away" or "asap tell that node .down" directly without waiting for gossip etc @@ -466,38 +476,6 @@ extension ClusterShellState { // will be empty if myself node is NOT a Leader let leaderActions: [LeaderAction] } - - /// If, and only if, the current node is a leader it performs a set of tasks, such as moving nodes to `.up` etc. - // TODO: test the actions when leader, not leader, that only applies to joining ones etc - mutating func tryCollectLeaderActions() -> [LeaderAction] { - guard self.membership.isLeader(self.myselfNode) else { - return [] // since we are not the leader, we perform no tasks - } - - func moveMembersUp() -> [LeaderAction] { - let joiningMembers = self.membership.members(withStatus: .joining) - - return joiningMembers.map { joiningMember in - let change = Cluster.MembershipChange(member: joiningMember, toStatus: .up) - return LeaderAction.moveMember(change) - } - } - - func removeMembers() -> [LeaderAction] { - // TODO: implement member removal; once all nodes have seen a node as Down, we move it to Removed - return [] - } - - var leadershipActions: [LeaderAction] = [] - leadershipActions.append(contentsOf: moveMembersUp()) - leadershipActions.append(contentsOf: removeMembers()) - - return leadershipActions - } - - enum LeaderAction { - case moveMember(Cluster.MembershipChange) - } } // ==== ---------------------------------------------------------------------------------------------------------------- @@ -509,8 +487,4 @@ extension ClusterShellState { "membership/count": "\(String(describing: self.membership.count(atLeast: .joining)))", ] } - - func logMembership() { - self.log.info("MEMBERSHIP:::: \(self.membership.prettyDescription(label: self.myselfNode.node.systemName))") - } } diff --git a/Sources/DistributedActors/Cluster/DowningStrategy/DowningStrategy.swift b/Sources/DistributedActors/Cluster/DowningStrategy/DowningStrategy.swift index ca8e96beb..536512630 100644 --- a/Sources/DistributedActors/Cluster/DowningStrategy/DowningStrategy.swift +++ b/Sources/DistributedActors/Cluster/DowningStrategy/DowningStrategy.swift @@ -73,7 +73,6 @@ internal struct DowningStrategyShell { func receiveClusterEvent(_ context: ActorContext, event: Cluster.Event) throws { let directive: DowningStrategyDirective = try self.strategy.onClusterEvent(event: event) - context.log.trace("Downing strategy reaction to \(event) resulted in: \(directive)", metadata: self.metadata) self.interpret(context, directive) } diff --git a/Sources/DistributedActors/Cluster/DowningStrategy/TimeoutBasedDowningStrategy.swift b/Sources/DistributedActors/Cluster/DowningStrategy/TimeoutBasedDowningStrategy.swift index c3f6325ba..c6717b88a 100644 --- a/Sources/DistributedActors/Cluster/DowningStrategy/TimeoutBasedDowningStrategy.swift +++ b/Sources/DistributedActors/Cluster/DowningStrategy/TimeoutBasedDowningStrategy.swift @@ -54,11 +54,15 @@ extension TimeoutBasedDowningStrategy: DowningStrategy { return .none } - if change.isAtLeastDown || change.isRemoval || change.isReplacement { + if change.isAtLeastDown { // it was marked as down by someone, we don't need to track it anymore _ = self._markAsDown.remove(change.member) _ = self._unreachable.remove(change.member) return .cancelTimer(key: self.timerKey(change.member)) + } else if let replaced = change.replaced { + _ = self._markAsDown.remove(replaced) + _ = self._unreachable.remove(replaced) + return .markAsDown([replaced]) } return .none @@ -113,7 +117,7 @@ extension TimeoutBasedDowningStrategy: DowningStrategy { func onTimeout(_ member: Cluster.Member) -> DowningStrategyDirective { guard let nodeToDown = self._unreachable.remove(member) else { - return .none + return .none // perhaps we removed it already for other reasons (e.g. node replacement) } if self.isLeader { return .markAsDown([nodeToDown]) @@ -123,7 +127,6 @@ extension TimeoutBasedDowningStrategy: DowningStrategy { } } - // TODO: remove this func onMemberRemoved(_ member: Cluster.Member) -> DowningStrategyDirective { self._markAsDown.remove(member) @@ -136,6 +139,10 @@ extension TimeoutBasedDowningStrategy: DowningStrategy { } public struct TimeoutBasedDowningStrategySettings { + /// Provides a slight delay after noticing an `.unreachable` before declaring down. + /// + /// Generally with a distributed failure detector such delay may not be necessary, however it is available in case + /// you want to allow noticing "tings are bad, but don't act on it" environments. public var downUnreachableMembersAfter: TimeAmount = .seconds(1) public static var `default`: TimeoutBasedDowningStrategySettings { diff --git a/Sources/DistributedActors/Cluster/Leadership.swift b/Sources/DistributedActors/Cluster/Leadership.swift index 698958381..141ae4091 100644 --- a/Sources/DistributedActors/Cluster/Leadership.swift +++ b/Sources/DistributedActors/Cluster/Leadership.swift @@ -314,7 +314,7 @@ extension Leadership { // select the leader, by lowest address let leader = membersToSelectAmong - .sorted { $0.node < $1.node } + .sorted(by: Cluster.Member.lowestAddressOrdering) .first if let change = try! membership.applyLeadershipChange(to: leader) { // try! safe, as we KNOW this member is part of membership diff --git a/Sources/DistributedActors/Cluster/NodeDeathWatcher.swift b/Sources/DistributedActors/Cluster/NodeDeathWatcher.swift index 609f02b50..60be17212 100644 --- a/Sources/DistributedActors/Cluster/NodeDeathWatcher.swift +++ b/Sources/DistributedActors/Cluster/NodeDeathWatcher.swift @@ -128,6 +128,7 @@ enum NodeDeathWatcherShell { case membershipChange(Cluster.MembershipChange) } + // FIXME: death watcher is incomplete, should handle snapshot!! static func behavior(clusterEvents: EventStream) -> Behavior { return .setup { context in let instance = NodeDeathWatcherInstance(selfNode: context.system.settings.cluster.uniqueBindNode) @@ -155,7 +156,7 @@ enum NodeDeathWatcherShell { _ = instance.onActorWatched(by: watcher, remoteNode: remoteNode) // TODO: return and interpret directives case .membershipSnapshot(let membership): - let diff = Cluster.Membership.diff(from: lastMembership, to: membership) + let diff = Cluster.Membership._diff(from: lastMembership, to: membership) for change in diff.changes { _ = instance.onMembershipChanged(change) // TODO: return and interpret directives diff --git a/Sources/DistributedActors/Cluster/Protobuf/ClusterEvents.pb.swift b/Sources/DistributedActors/Cluster/Protobuf/ClusterEvents.pb.swift index 0ca41bfe1..8b442d1ce 100644 --- a/Sources/DistributedActors/Cluster/Protobuf/ClusterEvents.pb.swift +++ b/Sources/DistributedActors/Cluster/Protobuf/ClusterEvents.pb.swift @@ -159,7 +159,7 @@ struct ProtoClusterLeadershipChange { // MARK: - Code below here is support for the SwiftProtobuf runtime. extension ProtoClusterEvent: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding { - static let protoMessageName: String = "Cluster.Event" + static let protoMessageName: String = "ClusterEvent" static let _protobuf_nameMap: SwiftProtobuf._NameMap = [ 1: .same(proto: "snapshot"), 2: .same(proto: "membershipChange"), diff --git a/Sources/DistributedActors/Cluster/Protobuf/Membership+Serialization.swift b/Sources/DistributedActors/Cluster/Protobuf/Membership+Serialization.swift index 85dbc7b30..9edddffcc 100644 --- a/Sources/DistributedActors/Cluster/Protobuf/Membership+Serialization.swift +++ b/Sources/DistributedActors/Cluster/Protobuf/Membership+Serialization.swift @@ -54,7 +54,7 @@ extension Cluster.Member: InternalProtobufRepresentable { proto.node = try self.node.toProto(context: context) proto.status = self.status.toProto(context: context) proto.reachability = try self.reachability.toProto(context: context) - if let number = self.upNumber { + if let number = self._upNumber { proto.upNumber = UInt32(number) } return proto @@ -67,7 +67,7 @@ extension Cluster.Member: InternalProtobufRepresentable { self.node = try .init(fromProto: proto.node, context: context) self.status = try .init(fromProto: proto.status, context: context) self.reachability = try .init(fromProto: proto.reachability, context: context) - self.upNumber = proto.upNumber == 0 ? nil : Int(proto.upNumber) + self._upNumber = proto.upNumber == 0 ? nil : Int(proto.upNumber) } } @@ -105,10 +105,10 @@ extension Cluster.MemberStatus { proto = .joining case .up: proto = .up - case .down: - proto = .down case .leaving: proto = .leaving + case .down: + proto = .down case .removed: proto = .removed } @@ -125,10 +125,10 @@ extension Cluster.MemberStatus { self = .joining case .up: self = .up - case .down: - self = .down case .leaving: self = .leaving + case .down: + self = .down case .removed: self = .removed } diff --git a/Sources/DistributedActors/Cluster/Protobuf/Membership.pb.swift b/Sources/DistributedActors/Cluster/Protobuf/Membership.pb.swift index f8245dad2..8bb19d6ea 100644 --- a/Sources/DistributedActors/Cluster/Protobuf/Membership.pb.swift +++ b/Sources/DistributedActors/Cluster/Protobuf/Membership.pb.swift @@ -314,7 +314,7 @@ extension ProtoClusterMembership: SwiftProtobuf.Message, SwiftProtobuf._MessageI } extension ProtoClusterMember: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding { - static let protoMessageName: String = "Cluster.Member" + static let protoMessageName: String = "ClusterMember" static let _protobuf_nameMap: SwiftProtobuf._NameMap = [ 1: .same(proto: "node"), 2: .same(proto: "status"), diff --git a/Sources/DistributedActors/Cluster/SWIM/SWIMInstance+Logging.swift b/Sources/DistributedActors/Cluster/SWIM/SWIMInstance+Logging.swift index 3604501a8..448075f52 100644 --- a/Sources/DistributedActors/Cluster/SWIM/SWIMInstance+Logging.swift +++ b/Sources/DistributedActors/Cluster/SWIM/SWIMInstance+Logging.swift @@ -23,6 +23,7 @@ extension SWIM.Instance { [ "swim/membersToPing": "\(self.membersToPing)", "swim/protocolPeriod": "\(self.protocolPeriod)", + "swim/timeoutSuspectsBeforePeriod": "\(self.timeoutSuspectsBeforePeriod)", "swim/incarnation": "\(self.incarnation)", "swim/memberCount": "\(self.memberCount)", "swim/suspectCount": "\(self.suspects.count)", diff --git a/Sources/DistributedActors/Cluster/SWIM/SWIMInstance.swift b/Sources/DistributedActors/Cluster/SWIM/SWIMInstance.swift index 2f034d37a..275d0afad 100644 --- a/Sources/DistributedActors/Cluster/SWIM/SWIMInstance.swift +++ b/Sources/DistributedActors/Cluster/SWIM/SWIMInstance.swift @@ -228,7 +228,15 @@ final class SWIMInstance { } var protocolPeriod: Int { - return self._protocolPeriod + self._protocolPeriod + } + + /// When checking suspicion timeouts, any member suspect observation older or equal to this protocol period shall considered unreachable + /// + /// Note that this value may be negative, which simply implies "we are not at a protocol period old enough that any of the + /// suspects could have been old enough to be marked unreachable yet." + var timeoutSuspectsBeforePeriod: Int { + self.protocolPeriod - self.settings.failureDetector.suspicionTimeoutPeriodsMax } func status(of ref: ActorRef) -> SWIM.Status? { @@ -518,9 +526,6 @@ extension SWIM.Instance { /// we could get information through the seed nodes about the new members; but we still have never talked to them, /// thus we need to ensure we have a connection to them, before we consider adding them to the membership). case connect(node: UniqueNode, onceConnected: (Result) -> Void) -// case markedSuspect(member: SWIM.Member) -// /// Meaning the node is now marked `DEAD`. -// case confirmedDead(member: SWIM.Member) } struct MemberStatusChange { diff --git a/Sources/DistributedActors/Cluster/SWIM/SWIMShell.swift b/Sources/DistributedActors/Cluster/SWIM/SWIMShell.swift index dd7bcb61e..755e8ed8b 100644 --- a/Sources/DistributedActors/Cluster/SWIM/SWIMShell.swift +++ b/Sources/DistributedActors/Cluster/SWIM/SWIMShell.swift @@ -225,7 +225,12 @@ internal struct SWIMShell { switch result { case .failure(let err): if let timeoutError = err as? TimeoutError { - context.log.warning("Did not receive ack from \(reflecting: pingedMember.address) within [\(timeoutError.timeout.prettyDescription)]. Sending ping requests to other members.") + context.log.warning(""" + Did not receive ack from \(reflecting: pingedMember.address) within [\(timeoutError.timeout.prettyDescription)]. \ + Sending ping requests to other members. + """, metadata: [ + "swim/target": "\(self.swim.member(for: pingedMember), orElse: "nil")", + ]) } else { context.log.warning("\(err) Did not receive ack from \(reflecting: pingedMember.address) within configured timeout. Sending ping requests to other members.") } @@ -329,7 +334,7 @@ internal struct SWIMShell { func checkSuspicionTimeouts(context: ActorContext) { // TODO: push more of logic into SWIM instance, the calculating // FIXME: use decaying timeout as proposed in lifeguard paper - let timeoutSuspectsBeforePeriod = (self.swim.protocolPeriod - self.swim.settings.failureDetector.suspicionTimeoutPeriodsMax) + let timeoutSuspectsBeforePeriod = self.swim.timeoutSuspectsBeforePeriod context.log.trace("Checking suspicion timeouts...", metadata: [ "swim/suspects": "\(self.swim.suspects)", "swim/all": "\(self.swim._allMembersDict)", @@ -398,7 +403,7 @@ internal struct SWIMShell { case .ignored(let level, let message): if let level = level, let message = message { - context.log.log(level: level, message) + context.log.log(level: level, message, metadata: self.swim.metadata) } case .applied(let change, _, _): diff --git a/Sources/DistributedActors/Cluster/Transport/ActorRef+RemotePersonality.swift b/Sources/DistributedActors/Cluster/Transport/ActorRef+RemotePersonality.swift index c61573802..90f6075a0 100644 --- a/Sources/DistributedActors/Cluster/Transport/ActorRef+RemotePersonality.swift +++ b/Sources/DistributedActors/Cluster/Transport/ActorRef+RemotePersonality.swift @@ -65,7 +65,7 @@ public final class RemotePersonality { // TODO: optionally carry file/line? remoteControl.sendUserMessage(type: Message.self, envelope: Envelope(payload: .message(message)), recipient: self.address) } else { - pprint("no remote control!!!! \(self.address)") + self.system.log.warning("[SWIM] No remote control, while sending to: \(self.address)") self.system.personalDeadLetters(recipient: self.address).adapted().tell(message, file: file, line: line) } } @@ -90,7 +90,7 @@ public final class RemotePersonality { } // FIXME: this is a hack/workaround, see https://github.com/apple/swift-distributed-actors/issues/383 - let maxWorkaroundSpins = 100 + let maxWorkaroundSpins = 500 for spinNr in 1 ... maxWorkaroundSpins { switch self.clusterShell.associationRemoteControl(with: remoteAddress) { case .unknown: @@ -100,7 +100,7 @@ public final class RemotePersonality { } // else, fall through to the return nil below case .associated(let remoteControl): if spinNr > 1 { - self.system.log.warning("FIXME: Workaround, ActorRef's RemotePersonality had to spin \(spinNr) times to obtain remoteControl to send message to \(self.address)") + self.system.log.debug("FIXME: Workaround, ActorRef's RemotePersonality had to spin \(spinNr) times to obtain remoteControl to send message to \(self.address)") } // self._cachedAssociationRemoteControl = remoteControl // TODO: atomically cache a remote control? return remoteControl diff --git a/Sources/DistributedActors/Metrics/ActorSystem+Metrics.swift b/Sources/DistributedActors/Metrics/ActorSystem+Metrics.swift index dbf9463ad..c0d346a37 100644 --- a/Sources/DistributedActors/Metrics/ActorSystem+Metrics.swift +++ b/Sources/DistributedActors/Metrics/ActorSystem+Metrics.swift @@ -102,8 +102,8 @@ internal class ActorSystemMetrics { /// cluster.members let _cluster_members_joining: Gauge let _cluster_members_up: Gauge - let _cluster_members_down: Gauge let _cluster_members_leaving: Gauge + let _cluster_members_down: Gauge let _cluster_members_removed: Gauge let _cluster_unreachable_members: Gauge @@ -115,8 +115,8 @@ internal class ActorSystemMetrics { var joining = 0 var up = 0 - var down = 0 var leaving = 0 + var down = 0 var removed = 0 var unreachable = 0 for b in members { @@ -143,8 +143,8 @@ internal class ActorSystemMetrics { self._cluster_members.record(up) self._cluster_members_joining.record(joining) self._cluster_members_up.record(up) - self._cluster_members_down.record(down) self._cluster_members_leaving.record(leaving) + self._cluster_members_down.record(down) self._cluster_members_removed.record(removed) self._cluster_unreachable_members.record(unreachable) } @@ -212,9 +212,9 @@ internal class ActorSystemMetrics { self._cluster_members = .init(label: clusterMembersLabel) self._cluster_members_joining = .init(label: clusterMembersLabel, dimensions: [("status", Cluster.MemberStatus.joining.rawValue)]) self._cluster_members_up = .init(label: clusterMembersLabel, dimensions: [("status", Cluster.MemberStatus.joining.rawValue)]) - self._cluster_members_down = .init(label: clusterMembersLabel, dimensions: [("status", Cluster.MemberStatus.down.rawValue)]) self._cluster_members_leaving = .init(label: clusterMembersLabel, dimensions: [("status", Cluster.MemberStatus.leaving.rawValue)]) - self._cluster_members_removed = .init(label: clusterMembersLabel, dimensions: [("status", Cluster.MemberStatus.removed.rawValue)]) + self._cluster_members_down = .init(label: clusterMembersLabel, dimensions: [("status", Cluster.MemberStatus.down.rawValue)]) + self._cluster_members_removed = .init(label: clusterMembersLabel, dimensions: [("status", Cluster.MemberStatus.removed.rawValue)]) // TODO: this is equal to number of stored tombstones kind of self._cluster_unreachable_members = .init(label: clusterMembersLabel, dimensions: [("reachability", Cluster.MemberReachability.unreachable.rawValue)]) let clusterAssociations = settings.makeLabel("cluster", "associations") diff --git a/Sources/DistributedActors/Pattern/ConvergentGossip.swift b/Sources/DistributedActors/Pattern/ConvergentGossip.swift index c3a2f8f4c..2495dbfb7 100644 --- a/Sources/DistributedActors/Pattern/ConvergentGossip.swift +++ b/Sources/DistributedActors/Pattern/ConvergentGossip.swift @@ -86,8 +86,8 @@ final class ConvergentGossip { private func receiveGossip(_ context: ActorContext, envelope: ConvergentGossip.GossipEnvelope) { context.log.trace("Received gossip: \(envelope)", metadata: [ - "gossip/localPayload": "\(String(reflecting: self.payload))", - "actor/message": "\(envelope)", + "gossip/current": "\(String(reflecting: self.payload))", + "gossip/incoming": "\(envelope)", ]) // send to recipient which may then update() the payload we are gossiping @@ -103,15 +103,12 @@ final class ConvergentGossip { // TODO: bump local version vector; once it is in the envelope } - // FIXME: this is still just broadcasting (!) private func onPeriodicGossipTick(_ context: ActorContext) { - guard let target = self.peers.shuffled().first else { - // no peer to gossip to... - return - } - - self.sendGossip(context, to: target) self.scheduleNextGossipRound(context: context) + + if let target = self.peers.shuffled().first { + self.sendGossip(context, to: target) + } } private func sendGossip(_ context: ActorContext, to target: GossipPeerRef) { diff --git a/Sources/DistributedActors/Serialization.swift b/Sources/DistributedActors/Serialization.swift index e7ab47828..e1cf1a7e9 100644 --- a/Sources/DistributedActors/Serialization.swift +++ b/Sources/DistributedActors/Serialization.swift @@ -211,11 +211,11 @@ public struct Serialization { } public func serializerIdFor(metaType: AnyMetaType) -> SerializerId? { - return self.serializerIds[metaType.asHashable()] + self.serializerIds[metaType.asHashable()] } public func serializer(for id: SerializerId) -> AnySerializer? { - return self.serializers[id] + self.serializers[id] } internal func debugPrintSerializerTable(header: String = "") { @@ -223,7 +223,7 @@ public struct Serialization { for (key, id) in self.serializerIds.sorted(by: { $0.value < $1.value }) { p += " Serializer (id:\(id)) key:\(key) = \(self.serializers[id], orElse: "")\n" } - print(p) + self.log.debug("\(p)") } } diff --git a/Sources/DistributedActorsTestKit/Cluster/ClusteredNodesTestBase.swift b/Sources/DistributedActorsTestKit/Cluster/ClusteredNodesTestBase.swift index d8377c8f5..4a9293397 100644 --- a/Sources/DistributedActorsTestKit/Cluster/ClusteredNodesTestBase.swift +++ b/Sources/DistributedActorsTestKit/Cluster/ClusteredNodesTestBase.swift @@ -126,27 +126,29 @@ open class ClusteredNodesTestBase: XCTestCase { if let expectedStatus = maybeExpectedStatus { if let specificTimeout = ensureWithin { - try self.ensureNodes(expectedStatus, on: node, within: specificTimeout, systems: other) + try self.ensureNodes(expectedStatus, on: node, within: specificTimeout, nodes: other.cluster.node) } else { - try self.ensureNodes(expectedStatus, on: node, systems: other) + try self.ensureNodes(expectedStatus, on: node, nodes: other.cluster.node) } } } public func ensureNodes( - _ status: Cluster.MemberStatus, on system: ActorSystem? = nil, within: TimeAmount = .seconds(10), systems: ActorSystem..., + _ status: Cluster.MemberStatus, on system: ActorSystem? = nil, within: TimeAmount = .seconds(10), nodes: UniqueNode..., file: StaticString = #file, line: UInt = #line ) throws { - guard let onSystem = system ?? self._nodes.first else { + guard let onSystem = system ?? self._nodes.first(where: { !$0.isShuttingDown }) else { fatalError("Must at least have 1 system present to use [\(#function)]") } try self.testKit(onSystem).eventually(within: within, file: file, line: line) { - for onSystem in systems { + do { // all members on onMember should have reached this status (e.g. up) - for expectSystem in systems { - try self.assertMemberStatus(on: onSystem, node: expectSystem.cluster.node, is: status) + for node in nodes { + try self.assertMemberStatus(on: onSystem, node: node, is: status, file: file, line: line) } + } catch { + throw error } } } diff --git a/Sources/DistributedActorsTestKit/LogCapture.swift b/Sources/DistributedActorsTestKit/LogCapture.swift index 1928da67f..b809e471d 100644 --- a/Sources/DistributedActorsTestKit/LogCapture.swift +++ b/Sources/DistributedActorsTestKit/LogCapture.swift @@ -285,8 +285,9 @@ extension LogCapture { return found } else { let query = [ - message.map { "message: \"\($0)\"" }, prefix.map { "prefix: \"\($0)\"" }, + message.map { "message: \"\($0)\"" }, + grep.map { "grep: \"\($0)\"" }, level.map { "level: \($0)" } ?? "", expectedFile.map { "expectedFile: \"\($0)\"" }, (expectedLine > -1 ? Optional(expectedLine) : nil).map { "expectedLine: \($0)" }, @@ -296,9 +297,7 @@ extension LogCapture { let message = """ Did not find expected log, matching query: [\(query)] - in captured logs: - \(logs.map { "\($0)" }.joined(separator: "\n "))\n - at \(file):\(line) + in captured logs at \(file):\(line) """ let callSiteError = callSite.error(message) if failTest { diff --git a/Sources/DistributedActorsTestKit/ShouldMatchers.swift b/Sources/DistributedActorsTestKit/ShouldMatchers.swift index 8406327ec..14b9ce02b 100644 --- a/Sources/DistributedActorsTestKit/ShouldMatchers.swift +++ b/Sources/DistributedActorsTestKit/ShouldMatchers.swift @@ -47,9 +47,9 @@ public extension TestMatchers where T: Equatable { XCTAssertEqual(self.it, expected, msg, file: self.callSite.file, line: self.callSite.line) } - func toNotEqual(_ expected: T) { - let msg = self.callSite.detailedMessage(got: self.it, expected: expected) - XCTAssertNotEqual(self.it, expected, msg, file: self.callSite.file, line: self.callSite.line) + func toNotEqual(_ unexpectedEqual: T) { + let msg = self.callSite.detailedMessage(got: self.it, unexpectedEqual: unexpectedEqual) + XCTAssertNotEqual(self.it, unexpectedEqual, msg, file: self.callSite.file, line: self.callSite.line) } func toBe(_ expected: Other.Type) { @@ -79,7 +79,8 @@ public extension TestMatchers where T: Collection, T.Element: Equatable { if isTty { m += "\(ANSIColors.reset.rawValue)\(ANSIColors.red.rawValue)" } m += "\(self.it.dropFirst(partialMatch.underestimatedCount))] " m += "to start with prefix: " - if isTty { m += "\n " } // align with "[error] Expected " + if isTty { m += "\n" } + if isTty { m += String(repeating: " ", count: "[error] Expected ".count) } // align with the error message prefix m += "[" if isTty { m += "\(ANSIColors.bold.rawValue)" } m += "\(partialMatch)" @@ -110,6 +111,23 @@ public extension TestMatchers where T: Collection, T.Element: Equatable { } } + func toNotContain(_ el: T.Element) { + if self.it.contains(el) { + // fancy printout: + var m = "Expected \(T.self):\n " + m += self.it.map { "\($0)" }.joined(separator: "\n ") + m += "\n" + m += "to NOT contain: [" + if isTty { m += "\(ANSIColors.bold.rawValue)" } + m += "\(el)" + if isTty { m += "\(ANSIColors.reset.rawValue)\(ANSIColors.red.rawValue)" } + m += "]" + + let msg = self.callSite.detailedMessage(m) + XCTAssert(false, msg, file: self.callSite.file, line: self.callSite.line) + } + } + /// Asserts that `it` contains at least one element matching the predicate. func toContain(where predicate: (T.Element) -> Bool) { if !self.it.contains(where: { el in predicate(el) }) { @@ -296,6 +314,11 @@ extension Collection where Element: Equatable { return TestMatchers(it: self, callSite: csInfo).toContain(el) } + public func shouldNotContain(_ el: Element, file: StaticString = #file, line: UInt = #line, column: UInt = #column) { + let csInfo = CallSiteInfo(file: file, line: line, column: column, function: #function) + return TestMatchers(it: self, callSite: csInfo).toNotContain(el) + } + /// Applies the `where` predicate while trying to locate at least one element in the collection. public func shouldContain(where predicate: (Element) -> Bool, file: StaticString = #file, line: UInt = #line, column: UInt = #column) { let csInfo = CallSiteInfo(file: file, line: line, column: column, function: #function) @@ -417,6 +440,11 @@ struct CallSiteInfo { return self.detailedMessage(msg) } + func detailedMessage(got it: Any, unexpectedEqual: Any) -> String { + let msg = "[\(it)] does equal: [\(unexpectedEqual)]\n" + return self.detailedMessage(msg) + } + /// Prepares a detailed error information /// /// - Warning: Performs file IO in order to read source location line where failure happened diff --git a/Sources/DistributedActorsTestKit/TestProbes.swift b/Sources/DistributedActorsTestKit/TestProbes.swift index 619edb908..923cf8c0f 100644 --- a/Sources/DistributedActorsTestKit/TestProbes.swift +++ b/Sources/DistributedActorsTestKit/TestProbes.swift @@ -83,7 +83,7 @@ public class ActorTestProbe { do { self.internalRef = try spawn(behavior) } catch { - fatalError("Failed to spawn \(ActorTestProbe.self): [\(error)]:\(String(reflecting: type(of: error)))", file: file, line: line) + let _: Never = fatalErrorBacktrace("Failed to spawn \(ActorTestProbe.self): [\(error)]:\(String(reflecting: type(of: error)))", file: file, line: line) } self.name = self.internalRef.address.name diff --git a/Tests/ActorSingletonPluginTests/ActorSingletonPluginClusteredTests.swift b/Tests/ActorSingletonPluginTests/ActorSingletonPluginClusteredTests.swift index 143c16e81..2e506b9f0 100644 --- a/Tests/ActorSingletonPluginTests/ActorSingletonPluginClusteredTests.swift +++ b/Tests/ActorSingletonPluginTests/ActorSingletonPluginClusteredTests.swift @@ -52,7 +52,7 @@ final class ActorSingletonPluginClusteredTests: ClusteredNodesTestBase { third.cluster.join(node: second.cluster.node.node) // `first` will be the leader (lowest address) and runs the singleton - try self.ensureNodes(.up, within: .seconds(10), systems: first, second, third) + try self.ensureNodes(.up, within: .seconds(10), nodes: first.cluster.node, second.cluster.node, third.cluster.node) let replyProbe1 = self.testKit(first).spawnTestProbe(expecting: String.self) let ref1 = try first.singleton.ref(name: GreeterSingleton.name, of: GreeterSingleton.Message.self) @@ -123,7 +123,7 @@ final class ActorSingletonPluginClusteredTests: ClusteredNodesTestBase { third.cluster.join(node: second.cluster.node.node) // `first` becomes the leader (lowest address) and runs the singleton - try self.ensureNodes(.up, within: .seconds(10), systems: first, second, third) + try self.ensureNodes(.up, within: .seconds(10), nodes: first.cluster.node, second.cluster.node, third.cluster.node) try replyProbe1.expectMessage("Hello-1 Charlie-1!") try replyProbe2.expectMessage("Hello-1 Charlie-2!") @@ -172,7 +172,7 @@ final class ActorSingletonPluginClusteredTests: ClusteredNodesTestBase { first.cluster.join(node: second.cluster.node.node) third.cluster.join(node: second.cluster.node.node) - try self.ensureNodes(.up, within: .seconds(10), systems: first, second, third) + try self.ensureNodes(.up, within: .seconds(10), nodes: first.cluster.node, second.cluster.node, third.cluster.node) let replyProbe1 = self.testKit(first).spawnTestProbe(expecting: String.self) let ref1 = try first.singleton.ref(name: GreeterSingleton.name, of: GreeterSingleton.Message.self) @@ -192,15 +192,16 @@ final class ActorSingletonPluginClusteredTests: ClusteredNodesTestBase { try replyProbe3.expectMessage("Hello-1 Charlie!") // Take down the leader + let firstNode = first.cluster.node first.cluster.leave() // Make sure that `second` and `third` see `first` as down and become leader-less try self.testKit(second).eventually(within: .seconds(10)) { - try self.assertMemberStatus(on: second, node: first.cluster.node, is: .down) + try self.assertMemberStatus(on: second, node: firstNode, is: .down) try self.assertLeaderNode(on: second, is: nil) } try self.testKit(third).eventually(within: .seconds(10)) { - try self.assertMemberStatus(on: third, node: first.cluster.node, is: .down) + try self.assertMemberStatus(on: third, node: firstNode, is: .down) try self.assertLeaderNode(on: third, is: nil) } @@ -211,7 +212,7 @@ final class ActorSingletonPluginClusteredTests: ClusteredNodesTestBase { // `fourth` will become the new leader and singleton fourth.cluster.join(node: second.cluster.node.node) - try self.ensureNodes(.up, within: .seconds(10), systems: fourth, second, third) + try self.ensureNodes(.up, on: second, within: .seconds(10), nodes: fourth.cluster.node, second.cluster.node, third.cluster.node) // The stashed messages get routed to new singleton running on `fourth` try replyProbe2.expectMessage("Hello-4 Charlie-2!") diff --git a/Tests/DistributedActorsDocumentationTests/Actorable/GenActors/AllInOneMachine+GenActor.swift b/Tests/DistributedActorsDocumentationTests/Actorable/GenActors/AllInOneMachine+GenActor.swift index 5e83d1df6..92699e140 100644 --- a/Tests/DistributedActorsDocumentationTests/Actorable/GenActors/AllInOneMachine+GenActor.swift +++ b/Tests/DistributedActorsDocumentationTests/Actorable/GenActors/AllInOneMachine+GenActor.swift @@ -33,20 +33,20 @@ extension AllInOneMachine { public enum Message { case clean - case coffeeMachine(/*TODO: MODULE.*/GeneratedActor.Messages.CoffeeMachine) case diagnostics(/*TODO: MODULE.*/GeneratedActor.Messages.Diagnostics) + case coffeeMachine(/*TODO: MODULE.*/GeneratedActor.Messages.CoffeeMachine) } - /// Performs boxing of GeneratedActor.Messages.CoffeeMachine messages such that they can be received by Actor - public static func _boxCoffeeMachine(_ message: GeneratedActor.Messages.CoffeeMachine) -> AllInOneMachine.Message { - .coffeeMachine(message) - } - /// Performs boxing of GeneratedActor.Messages.Diagnostics messages such that they can be received by Actor public static func _boxDiagnostics(_ message: GeneratedActor.Messages.Diagnostics) -> AllInOneMachine.Message { .diagnostics(message) } + /// Performs boxing of GeneratedActor.Messages.CoffeeMachine messages such that they can be received by Actor + public static func _boxCoffeeMachine(_ message: GeneratedActor.Messages.CoffeeMachine) -> AllInOneMachine.Message { + .coffeeMachine(message) + } + } // ==== ---------------------------------------------------------------------------------------------------------------- // MARK: DO NOT EDIT: Generated AllInOneMachine behavior @@ -67,13 +67,13 @@ extension AllInOneMachine { instance.clean() + case .diagnostics(.printDiagnostics): + instance.printDiagnostics() + case .coffeeMachine(.makeCoffee(let _replyTo)): let result = instance.makeCoffee() _replyTo.tell(result) - case .diagnostics(.printDiagnostics): - instance.printDiagnostics() - } return .same }.receiveSignal { _context, signal in diff --git a/Tests/DistributedActorsDocumentationTests/Actorable/GenActors/AllInOneMachine+GenCodable.swift b/Tests/DistributedActorsDocumentationTests/Actorable/GenActors/AllInOneMachine+GenCodable.swift index 7c98af353..feb33fa6f 100644 --- a/Tests/DistributedActorsDocumentationTests/Actorable/GenActors/AllInOneMachine+GenCodable.swift +++ b/Tests/DistributedActorsDocumentationTests/Actorable/GenActors/AllInOneMachine+GenCodable.swift @@ -33,15 +33,15 @@ extension AllInOneMachine.Message: Codable { // TODO: Check with Swift team which style of discriminator to aim for public enum DiscriminatorKeys: String, Decodable { case clean - case _boxCoffeeMachine case _boxDiagnostics + case _boxCoffeeMachine } public enum CodingKeys: CodingKey { case _case - case _boxCoffeeMachine case _boxDiagnostics + case _boxCoffeeMachine } @@ -50,12 +50,12 @@ extension AllInOneMachine.Message: Codable { switch try container.decode(DiscriminatorKeys.self, forKey: CodingKeys._case) { case .clean: self = .clean - case ._boxCoffeeMachine: - let boxed = try container.decode(GeneratedActor.Messages.CoffeeMachine.self, forKey: CodingKeys._boxCoffeeMachine) - self = .coffeeMachine(boxed) case ._boxDiagnostics: let boxed = try container.decode(GeneratedActor.Messages.Diagnostics.self, forKey: CodingKeys._boxDiagnostics) self = .diagnostics(boxed) + case ._boxCoffeeMachine: + let boxed = try container.decode(GeneratedActor.Messages.CoffeeMachine.self, forKey: CodingKeys._boxCoffeeMachine) + self = .coffeeMachine(boxed) } } @@ -65,12 +65,12 @@ extension AllInOneMachine.Message: Codable { switch self { case .clean: try container.encode(DiscriminatorKeys.clean.rawValue, forKey: CodingKeys._case) - case .coffeeMachine(let boxed): - try container.encode(DiscriminatorKeys._boxCoffeeMachine.rawValue, forKey: CodingKeys._case) - try container.encode(boxed, forKey: CodingKeys._boxCoffeeMachine) case .diagnostics(let boxed): try container.encode(DiscriminatorKeys._boxDiagnostics.rawValue, forKey: CodingKeys._case) try container.encode(boxed, forKey: CodingKeys._boxDiagnostics) + case .coffeeMachine(let boxed): + try container.encode(DiscriminatorKeys._boxCoffeeMachine.rawValue, forKey: CodingKeys._case) + try container.encode(boxed, forKey: CodingKeys._boxCoffeeMachine) } } diff --git a/Tests/DistributedActorsTests/Actorable/ActorContextReceptionistTests.swift b/Tests/DistributedActorsTests/Actorable/ActorContextReceptionistTests.swift index 1317447d2..76a44e2ce 100644 --- a/Tests/DistributedActorsTests/Actorable/ActorContextReceptionistTests.swift +++ b/Tests/DistributedActorsTests/Actorable/ActorContextReceptionistTests.swift @@ -30,7 +30,7 @@ final class ActorContextReceptionTests: ActorSystemTestBase { return listing } - listing.actors.first!.shouldEqual(owner) + listing.actors.first.shouldEqual(owner) } func test_autoUpdatedListing_invokesOnUpdate() throws { diff --git a/Tests/DistributedActorsTests/CRDT/CRDTReplicatorShellTests.swift b/Tests/DistributedActorsTests/CRDT/CRDTReplicatorShellTests.swift index f4a655353..582ceaa24 100644 --- a/Tests/DistributedActorsTests/CRDT/CRDTReplicatorShellTests.swift +++ b/Tests/DistributedActorsTests/CRDT/CRDTReplicatorShellTests.swift @@ -273,7 +273,7 @@ final class CRDTReplicatorShellTests: ClusteredNodesTestBase { self.setUpRemote() try self.joinNodes(node: self.localSystem, with: self.remoteSystem) - try self.ensureNodes(.up, systems: self.localSystem, self.remoteSystem) + try self.ensureNodes(.up, nodes: self.localSystem.cluster.node, self.remoteSystem.cluster.node) let writeP = self.localTestKit.spawnTestProbe(expecting: LocalWriteResult.self) let readP = self.localTestKit.spawnTestProbe(expecting: LocalReadResult.self) @@ -320,7 +320,7 @@ final class CRDTReplicatorShellTests: ClusteredNodesTestBase { self.setUpRemote() try self.joinNodes(node: self.localSystem, with: self.remoteSystem) - try self.ensureNodes(.up, systems: self.localSystem, self.remoteSystem) + try self.ensureNodes(.up, nodes: self.localSystem.cluster.node, self.remoteSystem.cluster.node) let readP = self.localTestKit.spawnTestProbe(expecting: LocalReadResult.self) @@ -363,7 +363,7 @@ final class CRDTReplicatorShellTests: ClusteredNodesTestBase { self.setUpRemote() try self.joinNodes(node: self.localSystem, with: self.remoteSystem) - try self.ensureNodes(.up, systems: self.localSystem, self.remoteSystem) + try self.ensureNodes(.up, nodes: self.localSystem.cluster.node, self.remoteSystem.cluster.node) let readP = self.localTestKit.spawnTestProbe(expecting: LocalReadResult.self) @@ -395,7 +395,7 @@ final class CRDTReplicatorShellTests: ClusteredNodesTestBase { self.setUpRemote() try self.joinNodes(node: self.localSystem, with: self.remoteSystem) - try self.ensureNodes(.up, systems: self.localSystem, self.remoteSystem) + try self.ensureNodes(.up, nodes: self.localSystem.cluster.node, self.remoteSystem.cluster.node) let deleteP = self.localTestKit.spawnTestProbe(expecting: LocalDeleteResult.self) let readP = self.localTestKit.spawnTestProbe(expecting: LocalReadResult.self) diff --git a/Tests/DistributedActorsTests/Clocks/VersionVectorTests.swift b/Tests/DistributedActorsTests/Clocks/VersionVectorTests.swift index 609827b0c..640672e75 100644 --- a/Tests/DistributedActorsTests/Clocks/VersionVectorTests.swift +++ b/Tests/DistributedActorsTests/Clocks/VersionVectorTests.swift @@ -184,6 +184,9 @@ final class VersionVectorTests: XCTestCase { guard case .concurrent = VV([(replicaA, 1), (replicaB, 4), (replicaC, 6)]).compareTo(VV([(replicaA, 2), (replicaB, 7), (replicaC, 2)])) else { throw shouldNotHappen("Must be .concurrent relation if the two version vectors are not ordered or the same") } + guard case .concurrent = VV([(replicaA, 1)]).compareTo(VV([(replicaA, 1), (replicaB, 1)])) else { + throw shouldNotHappen("Should be .concurrent, since even if rhs has more information, there is not at least `one strictly less than`") + } } // ==== ---------------------------------------------------------------------------------------------------------------- diff --git a/Tests/DistributedActorsTests/Cluster/AssociationClusteredTests+XCTest.swift b/Tests/DistributedActorsTests/Cluster/AssociationClusteredTests+XCTest.swift index 30934c0f1..9957be880 100644 --- a/Tests/DistributedActorsTests/Cluster/AssociationClusteredTests+XCTest.swift +++ b/Tests/DistributedActorsTests/Cluster/AssociationClusteredTests+XCTest.swift @@ -26,7 +26,6 @@ extension ClusterAssociationTests { ("test_boundServer_shouldAcceptAssociate", test_boundServer_shouldAcceptAssociate), ("test_handshake_shouldNotifyOnSuccess", test_handshake_shouldNotifyOnSuccess), ("test_handshake_shouldNotifySuccessWhenAlreadyConnected", test_handshake_shouldNotifySuccessWhenAlreadyConnected), - ("test_association_sameAddressNodeJoin_shouldOverrideExistingNode", test_association_sameAddressNodeJoin_shouldOverrideExistingNode), ("test_association_shouldAllowSendingToRemoteReference", test_association_shouldAllowSendingToRemoteReference), ("test_association_shouldEstablishSingleAssociationForConcurrentlyInitiatedHandshakes_incoming_outgoing", test_association_shouldEstablishSingleAssociationForConcurrentlyInitiatedHandshakes_incoming_outgoing), ("test_association_shouldEstablishSingleAssociationForConcurrentlyInitiatedHandshakes_outgoing_outgoing", test_association_shouldEstablishSingleAssociationForConcurrentlyInitiatedHandshakes_outgoing_outgoing), diff --git a/Tests/DistributedActorsTests/Cluster/AssociationClusteredTests.swift b/Tests/DistributedActorsTests/Cluster/AssociationClusteredTests.swift index cd51db1f1..4a261b6bc 100644 --- a/Tests/DistributedActorsTests/Cluster/AssociationClusteredTests.swift +++ b/Tests/DistributedActorsTests/Cluster/AssociationClusteredTests.swift @@ -61,7 +61,8 @@ final class ClusterAssociationTests: ClusteredNodesTestBase { // ==== ------------------------------------------------------------------------------------------------------------ // MARK: Joining into existing cluster - func test_association_sameAddressNodeJoin_shouldOverrideExistingNode() throws { + // FIXME: unlock this test + func fixme_association_sameAddressNodeJoin_shouldOverrideExistingNode() throws { try shouldNotThrow { let (first, second) = self.setUpPair() diff --git a/Tests/DistributedActorsTests/Cluster/ClusterEventStreamTests.swift b/Tests/DistributedActorsTests/Cluster/ClusterEventStreamTests.swift index 5624a0e57..ac572251b 100644 --- a/Tests/DistributedActorsTests/Cluster/ClusterEventStreamTests.swift +++ b/Tests/DistributedActorsTests/Cluster/ClusterEventStreamTests.swift @@ -36,8 +36,8 @@ import NIO import XCTest final class ClusterEventStreamTests: ActorSystemTestBase { - let firstMember = Cluster.Member(node: UniqueNode(node: Node(systemName: "System", host: "1.1.1.1", port: 7337), nid: .random()), status: .up) - let secondMember = Cluster.Member(node: UniqueNode(node: Node(systemName: "System", host: "2.2.2.2", port: 8228), nid: .random()), status: .up) + let memberA = Cluster.Member(node: UniqueNode(node: Node(systemName: "System", host: "1.1.1.1", port: 7337), nid: .random()), status: .up) + let memberB = Cluster.Member(node: UniqueNode(node: Node(systemName: "System", host: "2.2.2.2", port: 8228), nid: .random()), status: .up) func test_clusterEventStream_shouldCollapseEventsAndOfferASnapshotToLateSubscribers() throws { let p1 = self.testKit.spawnTestProbe(expecting: Cluster.Event.self) @@ -52,9 +52,9 @@ final class ClusterEventStreamTests: ActorSystemTestBase { ) eventStream.subscribe(p1.ref) // sub before first -> up was published - eventStream.publish(.membershipChange(.init(member: self.firstMember, toStatus: .up))) + eventStream.publish(.membershipChange(.init(member: self.memberA, toStatus: .up))) eventStream.subscribe(p2.ref) - eventStream.publish(.membershipChange(.init(member: self.secondMember, toStatus: .up))) + eventStream.publish(.membershipChange(.init(member: self.memberB, toStatus: .up))) // ==== p1 --------------------- @@ -66,13 +66,13 @@ final class ClusterEventStreamTests: ActorSystemTestBase { } switch try p1.expectMessage() { case .membershipChange(let change): - change.node.shouldEqual(self.firstMember.node) + change.node.shouldEqual(self.memberA.node) default: throw p1.error("Expected a membershipChange") } switch try p1.expectMessage() { case .membershipChange(let change): - change.node.shouldEqual(self.secondMember.node) + change.node.shouldEqual(self.memberB.node) default: throw p1.error("Expected a membershipChange") } @@ -81,14 +81,14 @@ final class ClusterEventStreamTests: ActorSystemTestBase { switch try p2.expectMessage() { case .snapshot(let snapshot): - snapshot.members(self.firstMember.node.node).shouldEqual([firstMember]) + snapshot.uniqueMember(self.memberA.node).shouldEqual(self.memberA) () // ok default: throw p2.error("Expected a snapshot first") } switch try p2.expectMessage() { case .membershipChange(let change): - change.node.shouldEqual(self.secondMember.node) + change.node.shouldEqual(self.memberB.node) default: throw p2.error("Expected a membershipChange") } @@ -105,10 +105,10 @@ final class ClusterEventStreamTests: ActorSystemTestBase { customBehavior: ClusterEventStream.Shell.behavior ) - eventStream.publish(.membershipChange(.init(member: self.firstMember, toStatus: .joining))) - eventStream.publish(.membershipChange(.init(member: self.firstMember, toStatus: .up))) - eventStream.publish(.membershipChange(.init(member: self.secondMember, toStatus: .joining))) - eventStream.publish(.membershipChange(.init(member: self.secondMember, toStatus: .up))) + eventStream.publish(.membershipChange(.init(member: self.memberA, toStatus: .joining))) + eventStream.publish(.membershipChange(.init(member: self.memberA, toStatus: .up))) + eventStream.publish(.membershipChange(.init(member: self.memberB, toStatus: .joining))) + eventStream.publish(.membershipChange(.init(member: self.memberB, toStatus: .up))) eventStream.subscribe(p1.ref) // ==== p1 --------------------- @@ -116,7 +116,7 @@ final class ClusterEventStreamTests: ActorSystemTestBase { switch try p1.expectMessage() { case .snapshot(let snapshot): let members = snapshot.members(atLeast: .joining) - Set(members).shouldEqual(Set([firstMember, secondMember])) + Set(members).shouldEqual(Set([memberA, memberB])) default: throw p1.error("Expected a snapshot with all the data") diff --git a/Tests/DistributedActorsTests/Cluster/ClusterLeaderActionsClusteredTests+XCTest.swift b/Tests/DistributedActorsTests/Cluster/ClusterLeaderActionsClusteredTests+XCTest.swift new file mode 100644 index 000000000..d7da3a323 --- /dev/null +++ b/Tests/DistributedActorsTests/Cluster/ClusterLeaderActionsClusteredTests+XCTest.swift @@ -0,0 +1,34 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift Distributed Actors open source project +// +// Copyright (c) 2020 Apple Inc. and the Swift Distributed Actors project authors +// Licensed under Apache License v2.0 +// +// See LICENSE.txt for license information +// See CONTRIBUTORS.md for the list of Swift Distributed Actors project authors +// +// SPDX-License-Identifier: Apache-2.0 +// +//===----------------------------------------------------------------------===// + +import XCTest + +/// +/// NOTE: This file was generated by generate_linux_tests.rb +/// +/// Do NOT edit this file directly as it will be regenerated automatically when needed. +/// + +extension ClusterLeaderActionsClusteredTests { + static var allTests: [(String, (ClusterLeaderActionsClusteredTests) -> () throws -> Void)] { + return [ + ("test_singleLeader", test_singleLeader), + ("test_joining_to_up_decisionByLeader", test_joining_to_up_decisionByLeader), + ("test_joining_to_up_earlyYetStillLettingAllNodesKnowAboutLatestMembershipStatus", test_joining_to_up_earlyYetStillLettingAllNodesKnowAboutLatestMembershipStatus), + ("test_up_ensureAllSubscribersGetMovingUpEvents", test_up_ensureAllSubscribersGetMovingUpEvents), + ("test_down_to_removed_ensureRemovalHappensWhenAllHaveSeenDown", test_down_to_removed_ensureRemovalHappensWhenAllHaveSeenDown), + ("test_ensureDownAndRemovalSpreadsToAllMembers", test_ensureDownAndRemovalSpreadsToAllMembers), + ] + } +} diff --git a/Tests/DistributedActorsTests/Cluster/ClusterLeaderActionsClusteredTests.swift b/Tests/DistributedActorsTests/Cluster/ClusterLeaderActionsClusteredTests.swift new file mode 100644 index 000000000..1d45116b1 --- /dev/null +++ b/Tests/DistributedActorsTests/Cluster/ClusterLeaderActionsClusteredTests.swift @@ -0,0 +1,308 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift Distributed Actors open source project +// +// Copyright (c) 2018-2019 Apple Inc. and the Swift Distributed Actors project authors +// Licensed under Apache License v2.0 +// +// See LICENSE.txt for license information +// See CONTRIBUTORS.md for the list of Swift Distributed Actors project authors +// +// SPDX-License-Identifier: Apache-2.0 +// +//===----------------------------------------------------------------------===// + +@testable import DistributedActors +import DistributedActorsTestKit +import Foundation +import NIOSSL +import XCTest + +final class ClusterLeaderActionsClusteredTests: ClusteredNodesTestBase { + // ==== ------------------------------------------------------------------------------------------------------------ + // MARK: leader decision: .joining -> .up + + func test_singleLeader() throws { + try shouldNotThrow { + let first = self.setUpNode("first") { settings in + settings.cluster.node.port = 7111 + settings.cluster.autoLeaderElection = .lowestReachable(minNumberOfMembers: 1) + } + + let p = self.testKit(first).spawnTestProbe(expecting: Cluster.Event.self) + + _ = try first.spawn("selfishSingleLeader", Behavior.setup { context in + context.system.cluster.events.subscribe(context.myself) + + return .receiveMessage { event in + switch event { + case .leadershipChange: + p.tell(event) + return .same + default: + return .same + } + } + + }) + + switch try p.expectMessage() { + case .leadershipChange(let change): + guard let leader = change.newLeader else { + throw self.testKit(first).fail("Expected \(first.cluster.node) to be leader") + } + leader.node.shouldEqual(first.cluster.node) + default: + throw self.testKit(first).fail("Expected leader change event") + } + } + } + + func test_joining_to_up_decisionByLeader() throws { + try shouldNotThrow { + let first = self.setUpNode("first") { settings in + settings.cluster.node.port = 7111 + settings.cluster.autoLeaderElection = .lowestReachable(minNumberOfMembers: 3) + } + let second = self.setUpNode("second") { settings in + settings.cluster.node.port = 8222 + settings.cluster.autoLeaderElection = .lowestReachable(minNumberOfMembers: 3) + } + let third = self.setUpNode("third") { settings in + settings.cluster.node.port = 9333 + settings.cluster.autoLeaderElection = .lowestReachable(minNumberOfMembers: 3) + } + + first.cluster.join(node: second.cluster.node.node) + third.cluster.join(node: second.cluster.node.node) + + try assertAssociated(first, withAtLeast: second.cluster.node) + try assertAssociated(second, withAtLeast: third.cluster.node) + try assertAssociated(first, withAtLeast: third.cluster.node) + + try self.testKit(first).eventually(within: .seconds(10)) { + try self.assertMemberStatus(on: first, node: first.cluster.node, is: .up) + try self.assertMemberStatus(on: first, node: second.cluster.node, is: .up) + try self.assertMemberStatus(on: first, node: third.cluster.node, is: .up) + } + + try self.testKit(second).eventually(within: .seconds(10)) { + try self.assertMemberStatus(on: second, node: first.cluster.node, is: .up) + try self.assertMemberStatus(on: second, node: second.cluster.node, is: .up) + try self.assertMemberStatus(on: second, node: third.cluster.node, is: .up) + } + + try self.testKit(third).eventually(within: .seconds(10)) { + try self.assertMemberStatus(on: third, node: first.cluster.node, is: .up) + try self.assertMemberStatus(on: third, node: second.cluster.node, is: .up) + try self.assertMemberStatus(on: third, node: third.cluster.node, is: .up) + } + } + } + + func test_joining_to_up_earlyYetStillLettingAllNodesKnowAboutLatestMembershipStatus() throws { + try shouldNotThrow { + // This showcases a racy situation, where we allow a leader elected when at least 2 nodes joined + // yet we actually join 3 nodes -- meaning that the joining up is _slightly_ racy: + // - maybe nodes 1 and 2 join each other first and 1 starts upping + // - maybe nodes 2 and 3 join each other and 2 starts upping + // - and at the same time, maybe while 1 and 2 have started joining, 2 and 3 already joined, and 2 issued up for itself and 3 + // + // All this is _fine_. The cluster leader is such that under whichever rules we allowed it to be elected + // it shall perform its duties. This tests however quickly shows that lack of letting the "third" node, + // via gossip or some other way about the ->up of other nodes once it joins the "others", it'd be stuck waiting for + // the ->up forever. + // + // In other words, this test exercises that there must be _some_ (gossip, or similar "push" membership once a new member joins), + // to a new member. + // + let first = self.setUpNode("first") { settings in + settings.cluster.autoLeaderElection = .lowestReachable(minNumberOfMembers: 2) + } + let second = self.setUpNode("second") { settings in + settings.cluster.autoLeaderElection = .lowestReachable(minNumberOfMembers: 2) + } + let third = self.setUpNode("third") { settings in + settings.cluster.autoLeaderElection = .lowestReachable(minNumberOfMembers: 2) + } + + let fourth = self.setUpNode("fourth") { settings in + settings.cluster.autoLeaderElection = .none // even without election running, it will be notified by things by the others + } + + first.cluster.join(node: second.cluster.node.node) + third.cluster.join(node: second.cluster.node.node) + try self.ensureNodes(.up, within: .seconds(10), nodes: first.cluster.node, second.cluster.node, third.cluster.node) + + // Even the fourth node now could join and be notified about all the existing up members. + // It does not even have to run any leadership election -- there are leaders in the cluster. + // + // We only join one arbitrary node, we will be notified about all nodes: + fourth.cluster.join(node: third.cluster.node.node) + + try self.ensureNodes(.up, within: .seconds(10), nodes: first.cluster.node, second.cluster.node, third.cluster.node, fourth.cluster.node) + } + } + + func test_up_ensureAllSubscribersGetMovingUpEvents() throws { + try shouldNotThrow { + // it shall perform its duties. This tests however quickly shows that lack of letting the "third" node, + // via gossip or some other way about the ->up of other nodes once it joins the "others", it'd be stuck waiting for + // the ->up forever. + // + // In other words, this test exercises that there must be _some_ (gossip, or similar "push" membership once a new member joins), + // to a new member. + // + let first = self.setUpNode("first") { settings in + settings.cluster.autoLeaderElection = .lowestReachable(minNumberOfMembers: 2) + } + let p1 = self.testKit(first).spawnTestProbe(expecting: Cluster.Event.self) + first.cluster.events.subscribe(p1.ref) + + let second = self.setUpNode("second") { settings in + settings.cluster.autoLeaderElection = .lowestReachable(minNumberOfMembers: 2) + } + let p2 = self.testKit(second).spawnTestProbe(expecting: Cluster.Event.self) + second.cluster.events.subscribe(p2.ref) + + first.cluster.join(node: second.cluster.node.node) + + // this ensures that the membership, as seen in ClusterShell converged on all members being up + try self.ensureNodes(.up, nodes: first.cluster.node, second.cluster.node) + + // the following tests confirm that the manually subscribed actors, got all the events they expected + + // on the leader node, the other node noticed as up: + let eventsOnFirstSub = try p1.expectMessages(count: 6) + eventsOnFirstSub.shouldContain(.snapshot(.empty)) + eventsOnFirstSub.shouldContain(.membershipChange(.init(node: first.cluster.node, fromStatus: nil, toStatus: .joining))) + eventsOnFirstSub.shouldContain(.membershipChange(.init(node: second.cluster.node, fromStatus: nil, toStatus: .joining))) + eventsOnFirstSub.shouldContain(.membershipChange(.init(node: first.cluster.node, fromStatus: .joining, toStatus: .up))) + eventsOnFirstSub.shouldContain(.membershipChange(.init(node: second.cluster.node, fromStatus: .joining, toStatus: .up))) + eventsOnFirstSub.shouldContain(.leadershipChange(Cluster.LeadershipChange(oldLeader: nil, newLeader: .init(node: first.cluster.node, status: .joining))!)) // !-safe, since new/old leader known to be different + + // on non-leader node + let eventsOnSecondSub = try p2.expectMessages(count: 6) + eventsOnSecondSub.shouldContain(.snapshot(.empty)) + eventsOnSecondSub.shouldContain(.membershipChange(.init(node: first.cluster.node, fromStatus: nil, toStatus: .joining))) + eventsOnSecondSub.shouldContain(.membershipChange(.init(node: second.cluster.node, fromStatus: nil, toStatus: .joining))) + eventsOnSecondSub.shouldContain(.membershipChange(.init(node: first.cluster.node, fromStatus: .joining, toStatus: .up))) + eventsOnSecondSub.shouldContain(.membershipChange(.init(node: second.cluster.node, fromStatus: .joining, toStatus: .up))) + eventsOnSecondSub.shouldContain(.leadershipChange(Cluster.LeadershipChange(oldLeader: nil, newLeader: .init(node: first.cluster.node, status: .joining))!)) // !-safe, since new/old leader known to be different + } + } + + // ==== ---------------------------------------------------------------------------------------------------------------- + // MARK: .down -> removal + + func test_down_to_removed_ensureRemovalHappensWhenAllHaveSeenDown() throws { + try shouldNotThrow { + let first = self.setUpNode("first") { settings in + settings.cluster.autoLeaderElection = .lowestReachable(minNumberOfMembers: 2) + settings.cluster.downingStrategy = .timeout(.init(downUnreachableMembersAfter: .milliseconds(300))) + } + let p1 = self.testKit(first).spawnTestProbe(expecting: Cluster.Event.self) + first.cluster.events.subscribe(p1.ref) + + let second = self.setUpNode("second") { settings in + settings.cluster.autoLeaderElection = .lowestReachable(minNumberOfMembers: 2) + settings.cluster.downingStrategy = .timeout(.init(downUnreachableMembersAfter: .milliseconds(300))) + } + let p2 = self.testKit(second).spawnTestProbe(expecting: Cluster.Event.self) + second.cluster.events.subscribe(p2.ref) + + let third = self.setUpNode("third") { settings in + settings.cluster.autoLeaderElection = .lowestReachable(minNumberOfMembers: 2) + settings.cluster.downingStrategy = .timeout(.init(downUnreachableMembersAfter: .milliseconds(300))) + } + let p3 = self.testKit(third).spawnTestProbe(expecting: Cluster.Event.self) + third.cluster.events.subscribe(p3.ref) + + try self.joinNodes(node: first, with: second) + try self.joinNodes(node: second, with: third) + try self.joinNodes(node: first, with: third) + + let secondNode = second.cluster.node + try self.ensureNodes(.up, nodes: first.cluster.node, secondNode, third.cluster.node) + + first.cluster.down(node: secondNode.node) + + // other nodes have observed it down + try self.ensureNodes(.down, on: first, nodes: secondNode) + try self.ensureNodes(.down, on: third, nodes: secondNode) + + // on the leader node, the other node noticed as up: + let eventsOnFirstSub = try p1.expectMessages(count: 9) + eventsOnFirstSub.shouldContain(.snapshot(.empty)) + eventsOnFirstSub.shouldContain(.membershipChange(.init(node: first.cluster.node, fromStatus: nil, toStatus: .joining))) + eventsOnFirstSub.shouldContain(.membershipChange(.init(node: secondNode, fromStatus: nil, toStatus: .joining))) + eventsOnFirstSub.shouldContain(.membershipChange(.init(node: first.cluster.node, fromStatus: .joining, toStatus: .up))) + eventsOnFirstSub.shouldContain(.membershipChange(.init(node: secondNode, fromStatus: .joining, toStatus: .up))) + eventsOnFirstSub.shouldContain(.leadershipChange(Cluster.LeadershipChange(oldLeader: nil, newLeader: .init(node: first.cluster.node, status: .joining))!)) // !-safe, since new/old leader known to be different + eventsOnFirstSub.shouldContain(.membershipChange(.init(node: third.cluster.node, fromStatus: nil, toStatus: .joining))) + eventsOnFirstSub.shouldContain(.membershipChange(.init(node: third.cluster.node, fromStatus: .joining, toStatus: .up))) + + eventsOnFirstSub.shouldContain(.membershipChange(.init(node: secondNode, fromStatus: .up, toStatus: .down))) + + try self.testKit(first).eventually(within: .seconds(3)) { + let p1s = self.testKit(first).spawnTestProbe(expecting: Cluster.Membership.self) + first.cluster.ref.tell(.query(.currentMembership(p1s.ref))) + } + } + } + + func test_ensureDownAndRemovalSpreadsToAllMembers() throws { + try shouldNotThrow { + let first = self.setUpNode("first") { settings in + settings.cluster.autoLeaderElection = .lowestReachable(minNumberOfMembers: 2) + settings.cluster.downingStrategy = .timeout(.init(downUnreachableMembersAfter: .milliseconds(300))) + } + let p1 = self.testKit(first).spawnTestProbe(expecting: Cluster.Event.self) + first.cluster.events.subscribe(p1.ref) + + let second = self.setUpNode("second") { settings in + settings.cluster.autoLeaderElection = .lowestReachable(minNumberOfMembers: 2) + settings.cluster.downingStrategy = .timeout(.init(downUnreachableMembersAfter: .milliseconds(300))) + } + let p2 = self.testKit(second).spawnTestProbe(expecting: Cluster.Event.self) + second.cluster.events.subscribe(p2.ref) + + let third = self.setUpNode("third") { settings in + settings.cluster.autoLeaderElection = .lowestReachable(minNumberOfMembers: 2) + settings.cluster.downingStrategy = .timeout(.init(downUnreachableMembersAfter: .milliseconds(300))) + } + let p3 = self.testKit(third).spawnTestProbe(expecting: Cluster.Event.self) + third.cluster.events.subscribe(p3.ref) + + try self.joinNodes(node: first, with: second) + try self.joinNodes(node: second, with: third) + try self.joinNodes(node: first, with: third) + + try self.ensureNodes(.up, nodes: first.cluster.node, second.cluster.node, third.cluster.node) + + // crash the second node + second.shutdown() + + // other nodes have observed it down + try self.ensureNodes(.down, on: first, nodes: second.cluster.node) + try self.ensureNodes(.down, on: third, nodes: second.cluster.node) + + // on the leader node, the other node noticed as up: + let testKit = self.testKit(first) + try testKit.eventually(within: .seconds(5)) { + let event: Cluster.Event? = try p1.maybeExpectMessage() + switch event { + case .membershipChange(.init(node: second.cluster.node, fromStatus: .up, toStatus: .down)): () + case let other: throw testKit.error("Expected `second` [ up] -> [ .down], on first node, was: \(other, orElse: "nil")") + } + } + try testKit.eventually(within: .seconds(5)) { + let event: Cluster.Event? = try p1.maybeExpectMessage() + switch event { + case .membershipChange(.init(node: second.cluster.node, fromStatus: .down, toStatus: .removed)): () + case let other: throw testKit.error("Expected `second` [ up] -> [ .down], on first node, was: \(other, orElse: "nil")") + } + } + } + } +} diff --git a/Tests/DistributedActorsTests/Cluster/ClusterLeaderActionsTests+XCTest.swift b/Tests/DistributedActorsTests/Cluster/ClusterLeaderActionsTests+XCTest.swift index 54eba8cef..54672504d 100644 --- a/Tests/DistributedActorsTests/Cluster/ClusterLeaderActionsTests+XCTest.swift +++ b/Tests/DistributedActorsTests/Cluster/ClusterLeaderActionsTests+XCTest.swift @@ -23,10 +23,8 @@ import XCTest extension ClusterLeaderActionsTests { static var allTests: [(String, (ClusterLeaderActionsTests) -> () throws -> Void)] { return [ - ("test_singleLeader", test_singleLeader), - ("test_joining_to_up_decisionByLeader", test_joining_to_up_decisionByLeader), - ("test_joining_to_up_earlyYetStillLettingAllNodesKnowAboutLatestMembershipStatus", test_joining_to_up_earlyYetStillLettingAllNodesKnowAboutLatestMembershipStatus), - ("test_ensureAllSubscribersGetMovingUpEvents", test_ensureAllSubscribersGetMovingUpEvents), + ("test_leaderActions_removeDownMembers_ifKnownAsDownToAllMembers", test_leaderActions_removeDownMembers_ifKnownAsDownToAllMembers), + ("test_leaderActions_removeDownMembers_dontRemoveIfDownNotKnownToAllMembersYet", test_leaderActions_removeDownMembers_dontRemoveIfDownNotKnownToAllMembersYet), ] } } diff --git a/Tests/DistributedActorsTests/Cluster/ClusterLeaderActionsTests.swift b/Tests/DistributedActorsTests/Cluster/ClusterLeaderActionsTests.swift index e8a9937f5..0c8850b64 100644 --- a/Tests/DistributedActorsTests/Cluster/ClusterLeaderActionsTests.swift +++ b/Tests/DistributedActorsTests/Cluster/ClusterLeaderActionsTests.swift @@ -18,177 +18,143 @@ import Foundation import NIOSSL import XCTest -final class ClusterLeaderActionsTests: ClusteredNodesTestBase { - // ==== ------------------------------------------------------------------------------------------------------------ - // MARK: leader decision: .joining -> .up - - func test_singleLeader() throws { - try shouldNotThrow { - let first = self.setUpNode("first") { settings in - settings.cluster.node.port = 7111 - settings.cluster.autoLeaderElection = .lowestReachable(minNumberOfMembers: 1) - } - - let p = self.testKit(first).spawnTestProbe(expecting: Cluster.Event.self) - - _ = try first.spawn("selfishSingleLeader", Behavior.setup { context in - context.system.cluster.events.subscribe(context.myself) - - return .receiveMessage { event in - switch event { - case .leadershipChange: - p.tell(event) - return .same - default: - return .same - } - } - - }) - - switch try p.expectMessage() { - case .leadershipChange(let change): - guard let leader = change.newLeader else { - throw self.testKit(first).fail("Expected \(first.cluster.node) to be leader") - } - leader.node.shouldEqual(first.cluster.node) - default: - throw self.testKit(first).fail("Expected leader change event") - } - } +// Unit tests of the actions, see `ClusterLeaderActionsClusteredTests` for integration tests +final class ClusterLeaderActionsTests: XCTestCase { + let _nodeA = Node(systemName: "nodeA", host: "1.1.1.1", port: 7337) + let _nodeB = Node(systemName: "nodeB", host: "2.2.2.2", port: 8228) + let _nodeC = Node(systemName: "nodeC", host: "3.3.3.3", port: 9119) + + var allNodes: [UniqueNode] { + [self.nodeA, self.nodeB, self.nodeC] } - func test_joining_to_up_decisionByLeader() throws { - try shouldNotThrow { - let first = self.setUpNode("first") { settings in - settings.cluster.node.port = 7111 - settings.cluster.autoLeaderElection = .lowestReachable(minNumberOfMembers: 3) - } - let second = self.setUpNode("second") { settings in - settings.cluster.node.port = 8222 - settings.cluster.autoLeaderElection = .lowestReachable(minNumberOfMembers: 3) - } - let third = self.setUpNode("third") { settings in - settings.cluster.node.port = 9333 - settings.cluster.autoLeaderElection = .lowestReachable(minNumberOfMembers: 3) - } - - first.cluster.join(node: second.cluster.node.node) - third.cluster.join(node: second.cluster.node.node) - - try assertAssociated(first, withAtLeast: second.cluster.node) - try assertAssociated(second, withAtLeast: third.cluster.node) - try assertAssociated(first, withAtLeast: third.cluster.node) - - try self.testKit(first).eventually(within: .seconds(10)) { - try self.assertMemberStatus(on: first, node: first.cluster.node, is: .up) - try self.assertMemberStatus(on: first, node: second.cluster.node, is: .up) - try self.assertMemberStatus(on: first, node: third.cluster.node, is: .up) - } - - try self.testKit(second).eventually(within: .seconds(10)) { - try self.assertMemberStatus(on: second, node: first.cluster.node, is: .up) - try self.assertMemberStatus(on: second, node: second.cluster.node, is: .up) - try self.assertMemberStatus(on: second, node: third.cluster.node, is: .up) - } - - try self.testKit(third).eventually(within: .seconds(10)) { - try self.assertMemberStatus(on: third, node: first.cluster.node, is: .up) - try self.assertMemberStatus(on: third, node: second.cluster.node, is: .up) - try self.assertMemberStatus(on: third, node: third.cluster.node, is: .up) - } - } + var stateA: ClusterShellState! + var stateB: ClusterShellState! + var stateC: ClusterShellState! + + var nodeA: UniqueNode { + self.stateA.myselfNode } - func test_joining_to_up_earlyYetStillLettingAllNodesKnowAboutLatestMembershipStatus() throws { - try shouldNotThrow { - // This showcases a racy situation, where we allow a leader elected when at least 2 nodes joined - // yet we actually join 3 nodes -- meaning that the joining up is _slightly_ racy: - // - maybe nodes 1 and 2 join each other first and 1 starts upping - // - maybe nodes 2 and 3 join each other and 2 starts upping - // - and at the same time, maybe while 1 and 2 have started joining, 2 and 3 already joined, and 2 issued up for itself and 3 - // - // All this is _fine_. The cluster leader is such that under whichever rules we allowed it to be elected - // it shall perform its duties. This tests however quickly shows that lack of letting the "third" node, - // via gossip or some other way about the ->up of other nodes once it joins the "others", it'd be stuck waiting for - // the ->up forever. - // - // In other words, this test exercises that there must be _some_ (gossip, or similar "push" membership once a new member joins), - // to a new member. - // - let first = self.setUpNode("first") { settings in - settings.cluster.autoLeaderElection = .lowestReachable(minNumberOfMembers: 2) - } - let second = self.setUpNode("second") { settings in - settings.cluster.autoLeaderElection = .lowestReachable(minNumberOfMembers: 2) - } - let third = self.setUpNode("third") { settings in - settings.cluster.autoLeaderElection = .lowestReachable(minNumberOfMembers: 2) - } - - let fourth = self.setUpNode("fourth") { settings in - settings.cluster.autoLeaderElection = .none // even without election running, it will be notified by things by the others - } - - first.cluster.join(node: second.cluster.node.node) - third.cluster.join(node: second.cluster.node.node) - try self.ensureNodes(.up, within: .seconds(10), systems: first, second, third) - - // Even the fourth node now could join and be notified about all the existing up members. - // It does not even have to run any leadership election -- there are leaders in the cluster. - // - // We only join one arbitrary node, we will be notified about all nodes: - fourth.cluster.join(node: third.cluster.node.node) - - try self.ensureNodes(.up, within: .seconds(10), systems: first, second, third, fourth) + var nodeB: UniqueNode { + self.stateB.myselfNode + } + + var nodeC: UniqueNode { + self.stateC.myselfNode + } + + override func setUp() { + self.stateA = ClusterShellState.makeTestMock(side: .server) { settings in + settings.node = self._nodeA + } + self.stateB = ClusterShellState.makeTestMock(side: .server) { settings in + settings.node = self._nodeB } + self.stateC = ClusterShellState.makeTestMock(side: .server) { settings in + settings.node = self._nodeC + } + + _ = self.stateA.membership.join(self.nodeA) + _ = self.stateA.membership.join(self.nodeB) + _ = self.stateA.membership.join(self.nodeC) + + _ = self.stateB.membership.join(self.nodeA) + _ = self.stateB.membership.join(self.nodeB) + _ = self.stateB.membership.join(self.nodeC) + + _ = self.stateC.membership.join(self.nodeA) + _ = self.stateC.membership.join(self.nodeB) + _ = self.stateC.membership.join(self.nodeC) + _ = self.stateA.latestGossip.mergeForward(incoming: self.stateB.latestGossip) + + _ = self.stateA.latestGossip.mergeForward(incoming: self.stateC.latestGossip) + + _ = self.stateB.latestGossip.mergeForward(incoming: self.stateA.latestGossip) + _ = self.stateC.latestGossip.mergeForward(incoming: self.stateA.latestGossip) } - func test_ensureAllSubscribersGetMovingUpEvents() throws { - try shouldNotThrow { - // it shall perform its duties. This tests however quickly shows that lack of letting the "third" node, - // via gossip or some other way about the ->up of other nodes once it joins the "others", it'd be stuck waiting for - // the ->up forever. - // - // In other words, this test exercises that there must be _some_ (gossip, or similar "push" membership once a new member joins), - // to a new member. - // - let first = self.setUpNode("first") { settings in - settings.cluster.autoLeaderElection = .lowestReachable(minNumberOfMembers: 2) - } - let p1 = self.testKit(first).spawnTestProbe(expecting: Cluster.Event.self) - first.cluster.events.subscribe(p1.ref) - - let second = self.setUpNode("second") { settings in - settings.cluster.autoLeaderElection = .lowestReachable(minNumberOfMembers: 2) - } - let p2 = self.testKit(second).spawnTestProbe(expecting: Cluster.Event.self) - second.cluster.events.subscribe(p2.ref) - - first.cluster.join(node: second.cluster.node.node) - - // this ensures that the membership, as seen in ClusterShell converged on all members being up - try self.ensureNodes(.up, systems: first, second) - - // the following tests confirm that the manually subscribed actors, got all the events they expected - - // on the leader node, the other node noticed as up: - let eventsOnFirstSub = try p1.expectMessages(count: 6) - eventsOnFirstSub.shouldContain(.snapshot(.empty)) - eventsOnFirstSub.shouldContain(.membershipChange(.init(node: first.cluster.node, fromStatus: .joining, toStatus: .joining))) - eventsOnFirstSub.shouldContain(.membershipChange(.init(node: second.cluster.node, fromStatus: nil, toStatus: .joining))) - eventsOnFirstSub.shouldContain(.membershipChange(.init(node: first.cluster.node, fromStatus: .joining, toStatus: .up))) - eventsOnFirstSub.shouldContain(.membershipChange(.init(node: second.cluster.node, fromStatus: .joining, toStatus: .up))) - eventsOnFirstSub.shouldContain(.leadershipChange(Cluster.LeadershipChange(oldLeader: nil, newLeader: .init(node: first.cluster.node, status: .joining))!)) // !-safe, since new/old leader known to be different - - // on non-leader node - let eventsOnSecondSub = try p2.expectMessages(count: 6) - eventsOnSecondSub.shouldContain(.snapshot(.empty)) - eventsOnSecondSub.shouldContain(.membershipChange(.init(node: first.cluster.node, fromStatus: nil, toStatus: .joining))) - eventsOnSecondSub.shouldContain(.membershipChange(.init(node: second.cluster.node, fromStatus: .joining, toStatus: .joining))) - eventsOnSecondSub.shouldContain(.membershipChange(.init(node: first.cluster.node, fromStatus: .joining, toStatus: .up))) - eventsOnSecondSub.shouldContain(.membershipChange(.init(node: second.cluster.node, fromStatus: .joining, toStatus: .up))) - eventsOnSecondSub.shouldContain(.leadershipChange(Cluster.LeadershipChange(oldLeader: nil, newLeader: .init(node: first.cluster.node, status: .joining))!)) // !-safe, since new/old leader known to be different + // ==== ------------------------------------------------------------------------------------------------------------ + // MARK: Moving members to .removed + + func test_leaderActions_removeDownMembers_ifKnownAsDownToAllMembers() { + // make A the leader + let makeFirstTheLeader = Cluster.LeadershipChange(oldLeader: nil, newLeader: self.stateA.membership.member(self.nodeA.node)!)! + _ = self.stateA.applyClusterEvent(.leadershipChange(makeFirstTheLeader)) + + // time to mark B as .down + _ = self.stateA.membership.mark(self.nodeB, as: .down) // only F knows that S is .down + + // ensure some nodes are up, so they participate in the convergence check + _ = self.stateA.membership.mark(self.nodeA, as: .up) + _ = self.stateA.membership.mark(self.nodeC, as: .up) + + // not yet converged, the first/third members need to chat some more, to ensure first knows that third knows that second is down + self.stateA.latestGossip.converged().shouldBeFalse() + let moveMembersUp = self.stateA.collectLeaderActions() + moveMembersUp.count.shouldEqual(0) + + // we tell others about the latest gossip, that B is down + // second does not get the information, let's assume we cannot communicate with it + self.gossip(from: self.stateA, to: &self.stateC) + + // first and second now know that second is down + self.stateA.membership.uniqueMember(self.nodeB)!.status.shouldEqual(.down) + self.stateC.membership.uniqueMember(self.nodeB)!.status.shouldEqual(.down) + + // we gossiped to C, and it knows, but we don't know yet if it knows (if it has received the information) + self.stateA.latestGossip.converged().shouldBeFalse() + + // after a gossip back... + self.gossip(from: self.stateC, to: &self.stateA) + + // now first knows that all other up/leaving members also know about second being .down + self.stateA.latestGossip.converged().shouldBeTrue() + self.stateC.latestGossip.converged().shouldBeTrue() // also true, but not needed for the leader to make the decision + + let hopefullyRemovalActions = self.stateA.collectLeaderActions() + hopefullyRemovalActions.count.shouldEqual(1) + guard case .some(.removeMember(let member)) = hopefullyRemovalActions.first else { + XCTFail("Expected a member removal action, but did not get one, actions: \(hopefullyRemovalActions)") + return } + member.status.isDown.shouldBeTrue() + member.node.shouldEqual(self.nodeB) + + // interpret leader actions would interpret it by removing the member now and tombstone-ing it, + // see `interpretLeaderActions` + _ = self.stateA.membership.removeCompletely(self.nodeB) + self.stateA.latestGossip.membership.uniqueMember(self.nodeB).shouldBeNil() + + // once we (leader) have performed removal and talk to others, they should also remove and prune seen tables + // + // once a node is removed, it should not be seen in gossip seen tables anymore + _ = self.gossip(from: self.stateA, to: &self.stateC) + } + + func test_leaderActions_removeDownMembers_dontRemoveIfDownNotKnownToAllMembersYet() { + // A is .down, but + _ = self.stateB._latestGossip = .parse( + """ + A.down B.up C.up [leader:B] + A: A:7 B:2 + B: A:7 B:10 C:6 + C: A:7 B:5 C:6 + """, owner: self.nodeB, nodes: self.allNodes + ) + + self.stateB.latestGossip.converged().shouldBeFalse() + let moveMembersUp = self.stateA.collectLeaderActions() + moveMembersUp.count.shouldEqual(0) + } + + @discardableResult + private func gossip(from: ClusterShellState, to: inout ClusterShellState) -> Cluster.Gossip.MergeDirective { + to.latestGossip.mergeForward(incoming: from.latestGossip) } + + // ==== ------------------------------------------------------------------------------------------------------------ + // MARK: Moving members .up + + // TODO: same style of tests for upNumbers and moving members up } diff --git a/Tests/DistributedActorsTests/Cluster/DowningStrategy/DowningClusteredTests+XCTest.swift b/Tests/DistributedActorsTests/Cluster/DowningStrategy/DowningClusteredTests+XCTest.swift index d3d52ba90..4a810c14c 100644 --- a/Tests/DistributedActorsTests/Cluster/DowningStrategy/DowningClusteredTests+XCTest.swift +++ b/Tests/DistributedActorsTests/Cluster/DowningStrategy/DowningClusteredTests+XCTest.swift @@ -23,10 +23,14 @@ import XCTest extension DowningClusteredTests { static var allTests: [(String, (DowningClusteredTests) -> () throws -> Void)] { return [ - ("test_stoppingSelfNode_shouldPropagateToOtherNodes_downedBy_leaveSelf", test_stoppingSelfNode_shouldPropagateToOtherNodes_downedBy_leaveSelf), - ("test_stoppingSelfNode_shouldPropagateToOtherNodes_downedBy_downSelf", test_stoppingSelfNode_shouldPropagateToOtherNodes_downedBy_downSelf), - ("test_stoppingSelfNode_shouldPropagateToOtherNodes_downedBy_shutdownSelf", test_stoppingSelfNode_shouldPropagateToOtherNodes_downedBy_shutdownSelf), - ("test_stoppingSelfNode_shouldPropagateToOtherNodes_downedBy_downFromSecond", test_stoppingSelfNode_shouldPropagateToOtherNodes_downedBy_downFromSecond), + ("test_stopLeader_by_leaveSelfNode_shouldPropagateToOtherNodes", test_stopLeader_by_leaveSelfNode_shouldPropagateToOtherNodes), + ("test_stopMember_by_leaveSelfNode_shouldPropagateToOtherNodes", test_stopMember_by_leaveSelfNode_shouldPropagateToOtherNodes), + ("test_stopLeader_by_downSelf_shouldPropagateToOtherNodes", test_stopLeader_by_downSelf_shouldPropagateToOtherNodes), + ("test_stopMember_by_downSelf_shouldPropagateToOtherNodes", test_stopMember_by_downSelf_shouldPropagateToOtherNodes), + ("test_stopLeader_by_downByMember_shouldPropagateToOtherNodes", test_stopLeader_by_downByMember_shouldPropagateToOtherNodes), + ("test_stopMember_by_downByMember_shouldPropagateToOtherNodes", test_stopMember_by_downByMember_shouldPropagateToOtherNodes), + ("test_stopLeader_by_shutdownSelf_shouldPropagateToOtherNodes", test_stopLeader_by_shutdownSelf_shouldPropagateToOtherNodes), + ("test_stopMember_by_shutdownSelf_shouldPropagateToOtherNodes", test_stopMember_by_shutdownSelf_shouldPropagateToOtherNodes), ] } } diff --git a/Tests/DistributedActorsTests/Cluster/DowningStrategy/DowningClusteredTests.swift b/Tests/DistributedActorsTests/Cluster/DowningStrategy/DowningClusteredTests.swift index 0fce098bc..475552c02 100644 --- a/Tests/DistributedActorsTests/Cluster/DowningStrategy/DowningClusteredTests.swift +++ b/Tests/DistributedActorsTests/Cluster/DowningStrategy/DowningClusteredTests.swift @@ -16,77 +16,184 @@ import DistributedActorsTestKit import XCTest +// "Get down!" final class DowningClusteredTests: ClusteredNodesTestBase { enum NodeStopMethod { - case leaveSelf // TODO: eventually this one will be more graceful, ensure others see us leave etc + case leaveSelfNode // TODO: eventually this one will be more graceful, ensure others see us leave etc case downSelf case shutdownSelf - case downFromSecond + case downFromOtherMember } - func shared_stoppingSelfNode_shouldPropagateToOtherNodes(stopMethod: NodeStopMethod) throws { - let (first, second) = self.setUpPair { settings in + /// Selects which node to stop + enum StopNodeSelection { + case firstLeader // the first node is going to be the leader, so testing for downing the leader and a non-leader is recommended. + case secondNonLeader + } + + // ==== ---------------------------------------------------------------------------------------------------------------- + // MARK: Shared Settings + + private var downingStrategy: DowningStrategySettings { + .timeout(.init(downUnreachableMembersAfter: .milliseconds(200))) + } + + // ==== ---------------------------------------------------------------------------------------------------------------- + // MARK: Downing + + func shared_stoppingNode_shouldPropagateToOtherNodesAsDown(stopMethod: NodeStopMethod, stopNode: StopNodeSelection, _ modifySettings: ((inout ActorSystemSettings) -> Void)? = nil) throws { + let (first, second) = self.setUpPair(modifySettings) + let thirdNeverDownSystem = self.setUpNode("third", modifySettings) + let thirdNeverDownNode = thirdNeverDownSystem.cluster.node + + try self.joinNodes(node: first, with: second, ensureMembers: .up) + try self.joinNodes(node: thirdNeverDownSystem, with: second, ensureMembers: .up) + + let expectedDownSystem: ActorSystem + let otherNotDownPairSystem: ActorSystem + switch stopNode { + case .firstLeader: + expectedDownSystem = first + otherNotDownPairSystem = second + case .secondNonLeader: + expectedDownSystem = second + otherNotDownPairSystem = first + } + + let expectedDownNode = expectedDownSystem.cluster.node + + // we start cluster event probes early, so they get the events one by one as they happen + let eventsProbeOther = self.testKit(otherNotDownPairSystem).spawnTestProbe(subscribedTo: otherNotDownPairSystem.cluster.events) + let eventsProbeThird = self.testKit(thirdNeverDownSystem).spawnTestProbe(subscribedTo: thirdNeverDownSystem.cluster.events) + + pinfo("Expecting [\(expectedDownSystem)] to become [.down], method to stop the node [\(stopMethod)]") + + // we cause the stop of the target node as expected + switch (stopMethod, stopNode) { + case (.leaveSelfNode, .firstLeader): first.cluster.leave() + case (.leaveSelfNode, .secondNonLeader): second.cluster.leave() + + case (.downSelf, .firstLeader): first.cluster.down(node: first.cluster.node.node) + case (.downSelf, .secondNonLeader): second.cluster.down(node: second.cluster.node.node) + + case (.shutdownSelf, .firstLeader): first.shutdown() + case (.shutdownSelf, .secondNonLeader): second.shutdown() + + case (.downFromOtherMember, .firstLeader): second.cluster.down(node: first.cluster.node.node) + case (.downFromOtherMember, .secondNonLeader): thirdNeverDownSystem.cluster.down(node: second.cluster.node.node) + } + + func expectedDownMemberEventsFishing(on: ActorSystem) -> (Cluster.Event) -> ActorTestProbe.FishingDirective { + { event in + switch event { + case .membershipChange(let change) where change.node == expectedDownNode && change.isRemoval: + pinfo("MembershipChange on \(on.cluster.node.node): \(change)") + return .catchComplete(change) + case .membershipChange(let change) where change.node == expectedDownNode: + pinfo("MembershipChange on \(on.cluster.node.node): \(change)") + return .catchContinue(change) + case .reachabilityChange(let change) where change.member.node == expectedDownNode: + pnote("ReachabilityChange on \(otherNotDownPairSystem.cluster.node.node) = \(change)") + return .ignore + default: + // pnote("Event on \(otherNotDownPairSystem.cluster.node.node) = \(event)") + return .ignore + } + } + } + + // collect all events regarding the expectedDownNode's membership lifecycle + let eventsOnOther = try eventsProbeOther.fishFor(Cluster.MembershipChange.self, within: .seconds(10), expectedDownMemberEventsFishing(on: otherNotDownPairSystem)) + let eventsOnThird = try eventsProbeThird.fishFor(Cluster.MembershipChange.self, within: .seconds(10), expectedDownMemberEventsFishing(on: thirdNeverDownSystem)) + + eventsOnOther.shouldContain(where: { change in change.toStatus.isDown && (change.fromStatus == .joining || change.fromStatus == .up) }) + eventsOnOther.shouldContain(Cluster.MembershipChange(node: expectedDownNode, fromStatus: .down, toStatus: .removed)) + + eventsOnOther.shouldContain(where: { change in change.toStatus.isDown && (change.fromStatus == .joining || change.fromStatus == .up) }) + eventsOnThird.shouldContain(Cluster.MembershipChange(node: expectedDownNode, fromStatus: .down, toStatus: .removed)) + } + + // ==== ---------------------------------------------------------------------------------------------------------------- + // MARK: Stop by: cluster.leave() + + func test_stopLeader_by_leaveSelfNode_shouldPropagateToOtherNodes() throws { + try self.shared_stoppingNode_shouldPropagateToOtherNodesAsDown(stopMethod: .leaveSelfNode, stopNode: .firstLeader) { settings in settings.cluster.onDownAction = .gracefulShutdown(delay: .seconds(0)) settings.cluster.autoLeaderElection = .lowestReachable(minNumberOfMembers: 2) - settings.cluster.downingStrategy = .timeout(.default) + settings.cluster.downingStrategy = self.downingStrategy } - let third = self.setUpNode("third") { settings in + } + + func test_stopMember_by_leaveSelfNode_shouldPropagateToOtherNodes() throws { + try self.shared_stoppingNode_shouldPropagateToOtherNodesAsDown(stopMethod: .leaveSelfNode, stopNode: .secondNonLeader) { settings in settings.cluster.onDownAction = .gracefulShutdown(delay: .seconds(0)) settings.cluster.autoLeaderElection = .lowestReachable(minNumberOfMembers: 2) - settings.cluster.downingStrategy = .timeout(.default) - } - try self.joinNodes(node: first, with: second, ensureMembers: .up) - try self.joinNodes(node: third, with: second, ensureMembers: .up) - - switch stopMethod { - case .leaveSelf: - first.cluster.leave() - case .downSelf: - first.cluster.down(node: first.cluster.node.node) - case .shutdownSelf: - first.shutdown() - case .downFromSecond: - second.cluster.down(node: first.cluster.node.node) + settings.cluster.downingStrategy = self.downingStrategy } + } - switch stopMethod { - case .leaveSelf, .downSelf, .downFromSecond: - _ = try shouldNotThrow { - try self.testKit(second).eventually(within: .seconds(10)) { - try self.capturedLogs(of: first).shouldContain(prefix: "Self node was marked [.down]!", failTest: false) - } - } - case .shutdownSelf: - () // no extra assertion + // ==== ---------------------------------------------------------------------------------------------------------------- + // MARK: Stop by: cluster.down(selfNode) + + func test_stopLeader_by_downSelf_shouldPropagateToOtherNodes() throws { + try self.shared_stoppingNode_shouldPropagateToOtherNodesAsDown(stopMethod: .downSelf, stopNode: .firstLeader) { settings in + settings.cluster.onDownAction = .gracefulShutdown(delay: .seconds(0)) + settings.cluster.autoLeaderElection = .lowestReachable(minNumberOfMembers: 2) + + settings.cluster.downingStrategy = self.downingStrategy } + } - _ = try shouldNotThrow { - try self.testKit(second).eventually(within: .seconds(20)) { - // at least one of the nodes will first realize that first is unreachable, - // upon doing so, the leader election picks a new leader -- it will be `second` on lowest-reachable-address leadership for example. - // As we now have a leader, downing may perform it's move.\ + func test_stopMember_by_downSelf_shouldPropagateToOtherNodes() throws { + try self.shared_stoppingNode_shouldPropagateToOtherNodesAsDown(stopMethod: .downSelf, stopNode: .secondNonLeader) { settings in + settings.cluster.onDownAction = .gracefulShutdown(delay: .seconds(0)) + settings.cluster.autoLeaderElection = .lowestReachable(minNumberOfMembers: 2) - try self.capturedLogs(of: second).shouldContain(grep: "@localhost:9001, status: down", failTest: false) - try self.capturedLogs(of: third).shouldContain(grep: "@localhost:9001, status: down", failTest: false) - } + settings.cluster.downingStrategy = self.downingStrategy } } - func test_stoppingSelfNode_shouldPropagateToOtherNodes_downedBy_leaveSelf() throws { - try self.shared_stoppingSelfNode_shouldPropagateToOtherNodes(stopMethod: .leaveSelf) + // ==== ---------------------------------------------------------------------------------------------------------------- + // MARK: Stop by system.shutdown() + + func test_stopLeader_by_downByMember_shouldPropagateToOtherNodes() throws { + try self.shared_stoppingNode_shouldPropagateToOtherNodesAsDown(stopMethod: .downFromOtherMember, stopNode: .firstLeader) { settings in + settings.cluster.onDownAction = .gracefulShutdown(delay: .seconds(0)) + settings.cluster.autoLeaderElection = .lowestReachable(minNumberOfMembers: 2) + + settings.cluster.downingStrategy = self.downingStrategy + } } - func test_stoppingSelfNode_shouldPropagateToOtherNodes_downedBy_downSelf() throws { - try self.shared_stoppingSelfNode_shouldPropagateToOtherNodes(stopMethod: .downSelf) + func test_stopMember_by_downByMember_shouldPropagateToOtherNodes() throws { + try self.shared_stoppingNode_shouldPropagateToOtherNodesAsDown(stopMethod: .downFromOtherMember, stopNode: .secondNonLeader) { settings in + settings.cluster.onDownAction = .gracefulShutdown(delay: .seconds(0)) + settings.cluster.autoLeaderElection = .lowestReachable(minNumberOfMembers: 2) + + settings.cluster.downingStrategy = self.downingStrategy + } } - func test_stoppingSelfNode_shouldPropagateToOtherNodes_downedBy_shutdownSelf() throws { - try self.shared_stoppingSelfNode_shouldPropagateToOtherNodes(stopMethod: .shutdownSelf) + // ==== ---------------------------------------------------------------------------------------------------------------- + // MARK: Stop by: otherSystem.cluster.down(theNode) + + func test_stopLeader_by_shutdownSelf_shouldPropagateToOtherNodes() throws { + try self.shared_stoppingNode_shouldPropagateToOtherNodesAsDown(stopMethod: .shutdownSelf, stopNode: .firstLeader) { settings in + settings.cluster.onDownAction = .gracefulShutdown(delay: .seconds(0)) + settings.cluster.autoLeaderElection = .lowestReachable(minNumberOfMembers: 2) + + settings.cluster.downingStrategy = self.downingStrategy + } } - func test_stoppingSelfNode_shouldPropagateToOtherNodes_downedBy_downFromSecond() throws { - try self.shared_stoppingSelfNode_shouldPropagateToOtherNodes(stopMethod: .downFromSecond) + func test_stopMember_by_shutdownSelf_shouldPropagateToOtherNodes() throws { + try self.shared_stoppingNode_shouldPropagateToOtherNodesAsDown(stopMethod: .shutdownSelf, stopNode: .secondNonLeader) { settings in + settings.cluster.onDownAction = .gracefulShutdown(delay: .seconds(0)) + settings.cluster.autoLeaderElection = .lowestReachable(minNumberOfMembers: 2) + + settings.cluster.downingStrategy = self.downingStrategy + } } } diff --git a/Tests/DistributedActorsTests/Cluster/DowningStrategy/TimeoutBasedDowningInstanceTests.swift b/Tests/DistributedActorsTests/Cluster/DowningStrategy/TimeoutBasedDowningInstanceTests.swift index 52941bfa0..1c514792e 100644 --- a/Tests/DistributedActorsTests/Cluster/DowningStrategy/TimeoutBasedDowningInstanceTests.swift +++ b/Tests/DistributedActorsTests/Cluster/DowningStrategy/TimeoutBasedDowningInstanceTests.swift @@ -148,22 +148,26 @@ final class TimeoutBasedDowningInstanceTests: XCTestCase { // we are the leader, if a timeout happens, we should issue .down _ = try self.instance.onLeaderChange(to: self.selfMember) - let directive = self.instance.onMemberUnreachable(.init(member: self.otherMember.asUnreachable)) - guard case .startTimer = directive else { - throw TestError("Expected .startTimer, but got \(directive)") - } + let unreachableMember = self.otherMember.asUnreachable - let downDecision = self.instance.onTimeout(self.otherMember) - guard case .markAsDown(let nodesToDown) = downDecision else { - throw TestError("Expected .markAsDown, but got \(directive)") - } + try shouldNotThrow { + let directive = self.instance.onMemberUnreachable(.init(member: unreachableMember)) + guard case .startTimer = directive else { + throw TestError("Expected .startTimer, but got \(directive)") + } - nodesToDown.shouldEqual([self.otherMember]) + let downDecision = self.instance.onTimeout(unreachableMember) + guard case .markAsDown(let nodesToDown) = downDecision else { + throw TestError("Expected .markAsDown, but got \(directive)") + } + + nodesToDown.shouldEqual([unreachableMember]) - // since we signalled the .down, no need to retain the member as "to be marked down" anymore, - // if we never cleaned that set it could be technically a memory leak, continuing to accumulate - // all members that we downed over the lifetime of the cluster. - self.instance._markAsDown.shouldBeEmpty() + // since we signalled the .down, no need to retain the member as "to be marked down" anymore, + // if we never cleaned that set it could be technically a memory leak, continuing to accumulate + // all members that we downed over the lifetime of the cluster. + self.instance._markAsDown.shouldBeEmpty() + } } // ==== ------------------------------------------------------------------------------------------------------------ @@ -238,6 +242,6 @@ final class TimeoutBasedDowningInstanceTests: XCTestCase { guard case .startTimer = self.instance.onMemberUnreachable(.init(member: member.asUnreachable)) else { throw TestError("Expected directive to be .startTimer") } - self.instance._unreachable.shouldContain(member) + self.instance._unreachable.shouldContain(member.asUnreachable) } } diff --git a/Tests/DistributedActorsTests/Cluster/MembershipGossipSeenTableTests+XCTest.swift b/Tests/DistributedActorsTests/Cluster/GossipSeenTableTests+XCTest.swift similarity index 81% rename from Tests/DistributedActorsTests/Cluster/MembershipGossipSeenTableTests+XCTest.swift rename to Tests/DistributedActorsTests/Cluster/GossipSeenTableTests+XCTest.swift index e3797c7e9..d33d1140f 100644 --- a/Tests/DistributedActorsTests/Cluster/MembershipGossipSeenTableTests+XCTest.swift +++ b/Tests/DistributedActorsTests/Cluster/GossipSeenTableTests+XCTest.swift @@ -20,10 +20,10 @@ import XCTest /// Do NOT edit this file directly as it will be regenerated automatically when needed. /// -extension MembershipGossipSeenTableTests { - static var allTests: [(String, (MembershipGossipSeenTableTests) -> () throws -> Void)] { +extension GossipSeenTableTests { + static var allTests: [(String, (GossipSeenTableTests) -> () throws -> Void)] { return [ - ("test_seenTable_compare_concurrent", test_seenTable_compare_concurrent), + ("test_seenTable_compare_concurrent_eachOtherDontKnown", test_seenTable_compare_concurrent_eachOtherDontKnown), ("test_incrementVersion", test_incrementVersion), ("test_seenTable_merge_notYetSeenInformation", test_seenTable_merge_notYetSeenInformation), ("test_seenTable_merge_sameInformation", test_seenTable_merge_sameInformation), @@ -31,6 +31,7 @@ extension MembershipGossipSeenTableTests { ("test_seenTable_merge_behindInformation", test_seenTable_merge_behindInformation), ("test_seenTable_merge_concurrentInformation", test_seenTable_merge_concurrentInformation), ("test_seenTable_merge_concurrentInformation_unknownMember", test_seenTable_merge_concurrentInformation_unknownMember), + ("test_prune_removeNodeFromSeenTable", test_prune_removeNodeFromSeenTable), ] } } diff --git a/Tests/DistributedActorsTests/Cluster/GossipSeenTableTests.swift b/Tests/DistributedActorsTests/Cluster/GossipSeenTableTests.swift new file mode 100644 index 000000000..19b0b7a87 --- /dev/null +++ b/Tests/DistributedActorsTests/Cluster/GossipSeenTableTests.swift @@ -0,0 +1,234 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift Distributed Actors open source project +// +// Copyright (c) 2018-2019 Apple Inc. and the Swift Distributed Actors project authors +// Licensed under Apache License v2.0 +// +// See LICENSE.txt for license information +// See CONTRIBUTORS.md for the list of Swift Distributed Actors project authors +// +// SPDX-License-Identifier: Apache-2.0 +// +//===----------------------------------------------------------------------===// + +@testable import DistributedActors +import DistributedActorsTestKit +import NIO +import XCTest + +/// Tests of just the datatype +final class GossipSeenTableTests: XCTestCase { + typealias SeenTable = Cluster.Gossip.SeenTable + + var nodeA: UniqueNode! + var nodeB: UniqueNode! + var nodeC: UniqueNode! + + lazy var allNodes = [ + self.nodeA!, self.nodeB!, self.nodeC!, + ] + + override func setUp() { + super.setUp() + self.nodeA = UniqueNode(protocol: "sact", systemName: "firstA", host: "127.0.0.1", port: 7111, nid: .random()) + self.nodeB = UniqueNode(protocol: "sact", systemName: "secondB", host: "127.0.0.1", port: 7222, nid: .random()) + self.nodeC = UniqueNode(protocol: "sact", systemName: "thirdC", host: "127.0.0.1", port: 7333, nid: .random()) + } + + func test_seenTable_compare_concurrent_eachOtherDontKnown() { + let table = Cluster.Gossip.SeenTable.parse( + """ + A: A@1 + """, nodes: self.allNodes + ) + + let incoming = Cluster.Gossip.SeenTable.parse( + """ + B: B@1 + """, nodes: self.allNodes + ) + + // neither node knew about each other, so the updates were concurrent; + table.compareVersion(observedOn: self.nodeA, to: incoming.version(at: self.nodeB)!) + .shouldEqual(VersionVector.CausalRelation.concurrent) + + incoming.compareVersion(observedOn: self.nodeB, to: table.version(at: self.nodeA)!) + .shouldEqual(VersionVector.CausalRelation.concurrent) + } + + // ==== ------------------------------------------------------------------------------------------------------------ + // MARK: increments + + func test_incrementVersion() { + var table = Cluster.Gossip.SeenTable(myselfNode: self.nodeA, version: .init()) + table.version(at: self.nodeA).shouldEqual(VersionVector.parse("", nodes: self.allNodes)) + + table.incrementVersion(owner: self.nodeA, at: self.nodeA) + table.version(at: self.nodeA).shouldEqual(VersionVector.parse("A:1", nodes: self.allNodes)) + + table.incrementVersion(owner: self.nodeA, at: self.nodeA) + table.version(at: self.nodeA).shouldEqual(VersionVector.parse("A:2", nodes: self.allNodes)) + + table.incrementVersion(owner: self.nodeA, at: self.nodeB) + table.version(at: self.nodeA).shouldEqual(VersionVector.parse("A:2 B:1", nodes: self.allNodes)) + table.version(at: self.nodeB).shouldBeNil() + + table.incrementVersion(owner: self.nodeB, at: self.nodeB) + table.version(at: self.nodeA).shouldEqual(VersionVector.parse("A:2 B:1", nodes: self.allNodes)) + table.version(at: self.nodeB).shouldEqual(VersionVector.parse("B:1", nodes: self.allNodes)) + } + + // ==== ------------------------------------------------------------------------------------------------------------ + // MARK: merge + + func test_seenTable_merge_notYetSeenInformation() { + var table = Cluster.Gossip.SeenTable.parse( + """ + A: A:1 + """, nodes: self.allNodes + ) + + let incoming = Cluster.Gossip.SeenTable.parse( + """ + B: B:2 + """, nodes: self.allNodes + ) + + table.merge(selfOwner: self.nodeA, incoming: incoming) + + table.version(at: self.nodeA) + .shouldEqual(VersionVector.parse("A:1 B:2", nodes: self.allNodes)) + table.version(at: self.nodeB) + .shouldEqual(VersionVector.parse("B:2", nodes: self.allNodes)) + } + + func test_seenTable_merge_sameInformation() { + // a situation in which the two nodes have converged, so their versions are .same + + var table = Cluster.Gossip.SeenTable(myselfNode: self.nodeA, version: .init()) + table.incrementVersion(owner: self.nodeA, at: self.nodeA) // A observed: A:1 + table.incrementVersion(owner: self.nodeA, at: self.nodeB) // A observed: A:1 B:1 + table.incrementVersion(owner: self.nodeA, at: self.nodeB) // A observed: A:1 B:2 + + var incoming = Cluster.Gossip(ownerNode: self.nodeB) // B observed: + incoming.incrementOwnerVersion() // B observed: B:1 + incoming.incrementOwnerVersion() // B observed: B:2 + incoming.seen.incrementVersion(owner: self.nodeB, at: self.nodeA) // B observed: A:1 B:2 + + table.merge(selfOwner: self.nodeA, incoming: incoming.seen) + + table.version(at: self.nodeA).shouldEqual(VersionVector.parse("A:1 B:2", nodes: self.allNodes)) + table.version(at: self.nodeB).shouldEqual(VersionVector.parse("A:1 B:2", nodes: self.allNodes)) + } + + func test_seenTable_merge_aheadInformation() { + // the incoming gossip is "ahead" and has some more information + + var table = Cluster.Gossip.SeenTable(myselfNode: self.nodeA, version: .init()) + table.incrementVersion(owner: self.nodeA, at: self.nodeA) // A observed: A:1 + + var incoming = Cluster.Gossip(ownerNode: self.nodeB) // B observed: + incoming.incrementOwnerVersion() // B observed: B:1 + incoming.incrementOwnerVersion() // B observed: B:2 + incoming.seen.incrementVersion(owner: self.nodeB, at: self.nodeA) // B observed: A:1 B:2 + + table.merge(selfOwner: self.nodeA, incoming: incoming.seen) + + table.version(at: self.nodeA).shouldEqual(VersionVector.parse("A:1 B:2", nodes: self.allNodes)) + table.version(at: self.nodeB).shouldEqual(VersionVector.parse("A:1 B:2", nodes: self.allNodes)) + } + + func test_seenTable_merge_behindInformation() { + // the incoming gossip is "behind" + + var table = Cluster.Gossip.SeenTable(myselfNode: self.nodeA, version: .init()) + table.incrementVersion(owner: self.nodeA, at: self.nodeA) // A observed: A:1 + table.incrementVersion(owner: self.nodeA, at: self.nodeB) // A observed: A:1 B:1 + table.incrementVersion(owner: self.nodeA, at: self.nodeB) // A observed: A:1 B:2 + + var incoming = Cluster.Gossip(ownerNode: self.nodeB) // B observed: + incoming.incrementOwnerVersion() // B observed: B:1 + incoming.incrementOwnerVersion() // B observed: B:2 + + table.merge(selfOwner: self.nodeA, incoming: incoming.seen) + + table.version(at: self.nodeA).shouldEqual(VersionVector.parse("A:1 B:2", nodes: self.allNodes)) + table.version(at: self.nodeB).shouldEqual(VersionVector.parse("B:2", nodes: self.allNodes)) + } + + func test_seenTable_merge_concurrentInformation() { + // the incoming gossip is "concurrent" + + var table = Cluster.Gossip.SeenTable(myselfNode: self.nodeA, version: .init()) + table.incrementVersion(owner: self.nodeA, at: self.nodeA) // A observed: A:1 + table.incrementVersion(owner: self.nodeA, at: self.nodeB) // A observed: A:1 B:1 + table.incrementVersion(owner: self.nodeA, at: self.nodeB) // A observed: A:1 B:2 + table.incrementVersion(owner: self.nodeA, at: self.nodeC) // A observed: A:1 B:2 C:1 + // M has seen gossip from S, when it was at t=2 + table.incrementVersion(owner: self.nodeB, at: self.nodeB) // B observed: B:2 + table.incrementVersion(owner: self.nodeB, at: self.nodeB) // B observed: B:3 + + // in reality S is quite more far ahead, already at t=4 + var incoming = Cluster.Gossip(ownerNode: self.nodeB) // B observed + incoming.incrementOwnerVersion() // B observed: B:1 + incoming.incrementOwnerVersion() // B observed: B:2 + incoming.incrementOwnerVersion() // B observed: B:3 + incoming.seen.incrementVersion(owner: self.nodeB, at: self.nodeC) // B observed: B:3 C:1 + + table.merge(selfOwner: self.nodeA, incoming: incoming.seen) + + table.version(at: self.nodeA).shouldEqual(VersionVector.parse("A:1 B:3 C:1", nodes: self.allNodes)) + table.version(at: self.nodeB).shouldEqual(incoming.seen.version(at: self.nodeB)) + } + + func test_seenTable_merge_concurrentInformation_unknownMember() { + // the incoming gossip is "concurrent", and has a table entry for a node we don't know + + var table = Cluster.Gossip.SeenTable.parse( + """ + A: A:4 + """, nodes: self.allNodes + ) + + let incoming = Cluster.Gossip.SeenTable.parse( + """ + A: A:1 + B: B:2 C:1 + C: C:1 + """, nodes: self.allNodes + ) + + table.merge(selfOwner: self.nodeA, incoming: incoming) + + table.version(at: self.nodeA).shouldEqual(VersionVector.parse("A:4 B:2 C:1", nodes: self.allNodes)) + table.version(at: self.nodeB).shouldEqual(VersionVector.parse("B:2 C:1", nodes: self.allNodes)) + table.version(at: self.nodeC).shouldEqual(VersionVector.parse("C:1", nodes: self.allNodes)) + } + + // ==== ------------------------------------------------------------------------------------------------------------ + // MARK: Prune + + func test_prune_removeNodeFromSeenTable() { + var table = Cluster.Gossip.SeenTable(myselfNode: self.nodeA, version: .init()) + table.incrementVersion(owner: self.nodeA, at: self.nodeA) + table.incrementVersion(owner: self.nodeA, at: self.nodeC) + + table.incrementVersion(owner: self.nodeB, at: self.nodeC) + table.incrementVersion(owner: self.nodeB, at: self.nodeB) + table.incrementVersion(owner: self.nodeB, at: self.nodeA) + + table.incrementVersion(owner: self.nodeC, at: self.nodeC) + table.incrementVersion(owner: self.nodeC, at: self.nodeA) + + table.version(at: self.nodeA).shouldEqual(VersionVector.parse("A:1 C:1", nodes: self.allNodes)) + table.version(at: self.nodeB).shouldEqual(VersionVector.parse("B:1 A:1 C:1", nodes: self.allNodes)) + table.version(at: self.nodeC).shouldEqual(VersionVector.parse("C:1 A:1", nodes: self.allNodes)) + + table.prune(self.nodeC) + + table.version(at: self.nodeA).shouldEqual(VersionVector.parse("A:1", nodes: self.allNodes)) + table.version(at: self.nodeB).shouldEqual(VersionVector.parse("B:1 A:1", nodes: self.allNodes)) + table.version(at: self.nodeC).shouldBeNil() + } +} diff --git a/Tests/DistributedActorsTests/Cluster/LeadershipTests.swift b/Tests/DistributedActorsTests/Cluster/LeadershipTests.swift index 09fc9197b..52bb45a4a 100644 --- a/Tests/DistributedActorsTests/Cluster/LeadershipTests.swift +++ b/Tests/DistributedActorsTests/Cluster/LeadershipTests.swift @@ -19,15 +19,15 @@ import NIO import XCTest final class LeadershipTests: XCTestCase { - let firstMember = Cluster.Member(node: UniqueNode(node: Node(systemName: "System", host: "1.1.1.1", port: 7337), nid: .random()), status: .up) - let secondMember = Cluster.Member(node: UniqueNode(node: Node(systemName: "System", host: "2.2.2.2", port: 8228), nid: .random()), status: .up) - let thirdMember = Cluster.Member(node: UniqueNode(node: Node(systemName: "System", host: "3.3.3.3", port: 9119), nid: .random()), status: .up) + let memberA = Cluster.Member(node: UniqueNode(node: Node(systemName: "System", host: "1.1.1.1", port: 7337), nid: .random()), status: .up) + let memberB = Cluster.Member(node: UniqueNode(node: Node(systemName: "System", host: "2.2.2.2", port: 8228), nid: .random()), status: .up) + let memberC = Cluster.Member(node: UniqueNode(node: Node(systemName: "System", host: "3.3.3.3", port: 9119), nid: .random()), status: .up) let newMember = Cluster.Member(node: UniqueNode(node: Node(systemName: "System", host: "4.4.4.4", port: 1001), nid: .random()), status: .up) let fakeContext = LeaderElectionContext(log: Logger(label: "mock"), eventLoop: EmbeddedEventLoop()) lazy var initialMembership: Cluster.Membership = [ - firstMember, secondMember, thirdMember, + memberA, memberB, memberC, ] // ==== ------------------------------------------------------------------------------------------------------------ @@ -39,14 +39,14 @@ final class LeadershipTests: XCTestCase { let membership = self.initialMembership let change: Cluster.LeadershipChange? = try election.runElection(context: self.fakeContext, membership: membership).future.wait() - change.shouldEqual(Cluster.LeadershipChange(oldLeader: nil, newLeader: self.firstMember)) + change.shouldEqual(Cluster.LeadershipChange(oldLeader: nil, newLeader: self.memberA)) } func test_LowestAddressReachableMember_notEnoughMembersToDecide() throws { var election = Leadership.LowestReachableMember(minimumNrOfMembers: 3) var membership = self.initialMembership - _ = membership.remove(self.firstMember.node) + _ = membership.removeCompletely(self.memberA.node) // 2 members -> not enough to make decision anymore let change1: Cluster.LeadershipChange? = try election.runElection(context: self.fakeContext, membership: membership).future.wait() @@ -56,14 +56,14 @@ final class LeadershipTests: XCTestCase { // 3 members again, should work let change2: Cluster.LeadershipChange? = try election.runElection(context: self.fakeContext, membership: membership).future.wait() - change2.shouldEqual(Cluster.LeadershipChange(oldLeader: nil, newLeader: self.secondMember)) + change2.shouldEqual(Cluster.LeadershipChange(oldLeader: nil, newLeader: self.memberB)) } func test_LowestAddressReachableMember_notEnoughReachableMembersToDecide() throws { var election = Leadership.LowestReachableMember(minimumNrOfMembers: 3) var membership = self.initialMembership - _ = membership.mark(self.secondMember.node, reachability: .unreachable) + _ = membership.mark(self.memberB.node, reachability: .unreachable) // 2 reachable members -> not enough to make decision anymore let change1: Cluster.LeadershipChange? = try election.runElection(context: self.fakeContext, membership: membership).future.wait() @@ -73,15 +73,15 @@ final class LeadershipTests: XCTestCase { // 3 reachable members again, 1 unreachable, should work let change2: Cluster.LeadershipChange? = try election.runElection(context: self.fakeContext, membership: membership).future.wait() - change2.shouldEqual(Cluster.LeadershipChange(oldLeader: nil, newLeader: self.firstMember)) + change2.shouldEqual(Cluster.LeadershipChange(oldLeader: nil, newLeader: self.memberA)) } func test_LowestAddressReachableMember_onlyUnreachableMembers_cantDecide() throws { var election = Leadership.LowestReachableMember(minimumNrOfMembers: 3) var membership = self.initialMembership - _ = membership.mark(self.firstMember.node, reachability: .unreachable) - _ = membership.mark(self.secondMember.node, reachability: .unreachable) + _ = membership.mark(self.memberA.node, reachability: .unreachable) + _ = membership.mark(self.memberB.node, reachability: .unreachable) // 1 reachable member -> not enough to make decision anymore let change1: Cluster.LeadershipChange? = try election.runElection(context: self.fakeContext, membership: membership).future.wait() @@ -92,17 +92,18 @@ final class LeadershipTests: XCTestCase { var election = Leadership.LowestReachableMember(minimumNrOfMembers: 3) var membership = self.initialMembership - _ = try! membership.applyLeadershipChange(to: self.firstMember) // try! because `firstMember` is a member + _ = try! membership.applyLeadershipChange(to: self.memberA) // try! because `memberA` is a member - let leader = membership.leader - leader.shouldEqual(self.firstMember) + var leader = membership.leader + leader.shouldEqual(self.memberA) // leader is down: - _ = membership.mark(self.firstMember.node, as: .down) + _ = membership.mark(self.memberA.node, as: .down) // 2 members -> not enough to make decision anymore // Since we go from a leader to without, there should be a change let change: Cluster.LeadershipChange? = try election.runElection(context: self.fakeContext, membership: membership).future.wait() + leader?.status = .down change.shouldEqual(Cluster.LeadershipChange(oldLeader: leader, newLeader: nil)) } @@ -113,11 +114,11 @@ final class LeadershipTests: XCTestCase { _ = membership.join(self.newMember.node) (try election.runElection(context: self.fakeContext, membership: membership).future.wait()) - .shouldEqual(Cluster.LeadershipChange(oldLeader: nil, newLeader: self.firstMember)) + .shouldEqual(Cluster.LeadershipChange(oldLeader: nil, newLeader: self.memberA)) - _ = membership.mark(self.firstMember.node, as: .down) + _ = membership.mark(self.memberA.node, as: .down) (try election.runElection(context: self.fakeContext, membership: membership).future.wait()) - .shouldEqual(Cluster.LeadershipChange(oldLeader: nil, newLeader: self.secondMember)) + .shouldEqual(Cluster.LeadershipChange(oldLeader: nil, newLeader: self.memberB)) } func test_LowestAddressReachableMember_whenCurrentLeaderDown_enoughMembers() throws { @@ -127,11 +128,11 @@ final class LeadershipTests: XCTestCase { _ = membership.join(self.newMember.node) (try election.runElection(context: self.fakeContext, membership: membership).future.wait()) - .shouldEqual(Cluster.LeadershipChange(oldLeader: nil, newLeader: self.firstMember)) + .shouldEqual(Cluster.LeadershipChange(oldLeader: nil, newLeader: self.memberA)) - _ = membership.mark(self.firstMember.node, as: .down) + _ = membership.mark(self.memberA.node, as: .down) (try election.runElection(context: self.fakeContext, membership: membership).future.wait()) - .shouldEqual(Cluster.LeadershipChange(oldLeader: nil, newLeader: self.secondMember)) + .shouldEqual(Cluster.LeadershipChange(oldLeader: nil, newLeader: self.memberB)) } func test_LowestAddressReachableMember_whenCurrentLeaderUnreachable_notEnoughMinMembers() throws { @@ -147,14 +148,14 @@ final class LeadershipTests: XCTestCase { try election.runElection(context: self.fakeContext, membership: membership).future.wait() .map(applyToMembership) - .shouldEqual(Cluster.LeadershipChange(oldLeader: nil, newLeader: self.firstMember)) + .shouldEqual(Cluster.LeadershipChange(oldLeader: nil, newLeader: self.memberA)) - _ = membership.mark(self.firstMember.node, reachability: .unreachable) + _ = membership.mark(self.memberA.node, reachability: .unreachable) try election.runElection(context: self.fakeContext, membership: membership).future.wait() .map(applyToMembership) .shouldEqual(nil) - membership.leader.shouldEqual(self.firstMember) + membership.leader.shouldEqual(self.memberA.asUnreachable) } func test_LowestAddressReachableMember_keepLeader_notEnoughMembers_DO_NOT_loseLeadershipIfBelowMinNrOfMembers() throws { @@ -175,23 +176,23 @@ final class LeadershipTests: XCTestCase { try election.runElection(context: self.fakeContext, membership: membership).future.wait() .map(applyToMembership) - .shouldEqual(Cluster.LeadershipChange(oldLeader: nil, newLeader: self.firstMember)) + .shouldEqual(Cluster.LeadershipChange(oldLeader: nil, newLeader: self.memberA)) // down third - _ = membership.mark(self.thirdMember.node, as: .down) + _ = membership.mark(self.memberC.node, as: .down) // no reason to remove the leadership from the first node try election.runElection(context: self.fakeContext, membership: membership).future.wait() .map(applyToMembership) .shouldEqual(nil) // down second - _ = membership.mark(self.secondMember.node, as: .down) + _ = membership.mark(self.memberB.node, as: .down) // STILL no reason to remove the leadership from the first node try election.runElection(context: self.fakeContext, membership: membership).future.wait() .map(applyToMembership) .shouldEqual(nil) - membership.leader.shouldEqual(self.firstMember) + membership.leader.shouldEqual(self.memberA) } func test_LowestAddressReachableMember_keepLeader_notEnoughMembers_DO_loseLeadershipIfBelowMinNrOfMembers() throws { @@ -211,17 +212,17 @@ final class LeadershipTests: XCTestCase { try election.runElection(context: self.fakeContext, membership: membership).future.wait() .map(applyToMembership) - .shouldEqual(Cluster.LeadershipChange(oldLeader: nil, newLeader: self.firstMember)) + .shouldEqual(Cluster.LeadershipChange(oldLeader: nil, newLeader: self.memberA)) // down third - _ = membership.mark(self.thirdMember.node, as: .down) + _ = membership.mark(self.memberC.node, as: .down) // no reason to remove the leadership from the first node try election.runElection(context: self.fakeContext, membership: membership).future.wait() .map(applyToMembership) - .shouldEqual(Cluster.LeadershipChange(oldLeader: self.firstMember, newLeader: nil)) + .shouldEqual(Cluster.LeadershipChange(oldLeader: self.memberA, newLeader: nil)) // down second - _ = membership.mark(self.secondMember.node, as: .down) + _ = membership.mark(self.memberB.node, as: .down) // STILL no reason to remove the leadership from the first node try election.runElection(context: self.fakeContext, membership: membership).future.wait() .map(applyToMembership) diff --git a/Tests/DistributedActorsTests/Cluster/ClusterMembershipGossipTests+XCTest.swift b/Tests/DistributedActorsTests/Cluster/MembershipGossipClusteredTests+XCTest.swift similarity index 87% rename from Tests/DistributedActorsTests/Cluster/ClusterMembershipGossipTests+XCTest.swift rename to Tests/DistributedActorsTests/Cluster/MembershipGossipClusteredTests+XCTest.swift index 396f7bf66..04dd37f8a 100644 --- a/Tests/DistributedActorsTests/Cluster/ClusterMembershipGossipTests+XCTest.swift +++ b/Tests/DistributedActorsTests/Cluster/MembershipGossipClusteredTests+XCTest.swift @@ -20,8 +20,8 @@ import XCTest /// Do NOT edit this file directly as it will be regenerated automatically when needed. /// -extension ClusterMembershipGossipTests { - static var allTests: [(String, (ClusterMembershipGossipTests) -> () throws -> Void)] { +extension MembershipGossipClusteredTests { + static var allTests: [(String, (MembershipGossipClusteredTests) -> () throws -> Void)] { return [ ("test_down_beGossipedToOtherNodes", test_down_beGossipedToOtherNodes), ("test_join_swimDiscovered_thirdNode", test_join_swimDiscovered_thirdNode), diff --git a/Tests/DistributedActorsTests/Cluster/ClusterMembershipGossipTests.swift b/Tests/DistributedActorsTests/Cluster/MembershipGossipClusteredTests.swift similarity index 91% rename from Tests/DistributedActorsTests/Cluster/ClusterMembershipGossipTests.swift rename to Tests/DistributedActorsTests/Cluster/MembershipGossipClusteredTests.swift index acf06e4a7..ccf85701d 100644 --- a/Tests/DistributedActorsTests/Cluster/ClusterMembershipGossipTests.swift +++ b/Tests/DistributedActorsTests/Cluster/MembershipGossipClusteredTests.swift @@ -18,11 +18,20 @@ import Foundation import NIOSSL import XCTest -final class ClusterMembershipGossipTests: ClusteredNodesTestBase { +final class MembershipGossipClusteredTests: ClusteredNodesTestBase { override func configureLogCapture(settings: inout LogCapture.Settings) { - settings.filterActorPaths = ["/system/cluster"] - settings.excludeActorPaths = ["/system/cluster/swim"] // we assume it works fine - settings.excludeGrep = ["with generation"] // exclude timers noise + settings.filterActorPaths = [ + "/system/cluster", + ] + settings.excludeActorPaths = [ + "/system/cluster/swim", // we assume it works fine + "/system/receptionist", + ] + settings.excludeGrep = [ + "TimerKey", + "schedule next gossip", + "Gossip payload updated", + ] } // ==== ------------------------------------------------------------------------------------------------------------ diff --git a/Tests/DistributedActorsTests/Cluster/MembershipGossipSeenTableTests.swift b/Tests/DistributedActorsTests/Cluster/MembershipGossipSeenTableTests.swift deleted file mode 100644 index 7df0c75d8..000000000 --- a/Tests/DistributedActorsTests/Cluster/MembershipGossipSeenTableTests.swift +++ /dev/null @@ -1,209 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// This source file is part of the Swift Distributed Actors open source project -// -// Copyright (c) 2018-2019 Apple Inc. and the Swift Distributed Actors project authors -// Licensed under Apache License v2.0 -// -// See LICENSE.txt for license information -// See CONTRIBUTORS.md for the list of Swift Distributed Actors project authors -// -// SPDX-License-Identifier: Apache-2.0 -// -//===----------------------------------------------------------------------===// - -@testable import DistributedActors -import DistributedActorsTestKit -import NIO -import XCTest - -/// Tests of just the datatype -final class MembershipGossipSeenTableTests: XCTestCase { - typealias SeenTable = Cluster.Gossip.SeenTable - - var myselfNode: UniqueNode! - var secondNode: UniqueNode! - var thirdNode: UniqueNode! - - override func setUp() { - super.setUp() - self.myselfNode = UniqueNode(protocol: "sact", systemName: "myself", host: "127.0.0.1", port: 7111, nid: .random()) - self.secondNode = UniqueNode(protocol: "sact", systemName: "second", host: "127.0.0.1", port: 7222, nid: .random()) - self.thirdNode = UniqueNode(protocol: "sact", systemName: "third", host: "127.0.0.1", port: 7333, nid: .random()) - } - - func test_seenTable_compare_concurrent() { - var table = Cluster.Gossip.SeenTable(myselfNode: self.myselfNode, version: .init()) - table.incrementVersion(owner: self.myselfNode, at: self.myselfNode) // M:1 - - var incomingVersion = VersionVector.first(at: .uniqueNode(self.secondNode)) - incomingVersion.increment(at: .uniqueNode(self.secondNode)) - incomingVersion.increment(at: .uniqueNode(self.secondNode)) // S:2 - - // neither node knew about each other, so the updates were concurrent; - // myself receives the table and compares with the incoming: - let result = table.compareVersion(observedOn: self.myselfNode, to: incomingVersion) - result.shouldEqual(VersionVector.CausalRelation.concurrent) - } - - // ==== ------------------------------------------------------------------------------------------------------------ - // MARK: increments - - func test_incrementVersion() { - var table = Cluster.Gossip.SeenTable(myselfNode: self.myselfNode, version: .init()) - table.version(at: self.myselfNode).shouldEqual(self.parseVersionVector("")) - - table.incrementVersion(owner: self.myselfNode, at: self.myselfNode) - table.version(at: self.myselfNode).shouldEqual(self.parseVersionVector("M:1")) - - table.incrementVersion(owner: self.myselfNode, at: self.myselfNode) - table.version(at: self.myselfNode).shouldEqual(self.parseVersionVector("M:2")) - - table.incrementVersion(owner: self.myselfNode, at: self.secondNode) - table.version(at: self.myselfNode).shouldEqual(self.parseVersionVector("M:2 S:1")) - table.version(at: self.secondNode).shouldBeNil() - - table.incrementVersion(owner: self.secondNode, at: self.secondNode) - table.version(at: self.myselfNode).shouldEqual(self.parseVersionVector("M:2 S:1")) - table.version(at: self.secondNode).shouldEqual(self.parseVersionVector("S:1")) - } - - // ==== ------------------------------------------------------------------------------------------------------------ - // MARK: merge - - func test_seenTable_merge_notYetSeenInformation() { - var table = Cluster.Gossip.SeenTable(myselfNode: self.myselfNode, version: .init()) - table.incrementVersion(owner: self.myselfNode, at: self.myselfNode) // M observed: M:1 - - var incoming = Cluster.Gossip(ownerNode: self.secondNode) - incoming.incrementOwnerVersion() // S observed: S:1 - incoming.incrementOwnerVersion() // S observed: S:2 - - table.merge(owner: self.myselfNode, incoming: incoming) - - table.version(at: self.myselfNode).shouldEqual(VersionVector([ - (.uniqueNode(self.myselfNode), 1), (.uniqueNode(self.secondNode), 2), - ])) - table.version(at: self.secondNode).shouldEqual(VersionVector([ - (.uniqueNode(self.secondNode), 2), // as we do not know if the second node has observed OUR information yet - ])) - } - - func test_seenTable_merge_sameInformation() { - // a situation in which the two nodes have converged, so their versions are .same - - var table = Cluster.Gossip.SeenTable(myselfNode: self.myselfNode, version: .init()) - table.incrementVersion(owner: self.myselfNode, at: self.myselfNode) // M observed: M:1 - table.incrementVersion(owner: self.myselfNode, at: self.secondNode) // M observed: M:1 S:1 - table.incrementVersion(owner: self.myselfNode, at: self.secondNode) // M observed: M:1 S:2 - - var incoming = Cluster.Gossip(ownerNode: self.secondNode) // S observed: - incoming.incrementOwnerVersion() // S observed: S:1 - incoming.incrementOwnerVersion() // S observed: S:2 - incoming.seen.incrementVersion(owner: self.secondNode, at: self.myselfNode) // S observed: M:1 S:2 - - table.merge(owner: self.myselfNode, incoming: incoming) - - table.version(at: self.myselfNode).shouldEqual(self.parseVersionVector("M:1 S:2")) - table.version(at: self.secondNode).shouldEqual(self.parseVersionVector("M:1 S:2")) - } - - func test_seenTable_merge_aheadInformation() { - // the incoming gossip is "ahead" and has some more information - - var table = Cluster.Gossip.SeenTable(myselfNode: self.myselfNode, version: .init()) - table.incrementVersion(owner: self.myselfNode, at: self.myselfNode) // M observed: M:1 - - var incoming = Cluster.Gossip(ownerNode: self.secondNode) // S observed: - incoming.incrementOwnerVersion() // S observed: S:1 - incoming.incrementOwnerVersion() // S observed: S:2 - incoming.seen.incrementVersion(owner: self.secondNode, at: self.myselfNode) // S observed: M:1 S:2 - - table.merge(owner: self.myselfNode, incoming: incoming) - - table.version(at: self.myselfNode).shouldEqual(self.parseVersionVector("M:1 S:2")) - table.version(at: self.secondNode).shouldEqual(self.parseVersionVector("M:1 S:2")) - } - - func test_seenTable_merge_behindInformation() { - // the incoming gossip is "behind" - - var table = Cluster.Gossip.SeenTable(myselfNode: self.myselfNode, version: .init()) - table.incrementVersion(owner: self.myselfNode, at: self.myselfNode) // M observed: M:1 - table.incrementVersion(owner: self.myselfNode, at: self.secondNode) // M observed: M:1 S:1 - table.incrementVersion(owner: self.myselfNode, at: self.secondNode) // M observed: M:1 S:2 - - var incoming = Cluster.Gossip(ownerNode: self.secondNode) // S observed: - incoming.incrementOwnerVersion() // S observed: S:1 - incoming.incrementOwnerVersion() // S observed: S:2 - - table.merge(owner: self.myselfNode, incoming: incoming) - - table.version(at: self.myselfNode).shouldEqual(self.parseVersionVector("M:1 S:2")) - table.version(at: self.secondNode).shouldEqual(self.parseVersionVector("S:2")) - } - - func test_seenTable_merge_concurrentInformation() { - // the incoming gossip is "concurrent" - - var table = Cluster.Gossip.SeenTable(myselfNode: self.myselfNode, version: .init()) - table.incrementVersion(owner: self.myselfNode, at: self.myselfNode) // M observed: M:1 - table.incrementVersion(owner: self.myselfNode, at: self.secondNode) // M observed: M:1 S:1 - table.incrementVersion(owner: self.myselfNode, at: self.secondNode) // M observed: M:1 S:2 - table.incrementVersion(owner: self.myselfNode, at: self.thirdNode) // M observed: M:1 S:2 T:1 - // M has seen gossip from S, when it was at t=2 - table.incrementVersion(owner: self.secondNode, at: self.secondNode) // S observed: S:2 - table.incrementVersion(owner: self.secondNode, at: self.secondNode) // S observed: S:3 - - // in reality S is quite more far ahead, already at t=4 - var incoming = Cluster.Gossip(ownerNode: self.secondNode) // S observed - incoming.incrementOwnerVersion() // S observed: S:1 - incoming.incrementOwnerVersion() // S observed: S:2 - incoming.incrementOwnerVersion() // S observed: S:3 - incoming.seen.incrementVersion(owner: self.secondNode, at: self.thirdNode) // S observed: S:3 T:1 - - table.merge(owner: self.myselfNode, incoming: incoming) - - table.version(at: self.myselfNode).shouldEqual(self.parseVersionVector("M:1 S:3 T:1")) - table.version(at: self.secondNode).shouldEqual(incoming.seen.version(at: self.secondNode)) - } - - func test_seenTable_merge_concurrentInformation_unknownMember() { - // the incoming gossip is "concurrent", and has a table entry for a node we don't know - - var table = Cluster.Gossip.SeenTable(myselfNode: self.myselfNode, version: .init()) - table.incrementVersion(owner: self.myselfNode, at: self.myselfNode) // M observed: M:1 - - var incoming = Cluster.Gossip(ownerNode: self.secondNode) // S observed - incoming.incrementOwnerVersion() // S observed: S:1 - incoming.incrementOwnerVersion() // S observed: S:2 - incoming.seen.incrementVersion(owner: self.secondNode, at: self.thirdNode) // S observed: S:2 T:1 - // also it knows third is at t1 - incoming.seen.incrementVersion(owner: self.thirdNode, at: self.thirdNode) // T observed: T:1 - - table.merge(owner: self.myselfNode, incoming: incoming) - - table.version(at: self.myselfNode).shouldEqual(self.parseVersionVector("M:1 S:2 T:1")) - table.version(at: self.secondNode).shouldEqual(self.parseVersionVector("S:2 T:1")) - table.version(at: self.thirdNode).shouldEqual(self.parseVersionVector("T:1")) - } - - // ==== ---------------------------------------------------------------------------------------------------------------- - // MARK: Utilities - - func parseVersionVector(_ s: String) -> VersionVector { - func parseReplicaId(r: Substring) -> ReplicaId { - switch r { - case "M": return .uniqueNode(self.myselfNode) - case "S": return .uniqueNode(self.secondNode) - case "T": return .uniqueNode(self.thirdNode) - default: fatalError("Unknown replica id: \(r)") - } - } - let replicaVersions: [VersionVector.ReplicaVersion] = s.split(separator: " ").map { segment in - let v = segment.split(separator: ":") - return (parseReplicaId(r: v.first!), Int(v.dropFirst().first!)!) - } - return VersionVector(replicaVersions) - } -} diff --git a/Tests/DistributedActorsTests/Cluster/MembershipGossipTests+XCTest.swift b/Tests/DistributedActorsTests/Cluster/MembershipGossipTests+XCTest.swift index c4eb475fa..ddde5f125 100644 --- a/Tests/DistributedActorsTests/Cluster/MembershipGossipTests+XCTest.swift +++ b/Tests/DistributedActorsTests/Cluster/MembershipGossipTests+XCTest.swift @@ -24,8 +24,19 @@ extension MembershipGossipTests { static var allTests: [(String, (MembershipGossipTests) -> () throws -> Void)] { return [ ("test_mergeForward_incomingGossip_firstGossipFromOtherNode", test_mergeForward_incomingGossip_firstGossipFromOtherNode), + ("test_mergeForward_incomingGossip_firstGossipFromOtherNodes", test_mergeForward_incomingGossip_firstGossipFromOtherNodes), ("test_mergeForward_incomingGossip_sameVersions", test_mergeForward_incomingGossip_sameVersions), - ("test_mergeForward_incomingGossip_hasNoInformation", test_mergeForward_incomingGossip_hasNoInformation), + ("test_mergeForward_incomingGossip_fromFourth_onlyKnowsAboutItself", test_mergeForward_incomingGossip_fromFourth_onlyKnowsAboutItself), + ("test_mergeForward_incomingGossip_localHasRemoved_incomingHasOldViewWithDownNode", test_mergeForward_incomingGossip_localHasRemoved_incomingHasOldViewWithDownNode), + ("test_mergeForward_incomingGossip_concurrent_leaderDisagreement", test_mergeForward_incomingGossip_concurrent_leaderDisagreement), + ("test_mergeForward_incomingGossip_concurrent_simple", test_mergeForward_incomingGossip_concurrent_simple), + ("test_mergeForward_incomingGossip_hasNewNode", test_mergeForward_incomingGossip_hasNewNode), + ("test_mergeForward_removal_incomingGossip_isAhead_hasRemovedNodeKnownToBeDown", test_mergeForward_removal_incomingGossip_isAhead_hasRemovedNodeKnownToBeDown), + ("test_mergeForward_incomingGossip_removal_isAhead_hasMyNodeRemoved_thusWeKeepItAsRemoved", test_mergeForward_incomingGossip_removal_isAhead_hasMyNodeRemoved_thusWeKeepItAsRemoved), + ("test_converged_shouldBeTrue_forNoMembers", test_converged_shouldBeTrue_forNoMembers), + ("test_converged_amongUpMembers", test_converged_amongUpMembers), + ("test_converged_othersAreOnlyDown", test_converged_othersAreOnlyDown), + ("test_gossip_eventuallyConverges", test_gossip_eventuallyConverges), ] } } diff --git a/Tests/DistributedActorsTests/Cluster/MembershipGossipTests.swift b/Tests/DistributedActorsTests/Cluster/MembershipGossipTests.swift index 654b62d6e..fcfb1b8ed 100644 --- a/Tests/DistributedActorsTests/Cluster/MembershipGossipTests.swift +++ b/Tests/DistributedActorsTests/Cluster/MembershipGossipTests.swift @@ -19,62 +19,557 @@ import XCTest /// Tests of just the datatype final class MembershipGossipTests: XCTestCase { - var myselfNode: UniqueNode! - var secondNode: UniqueNode! - var thirdNode: UniqueNode! + var nodeA: UniqueNode! + var nodeB: UniqueNode! + var nodeC: UniqueNode! var fourthNode: UniqueNode! - - var myGossip: Cluster.Gossip! + lazy var allNodes = [ + self.nodeA!, self.nodeB!, self.nodeC!, self.fourthNode!, + ] override func setUp() { super.setUp() - self.myselfNode = UniqueNode(protocol: "sact", systemName: "myself", host: "127.0.0.1", port: 7111, nid: .random()) - self.secondNode = UniqueNode(protocol: "sact", systemName: "second", host: "127.0.0.1", port: 7222, nid: .random()) - self.thirdNode = UniqueNode(protocol: "sact", systemName: "third", host: "127.0.0.1", port: 7333, nid: .random()) - self.fourthNode = UniqueNode(protocol: "sact", systemName: "third", host: "127.0.0.1", port: 7444, nid: .random()) - self.myGossip = Cluster.Gossip(ownerNode: self.myselfNode) + self.nodeA = UniqueNode(protocol: "sact", systemName: "firstA", host: "127.0.0.1", port: 7111, nid: .random()) + self.nodeB = UniqueNode(protocol: "sact", systemName: "secondB", host: "127.0.0.1", port: 7222, nid: .random()) + self.nodeC = UniqueNode(protocol: "sact", systemName: "thirdC", host: "127.0.0.1", port: 7333, nid: .random()) + self.fourthNode = UniqueNode(protocol: "sact", systemName: "fourthD", host: "127.0.0.1", port: 7444, nid: .random()) } + // ==== ------------------------------------------------------------------------------------------------------------ + // MARK: Merging gossips + func test_mergeForward_incomingGossip_firstGossipFromOtherNode() { - var gossipFromSecond = Cluster.Gossip(ownerNode: self.secondNode) - _ = gossipFromSecond.membership.join(self.secondNode) + var gossip = Cluster.Gossip.parse( + """ + A.joining + A: A:1 + """, owner: self.nodeA, nodes: self.allNodes + ) + + let incoming = Cluster.Gossip.parse( + """ + B.joining + B: B:1 + """, owner: self.nodeB, nodes: self.allNodes + ) - let directive = self.myGossip.mergeForward(incoming: gossipFromSecond) + let directive = gossip.mergeForward(incoming: incoming) directive.effectiveChanges.shouldEqual( - [Cluster.MembershipChange(member: Cluster.Member(node: self.secondNode, status: .joining), toStatus: .joining)] + [Cluster.MembershipChange(node: self.nodeB, fromStatus: nil, toStatus: .joining)] + ) + + gossip.shouldEqual(Cluster.Gossip.parse( + """ + A.joining B.joining + A: A:1 B:1 + B: B:1 + """, owner: self.nodeA, nodes: self.allNodes + )) + } + + func test_mergeForward_incomingGossip_firstGossipFromOtherNodes() { + var gossip = Cluster.Gossip.parse( + """ + A.joining + A: A:1 + """, owner: self.nodeA, nodes: self.allNodes + ) + + let incoming = Cluster.Gossip.parse( + """ + B.joining C.joining + B: B:1 C:1 + C: B:1 C:1 + """, owner: self.nodeB, nodes: self.allNodes + ) + + let directive = gossip.mergeForward(incoming: incoming) + + Set(directive.effectiveChanges).shouldEqual(Set([ + Cluster.MembershipChange(node: self.nodeB, fromStatus: nil, toStatus: .joining), + Cluster.MembershipChange(node: self.nodeC, fromStatus: nil, toStatus: .joining), + ])) + + let expected = Cluster.Gossip.parse( + """ + A.joining B.joining C.joining + A: A:1 B:1 C:1 + B: B:1 C:1 + C: B:1 C:1 + """, owner: self.nodeA, nodes: self.allNodes ) + + gossip.seen.shouldEqual(expected.seen) + gossip.seen.version(at: self.nodeA).shouldEqual(expected.seen.version(at: self.nodeA)) + gossip.seen.version(at: self.nodeB).shouldEqual(expected.seen.version(at: self.nodeB)) + gossip.seen.version(at: self.nodeC).shouldEqual(expected.seen.version(at: self.nodeC)) + gossip.membership.shouldEqual(expected.membership) + gossip.shouldEqual(expected) } func test_mergeForward_incomingGossip_sameVersions() { - self.myGossip.seen.incrementVersion(owner: self.secondNode, at: self.myselfNode) // v: myself:1, second:1 - _ = self.myGossip.membership.join(self.secondNode) // myself:joining, second:joining + var gossip = Cluster.Gossip(ownerNode: self.nodeA) + _ = gossip.membership.join(self.nodeA) + gossip.seen.incrementVersion(owner: self.nodeB, at: self.nodeA) // v: myself:1, second:1 + _ = gossip.membership.join(self.nodeB) // myself:joining, second:joining - let gossipFromSecond = Cluster.Gossip(ownerNode: self.secondNode) - let directive = self.myGossip.mergeForward(incoming: gossipFromSecond) + let gossipFromSecond = Cluster.Gossip(ownerNode: self.nodeB) + let directive = gossip.mergeForward(incoming: gossipFromSecond) directive.effectiveChanges.shouldEqual([]) } - func test_mergeForward_incomingGossip_hasNoInformation() { - _ = self.myGossip.membership.join(self.myselfNode) - self.myGossip.incrementOwnerVersion() - _ = self.myGossip.membership.join(self.secondNode) - self.myGossip.seen.incrementVersion(owner: self.secondNode, at: self.secondNode) - _ = self.myGossip.membership.join(self.thirdNode) - self.myGossip.seen.incrementVersion(owner: self.thirdNode, at: self.thirdNode) + func test_mergeForward_incomingGossip_fromFourth_onlyKnowsAboutItself() { + var gossip = Cluster.Gossip.parse( + """ + A.joining B.joining B.joining + A: A@1 B@1 C@1 + """, owner: self.nodeA, nodes: self.allNodes + ) // only knows about fourth, while myGossip has first, second and third - var incomingGossip = Cluster.Gossip(ownerNode: self.fourthNode) - _ = incomingGossip.membership.join(self.fourthNode) - incomingGossip.incrementOwnerVersion() + let incomingGossip = Cluster.Gossip.parse( + """ + D.joining + D: D@1 + """, owner: self.fourthNode, nodes: self.allNodes + ) - let directive = self.myGossip.mergeForward(incoming: incomingGossip) + let directive = gossip.mergeForward(incoming: incomingGossip) // this test also covers so does not accidentally cause changes into .removed, which would be catastrophic directive.causalRelation.shouldEqual(.concurrent) directive.effectiveChanges.shouldEqual( - [Cluster.MembershipChange(member: Cluster.Member(node: self.fourthNode, status: .joining), toStatus: .joining)] + [Cluster.MembershipChange(node: self.fourthNode, fromStatus: nil, toStatus: .joining)] + ) + gossip.shouldEqual(Cluster.Gossip.parse( + """ + A.joining B.joining C.joining + A: A@1 B@1 C@1 D@1 + D: D@1 + """, owner: self.nodeA, nodes: self.allNodes + )) + } + + func test_mergeForward_incomingGossip_localHasRemoved_incomingHasOldViewWithDownNode() { + var gossip = Cluster.Gossip.parse( + """ + A.up B.down C.up + A: A@5 B@5 C@6 + B: A@5 B@5 C@6 + C: A@5 B@5 C@6 + """, + owner: self.nodeA, nodes: self.allNodes + ) + + // while we just removed it: + gossip.converged().shouldBeTrue() // sanity check + let removedMember: Cluster.Member = gossip.membership.uniqueMember(self.nodeB)! + _ = gossip.pruneMember(removedMember) + _ = gossip.incrementOwnerVersion() + + let incomingOldGossip = Cluster.Gossip.parse( + """ + A.up B.down C.up + A: A@5 B@5 C@6 + B: A@5 B@10 C@6 + C: A@5 B@5 C@6 + """, + owner: self.nodeB, nodes: self.allNodes + ) // TODO: this will be rejected since owner is the downe node (!) add another test with third sending the same info + + let gossipBeforeMerge = gossip + let directive = gossip.mergeForward(incoming: incomingOldGossip) + + directive.causalRelation.shouldEqual(.concurrent) + directive.effectiveChanges.shouldEqual( + [] // no effect + ) + + gossip.membership.members(atLeast: .joining).shouldNotContain(removedMember) + gossip.seen.nodes.shouldNotContain(removedMember.node) + + gossip.seen.shouldEqual(gossipBeforeMerge.seen) + gossip.membership.shouldEqual(gossipBeforeMerge.membership) + } + + func test_mergeForward_incomingGossip_concurrent_leaderDisagreement() { + var gossip = Cluster.Gossip.parse( + """ + A.up B.joining [leader:A] + A: A@5 B@5 + B: A@5 B@5 + """, + owner: self.nodeA, nodes: self.allNodes + ) + + // this may happen after healing a cluster partition, + // once the nodes talk to each other again, they will run leader election and resolve the double leader situation + // until that happens, the two leaders indeed remain as-is -- as the membership itself is not the right place to resolve + // who shall be leader + let incomingGossip = Cluster.Gossip.parse( + """ + A.up B.joining C.up [leader:B] + B: B@2 C@1 + C: B@5 C@9 + """, + owner: self.nodeC, nodes: self.allNodes + ) + + let directive = gossip.mergeForward(incoming: incomingGossip) + + directive.causalRelation.shouldEqual(.concurrent) + directive.effectiveChanges.shouldEqual( + [ + // such unknown -> up may indeed happen, if we have a large cluster, which ends up partitioned into two + // each side keeps adding new nodes, and then the partition heals. The core membership does NOT prevent this, + // what WOULD prevent this is how the leaders are selected -- e.g. dynamic quorum etc. Though even quorums can be cheated + // in dramatic situations (e.g. we add more nodes on one side than quorum, so it gets "its illegal quorum"), + // such is the case with any "dynamic" quorum however. We CAN and will provide strategies to select leaders which + // strongly militate such risk though. + Cluster.MembershipChange(node: self.nodeC, fromStatus: nil, toStatus: .up), + // note that this has NO effect on the leader; we keep trusting "our" leader, + // leader election should kick in and reconcile those two + ] + ) + + let expected = Cluster.Gossip.parse( + """ + A.up B.joining C.up [leader:A] + A: A:5 B:5 C:9 + B: A:5 B:5 C@1 + C: B:5 C:9 + """, owner: self.nodeA, nodes: self.allNodes + ) + + gossip.seen.shouldEqual(expected.seen) + gossip.membership.shouldEqual(expected.membership) + gossip.shouldEqual(expected) + } + + func test_mergeForward_incomingGossip_concurrent_simple() { + var gossip = Cluster.Gossip.parse( + """ + A.up B.joining + A: A@4 + """, owner: self.nodeA, nodes: self.allNodes + ) + + let concurrent = Cluster.Gossip.parse( + """ + A.joining B.joining + B: B@2 + """, owner: self.nodeB, nodes: self.allNodes + ) + + pprint("membership = \(gossip)") + pprint("ahead = \(concurrent)") + + let directive = gossip.mergeForward(incoming: concurrent) + pprint("membership.version = \(gossip.version)") + + gossip.owner.shouldEqual(self.nodeA) + directive.effectiveChanges.count.shouldEqual(0) + gossip.shouldEqual(Cluster.Gossip.parse( + """ + A.up B.joining + A: A@4 B@2 + B: B@2 + """, owner: self.nodeA, nodes: self.allNodes + )) + } + + func test_mergeForward_incomingGossip_hasNewNode() { + var gossip = Cluster.Gossip.parse( + """ + A.up + A: A@5 + """, + owner: self.nodeA, nodes: self.allNodes + ) + + var incomingGossip = gossip + _ = incomingGossip.membership.join(self.nodeB) + incomingGossip.incrementOwnerVersion() + + let directive = gossip.mergeForward(incoming: incomingGossip) + + directive.causalRelation.shouldEqual(.happenedBefore) + directive.effectiveChanges.shouldEqual( + [Cluster.MembershipChange(node: self.nodeB, fromStatus: nil, toStatus: .joining)] + ) + + gossip.membership.members(atLeast: .joining).shouldContain(Cluster.Member(node: self.nodeB, status: .joining)) + } + + func test_mergeForward_removal_incomingGossip_isAhead_hasRemovedNodeKnownToBeDown() { + var gossip = Cluster.Gossip.parse( + """ + A.up B.down C.up [leader:A] + A: A@5 B@5 C@6 + B: A@5 B@5 C@6 + C: A@5 B@5 C@6 + """, + owner: self.nodeA, nodes: self.allNodes + ) + + let incomingGossip = Cluster.Gossip.parse( + """ + A.up C.up + A: A@5 C@6 + C: A@5 C@7 + """, owner: self.nodeC, nodes: self.allNodes + ) + + let directive = gossip.mergeForward(incoming: incomingGossip) + + directive.causalRelation.shouldEqual(.concurrent) + // v-clock wise this indeed is concurrent, however due to down/removal involved, we can handle it + + directive.effectiveChanges.shouldEqual([ + Cluster.MembershipChange(node: self.nodeB, fromStatus: .down, toStatus: .removed), + ]) + + gossip.shouldEqual(Cluster.Gossip.parse( + """ + A.up C.up [leader:A] + A: A@5 C@7 + C: A@5 C@7 + """, owner: self.nodeA, nodes: self.allNodes + )) + } + + func test_mergeForward_incomingGossip_removal_isAhead_hasMyNodeRemoved_thusWeKeepItAsRemoved() { + var gossip = Cluster.Gossip.parse( + """ + A.up B.down C.up + A: A@5 B@5 C@6 + B: A@5 B@5 C@6 + C: A@5 B@5 C@6 + """, + owner: self.nodeB, nodes: self.allNodes + ) + + let incomingGossip = Cluster.Gossip.parse( + """ + A.up C.up + A: A@5 C@6 + C: A@5 C@7 + """, owner: self.nodeC, nodes: self.allNodes + ) + + let directive = gossip.mergeForward(incoming: incomingGossip) + + directive.causalRelation.shouldEqual(.concurrent) + directive.effectiveChanges.shouldEqual([ + Cluster.MembershipChange(node: self.nodeB, fromStatus: .down, toStatus: .removed), + ]) + + // we MIGHT receive a removal of "our node" however we must never apply such change! + // we know we are `.down` and that's the most "we" will ever perceive ourselves as -- i.e. removed is only for "others". + let expected = Cluster.Gossip.parse( + """ + A.up B.removed C.up + A: A@5 B@5 C@6 + B: A@5 B@5 C@7 + C: A@5 B@5 C@7 + """, owner: self.nodeB, nodes: self.allNodes ) + gossip.seen.version(at: self.nodeA).shouldEqual(expected.seen.version(at: self.nodeA)) + gossip.seen.version(at: self.nodeB).shouldEqual(expected.seen.version(at: self.nodeB)) + gossip.seen.version(at: self.nodeC).shouldEqual(expected.seen.version(at: self.nodeC)) + gossip.shouldEqual(expected) + } + + // ==== ------------------------------------------------------------------------------------------------------------ + // MARK: Convergence + + func test_converged_shouldBeTrue_forNoMembers() { + var gossip = Cluster.Gossip(ownerNode: self.nodeA) + _ = gossip.membership.join(self.nodeA) + gossip.converged().shouldBeTrue() + + gossip.incrementOwnerVersion() + gossip.converged().shouldBeTrue() + } + + func test_converged_amongUpMembers() { + var gossip = Cluster.Gossip(ownerNode: self.nodeA) + _ = gossip.membership.join(self.nodeA) + _ = gossip.membership.mark(self.nodeA, as: .up) + + _ = gossip.membership.join(self.nodeB) + _ = gossip.membership.mark(self.nodeB, as: .up) + + _ = gossip.membership.join(self.nodeC) + _ = gossip.membership.mark(self.nodeC, as: .up) + + gossip.seen.incrementVersion(owner: self.nodeA, at: self.nodeA) + gossip.seen.incrementVersion(owner: self.nodeA, at: self.nodeB) + gossip.seen.incrementVersion(owner: self.nodeA, at: self.nodeC) + // we are "ahead" of others + gossip.converged().shouldBeFalse() + + gossip.seen.incrementVersion(owner: self.nodeB, at: self.nodeA) + gossip.seen.incrementVersion(owner: self.nodeB, at: self.nodeB) + // others still catching up + gossip.converged().shouldBeFalse() + + gossip.seen.incrementVersion(owner: self.nodeB, at: self.nodeC) + // second has caught up, but third still not + gossip.converged().shouldBeFalse() + + gossip.seen.incrementVersion(owner: self.nodeC, at: self.nodeA) + // second has caught up, but third catching up + gossip.converged().shouldBeFalse() + + gossip.seen.incrementVersion(owner: self.nodeC, at: self.nodeB) + gossip.seen.incrementVersion(owner: self.nodeC, at: self.nodeC) + // second and third have caught up + gossip.converged().shouldBeTrue() + + // if second and third keep moving on, they still have at-least seen our version, + // co convergence still should remain true + gossip.seen.incrementVersion(owner: self.nodeC, at: self.nodeC) + + gossip.seen.incrementVersion(owner: self.nodeB, at: self.nodeC) + gossip.seen.incrementVersion(owner: self.nodeB, at: self.nodeC) + gossip.converged().shouldBeTrue() + } + + func test_converged_othersAreOnlyDown() { + let gossip = Cluster.Gossip.parse( + """ + A.up B.down + A: A@8 B@5 + B: B@6 + C: A@7 B@5 + """, owner: self.nodeA, nodes: self.allNodes + ) + + // since all other nodes other than A are down or joining, thus we do not count them in convergence + gossip.converged().shouldBeTrue() + } + + // FIXME: we should not need .joining nodes to participate on convergence() + func fixme_converged_joiningOrDownMembersDoNotCount() { + var gossip = Cluster.Gossip(ownerNode: self.nodeA) + _ = gossip.membership.join(self.nodeA) + + _ = gossip.membership.join(self.nodeB) + _ = gossip.membership.mark(self.nodeB, as: .joining) + + _ = gossip.membership.join(self.nodeC) + _ = gossip.membership.mark(self.nodeC, as: .down) + + gossip.seen.incrementVersion(owner: self.nodeA, at: self.nodeA) + gossip.seen.incrementVersion(owner: self.nodeA, at: self.nodeB) + gossip.seen.incrementVersion(owner: self.nodeA, at: self.nodeC) + + gossip.seen.incrementVersion(owner: self.nodeB, at: self.nodeA) + gossip.seen.incrementVersion(owner: self.nodeB, at: self.nodeB) + gossip.seen.incrementVersion(owner: self.nodeB, at: self.nodeC) + + gossip.seen.incrementVersion(owner: self.nodeC, at: self.nodeA) + gossip.seen.incrementVersion(owner: self.nodeC, at: self.nodeB) + gossip.seen.incrementVersion(owner: self.nodeC, at: self.nodeC) + + // all have caught up, however they are only .down or .joining (!) + gossip.converged().shouldBeTrue() + // (convergence among only joining members matters since then we can kick off the leader actions to move members up) + + // joining a node that is up, and caught up though means we can converge + _ = gossip.membership.join(self.fourthNode) + _ = gossip.membership.mark(self.fourthNode, as: .up) + + // as the new node is up, it matters to convergence, it should have the full picture but does not + // as such, we are not converged + gossip.converged().shouldBeFalse() + + gossip.seen.incrementVersion(owner: self.fourthNode, at: self.nodeA) + gossip.seen.incrementVersion(owner: self.fourthNode, at: self.nodeB) + gossip.seen.incrementVersion(owner: self.fourthNode, at: self.nodeC) + // it moved a bit fast: + gossip.seen.incrementVersion(owner: self.fourthNode, at: self.fourthNode) + gossip.seen.incrementVersion(owner: self.fourthNode, at: self.fourthNode) + gossip.seen.incrementVersion(owner: self.fourthNode, at: self.fourthNode) + gossip.seen.incrementVersion(owner: self.fourthNode, at: self.fourthNode) + // but we're up to date with this: + gossip.seen.incrementVersion(owner: self.nodeA, at: self.fourthNode) + gossip.seen.incrementVersion(owner: self.nodeA, at: self.fourthNode) + gossip.seen.incrementVersion(owner: self.nodeA, at: self.fourthNode) + gossip.seen.incrementVersion(owner: self.nodeA, at: self.fourthNode) + + // the new node has caught up: + gossip.converged().shouldBeTrue() + } + + func test_gossip_eventuallyConverges() { + func makeRandomGossip(owner node: UniqueNode) -> Cluster.Gossip { + var gossip = Cluster.Gossip(ownerNode: node) + _ = gossip.membership.join(node) + _ = gossip.membership.mark(node, as: .joining) + var vv = VersionVector() + vv.state[.uniqueNode(node)] = node.port + gossip.seen.underlying[node] = .init(vv) + + // know just enough that we're not alone and thus need to communicate: + _ = gossip.membership.join(self.nodeA) + _ = gossip.membership.mark(self.nodeA, as: .up) + _ = gossip.seen.incrementVersion(owner: self.nodeA, at: self.nodeA) + _ = gossip.seen.incrementVersion(owner: node, at: self.nodeA) + + _ = gossip.membership.join(self.nodeB) + _ = gossip.membership.mark(self.nodeB, as: .up) + _ = gossip.seen.incrementVersion(owner: self.nodeB, at: self.nodeB) + _ = gossip.seen.incrementVersion(owner: self.nodeB, at: self.nodeB) + _ = gossip.seen.incrementVersion(owner: node, at: self.nodeB) + _ = gossip.seen.incrementVersion(owner: node, at: self.nodeB) + return gossip + } + + let firstGossip = makeRandomGossip(owner: self.nodeA) + let secondGossip = makeRandomGossip(owner: self.nodeB) + let thirdGossip = makeRandomGossip(owner: self.nodeC) + let fourthGossip = makeRandomGossip(owner: self.fourthNode) + + var gossips = [ + 1: firstGossip, + 2: secondGossip, + 3: thirdGossip, + 4: fourthGossip, + ] + + gossips.forEach { _, gossip in + assert(!gossip.converged(), "Should not start out convergent") + } + + var gossipSend = 0 + let gossipSendsMax = 100 + while gossipSend < gossipSendsMax { + let (_, from) = gossips.randomElement()! + var (toId, target) = gossips.randomElement()! + + _ = target.mergeForward(incoming: from) + gossips[toId] = target + + if gossips.allSatisfy({ $1.converged() }) { + break + } + + gossipSend += 1 + } + + let allConverged = gossips.allSatisfy { $1.converged() } + guard allConverged else { + XCTFail(""" + Gossips among \(gossips.count) members did NOT converge after \(gossipSend) (individual) sends. + \(gossips.values.map { "\($0)" }.joined(separator: "\n")) + """) + return + } + + pinfo("Gossip converged on all \(gossips.count) members, after \(gossipSend) (individual) sends") } } diff --git a/Tests/DistributedActorsTests/Cluster/RemotingHandshakeStateMachineTests.swift b/Tests/DistributedActorsTests/Cluster/RemotingHandshakeStateMachineTests.swift index 7881b1dc3..a1a912a5e 100644 --- a/Tests/DistributedActorsTests/Cluster/RemotingHandshakeStateMachineTests.swift +++ b/Tests/DistributedActorsTests/Cluster/RemotingHandshakeStateMachineTests.swift @@ -24,40 +24,14 @@ final class RemoteHandshakeStateMachineTests: XCTestCase { let systemName = "RemoteHandshakeStateMachineTests" - // usual reminder that Swift Distributed Actors is not inherently "client/server" once associated, only the handshake is - enum HandshakeSide: String { - case client - case server - } - - func makeMockKernelState(side: HandshakeSide, configureSettings: (inout ClusterSettings) -> Void = { _ in () }) -> ClusterShellState { - var settings = ClusterSettings( - node: Node( - systemName: systemName, - host: "127.0.0.1", - port: 7337 - ) - ) - configureSettings(&settings) - let log = Logger(label: "handshake-\(side)") // TODO: could be a mock logger we can assert on? - - return ClusterShellState( - settings: settings, - channel: EmbeddedChannel(), - events: EventStream(ref: ActorRef(.deadLetters(.init(log, address: ._deadLetters, system: nil)))), - gossipControl: ConvergentGossipControl(ActorRef(.deadLetters(.init(log, address: ._deadLetters, system: nil)))), - log: log - ) - } - // ==== ------------------------------------------------------------------------------------------------------------ // MARK: Happy path handshakes func test_handshake_happyPath() throws { - let serverKernel = self.makeMockKernelState(side: .server) + let serverKernel = ClusterShellState.makeTestMock(side: .server) let serverAddress = serverKernel.myselfNode - let clientKernel = self.makeMockKernelState(side: .client) { settings in + let clientKernel = ClusterShellState.makeTestMock(side: .client) { settings in settings.node.port = 2222 } @@ -93,10 +67,10 @@ final class RemoteHandshakeStateMachineTests: XCTestCase { // MARK: Version negotiation func test_negotiate_server_shouldAcceptClient_newerPatch() throws { - let serverKernel = self.makeMockKernelState(side: .server) + let serverKernel = ClusterShellState.makeTestMock(side: .server) let serverAddress = serverKernel.myselfNode - let clientKernel = self.makeMockKernelState(side: .client) { settings in + let clientKernel = ClusterShellState.makeTestMock(side: .client) { settings in settings.node.port = 2222 settings._protocolVersion.patch += 1 } @@ -118,10 +92,10 @@ final class RemoteHandshakeStateMachineTests: XCTestCase { } func test_negotiate_server_shouldRejectClient_newerMajor() throws { - let serverKernel = self.makeMockKernelState(side: .server) + let serverKernel = ClusterShellState.makeTestMock(side: .server) let serverAddress = serverKernel.myselfNode - let clientKernel = self.makeMockKernelState(side: .client) { settings in + let clientKernel = ClusterShellState.makeTestMock(side: .client) { settings in settings.node.port = 2222 settings._protocolVersion.major += 1 } @@ -149,10 +123,10 @@ final class RemoteHandshakeStateMachineTests: XCTestCase { // MARK: Handshake timeout causing retries func test_onTimeout_shouldReturnNewHandshakeOffersMultipleTimes() throws { - let serverKernel = self.makeMockKernelState(side: .server) + let serverKernel = ClusterShellState.makeTestMock(side: .server) let serverAddress = serverKernel.myselfNode - let clientKernel = self.makeMockKernelState(side: .client) { settings in + let clientKernel = ClusterShellState.makeTestMock(side: .client) { settings in settings.node.port = 8228 } diff --git a/Tests/DistributedActorsTests/Cluster/TestExtensions+MembershipDSL.swift b/Tests/DistributedActorsTests/Cluster/TestExtensions+MembershipDSL.swift new file mode 100644 index 000000000..b24bf2174 --- /dev/null +++ b/Tests/DistributedActorsTests/Cluster/TestExtensions+MembershipDSL.swift @@ -0,0 +1,143 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift Distributed Actors open source project +// +// Copyright (c) 2020 Apple Inc. and the Swift Distributed Actors project authors +// Licensed under Apache License v2.0 +// +// See LICENSE.txt for license information +// See CONTRIBUTORS.md for the list of Swift Distributed Actors project authors +// +// SPDX-License-Identifier: Apache-2.0 +// +//===----------------------------------------------------------------------===// + +@testable import DistributedActors +import Logging +import NIO + +// ==== ---------------------------------------------------------------------------------------------------------------- +// MARK: Membership Testing DSL + +extension Cluster.Gossip { + /// First line is Membership DSL, followed by lines of the SeenTable DSL + internal static func parse(_ dsl: String, owner: UniqueNode, nodes: [UniqueNode]) -> Cluster.Gossip { + let dslLines = dsl.split(separator: "\n") + var gossip = Cluster.Gossip(ownerNode: owner) + gossip.membership = Cluster.Membership.parse(String(dslLines.first!), nodes: nodes) + gossip.seen = Cluster.Gossip.SeenTable.parse(dslLines.dropFirst().joined(separator: "\n"), nodes: nodes) + return gossip + } +} + +extension Cluster.Gossip.SeenTable { + /// Express seen tables using a DSL + /// Syntax: each line: `: @*` + internal static func parse(_ dslString: String, nodes: [UniqueNode], file: StaticString = #file, line: UInt = #line) -> Cluster.Gossip.SeenTable { + let lines = dslString.split(separator: "\n") + func nodeById(id: String.SubSequence) -> UniqueNode { + if let found = nodes.first(where: { $0.node.systemName.contains(id) }) { + return found + } else { + fatalError("Could not find node containing [\(id)] in \(nodes), for seen table: \(dslString)", file: file, line: line) + } + } + + var table = Cluster.Gossip.SeenTable() + + for line in lines { + let elements = line.split(separator: " ") + let id = elements.first!.dropLast(1) + let on = nodeById(id: id) + + var vv = VersionVector.empty + for dslVersion in elements.dropFirst() { + let parts = dslVersion.split { c in "@:".contains(c) } + + let atId = parts.first! + let atNode = nodeById(id: atId) + + let versionString = parts.dropFirst().first! + let atVersion = Int(versionString)! + + vv.state[.uniqueNode(atNode)] = atVersion + } + + table.underlying[on] = vv + } + + return table + } +} + +extension VersionVector { + internal static func parse(_ dslString: String, nodes: [UniqueNode], file: StaticString = #file, line: UInt = #line) -> VersionVector { + func nodeById(id: String.SubSequence) -> UniqueNode { + if let found = nodes.first(where: { $0.node.systemName.contains(id) }) { + return found + } else { + fatalError("Could not find node containing [\(id)] in \(nodes), for seen table: \(dslString)", file: file, line: line) + } + } + + let replicaVersions: [VersionVector.ReplicaVersion] = dslString.split(separator: " ").map { segment in + let v = segment.split { c in ":@".contains(c) } + return (.uniqueNode(nodeById(id: v.first!)), Int(v.dropFirst().first!)!) + } + return VersionVector(replicaVersions) + } +} + +extension Cluster.Membership { + /// Express membership as: `F.up S.down T.joining`. + /// + /// Syntax reference: + /// + /// ``` + /// [.:] || [leader:] + /// ``` + internal static func parse(_ dslString: String, nodes: [UniqueNode], file: StaticString = #file, line: UInt = #line) -> Cluster.Membership { + func nodeById(id: String.SubSequence) -> UniqueNode { + if let found = nodes.first(where: { $0.node.systemName.contains(id) }) { + return found + } else { + fatalError("Could not find node containing [\(id)] in \(nodes), for seen table: \(dslString)", file: file, line: line) + } + } + + var membership = Cluster.Membership.empty + + for nodeDsl in dslString.split(separator: " ") { + let elements = nodeDsl.split { c in ".:".contains(c) } + let nodeId = elements.first! + if nodeId == "[leader" { + // this is hacky, but good enough for our testing tools + let actualNodeId = elements.dropFirst().first! + let leaderNode = nodeById(id: actualNodeId.dropLast(1)) + let leaderMember = membership.uniqueMember(leaderNode)! + membership.leader = leaderMember + } else { + let node = nodeById(id: nodeId) + + let statusString = String(elements.dropFirst().first!) + let status = Cluster.MemberStatus.parse(statusString)! + + membership._members[node] = Cluster.Member(node: node, status: status) + } + } + + return membership + } +} + +extension Cluster.MemberStatus { + /// Not efficient but useful for constructing mini DSLs to write membership + internal static func parse(_ s: String) -> Cluster.MemberStatus? { + let id = String(s.trimmingCharacters(in: .symbols)) + for c in Self.allCases where id == "\(c)" { + return c + } + + return nil + } +} diff --git a/Tests/DistributedActorsTests/Cluster/TestExtensions.swift b/Tests/DistributedActorsTests/Cluster/TestExtensions.swift new file mode 100644 index 000000000..d082d5f17 --- /dev/null +++ b/Tests/DistributedActorsTests/Cluster/TestExtensions.swift @@ -0,0 +1,45 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift Distributed Actors open source project +// +// Copyright (c) 2020 Apple Inc. and the Swift Distributed Actors project authors +// Licensed under Apache License v2.0 +// +// See LICENSE.txt for license information +// See CONTRIBUTORS.md for the list of Swift Distributed Actors project authors +// +// SPDX-License-Identifier: Apache-2.0 +// +//===----------------------------------------------------------------------===// + +@testable import DistributedActors +import Logging +import NIO + +// usual reminder that Swift Distributed Actors is not inherently "client/server" once associated, only the handshake is +enum HandshakeSide: String { + case client + case server +} + +extension ClusterShellState { + static func makeTestMock(side: HandshakeSide, configureSettings: (inout ClusterSettings) -> Void = { _ in () }) -> ClusterShellState { + var settings = ClusterSettings( + node: Node( + systemName: "MockSystem", + host: "127.0.0.1", + port: 7337 + ) + ) + configureSettings(&settings) + let log = Logger(label: "handshake-\(side)") // TODO: could be a mock logger we can assert on? + + return ClusterShellState( + settings: settings, + channel: EmbeddedChannel(), + events: EventStream(ref: ActorRef(.deadLetters(.init(log, address: ._deadLetters, system: nil)))), + gossipControl: ConvergentGossipControl(ActorRef(.deadLetters(.init(log, address: ._deadLetters, system: nil)))), + log: log + ) + } +} diff --git a/Tests/DistributedActorsTests/MembershipTests+XCTest.swift b/Tests/DistributedActorsTests/MembershipTests+XCTest.swift index f4206f5ea..ef8ecdd0d 100644 --- a/Tests/DistributedActorsTests/MembershipTests+XCTest.swift +++ b/Tests/DistributedActorsTests/MembershipTests+XCTest.swift @@ -25,11 +25,13 @@ extension MembershipTests { return [ ("test_status_ordering", test_status_ordering), ("test_age_ordering", test_age_ordering), - ("test_member_forNonUniqueNode", test_member_forNonUniqueNode), - ("test_member_forNonUniqueNode_givenReplacementNodeStored", test_member_forNonUniqueNode_givenReplacementNodeStored), + ("test_membership_equality", test_membership_equality), + ("test_member_equality", test_member_equality), + ("test_member_replacement_shouldOfferChange", test_member_replacement_shouldOfferChange), ("test_apply_LeadershipChange", test_apply_LeadershipChange), ("test_join_memberReplacement", test_join_memberReplacement), ("test_apply_memberReplacement", test_apply_memberReplacement), + ("test_apply_memberRemoval", test_apply_memberRemoval), ("test_members_listing", test_members_listing), ("test_members_listing_filteringByReachability", test_members_listing_filteringByReachability), ("test_mark_shouldOnlyProceedForwardInStatuses", test_mark_shouldOnlyProceedForwardInStatuses), @@ -46,6 +48,8 @@ extension MembershipTests { ("test_mergeForward_fromAhead_same", test_mergeForward_fromAhead_same), ("test_mergeForward_fromAhead_membership_withAdditionalMember", test_mergeForward_fromAhead_membership_withAdditionalMember), ("test_mergeForward_fromAhead_membership_withMemberNowDown", test_mergeForward_fromAhead_membership_withMemberNowDown), + ("test_mergeForward_fromAhead_membership_withDownMembers", test_mergeForward_fromAhead_membership_withDownMembers), + ("test_mergeForward_fromAhead_membership_ignoreRemovedWithoutPrecedingDown", test_mergeForward_fromAhead_membership_ignoreRemovedWithoutPrecedingDown), ] } } diff --git a/Tests/DistributedActorsTests/MembershipTests.swift b/Tests/DistributedActorsTests/MembershipTests.swift index e2b3e6ef2..5acd2be47 100644 --- a/Tests/DistributedActorsTests/MembershipTests.swift +++ b/Tests/DistributedActorsTests/MembershipTests.swift @@ -17,13 +17,24 @@ import DistributedActorsTestKit import XCTest final class MembershipTests: XCTestCase { - let firstMember = Cluster.Member(node: UniqueNode(node: Node(systemName: "System", host: "1.1.1.1", port: 7337), nid: .random()), status: .up) - let secondMember = Cluster.Member(node: UniqueNode(node: Node(systemName: "System", host: "2.2.2.2", port: 8228), nid: .random()), status: .up) - let thirdMember = Cluster.Member(node: UniqueNode(node: Node(systemName: "System", host: "3.3.3.3", port: 9119), nid: .random()), status: .up) - let newMember = Cluster.Member(node: UniqueNode(node: Node(systemName: "System", host: "4.4.4.4", port: 1001), nid: .random()), status: .up) + let memberA = Cluster.Member(node: UniqueNode(node: Node(systemName: "nodeA", host: "1.1.1.1", port: 1111), nid: .random()), status: .up) + var nodeA: UniqueNode { self.memberA.node } + + let memberB = Cluster.Member(node: UniqueNode(node: Node(systemName: "nodeB", host: "2.2.2.2", port: 2222), nid: .random()), status: .up) + var nodeB: UniqueNode { self.memberB.node } + + let memberC = Cluster.Member(node: UniqueNode(node: Node(systemName: "nodeC", host: "3.3.3.3", port: 3333), nid: .random()), status: .up) + var nodeC: UniqueNode { self.memberC.node } + + let memberD = Cluster.Member(node: UniqueNode(node: Node(systemName: "nodeD", host: "4.4.4.4", port: 4444), nid: .random()), status: .up) + var nodeD: UniqueNode { self.memberD.node } + + lazy var allNodes = [ + nodeA, nodeB, nodeC, + ] lazy var initialMembership: Cluster.Membership = [ - firstMember, secondMember, thirdMember, + memberA, memberB, memberC, ] // ==== ------------------------------------------------------------------------------------------------------------ // MARK: status ordering @@ -55,77 +66,116 @@ final class MembershipTests: XCTestCase { func test_age_ordering() { let ms = [ - Cluster.Member(node: firstMember.node, status: .joining), - Cluster.Member(node: firstMember.node, status: .up, upNumber: 1), - Cluster.Member(node: firstMember.node, status: .down, upNumber: 4), - Cluster.Member(node: firstMember.node, status: .up, upNumber: 2), + Cluster.Member(node: memberA.node, status: .joining), + Cluster.Member(node: memberA.node, status: .up, upNumber: 1), + Cluster.Member(node: memberA.node, status: .down, upNumber: 4), + Cluster.Member(node: memberA.node, status: .up, upNumber: 2), ] - let ns = ms.sorted(by: Cluster.Member.ageOrdering).map { $0.upNumber } + let ns = ms.sorted(by: Cluster.Member.ageOrdering).map { $0._upNumber } ns.shouldEqual([nil, 1, 2, 4]) } - // ==== ------------------------------------------------------------------------------------------------------------ - // MARK: Member lookups + // ==== ---------------------------------------------------------------------------------------------------------------- + // MARK: equality - func test_member_forNonUniqueNode() { - var membership: Cluster.Membership = [firstMember, secondMember] - var secondReplacement = Cluster.Member(node: UniqueNode(node: Node(systemName: "System", host: "2.2.2.2", port: 8228), nid: .random()), status: .up) + // Implementation note: + // See the Membership equality implementation for an in depth rationale why the equality works like this. - let change = membership.join(secondReplacement.node) - change.isReplacement.shouldBeTrue() + func test_membership_equality() { + let left: Cluster.Membership = [ + Cluster.Member(node: memberA.node, status: .up, upNumber: 1), + Cluster.Member(node: memberB.node, status: .up, upNumber: 1), + Cluster.Member(node: memberC.node, status: .up, upNumber: 1), + ] + let right: Cluster.Membership = [ + Cluster.Member(node: memberA.node, status: .up, upNumber: 1), + Cluster.Member(node: memberB.node, status: .down, upNumber: 1), + Cluster.Member(node: memberC.node, status: .up, upNumber: 1), + ] - membership.members(self.secondMember.node.node).count.shouldEqual(2) + left.shouldNotEqual(right) + right.shouldNotEqual(left) // sanity check, since hand implemented equality + } - let mostUpToDateNodeAboutGivenNode = membership.firstMember(self.secondMember.node.node) - mostUpToDateNodeAboutGivenNode.shouldEqual(secondReplacement) + func test_member_equality() { + // member identity is the underlying unique node, this status DOES NOT contribute to equality: + var member = self.memberA + member.status = .down + member.shouldEqual(self.memberA) - let nonUniqueNode = self.secondMember.node.node - let seconds = membership.members(nonUniqueNode) + // addresses are different + self.memberA.shouldNotEqual(self.memberB) - // the current status of members should be the following by now: + // only the node id is different: + let one = Cluster.Member(node: UniqueNode(node: Node(systemName: "firstA", host: "1.1.1.1", port: 1111), nid: .init(1)), status: .up) + let two = Cluster.Member(node: UniqueNode(node: Node(systemName: "firstA", host: "1.1.1.1", port: 1111), nid: .init(12222)), status: .up) + one.shouldNotEqual(two) - secondReplacement.status = .down - seconds.shouldEqual([secondReplacement, secondMember]) // first the replacement, then the (now down) previous incarnation + // node names do not matter for equality: + let three = Cluster.Member(node: UniqueNode(node: Node(systemName: "does", host: "1.1.1.1", port: 1111), nid: .init(1)), status: .up) + let four = Cluster.Member(node: UniqueNode(node: Node(systemName: "not matter", host: "1.1.1.1", port: 1111), nid: .init(12222)), status: .up) + three.shouldNotEqual(four) } - func test_member_forNonUniqueNode_givenReplacementNodeStored() {} + // ==== ------------------------------------------------------------------------------------------------------------ + // MARK: Member lookups + + func test_member_replacement_shouldOfferChange() { + var membership: Cluster.Membership = [memberA, memberB] + let secondReplacement = Cluster.Member( + node: UniqueNode(node: Node(systemName: self.nodeB.node.systemName, host: self.nodeB.node.host, port: self.nodeB.node.port), nid: .random()), status: .up + ) + + let change = membership.apply(Cluster.MembershipChange(member: secondReplacement))! + change.isReplacement.shouldBeTrue() + change.member.shouldEqual(secondReplacement) + change.replacementDownPreviousNodeChange.shouldEqual( + Cluster.MembershipChange(member: self.memberB, toStatus: .down) + ) + + membership.members(atLeast: .joining).count.shouldEqual(2) + let memberNode = membership.uniqueMember(change.member.node) + memberNode?.status.shouldEqual(Cluster.MemberStatus.up) + } // ==== ---------------------------------------------------------------------------------------------------------------- // MARK: Applying changes func test_apply_LeadershipChange() throws { var membership = self.initialMembership - membership.isLeader(self.firstMember).shouldBeFalse() + membership.isLeader(self.memberA).shouldBeFalse() - let change = try membership.applyLeadershipChange(to: self.firstMember) - change.shouldEqual(Cluster.LeadershipChange(oldLeader: nil, newLeader: self.firstMember)) - membership.isLeader(self.firstMember).shouldBeTrue() + let change = try membership.applyLeadershipChange(to: self.memberA) + change.shouldEqual(Cluster.LeadershipChange(oldLeader: nil, newLeader: self.memberA)) + membership.isLeader(self.memberA).shouldBeTrue() // applying "same change" no-op - let noChange = try membership.applyLeadershipChange(to: self.firstMember) + let noChange = try membership.applyLeadershipChange(to: self.memberA) noChange.shouldBeNil() // changing to no leader is ok let noLeaderChange = try membership.applyLeadershipChange(to: nil) - noLeaderChange.shouldEqual(Cluster.LeadershipChange(oldLeader: self.firstMember, newLeader: nil)) + noLeaderChange.shouldEqual(Cluster.LeadershipChange(oldLeader: self.memberA, newLeader: nil)) do { - _ = try membership.applyLeadershipChange(to: self.newMember) // not part of membership (!) + _ = try membership.applyLeadershipChange(to: self.memberD) // not part of membership (!) } catch { "\(error)".shouldStartWith(prefix: "nonMemberLeaderSelected") } } + // TODO: what if leadership change oldLeader also implies the oldLeader -> .down + func test_join_memberReplacement() { var membership = self.initialMembership - let replacesFirstNode = UniqueNode(node: Node(systemName: "System", host: "1.1.1.1", port: 7337), nid: .random()) + let replacesFirstNode = UniqueNode(node: self.nodeA.node, nid: .random()) - let change = membership.join(replacesFirstNode) + let change = membership.join(replacesFirstNode)! change.isReplacement.shouldBeTrue() - change.replaced.shouldEqual(self.firstMember) - change.replaced!.status.shouldEqual(self.firstMember.status) + change.replaced.shouldEqual(self.memberA) + change.replaced!.status.shouldEqual(self.memberA.status) change.node.shouldEqual(replacesFirstNode) change.toStatus.shouldEqual(.joining) } @@ -133,7 +183,7 @@ final class MembershipTests: XCTestCase { func test_apply_memberReplacement() throws { var membership = self.initialMembership - let firstReplacement = Cluster.Member(node: UniqueNode(node: Node(systemName: "System", host: "1.1.1.1", port: 7337), nid: .random()), status: .up) + let firstReplacement = Cluster.Member(node: UniqueNode(node: self.nodeA.node, nid: .init(111_111)), status: .up) try shouldNotThrow { guard let change = membership.apply(Cluster.MembershipChange(member: firstReplacement)) else { @@ -141,13 +191,31 @@ final class MembershipTests: XCTestCase { } change.isReplacement.shouldBeTrue() - change.replaced.shouldEqual(self.firstMember) - change.replaced!.status.shouldEqual(self.firstMember.status) + change.replaced.shouldEqual(self.memberA) + change.replaced!.status.shouldEqual(self.memberA.status) change.node.shouldEqual(firstReplacement.node) change.toStatus.shouldEqual(firstReplacement.status) } } + func test_apply_memberRemoval() throws { + var membership = self.initialMembership + + let removal = Cluster.Member(node: self.memberA.node, status: .removed) + + try shouldNotThrow { + guard let change = membership.apply(Cluster.MembershipChange(member: removal)) else { + throw TestError("Expected a change, but didn't get one") + } + + change.isReplacement.shouldBeFalse() + change.node.shouldEqual(removal.node) + change.toStatus.shouldEqual(removal.status) + + membership.uniqueMember(self.memberA.node).shouldBeNil() + } + } + // ==== ------------------------------------------------------------------------------------------------------------ // MARK: member listing @@ -155,7 +223,7 @@ final class MembershipTests: XCTestCase { self.initialMembership.members(atLeast: .joining).count.shouldEqual(3) self.initialMembership.members(atLeast: .up).count.shouldEqual(3) var changed = self.initialMembership - _ = changed.mark(self.firstMember.node, as: .down) + _ = changed.mark(self.memberA.node, as: .down) changed.count(atLeast: .joining).shouldEqual(3) changed.count(atLeast: .up).shouldEqual(3) changed.count(atLeast: .leaving).shouldEqual(1) @@ -165,10 +233,10 @@ final class MembershipTests: XCTestCase { func test_members_listing_filteringByReachability() { var changed = self.initialMembership - _ = changed.mark(self.firstMember.node, as: .down) + _ = changed.mark(self.memberA.node, as: .down) - _ = changed.mark(self.firstMember.node, reachability: .unreachable) - _ = changed.mark(self.secondMember.node, reachability: .unreachable) + _ = changed.mark(self.memberA.node, reachability: .unreachable) + _ = changed.mark(self.memberB.node, reachability: .unreachable) // exact status match @@ -241,7 +309,7 @@ final class MembershipTests: XCTestCase { } func test_mark_shouldNotReturnChangeForMarkingAsSameStatus() { - let member = self.firstMember + let member = self.memberA var membership: Cluster.Membership = [member] let noChange = membership.mark(member.node, as: member.status) @@ -266,46 +334,48 @@ final class MembershipTests: XCTestCase { func test_join_overAnExistingMode_replacement() { var membership = self.initialMembership - let secondReplacement = Cluster.Member(node: UniqueNode(node: Node(systemName: "System", host: "2.2.2.2", port: 8228), nid: .random()), status: .joining) - let change = membership.join(secondReplacement.node) + let secondReplacement = Cluster.Member(node: UniqueNode(node: self.nodeB.node, nid: .random()), status: .joining) + let change = membership.join(secondReplacement.node)! change.isReplacement.shouldBeTrue() - let members = membership.members(secondReplacement.node.node) - var secondDown = self.secondMember + let members = membership.members(atLeast: .joining) + var secondDown = self.memberB secondDown.status = .down - members.shouldContain(secondDown) + + members.count.shouldEqual(3) members.shouldContain(secondReplacement) + members.shouldNotContain(self.memberB) // was replaced } func test_mark_replacement() throws { - var membership: Cluster.Membership = [self.firstMember] + var membership: Cluster.Membership = [self.memberA] - let firstReplacement = Cluster.Member(node: UniqueNode(node: Node(systemName: "System", host: "1.1.1.1", port: 7337), nid: .random()), status: .up) + let firstReplacement = Cluster.Member(node: UniqueNode(node: self.nodeA.node, nid: .random()), status: .up) try shouldNotThrow { guard let change = membership.mark(firstReplacement.node, as: firstReplacement.status) else { throw TestError("Expected a change") } change.isReplacement.shouldBeTrue() - change.replaced.shouldEqual(self.firstMember) - change.fromStatus.shouldEqual(nil) + change.replaced.shouldEqual(self.memberA) + change.fromStatus.shouldEqual(.up) change.node.shouldEqual(firstReplacement.node) change.toStatus.shouldEqual(.up) } } func test_replacement_changeCreation() { - var existing = self.firstMember + var existing = self.memberA existing.status = .joining - let replacement = Cluster.Member(node: UniqueNode(node: Node(systemName: "System", host: "1.1.1.1", port: 7337), nid: .random()), status: .up) + let replacement = Cluster.Member(node: UniqueNode(node: existing.node.node, nid: .random()), status: .up) let change = Cluster.MembershipChange(replaced: existing, by: replacement) change.isReplacement.shouldBeTrue() change.member.shouldEqual(replacement) change.node.shouldEqual(replacement.node) - change.fromStatus.shouldBeNil() // the replacement is "from unknown" after all + change.fromStatus.shouldEqual(existing.status) change.replaced!.status.shouldEqual(existing.status) // though we have the replaced member, it will have its own previous status change.replaced.shouldEqual(existing) @@ -317,7 +387,7 @@ final class MembershipTests: XCTestCase { // MARK: Moving members along their lifecycle func test_moveForward_MemberStatus() { - var member = self.firstMember + var member = self.memberA member.status = .joining let joiningMember = member @@ -331,37 +401,50 @@ final class MembershipTests: XCTestCase { member.status = .removed member.status = .joining - member.moveForward(.up).shouldEqual(Cluster.MembershipChange(member: joiningMember, toStatus: .up)) + member.moveForward(to: .up).shouldEqual(Cluster.MembershipChange(member: joiningMember, toStatus: .up)) member.status.shouldEqual(.up) - member.moveForward(.joining).shouldEqual(nil) // no change, cannot move back + member.moveForward(to: .joining).shouldEqual(nil) // no change, cannot move back - member.moveForward(.up).shouldEqual(nil) // no change, cannot move to same status - member.moveForward(.leaving).shouldEqual(Cluster.MembershipChange(member: upMember, toStatus: .leaving)) + member.moveForward(to: .up).shouldEqual(nil) // no change, cannot move to same status + member.moveForward(to: .leaving).shouldEqual(Cluster.MembershipChange(member: upMember, toStatus: .leaving)) member.status.shouldEqual(.leaving) - member.moveForward(.joining).shouldEqual(nil) // no change, cannot move back - member.moveForward(.up).shouldEqual(nil) // no change, cannot move back - member.moveForward(.leaving).shouldEqual(nil) // no change, same - member.moveForward(.down).shouldEqual(Cluster.MembershipChange(member: leavingMember, toStatus: .down)) + member.moveForward(to: .joining).shouldEqual(nil) // no change, cannot move back + member.moveForward(to: .up).shouldEqual(nil) // no change, cannot move back + member.moveForward(to: .leaving).shouldEqual(nil) // no change, same + member.moveForward(to: .down).shouldEqual(Cluster.MembershipChange(member: leavingMember, toStatus: .down)) member.status.shouldEqual(.down) - member.moveForward(.joining).shouldEqual(nil) // no change, cannot move back - member.moveForward(.up).shouldEqual(nil) // no change, cannot move back - member.moveForward(.leaving).shouldEqual(nil) // no change, cannot move back - member.moveForward(.down).shouldEqual(nil) // no change, same - member.moveForward(.removed).shouldEqual(Cluster.MembershipChange(member: downMember, toStatus: .removed)) + member.moveForward(to: .joining).shouldEqual(nil) // no change, cannot move back + member.moveForward(to: .up).shouldEqual(nil) // no change, cannot move back + member.moveForward(to: .leaving).shouldEqual(nil) // no change, cannot move back + member.moveForward(to: .down).shouldEqual(nil) // no change, same + member.moveForward(to: .removed).shouldEqual(Cluster.MembershipChange(member: downMember, toStatus: .removed)) member.status.shouldEqual(.removed) member.status = .joining - member.moveForward(.leaving).shouldEqual(Cluster.MembershipChange(member: joiningMember, toStatus: .leaving)) + member.moveForward(to: .leaving).shouldEqual(Cluster.MembershipChange(member: joiningMember, toStatus: .leaving)) member.status.shouldEqual(.leaving) member.status = .joining - member.moveForward(.down).shouldEqual(Cluster.MembershipChange(member: joiningMember, toStatus: .down)) + member.moveForward(to: .down).shouldEqual(Cluster.MembershipChange(member: joiningMember, toStatus: .down)) member.status.shouldEqual(.down) + // moving to removed is only allowed from .down + member.status = .joining + member.moveForward(to: .removed).shouldBeNil() + member.status.shouldEqual(.joining) + member.status = .up - member.moveForward(.removed).shouldEqual(Cluster.MembershipChange(member: upMember, toStatus: .removed)) + member.moveForward(to: .removed).shouldBeNil() + member.status.shouldEqual(.up) + + member.status = .leaving + member.moveForward(to: .removed).shouldBeNil() + member.status.shouldEqual(.leaving) + + member.status = .down + member.moveForward(to: .removed).shouldEqual(Cluster.MembershipChange(member: downMember, toStatus: .removed)) member.status.shouldEqual(.removed) } @@ -370,42 +453,42 @@ final class MembershipTests: XCTestCase { func test_membershipDiff_beEmpty_whenNothingChangedForIt() { let changed = self.initialMembership - let diff = Cluster.Membership.diff(from: self.initialMembership, to: changed) + let diff = Cluster.Membership._diff(from: self.initialMembership, to: changed) diff.changes.count.shouldEqual(0) } func test_membershipDiff_shouldIncludeEntry_whenStatusChangedForIt() { - let changed = self.initialMembership.marking(self.firstMember.node, as: .leaving) + let changed = self.initialMembership.marking(self.memberA.node, as: .leaving) - let diff = Cluster.Membership.diff(from: self.initialMembership, to: changed) + let diff = Cluster.Membership._diff(from: self.initialMembership, to: changed) diff.changes.count.shouldEqual(1) let diffEntry = diff.changes.first! - diffEntry.node.shouldEqual(self.firstMember.node) + diffEntry.node.shouldEqual(self.memberA.node) diffEntry.fromStatus?.shouldEqual(.up) diffEntry.toStatus.shouldEqual(.leaving) } func test_membershipDiff_shouldIncludeEntry_whenMemberRemoved() { - let changed = self.initialMembership.removing(self.firstMember.node) + let changed = self.initialMembership.removingCompletely(self.memberA.node) - let diff = Cluster.Membership.diff(from: self.initialMembership, to: changed) + let diff = Cluster.Membership._diff(from: self.initialMembership, to: changed) diff.changes.count.shouldEqual(1) let diffEntry = diff.changes.first! - diffEntry.node.shouldEqual(self.firstMember.node) + diffEntry.node.shouldEqual(self.memberA.node) diffEntry.fromStatus?.shouldEqual(.up) diffEntry.toStatus.shouldEqual(.removed) } func test_membershipDiff_shouldIncludeEntry_whenMemberAdded() { - let changed = self.initialMembership.joining(self.newMember.node) + let changed = self.initialMembership.joining(self.memberD.node) - let diff = Cluster.Membership.diff(from: self.initialMembership, to: changed) + let diff = Cluster.Membership._diff(from: self.initialMembership, to: changed) diff.changes.count.shouldEqual(1) let diffEntry = diff.changes.first! - diffEntry.node.shouldEqual(self.newMember.node) + diffEntry.node.shouldEqual(self.memberD.node) diffEntry.fromStatus.shouldBeNil() diffEntry.toStatus.shouldEqual(.joining) } @@ -417,7 +500,7 @@ final class MembershipTests: XCTestCase { var membership = self.initialMembership let ahead = self.initialMembership - let changes = membership.mergeForward(fromAhead: ahead) + let changes = membership.mergeFrom(incoming: ahead, myself: nil) changes.count.shouldEqual(0) membership.shouldEqual(self.initialMembership) @@ -426,22 +509,72 @@ final class MembershipTests: XCTestCase { func test_mergeForward_fromAhead_membership_withAdditionalMember() { var membership = self.initialMembership var ahead = membership - _ = ahead.join(self.newMember.node) + _ = ahead.join(self.memberD.node)! - let changes = membership.mergeForward(fromAhead: ahead) + let changes = membership.mergeFrom(incoming: ahead, myself: nil) changes.count.shouldEqual(1) - membership.shouldEqual(self.initialMembership.joining(self.newMember.node)) + membership.shouldEqual(self.initialMembership.joining(self.memberD.node)) } func test_mergeForward_fromAhead_membership_withMemberNowDown() { - var membership = self.initialMembership - var ahead = membership - _ = ahead.mark(self.firstMember.node, as: .down) + var membership = Cluster.Membership.parse( + """ + A.up B.up C.up [leader:C] + """, nodes: self.allNodes + ) - let changes = membership.mergeForward(fromAhead: ahead) + let ahead = Cluster.Membership.parse( + """ + A.down B.up C.up + """, nodes: self.allNodes + ) + + let changes = membership.mergeFrom(incoming: ahead, myself: nil) changes.count.shouldEqual(1) - membership.shouldEqual(self.initialMembership) + var expected = membership + _ = expected.mark(self.nodeA, as: .down) + membership.shouldEqual(expected) + } + + func test_mergeForward_fromAhead_membership_withDownMembers() { + var membership = Cluster.Membership.parse( + """ + A.up B.up + """, nodes: self.allNodes + ) + + let ahead = Cluster.Membership.parse( + """ + A.down B.up C.down + """, nodes: self.allNodes + ) + + let changes = membership.mergeFrom(incoming: ahead, myself: nil) + + changes.count.shouldEqual(1) + changes.shouldEqual([ + Cluster.MembershipChange(node: self.nodeA, fromStatus: .up, toStatus: .down), + // we do not ADD .down members to our view + ]) + var expected = membership + _ = expected.mark(self.nodeA, as: .down) + membership.shouldEqual(expected) + } + + func test_mergeForward_fromAhead_membership_ignoreRemovedWithoutPrecedingDown() { + var membership = Cluster.Membership.parse( + "A.up B.up C.up [leader:C]", nodes: self.allNodes + ) + + let ahead = Cluster.Membership.parse( + "A.removed B.up C.up [leader:C]", nodes: self.allNodes + ) + + let changes = membership.mergeFrom(incoming: ahead, myself: self.nodeA) + + // removed MUST follow a .down, yet A was never down, so we ignore this. + changes.shouldEqual([]) } } diff --git a/Tests/DistributedActorsTests/NodeDeathWatcherTests.swift b/Tests/DistributedActorsTests/NodeDeathWatcherTests.swift index 1e1dc78e9..82295d7da 100644 --- a/Tests/DistributedActorsTests/NodeDeathWatcherTests.swift +++ b/Tests/DistributedActorsTests/NodeDeathWatcherTests.swift @@ -54,7 +54,7 @@ final class NodeDeathWatcherTests: ClusteredNodesTestBase { } }) - try self.ensureNodes(.up, systems: first, second) + try self.ensureNodes(.up, nodes: first.cluster.node, second.cluster.node) first.cluster.down(node: second.cluster.node.node) // should cause termination of all remote actors, observed by the local actors on [first] diff --git a/Tests/LinuxMain.swift b/Tests/LinuxMain.swift index 784cc2ce6..e001be4a7 100644 --- a/Tests/LinuxMain.swift +++ b/Tests/LinuxMain.swift @@ -68,8 +68,8 @@ XCTMain([ testCase(ClusterAssociationTests.allTests), testCase(ClusterEventStreamTests.allTests), testCase(ClusterEventsSerializationTests.allTests), + testCase(ClusterLeaderActionsClusteredTests.allTests), testCase(ClusterLeaderActionsTests.allTests), - testCase(ClusterMembershipGossipTests.allTests), testCase(ClusterOnDownActionTests.allTests), testCase(ClusterReceptionistTests.allTests), testCase(ConcurrencyHelpersTests.allTests), @@ -83,13 +83,14 @@ XCTMain([ testCase(FixedThreadPoolTests.allTests), testCase(GenCodableTests.allTests), testCase(GenerateActorsTests.allTests), + testCase(GossipSeenTableTests.allTests), testCase(HeapTests.allTests), testCase(InterceptorTests.allTests), testCase(LamportClockTests.allTests), testCase(LeadershipTests.allTests), testCase(MPSCLinkedQueueTests.allTests), testCase(MailboxTests.allTests), - testCase(MembershipGossipSeenTableTests.allTests), + testCase(MembershipGossipClusteredTests.allTests), testCase(MembershipGossipTests.allTests), testCase(MembershipSerializationTests.allTests), testCase(MembershipTests.allTests), diff --git a/scripts/dev/test_time_stats.sh b/scripts/dev/test_time_stats.sh old mode 100644 new mode 100755