Skip to content

Commit 05f32c7

Browse files
committed
NoSQL: Fail node-management-impl init after timeout
Also move the expensive part to a `@PostConstruct` to not block CDI entirely from initializing.
1 parent ff495af commit 05f32c7

File tree

3 files changed

+38
-5
lines changed

3 files changed

+38
-5
lines changed

persistence/nosql/nodes/impl/src/main/java/org/apache/polaris/nodes/impl/NodeManagementImpl.java

Lines changed: 28 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
import com.google.common.annotations.VisibleForTesting;
2626
import com.google.common.primitives.Ints;
2727
import jakarta.annotation.Nonnull;
28+
import jakarta.annotation.PostConstruct;
2829
import jakarta.annotation.PreDestroy;
2930
import jakarta.enterprise.context.ApplicationScoped;
3031
import jakarta.inject.Inject;
@@ -74,16 +75,17 @@ class NodeManagementImpl implements NodeManagement {
7475
static final Duration RESCHEDULE_AFTER_FAILURE = Duration.ofSeconds(10);
7576
static final Duration RESCHEDULE_UNTIL_EXPIRATION = Duration.ofMinutes(1);
7677
static final Duration RENEWAL_MIN_LEFT_FOR_RENEWAL = Duration.ofSeconds(30);
77-
private final NodeStore nodeStore;
7878
private final NodeManagementConfig config;
7979
private final MonotonicClock clock;
8080
private final int numNodeIds;
81-
private final IdGeneratorFactory<?> idGenFactory;
82-
private final IdGeneratorSpec idGenSpec;
83-
8481
private final Set<NodeLeaseImpl> registeredLeases = ConcurrentHashMap.newKeySet();
85-
private final IdGenerator systemIdGen;
8682
private final AsyncExec scheduler;
83+
private final NodeStoreFactory nodeStoreFactory;
84+
85+
private NodeStore nodeStore;
86+
private IdGeneratorFactory<?> idGenFactory;
87+
private IdGeneratorSpec idGenSpec;
88+
private IdGenerator systemIdGen;
8789

8890
private volatile boolean closed;
8991

@@ -95,6 +97,7 @@ class NodeManagementImpl implements NodeManagement {
9597
NodeStoreFactory nodeStoreFactory,
9698
AsyncExec scheduler) {
9799
var activePeriod = config.leaseDuration().minus(config.renewalPeriod());
100+
this.nodeStoreFactory = nodeStoreFactory;
98101
this.numNodeIds = config.numNodes();
99102
checkArgs(
100103
() ->
@@ -117,7 +120,11 @@ class NodeManagementImpl implements NodeManagement {
117120
this.config = config;
118121
this.clock = clock;
119122
this.scheduler = scheduler;
123+
}
120124

125+
@SuppressWarnings("BusyWait")
126+
@PostConstruct
127+
void init() {
121128
var idGenSpec =
122129
(IdGeneratorSpec) ImmutableIdGeneratorSpec.builder().from(config.idGeneratorSpec()).build();
123130
var validationIdGeneratorSource =
@@ -132,6 +139,9 @@ public int nodeId() {
132139
return 0;
133140
}
134141
};
142+
143+
// If this loop doesn't complete within 10 minutes, we can only give up.
144+
var timeout = clock.currentInstant().plus(Duration.ofMinutes(10));
135145
while (true) {
136146
var existingNodeManagementState = nodeStoreFactory.fetchManagementState();
137147
if (existingNodeManagementState.isPresent()) {
@@ -157,6 +167,19 @@ public int nodeId() {
157167
break;
158168
}
159169
}
170+
if (timeout.isBefore(clock.currentInstant())) {
171+
throw new IllegalStateException(
172+
"Timed out to fetch and/or persist node management configuration. This is likely due to an overloaded backend database.");
173+
}
174+
try {
175+
// random sleep
176+
Thread.sleep(ThreadLocalRandom.current().nextInt(10, 500));
177+
} catch (InterruptedException e) {
178+
Thread.currentThread().interrupt();
179+
throw new IllegalStateException(
180+
"Interrupted while waiting for node management configuration to be fetched/persisted",
181+
e);
182+
}
160183
}
161184

162185
this.idGenSpec = idGenSpec;

persistence/nosql/nodes/impl/src/test/java/org/apache/polaris/nodes/impl/TestNodeLeases.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,8 @@ public void leaseAgainAfterStalledExecutor() {
6060
.build();
6161
var renewInterval = config.leaseDuration().minus(config.renewalPeriod());
6262
try (var mgmt = new NodeManagementImpl(config, clock, new MockNodeStoreFactory(), scheduler)) {
63+
mgmt.init();
64+
6365
soft.assertThat(scheduler.tasks()).isEmpty();
6466

6567
var lease = mgmt.lease();
@@ -133,6 +135,8 @@ public NodeState persist(
133135
};
134136
try (var mgmt =
135137
new NodeManagementImpl(config, clock, new MockNodeStoreFactory(mockStore), scheduler)) {
138+
mgmt.init();
139+
136140
soft.assertThat(scheduler.tasks()).isEmpty();
137141

138142
var lease = mgmt.lease();

persistence/nosql/nodes/impl/src/test/java/org/apache/polaris/nodes/impl/TestNodeManagementImpl.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,8 @@ void warnOnIncompatibleIdGeneratorSpec(IdGeneratorSpec spec) {
126126
super.warnOnIncompatibleIdGeneratorSpec(spec);
127127
}
128128
}) {
129+
mgmt.init();
130+
129131
soft.assertThat(incompatible).isTrue();
130132
var node = mgmt.lease();
131133
var idGen = mgmt.buildIdGenerator(node);
@@ -154,6 +156,8 @@ public void simple() {
154156
.build())
155157
.build();
156158
try (var mgmt = new NodeManagementImpl(config, clock, new MockNodeStoreFactory(), scheduler)) {
159+
mgmt.init();
160+
157161
soft.assertThat(mgmt.maxNumberOfNodes()).isEqualTo(config.numNodes());
158162
var lease = mgmt.lease();
159163
soft.assertThat(lease).isNotNull();
@@ -175,6 +179,8 @@ public void allocateAll() {
175179
try (var mutableClock = new MutableMonotonicClock();
176180
var mgmt =
177181
new NodeManagementImpl(config, mutableClock, new MockNodeStoreFactory(), scheduler)) {
182+
mgmt.init();
183+
178184
var numNodeIds = 1 << SnowflakeIdGenerator.DEFAULT_NODE_ID_BITS;
179185
var leases = new ArrayList<NodeLease>();
180186
for (int i = 0; i < numNodeIds; i++) {

0 commit comments

Comments
 (0)