Skip to content

Commit 18b6707

Browse files
authored
[ML] improve the autoscaling decider reason messages (#69227) (#69270)
It can be difficult to fully grok why a scaling decision was made. This commit improves the messaging to with additional information on two different no_scale decisions.
1 parent e9a3e7e commit 18b6707

File tree

2 files changed

+30
-18
lines changed

2 files changed

+30
-18
lines changed

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/autoscaling/MlAutoscalingDeciderService.java

Lines changed: 26 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
import java.util.HashMap;
4444
import java.util.Iterator;
4545
import java.util.List;
46+
import java.util.Locale;
4647
import java.util.Map;
4748
import java.util.Objects;
4849
import java.util.Optional;
@@ -222,13 +223,6 @@ private void resetScaleDownCoolDown() {
222223
this.scaleDownDetected = NO_SCALE_DOWN_POSSIBLE;
223224
}
224225

225-
private boolean canScaleDown(TimeValue coolDown) {
226-
if (this.scaleDownDetected == NO_SCALE_DOWN_POSSIBLE) {
227-
return false;
228-
}
229-
return timeSupplier.get() - scaleDownDetected >= coolDown.millis();
230-
}
231-
232226
private boolean newScaleDownCheck() {
233227
return scaleDownDetected == NO_SCALE_DOWN_POSSIBLE;
234228
}
@@ -325,8 +319,18 @@ public AutoscalingDeciderResult scale(Settings configuration, AutoscalingDecider
325319
return noScaleResultOrRefresh(reasonBuilder, memoryTrackingStale, new AutoscalingDeciderResult(
326320
context.currentCapacity(),
327321
reasonBuilder
328-
.setSimpleReason("Passing currently perceived capacity as there are analytics and anomaly jobs in the queue, " +
329-
"but the number in the queue is less than the configured maximum allowed.")
322+
.setSimpleReason(
323+
String.format(
324+
Locale.ROOT,
325+
"Passing currently perceived capacity as there are [%d] analytics and [%d] anomaly jobs in the queue, "
326+
+ "but the number in the queue is less than the configured maximum allowed. "
327+
+ "[%d] for analytics and [%d] for anomaly jobs",
328+
waitingAnalyticsJobs.size(),
329+
waitingAnomalyJobs.size(),
330+
NUM_ANALYTICS_JOBS_IN_QUEUE.get(configuration),
331+
NUM_ANOMALY_JOBS_IN_QUEUE.get(configuration)
332+
)
333+
)
330334
.build()));
331335
}
332336
if (mlMemoryTracker.isRecentlyRefreshed(memoryTrackingStale) == false) {
@@ -379,11 +383,13 @@ public AutoscalingDeciderResult scale(Settings configuration, AutoscalingDecider
379383
checkForScaleDown(nodes, clusterState, largestJob, currentScale, reasonBuilder);
380384

381385
if (scaleDownDecision.isPresent()) {
386+
final long now = timeSupplier.get();
382387
if (newScaleDownCheck()) {
383-
scaleDownDetected = timeSupplier.get();
388+
scaleDownDetected = now;
384389
}
385390
TimeValue downScaleDelay = DOWN_SCALE_DELAY.get(configuration);
386-
if (canScaleDown(downScaleDelay)) {
391+
long msLeftToScale = downScaleDelay.millis() - (now - scaleDownDetected);
392+
if (msLeftToScale <= 0) {
387393
return scaleDownDecision.get();
388394
}
389395
logger.debug(() -> new ParameterizedMessage(
@@ -396,11 +402,15 @@ public AutoscalingDeciderResult scale(Settings configuration, AutoscalingDecider
396402
context.currentCapacity(),
397403
reasonBuilder
398404
.setSimpleReason(
399-
"Passing currently perceived capacity as configured down scale delay has not be satisfied; configured delay ["
400-
+ downScaleDelay.millis()
401-
+ "] last detected scale down event ["
402-
+ scaleDownDetected
403-
+ "]")
405+
String.format(
406+
Locale.ROOT,
407+
"Passing currently perceived capacity as down scale delay has not be satisfied; configured delay [%s]"
408+
+ "last detected scale down event [%s]. Will request scale down in approximately [%s]",
409+
downScaleDelay.getStringRep(),
410+
XContentElasticsearchExtension.DEFAULT_DATE_PRINTER.print(scaleDownDetected),
411+
TimeValue.timeValueMillis(msLeftToScale).getStringRep()
412+
)
413+
)
404414
.build());
405415
}
406416

x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/autoscaling/MlAutoscalingDeciderServiceTests.java

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -401,8 +401,10 @@ public void testScale_WithNoScaleUpButWaitingJobs() {
401401
DeciderContext deciderContext = new DeciderContext(clusterState, autoscalingCapacity);
402402

403403
AutoscalingDeciderResult result = service.scale(settings, deciderContext);
404-
assertThat(result.reason().summary(),
405-
containsString("Passing currently perceived capacity as there are analytics and anomaly jobs in the queue"));
404+
assertThat(
405+
result.reason().summary(),
406+
containsString("but the number in the queue is less than the configured maximum allowed")
407+
);
406408
assertThat(result.requiredCapacity(), equalTo(autoscalingCapacity));
407409
}
408410

0 commit comments

Comments
 (0)