Skip to content

Commit b23d996

Browse files
committed
ML: Adding support for lazy nodes (#29991) (#34538)
1 parent 1ff55f4 commit b23d996

File tree

3 files changed

+105
-5
lines changed

3 files changed

+105
-5
lines changed

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MachineLearning.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -262,6 +262,8 @@ public class MachineLearning extends Plugin implements ActionPlugin, AnalysisPlu
262262
Setting.intSetting("xpack.ml.node_concurrent_job_allocations", 2, 0, Property.Dynamic, Property.NodeScope);
263263
public static final Setting<Integer> MAX_MACHINE_MEMORY_PERCENT =
264264
Setting.intSetting("xpack.ml.max_machine_memory_percent", 30, 5, 90, Property.Dynamic, Property.NodeScope);
265+
public static final Setting<Integer> MAX_LAZY_ML_NODES =
266+
Setting.intSetting("xpack.ml.max_lazy_ml_nodes", 0, 0, 3, Property.Dynamic, Property.NodeScope);
265267

266268
private static final Logger logger = Loggers.getLogger(XPackPlugin.class);
267269

@@ -293,6 +295,7 @@ public List<Setting<?>> getSettings() {
293295
ML_ENABLED,
294296
CONCURRENT_JOB_ALLOCATIONS,
295297
MachineLearningField.MAX_MODEL_MEMORY_LIMIT,
298+
MAX_LAZY_ML_NODES,
296299
MAX_MACHINE_MEMORY_PERCENT,
297300
AutodetectBuilder.DONT_PERSIST_MODEL_STATE_SETTING,
298301
AutodetectBuilder.MAX_ANOMALY_RECORDS_SETTING,

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportOpenJobAction.java

Lines changed: 40 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,9 @@ public class TransportOpenJobAction extends TransportMasterNodeAction<OpenJobAct
100100
private final PersistentTasksService persistentTasksService;
101101
private final Client client;
102102
private final JobResultsProvider jobResultsProvider;
103+
private static final PersistentTasksCustomMetaData.Assignment AWAITING_LAZY_ASSIGNMENT =
104+
new PersistentTasksCustomMetaData.Assignment(null, "persistent task is awaiting node assignment.");
105+
103106

104107
@Inject
105108
public TransportOpenJobAction(Settings settings, TransportService transportService, ThreadPool threadPool,
@@ -700,6 +703,7 @@ public static class OpenJobPersistentTasksExecutor extends PersistentTasksExecut
700703
private final int fallbackMaxNumberOfOpenJobs;
701704
private volatile int maxConcurrentJobAllocations;
702705
private volatile int maxMachineMemoryPercent;
706+
private volatile int maxLazyMLNodes;
703707

704708
public OpenJobPersistentTasksExecutor(Settings settings, ClusterService clusterService,
705709
AutodetectProcessManager autodetectProcessManager) {
@@ -708,16 +712,35 @@ public OpenJobPersistentTasksExecutor(Settings settings, ClusterService clusterS
708712
this.fallbackMaxNumberOfOpenJobs = AutodetectProcessManager.MAX_OPEN_JOBS_PER_NODE.get(settings);
709713
this.maxConcurrentJobAllocations = MachineLearning.CONCURRENT_JOB_ALLOCATIONS.get(settings);
710714
this.maxMachineMemoryPercent = MachineLearning.MAX_MACHINE_MEMORY_PERCENT.get(settings);
715+
this.maxLazyMLNodes = MachineLearning.MAX_LAZY_ML_NODES.get(settings);
711716
clusterService.getClusterSettings()
712717
.addSettingsUpdateConsumer(MachineLearning.CONCURRENT_JOB_ALLOCATIONS, this::setMaxConcurrentJobAllocations);
713718
clusterService.getClusterSettings()
714719
.addSettingsUpdateConsumer(MachineLearning.MAX_MACHINE_MEMORY_PERCENT, this::setMaxMachineMemoryPercent);
720+
clusterService.getClusterSettings().addSettingsUpdateConsumer(MachineLearning.MAX_LAZY_ML_NODES, this::setMaxLazyMLNodes);
715721
}
716722

717723
@Override
718724
public PersistentTasksCustomMetaData.Assignment getAssignment(OpenJobAction.JobParams params, ClusterState clusterState) {
719-
return selectLeastLoadedMlNode(params.getJobId(), clusterState, maxConcurrentJobAllocations, fallbackMaxNumberOfOpenJobs,
720-
maxMachineMemoryPercent, logger);
725+
PersistentTasksCustomMetaData.Assignment assignment = selectLeastLoadedMlNode(params.getJobId(),
726+
clusterState,
727+
maxConcurrentJobAllocations,
728+
fallbackMaxNumberOfOpenJobs,
729+
maxMachineMemoryPercent,
730+
logger);
731+
if (assignment.getExecutorNode() == null) {
732+
int numMlNodes = 0;
733+
for(DiscoveryNode node : clusterState.getNodes()) {
734+
if (Boolean.valueOf(node.getAttributes().get(MachineLearning.ML_ENABLED_NODE_ATTR))) {
735+
numMlNodes++;
736+
}
737+
}
738+
739+
if (numMlNodes < maxLazyMLNodes) { // Means we have lazy nodes left to allocate
740+
assignment = AWAITING_LAZY_ASSIGNMENT;
741+
}
742+
}
743+
return assignment;
721744
}
722745

723746
@Override
@@ -727,9 +750,9 @@ public void validate(OpenJobAction.JobParams params, ClusterState clusterState)
727750

728751
// If we already know that we can't find an ml node because all ml nodes are running at capacity or
729752
// simply because there are no ml nodes in the cluster then we fail quickly here:
730-
PersistentTasksCustomMetaData.Assignment assignment = selectLeastLoadedMlNode(params.getJobId(), clusterState,
731-
maxConcurrentJobAllocations, fallbackMaxNumberOfOpenJobs, maxMachineMemoryPercent, logger);
732-
if (assignment.getExecutorNode() == null) {
753+
754+
PersistentTasksCustomMetaData.Assignment assignment = getAssignment(params, clusterState);
755+
if (assignment.getExecutorNode() == null && assignment.equals(AWAITING_LAZY_ASSIGNMENT) == false) {
733756
throw makeNoSuitableNodesException(logger, params.getJobId(), assignment.getExplanation());
734757
}
735758
}
@@ -773,6 +796,12 @@ void setMaxMachineMemoryPercent(int maxMachineMemoryPercent) {
773796
this.maxMachineMemoryPercent, maxMachineMemoryPercent);
774797
this.maxMachineMemoryPercent = maxMachineMemoryPercent;
775798
}
799+
800+
void setMaxLazyMLNodes(int maxLazyMLNodes) {
801+
logger.info("Changing [{}] from [{}] to [{}]", MachineLearning.MAX_LAZY_ML_NODES.getKey(),
802+
this.maxLazyMLNodes, maxLazyMLNodes);
803+
this.maxLazyMLNodes = maxLazyMLNodes;
804+
}
776805
}
777806

778807
public static class JobTask extends AllocatedPersistentTask implements OpenJobAction.JobTaskMatcher {
@@ -830,6 +859,12 @@ public boolean test(PersistentTasksCustomMetaData.PersistentTask<?> persistentTa
830859
jobState = jobTaskState == null ? JobState.OPENING : jobTaskState.getState();
831860

832861
PersistentTasksCustomMetaData.Assignment assignment = persistentTask.getAssignment();
862+
863+
// This means we are awaiting a new node to be spun up, ok to return back to the user to await node creation
864+
if (assignment != null && assignment.equals(AWAITING_LAZY_ASSIGNMENT)) {
865+
return true;
866+
}
867+
833868
// This logic is only appropriate when opening a job, not when reallocating following a failure,
834869
// and this is why this class must only be used when opening a job
835870
if (assignment != null && assignment.equals(PersistentTasksCustomMetaData.INITIAL_ASSIGNMENT) == false &&

x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/integration/TooManyJobsIT.java

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,68 @@ public void testCloseFailedJob() throws Exception {
6161
assertEquals(JobState.OPENED, ((JobTaskState) task.getState()).getState());
6262
}
6363

64+
public void testLazyNodeValidation() throws Exception {
65+
int numNodes = 1;
66+
int maxNumberOfJobsPerNode = 1;
67+
int maxNumberOfLazyNodes = 2;
68+
internalCluster().ensureAtMostNumDataNodes(0);
69+
logger.info("[{}] is [{}]", AutodetectProcessManager.MAX_OPEN_JOBS_PER_NODE.getKey(), maxNumberOfJobsPerNode);
70+
for (int i = 0; i < numNodes; i++) {
71+
internalCluster().startNode(Settings.builder()
72+
.put(AutodetectProcessManager.MAX_OPEN_JOBS_PER_NODE.getKey(), maxNumberOfJobsPerNode));
73+
}
74+
logger.info("Started [{}] nodes", numNodes);
75+
ensureStableCluster(numNodes);
76+
logger.info("[{}] is [{}]", MachineLearning.MAX_LAZY_ML_NODES.getKey(), maxNumberOfLazyNodes);
77+
// Set our lazy node number
78+
assertTrue(client().admin()
79+
.cluster()
80+
.prepareUpdateSettings()
81+
.setTransientSettings(
82+
Settings.builder()
83+
.put(MachineLearning.MAX_LAZY_ML_NODES.getKey(), maxNumberOfLazyNodes))
84+
.get()
85+
.isAcknowledged());
86+
// create and open first job, which succeeds:
87+
Job.Builder job = createJob("lazy-node-validation-job-1", new ByteSizeValue(2, ByteSizeUnit.MB));
88+
PutJobAction.Request putJobRequest = new PutJobAction.Request(job);
89+
client().execute(PutJobAction.INSTANCE, putJobRequest).get();
90+
client().execute(OpenJobAction.INSTANCE, new OpenJobAction.Request(job.getId())).get();
91+
assertBusy(() -> {
92+
GetJobsStatsAction.Response statsResponse =
93+
client().execute(GetJobsStatsAction.INSTANCE,
94+
new GetJobsStatsAction.Request("lazy-node-validation-job-1")).actionGet();
95+
assertEquals(statsResponse.getResponse().results().get(0).getState(), JobState.OPENED);
96+
});
97+
98+
// create and try to open second job, which succeeds due to lazy node number:
99+
job = createJob("lazy-node-validation-job-2", new ByteSizeValue(2, ByteSizeUnit.MB));
100+
putJobRequest = new PutJobAction.Request(job);
101+
client().execute(PutJobAction.INSTANCE, putJobRequest).get();
102+
client().execute(OpenJobAction.INSTANCE, new OpenJobAction.Request(job.getId())).get(); // Should return while job is opening
103+
104+
assertBusy(() -> {
105+
GetJobsStatsAction.Response statsResponse =
106+
client().execute(GetJobsStatsAction.INSTANCE,
107+
new GetJobsStatsAction.Request("lazy-node-validation-job-2")).actionGet();
108+
// Should get to opening state w/o a node
109+
assertEquals(JobState.OPENING, statsResponse.getResponse().results().get(0).getState());
110+
});
111+
112+
// Add another Node so we can get allocated
113+
internalCluster().startNode(Settings.builder()
114+
.put(AutodetectProcessManager.MAX_OPEN_JOBS_PER_NODE.getKey(), maxNumberOfJobsPerNode));
115+
ensureStableCluster(numNodes+1);
116+
117+
// We should automatically get allocated and opened to new node
118+
assertBusy(() -> {
119+
GetJobsStatsAction.Response statsResponse =
120+
client().execute(GetJobsStatsAction.INSTANCE,
121+
new GetJobsStatsAction.Request("lazy-node-validation-job-2")).actionGet();
122+
assertEquals(JobState.OPENED, statsResponse.getResponse().results().get(0).getState());
123+
});
124+
}
125+
64126
public void testSingleNode() throws Exception {
65127
verifyMaxNumberOfJobsLimit(1, randomIntBetween(1, 100));
66128
}

0 commit comments

Comments
 (0)