Skip to content

Commit 25c1c36

Browse files
shameersss1prabhjyotsingh
authored andcommitted
YARN-11702: Fix Yarn over allocating containers (apache#6990) Contributed by Syed Shameerur Rahman.
Reviewed-by: Akira Ajisaka <[email protected]> Signed-off-by: Shilun Fan <[email protected]>
1 parent 1f9e28c commit 25c1c36

File tree

8 files changed

+624
-1
lines changed

8 files changed

+624
-1
lines changed

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1406,6 +1406,17 @@ public static boolean isAclEnabled(Configuration conf) {
14061406
public static final int DEFAULT_RM_MAX_LOG_AGGREGATION_DIAGNOSTICS_IN_MEMORY =
14071407
10;
14081408

1409+
/**
1410+
* The configuration key for enabling or disabling the auto-correction of container allocation.
1411+
*/
1412+
public static final String RM_SCHEDULER_AUTOCORRECT_CONTAINER_ALLOCATION = RM_PREFIX
1413+
+ "scheduler.autocorrect.container.allocation";
1414+
1415+
/**
1416+
* Default value: {@value}.
1417+
*/
1418+
public static final boolean DEFAULT_RM_SCHEDULER_AUTOCORRECT_CONTAINER_ALLOCATION = false;
1419+
14091420
/** Whether to enable log aggregation */
14101421
public static final String LOG_AGGREGATION_ENABLED = YARN_PREFIX
14111422
+ "log-aggregation-enable";

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,21 @@
144144
<name>yarn.resourcemanager.principal</name>
145145
</property>
146146

147+
<property>
148+
<description>
149+
This configuration key enables or disables the auto-correction of container allocation in
150+
YARN. Due to the asynchronous nature of container request and allocation, YARN may sometimes
151+
over-allocate more containers than requested. The auto-correction feature addresses this by
152+
automatically adjusting the number of requested containers based on those already allocated,
153+
preventing extra containers from being allocated.
154+
While the extra allocated containers will be released by the client within a few seconds,
155+
this may not be a concern in normal circumstances. However, if the user is worried about
156+
resource contention due to over-allocation, enabling this feature can help avoid such cases.
157+
</description>
158+
<name>yarn.resourcemanager.scheduler.autocorrect.container.allocation</name>
159+
<value>false</value>
160+
</property>
161+
147162
<property>
148163
<description>The address of the scheduler interface.</description>
149164
<name>yarn.resourcemanager.scheduler.address</name>

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AbstractYarnScheduler.java

Lines changed: 218 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,10 @@
2222
import java.util.ArrayList;
2323
import java.util.Arrays;
2424
import java.util.Collection;
25+
import java.util.Collections;
2526
import java.util.EnumSet;
27+
28+
import java.util.HashMap;
2629
import java.util.List;
2730
import java.util.Map;
2831
import java.util.Set;
@@ -33,6 +36,11 @@
3336

3437
import com.google.gson.Gson;
3538
import com.google.gson.reflect.TypeToken;
39+
40+
import org.apache.commons.lang3.builder.EqualsBuilder;
41+
import org.apache.commons.lang3.builder.HashCodeBuilder;
42+
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerState;
43+
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CSQueue;
3644
import org.slf4j.Logger;
3745
import org.slf4j.LoggerFactory;
3846
import org.apache.hadoop.classification.InterfaceAudience.Private;
@@ -142,6 +150,7 @@ public abstract class AbstractYarnScheduler
142150
Thread updateThread;
143151
private final Object updateThreadMonitor = new Object();
144152
private Timer releaseCache;
153+
private boolean autoCorrectContainerAllocation;
145154

146155
/*
147156
* All schedulers which are inheriting AbstractYarnScheduler should use
@@ -196,6 +205,10 @@ public void serviceInit(Configuration conf) throws Exception {
196205
nmHeartbeatInterval =
197206
conf.getLong(YarnConfiguration.RM_NM_HEARTBEAT_INTERVAL_MS,
198207
YarnConfiguration.DEFAULT_RM_NM_HEARTBEAT_INTERVAL_MS);
208+
skipNodeInterval = YarnConfiguration.getSkipNodeInterval(conf);
209+
autoCorrectContainerAllocation =
210+
conf.getBoolean(YarnConfiguration.RM_SCHEDULER_AUTOCORRECT_CONTAINER_ALLOCATION,
211+
YarnConfiguration.DEFAULT_RM_SCHEDULER_AUTOCORRECT_CONTAINER_ALLOCATION);
199212
long configuredMaximumAllocationWaitTime =
200213
conf.getLong(YarnConfiguration.RM_WORK_PRESERVING_RECOVERY_SCHEDULING_WAIT_MS,
201214
YarnConfiguration.DEFAULT_RM_WORK_PRESERVING_RECOVERY_SCHEDULING_WAIT_MS);
@@ -589,6 +602,106 @@ public void recoverContainersOnNode(List<NMContainerStatus> containerReports,
589602
}
590603
}
591604

605+
/**
606+
* Autocorrect container resourceRequests by decrementing the number of newly allocated containers
607+
* from the current container request. This also updates the newlyAllocatedContainers to be within
608+
* the limits of the current container resourceRequests.
609+
* ResourceRequests locality/resourceName is not considered while autocorrecting the container
610+
* request, hence when there are two types of resourceRequest which is same except for the
611+
* locality/resourceName, it is counted as same {@link ContainerObjectType} and the container
612+
* ask and number of newly allocated container is decremented accordingly.
613+
* For example when a client requests for 4 containers with locality/resourceName
614+
* as "node1", AMRMClientaugments the resourceRequest into two
615+
* where R1(numContainer=4,locality=*) and R2(numContainer=4,locality=node1),
616+
* if Yarn allocated 6 containers previously, it will release 2 containers as well as
617+
* update the container ask to 0.
618+
*
619+
* If there is a client which directly calls Yarn (without AMRMClient) with
620+
* two where R1(numContainer=4,locality=*) and R2(numContainer=4,locality=node1)
621+
* the autocorrection may not work as expected. The use case of such client is very rare.
622+
*
623+
* <p>
624+
* This method is called from {@link AbstractYarnScheduler#allocate} method. It is package private
625+
* to be used within the scheduler package only.
626+
* @param resourceRequests List of resources to be allocated
627+
* @param application ApplicationAttempt
628+
*/
629+
@VisibleForTesting
630+
protected void autoCorrectContainerAllocation(List<ResourceRequest> resourceRequests,
631+
SchedulerApplicationAttempt application) {
632+
633+
// if there is no resourceRequests for containers or no newly allocated container from
634+
// the previous request there is nothing to do.
635+
if (!autoCorrectContainerAllocation || resourceRequests.isEmpty() ||
636+
application.newlyAllocatedContainers.isEmpty()) {
637+
return;
638+
}
639+
640+
// iterate newlyAllocatedContainers and form a mapping of container type
641+
// and number of its occurrence.
642+
Map<ContainerObjectType, List<RMContainer>> allocatedContainerMap = new HashMap<>();
643+
for (RMContainer rmContainer : application.newlyAllocatedContainers) {
644+
Container container = rmContainer.getContainer();
645+
ContainerObjectType containerObjectType = new ContainerObjectType(
646+
container.getAllocationRequestId(), container.getPriority(),
647+
container.getExecutionType(), container.getResource());
648+
allocatedContainerMap.computeIfAbsent(containerObjectType,
649+
k -> new ArrayList<>()).add(rmContainer);
650+
}
651+
652+
Map<ContainerObjectType, Integer> extraContainerAllocatedMap = new HashMap<>();
653+
// iterate through resourceRequests and update the request by
654+
// decrementing the already allocated containers.
655+
for (ResourceRequest request : resourceRequests) {
656+
ContainerObjectType containerObjectType =
657+
new ContainerObjectType(request.getAllocationRequestId(),
658+
request.getPriority(), request.getExecutionTypeRequest().getExecutionType(),
659+
request.getCapability());
660+
int numContainerAllocated = allocatedContainerMap.getOrDefault(containerObjectType,
661+
Collections.emptyList()).size();
662+
if (numContainerAllocated > 0) {
663+
int numContainerAsk = request.getNumContainers();
664+
int updatedContainerRequest = numContainerAsk - numContainerAllocated;
665+
if (updatedContainerRequest < 0) {
666+
// add an entry to extra allocated map
667+
extraContainerAllocatedMap.put(containerObjectType, Math.abs(updatedContainerRequest));
668+
LOG.debug("{} container of the resource type: {} will be released",
669+
Math.abs(updatedContainerRequest), request);
670+
// if newlyAllocatedContainer count is more than the current container
671+
// resourceRequests, reset it to 0.
672+
updatedContainerRequest = 0;
673+
}
674+
675+
// update the request
676+
LOG.debug("Updating container resourceRequests from {} to {} for the resource type: {}",
677+
numContainerAsk, updatedContainerRequest, request);
678+
request.setNumContainers(updatedContainerRequest);
679+
}
680+
}
681+
682+
// Iterate over the entries in extraContainerAllocatedMap
683+
for (Map.Entry<ContainerObjectType, Integer> entry : extraContainerAllocatedMap.entrySet()) {
684+
ContainerObjectType containerObjectType = entry.getKey();
685+
int extraContainers = entry.getValue();
686+
687+
// Get the list of allocated containers for the current ContainerObjectType
688+
List<RMContainer> allocatedContainers = allocatedContainerMap.get(containerObjectType);
689+
if (allocatedContainers != null) {
690+
for (RMContainer rmContainer : allocatedContainers) {
691+
if (extraContainers > 0) {
692+
// Change the state of the container from ALLOCATED to EXPIRED since it is not required.
693+
LOG.debug("Removing extra container:{}", rmContainer.getContainer());
694+
completedContainer(rmContainer, SchedulerUtils.createAbnormalContainerStatus(
695+
rmContainer.getContainerId(), SchedulerUtils.EXPIRED_CONTAINER),
696+
RMContainerEventType.EXPIRE);
697+
application.newlyAllocatedContainers.remove(rmContainer);
698+
extraContainers--;
699+
}
700+
}
701+
}
702+
}
703+
}
704+
592705
private RMContainer recoverAndCreateContainer(NMContainerStatus status,
593706
RMNode node, String queueName) {
594707
Container container =
@@ -623,6 +736,14 @@ private void recoverResourceRequestForContainer(RMContainer rmContainer) {
623736
return;
624737
}
625738

739+
// when auto correct container allocation is enabled, there can be a case when extra containers
740+
// go to expired state from allocated state. When such scenario happens do not re-attempt the
741+
// container request since this is expected.
742+
if (autoCorrectContainerAllocation &&
743+
RMContainerState.EXPIRED.equals(rmContainer.getState())) {
744+
return;
745+
}
746+
626747
// Add resource request back to Scheduler ApplicationAttempt.
627748

628749
// We lookup the application-attempt here again using
@@ -1514,4 +1635,101 @@ public boolean attemptAllocationOnNode(SchedulerApplicationAttempt appAttempt,
15141635
public void resetSchedulerMetrics() {
15151636
// reset scheduler metrics
15161637
}
1638+
1639+
/**
1640+
* Gets the apps from a given queue.
1641+
*
1642+
* Mechanics:
1643+
* 1. Get all {@link ApplicationAttemptId}s in the given queue by
1644+
* {@link #getAppsInQueue(String)} method.
1645+
* 2. Always need to check validity for the given queue by the returned
1646+
* values.
1647+
*
1648+
* @param queueName queue name
1649+
* @return a collection of app attempt ids in the given queue, it maybe empty.
1650+
* @throws YarnException if {@link #getAppsInQueue(String)} return null, will
1651+
* throw this exception.
1652+
*/
1653+
private List<ApplicationAttemptId> getAppsFromQueue(String queueName)
1654+
throws YarnException {
1655+
List<ApplicationAttemptId> apps = getAppsInQueue(queueName);
1656+
if (apps == null) {
1657+
throw new YarnException("The specified queue: " + queueName
1658+
+ " doesn't exist");
1659+
}
1660+
return apps;
1661+
}
1662+
1663+
/**
1664+
* ContainerObjectType is a container object with the following properties.
1665+
* Namely allocationId, priority, executionType and resourceType.
1666+
*/
1667+
protected class ContainerObjectType extends Object {
1668+
private final long allocationId;
1669+
private final Priority priority;
1670+
private final ExecutionType executionType;
1671+
private final Resource resource;
1672+
1673+
public ContainerObjectType(long allocationId, Priority priority,
1674+
ExecutionType executionType, Resource resource) {
1675+
this.allocationId = allocationId;
1676+
this.priority = priority;
1677+
this.executionType = executionType;
1678+
this.resource = resource;
1679+
}
1680+
1681+
public long getAllocationId() {
1682+
return allocationId;
1683+
}
1684+
1685+
public Priority getPriority() {
1686+
return priority;
1687+
}
1688+
1689+
public ExecutionType getExecutionType() {
1690+
return executionType;
1691+
}
1692+
1693+
public Resource getResource() {
1694+
return resource;
1695+
}
1696+
1697+
@Override
1698+
public int hashCode() {
1699+
return new HashCodeBuilder(17, 37)
1700+
.append(allocationId)
1701+
.append(priority)
1702+
.append(executionType)
1703+
.append(resource)
1704+
.toHashCode();
1705+
}
1706+
1707+
@Override
1708+
public boolean equals(Object obj) {
1709+
if (obj == null) {
1710+
return false;
1711+
}
1712+
if (obj.getClass() != this.getClass()) {
1713+
return false;
1714+
}
1715+
1716+
ContainerObjectType other = (ContainerObjectType) obj;
1717+
return new EqualsBuilder()
1718+
.append(allocationId, other.getAllocationId())
1719+
.append(priority, other.getPriority())
1720+
.append(executionType, other.getExecutionType())
1721+
.append(resource, other.getResource())
1722+
.isEquals();
1723+
}
1724+
1725+
@Override
1726+
public String toString() {
1727+
return "{ContainerObjectType: "
1728+
+ ", Priority: " + getPriority()
1729+
+ ", Allocation Id: " + getAllocationId()
1730+
+ ", Execution Type: " + getExecutionType()
1731+
+ ", Resource: " + getResource()
1732+
+ "}";
1733+
}
1734+
}
15171735
}

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/SchedulerApplicationAttempt.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -839,7 +839,8 @@ protected synchronized void addToUpdateContainerErrors(
839839
updateContainerErrors.add(error);
840840
}
841841

842-
protected synchronized void addToNewlyAllocatedContainers(
842+
@VisibleForTesting
843+
public synchronized void addToNewlyAllocatedContainers(
843844
SchedulerNode node, RMContainer rmContainer) {
844845
ContainerId matchedContainerId =
845846
getUpdateContext().matchContainerToOutstandingIncreaseReq(

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1236,6 +1236,10 @@ public Allocation allocate(ApplicationAttemptId applicationAttemptId,
12361236
application.showRequests();
12371237
}
12381238

1239+
// update the current container ask by considering the already allocated
1240+
// containers from previous allocation request and return updatedNewlyAllocatedContainers.
1241+
autoCorrectContainerAllocation(ask, application);
1242+
12391243
// Update application requests
12401244
if (application.updateResourceRequests(ask) || application
12411245
.updateSchedulingRequests(schedulingRequests)) {

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -970,6 +970,11 @@ public Allocation allocate(ApplicationAttemptId appAttemptId,
970970
}
971971
application.showRequests();
972972

973+
// update the current container ask by considering the already allocated containers
974+
// from previous allocation request as well as populate the updatedNewlyAllocatedContainers
975+
// list according the to the current ask.
976+
autoCorrectContainerAllocation(ask, application);
977+
973978
// Update application requests
974979
application.updateResourceRequests(ask);
975980

0 commit comments

Comments
 (0)