Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -46,15 +46,15 @@
<name>yarn.ipc.rpc.class</name>
<value>org.apache.hadoop.yarn.ipc.HadoopYarnProtoRPC</value>
</property>

<!-- Resource Manager Configuration -->

<property>
<description>The hostname of the RM.</description>
<name>yarn.resourcemanager.hostname</name>
<value>0.0.0.0</value>
</property>
</property>

<property>
<description>The address of the applications manager interface in the RM.</description>
<name>yarn.resourcemanager.address</name>
Expand Down Expand Up @@ -919,6 +919,13 @@
<value>1.0</value>
</property>

<property>
<description>The Number of consecutive missed heartbeats after which node will be
skipped from scheduling</description>
<name>yarn.scheduler.skip.node.multiplier</name>
<value>2</value>
</property>

<property>
<description>The minimum allowed version of a connecting nodemanager. The valid values are
NONE (no version checking), EqualToRM (the nodemanager's version is equal to
Expand Down Expand Up @@ -1140,7 +1147,7 @@
<name>yarn.nodemanager.hostname</name>
<value>0.0.0.0</value>
</property>

<property>
<description>The address of the container manager in the NM.</description>
<name>yarn.nodemanager.address</name>
Expand Down Expand Up @@ -1229,13 +1236,13 @@

<property>
<description>
Number of seconds after an application finishes before the nodemanager's
Number of seconds after an application finishes before the nodemanager's
DeletionService will delete the application's localized file directory
and log directory.

To diagnose YARN application problems, set this property's value large
enough (for example, to 600 = 10 minutes) to permit examination of these
directories. After changing the property's value, you must restart the
directories. After changing the property's value, you must restart the
nodemanager in order for it to have an effect.

The roots of YARN applications' work directories is configurable with
Expand All @@ -1254,7 +1261,7 @@
</property>

<property>
<description>List of directories to store localized files in. An
<description>List of directories to store localized files in. An
application's localized file directory will be found in:
${yarn.nodemanager.local-dirs}/usercache/${user}/appcache/application_${appid}.
Individual containers' work directories, called container_${contid}, will
Expand Down Expand Up @@ -1312,7 +1319,7 @@

<property>
<description>Target size of localizer cache in MB, per nodemanager. It is
a target retention size that only includes resources with PUBLIC and
a target retention size that only includes resources with PUBLIC and
PRIVATE visibility and excludes resources with APPLICATION visibility
</description>
<name>yarn.nodemanager.localizer.cache.target-size-mb</name>
Expand Down Expand Up @@ -1350,7 +1357,7 @@
<description>
Where to store container logs. An application's localized log directory
will be found in ${yarn.nodemanager.log-dirs}/application_${appid}.
Individual containers' log directories will be below this, in directories
Individual containers' log directories will be below this, in directories
named container_{$contid}. Each container directory will contain the files
stderr, stdin, and syslog generated by that container.
</description>
Expand Down Expand Up @@ -1382,12 +1389,12 @@
</property>

<property>
<description>How long to keep aggregation logs before deleting them. -1 disables.
<description>How long to keep aggregation logs before deleting them. -1 disables.
Be careful set this too small and you will spam the name node.</description>
<name>yarn.log-aggregation.retain-seconds</name>
<value>-1</value>
</property>
</property>

<property>
<description>How long to wait between aggregated log retention checks.
If set to 0 or a negative value then the value is computed as one-tenth
Expand Down Expand Up @@ -1451,7 +1458,7 @@
<value>/tmp/logs</value>
</property>
<property>
<description>The remote log dir will be created at
<description>The remote log dir will be created at
{yarn.nodemanager.remote-app-log-dir}/${user}/{thisParam}
</description>
<name>yarn.nodemanager.remote-app-log-dir-suffix</name>
Expand All @@ -1471,7 +1478,7 @@
</property>

<property>
<description>Amount of physical memory, in MB, that can be allocated
<description>Amount of physical memory, in MB, that can be allocated
for containers. If set to -1 and
yarn.nodemanager.resource.detect-hardware-capabilities is true, it is
automatically calculated(in case of Windows and Linux).
Expand Down Expand Up @@ -1762,9 +1769,9 @@
</property>

<property>
<description>The maximum percentage of disk space utilization allowed after
which a disk is marked as bad. Values can range from 0.0 to 100.0.
If the value is greater than or equal to 100, the nodemanager will check
<description>The maximum percentage of disk space utilization allowed after
which a disk is marked as bad. Values can range from 0.0 to 100.0.
If the value is greater than or equal to 100, the nodemanager will check
for full disk. This applies to yarn.nodemanager.local-dirs and
yarn.nodemanager.log-dirs when
yarn.nodemanager.disk-health-checker.disk-utilization-threshold.enabled is true.</description>
Expand Down Expand Up @@ -2120,8 +2127,8 @@
</property>

<property>
<description>The minimum allowed version of a resourcemanager that a nodemanager will connect to.
The valid values are NONE (no version checking), EqualToNM (the resourcemanager's version is
<description>The minimum allowed version of a resourcemanager that a nodemanager will connect to.
The valid values are NONE (no version checking), EqualToNM (the resourcemanager's version is
equal to or greater than the NM version), or a Version String.</description>
<name>yarn.nodemanager.resourcemanager.minimum.version</name>
<value>NONE</value>
Expand Down Expand Up @@ -2202,7 +2209,7 @@
<name>yarn.client.max-cached-nodemanagers-proxies</name>
<value>0</value>
</property>

<property>
<description>Enable the node manager to recover after starting</description>
<name>yarn.nodemanager.recovery.enabled</name>
Expand Down Expand Up @@ -2314,13 +2321,13 @@
<name>yarn.web-proxy.principal</name>
<value/>
</property>

<property>
<description>Keytab for WebAppProxy, if the proxy is not running as part of
<description>Keytab for WebAppProxy, if the proxy is not running as part of
the RM.</description>
<name>yarn.web-proxy.keytab</name>
</property>

<property>
<description>The address for the web proxy as HOST:PORT, if this is not
given then the proxy will run as part of the RM</description>
Expand All @@ -2334,7 +2341,7 @@
<description>
CLASSPATH for YARN applications. A comma-separated list
of CLASSPATH entries. When this value is empty, the following default
CLASSPATH for YARN applications would be used.
CLASSPATH for YARN applications would be used.
For Linux:
$HADOOP_CONF_DIR,
$HADOOP_COMMON_HOME/share/hadoop/common/*,
Expand Down Expand Up @@ -2849,29 +2856,29 @@
<name>yarn.sharedcache.app-checker.class</name>
<value>org.apache.hadoop.yarn.server.sharedcachemanager.RemoteAppChecker</value>
</property>

<property>
<description>A resource in the in-memory store is considered stale
if the time since the last reference exceeds the staleness period.
This value is specified in minutes.</description>
<name>yarn.sharedcache.store.in-memory.staleness-period-mins</name>
<value>10080</value>
</property>

<property>
<description>Initial delay before the in-memory store runs its first check
to remove dead initial applications. Specified in minutes.</description>
<name>yarn.sharedcache.store.in-memory.initial-delay-mins</name>
<value>10</value>
</property>

<property>
<description>The frequency at which the in-memory store checks to remove
dead initial applications. Specified in minutes.</description>
<name>yarn.sharedcache.store.in-memory.check-period-mins</name>
<value>720</value>
</property>

<property>
<description>The address of the admin interface in the SCM (shared cache manager)</description>
<name>yarn.sharedcache.admin.address</name>
Expand Down Expand Up @@ -3302,7 +3309,7 @@
Private_Dirty, Private_Clean, Shared_Dirty, Shared_Clean which can be used
for computing more accurate RSS. When this flag is enabled, RSS is computed
as Min(Shared_Dirty, Pss) + Private_Clean + Private_Dirty. It excludes
read-only shared mappings in RSS computation.
read-only shared mappings in RSS computation.
</description>
<name>yarn.nodemanager.container-monitor.procfs-tree.smaps-based-rss.enabled</name>
<value>false</value>
Expand Down Expand Up @@ -3752,7 +3759,7 @@
<name>yarn.timeline-service.http-cross-origin.enabled</name>
<value>false</value>
</property>

<property>
<description>
Flag to enable cross-origin (CORS) support for timeline service v1.x or
Expand Down Expand Up @@ -3870,7 +3877,7 @@
to specify details about the individual resource types.
</description>
</property>

<property>
<name>yarn.webapp.filter-entity-list-by-user</name>
<value>false</value>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,7 @@ public abstract class AbstractYarnScheduler
protected ConcurrentMap<ApplicationId, SchedulerApplication<T>> applications;
protected int nmExpireInterval;
protected long nmHeartbeatInterval;
private long skipNodeInterval;

private final static List<Container> EMPTY_CONTAINER_LIST =
new ArrayList<Container>();
Expand Down Expand Up @@ -361,6 +362,10 @@ public long getLastNodeUpdateTime() {
return lastNodeUpdateTime;
}

public long getSkipNodeInterval(){
return skipNodeInterval;
}

protected void containerLaunchedOnNode(
ContainerId containerId, SchedulerNode node) {
try {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
import org.apache.hadoop.yarn.util.resource.ResourceCalculator;
import org.apache.hadoop.yarn.util.resource.ResourceUtils;
import org.apache.hadoop.yarn.util.resource.Resources;
import org.apache.hadoop.util.Time;

import static org.apache.hadoop.yarn.exceptions
.InvalidResourceRequestException
Expand All @@ -72,7 +73,7 @@
.InvalidResourceRequestException.UNKNOWN_REASON_MESSAGE_TEMPLATE;

/**
* Utilities shared by schedulers.
* Utilities shared by schedulers.
*/
@Private
@Unstable
Expand Down Expand Up @@ -136,7 +137,7 @@ public String toString() {
*
* @param containerId {@link ContainerId} of returned/released/lost container.
* @param diagnostics diagnostic message
* @return <code>ContainerStatus</code> for an returned/released/lost
* @return <code>ContainerStatus</code> for an returned/released/lost
* container
*/
public static ContainerStatus createAbnormalContainerStatus(
Expand Down Expand Up @@ -179,7 +180,7 @@ public static ContainerStatus createPreemptedContainerStatus(
*
* @param containerId {@link ContainerId} of returned/released/lost container.
* @param diagnostics diagnostic message
* @return <code>ContainerStatus</code> for an returned/released/lost
* @return <code>ContainerStatus</code> for an returned/released/lost
* container
*/
private static ContainerStatus createAbnormalContainerStatus(
Expand Down Expand Up @@ -604,4 +605,11 @@ public static RMContainer createOpportunisticRmContainer(RMContext rmContext,
node.allocateContainer(rmContainer);
return rmContainer;
}

public static boolean isNodeHeartbeated(SchedulerNode node,
long skipNodeInterval) {
long timeElapsedFromLastHeartbeat =
Time.monotonicNow() - node.getLastHeartbeatMonotonicTime();
return timeElapsedFromLastHeartbeat <= skipNodeInterval;
}
}
Loading