Skip to content

Commit b05b3a1

Browse files
prabhjyotsinghharshith-212
authored andcommitted
ODP-2634: YARN-10352: Skip schedule on not heartbeated nodes in Multi Node Placement. Contributed by Prabhu Joseph and Qi Zhu (#43)
(cherry picked from commit bc815b3)
1 parent 3c0aacb commit b05b3a1

File tree

7 files changed

+307
-171
lines changed

7 files changed

+307
-171
lines changed

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java

Lines changed: 133 additions & 111 deletions
Large diffs are not rendered by default.

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml

Lines changed: 39 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -46,15 +46,15 @@
4646
<name>yarn.ipc.rpc.class</name>
4747
<value>org.apache.hadoop.yarn.ipc.HadoopYarnProtoRPC</value>
4848
</property>
49-
49+
5050
<!-- Resource Manager Configuration -->
5151

5252
<property>
5353
<description>The hostname of the RM.</description>
5454
<name>yarn.resourcemanager.hostname</name>
5555
<value>0.0.0.0</value>
56-
</property>
57-
56+
</property>
57+
5858
<property>
5959
<description>The address of the applications manager interface in the RM.</description>
6060
<name>yarn.resourcemanager.address</name>
@@ -919,6 +919,13 @@
919919
<value>1.0</value>
920920
</property>
921921

922+
<property>
923+
<description>The Number of consecutive missed heartbeats after which node will be
924+
skipped from scheduling</description>
925+
<name>yarn.scheduler.skip.node.multiplier</name>
926+
<value>2</value>
927+
</property>
928+
922929
<property>
923930
<description>The minimum allowed version of a connecting nodemanager. The valid values are
924931
NONE (no version checking), EqualToRM (the nodemanager's version is equal to
@@ -1140,7 +1147,7 @@
11401147
<name>yarn.nodemanager.hostname</name>
11411148
<value>0.0.0.0</value>
11421149
</property>
1143-
1150+
11441151
<property>
11451152
<description>The address of the container manager in the NM.</description>
11461153
<name>yarn.nodemanager.address</name>
@@ -1229,13 +1236,13 @@
12291236

12301237
<property>
12311238
<description>
1232-
Number of seconds after an application finishes before the nodemanager's
1239+
Number of seconds after an application finishes before the nodemanager's
12331240
DeletionService will delete the application's localized file directory
12341241
and log directory.
1235-
1242+
12361243
To diagnose YARN application problems, set this property's value large
12371244
enough (for example, to 600 = 10 minutes) to permit examination of these
1238-
directories. After changing the property's value, you must restart the
1245+
directories. After changing the property's value, you must restart the
12391246
nodemanager in order for it to have an effect.
12401247

12411248
The roots of YARN applications' work directories is configurable with
@@ -1254,7 +1261,7 @@
12541261
</property>
12551262

12561263
<property>
1257-
<description>List of directories to store localized files in. An
1264+
<description>List of directories to store localized files in. An
12581265
application's localized file directory will be found in:
12591266
${yarn.nodemanager.local-dirs}/usercache/${user}/appcache/application_${appid}.
12601267
Individual containers' work directories, called container_${contid}, will
@@ -1312,7 +1319,7 @@
13121319

13131320
<property>
13141321
<description>Target size of localizer cache in MB, per nodemanager. It is
1315-
a target retention size that only includes resources with PUBLIC and
1322+
a target retention size that only includes resources with PUBLIC and
13161323
PRIVATE visibility and excludes resources with APPLICATION visibility
13171324
</description>
13181325
<name>yarn.nodemanager.localizer.cache.target-size-mb</name>
@@ -1350,7 +1357,7 @@
13501357
<description>
13511358
Where to store container logs. An application's localized log directory
13521359
will be found in ${yarn.nodemanager.log-dirs}/application_${appid}.
1353-
Individual containers' log directories will be below this, in directories
1360+
Individual containers' log directories will be below this, in directories
13541361
named container_{$contid}. Each container directory will contain the files
13551362
stderr, stdin, and syslog generated by that container.
13561363
</description>
@@ -1382,12 +1389,12 @@
13821389
</property>
13831390

13841391
<property>
1385-
<description>How long to keep aggregation logs before deleting them. -1 disables.
1392+
<description>How long to keep aggregation logs before deleting them. -1 disables.
13861393
Be careful set this too small and you will spam the name node.</description>
13871394
<name>yarn.log-aggregation.retain-seconds</name>
13881395
<value>-1</value>
1389-
</property>
1390-
1396+
</property>
1397+
13911398
<property>
13921399
<description>How long to wait between aggregated log retention checks.
13931400
If set to 0 or a negative value then the value is computed as one-tenth
@@ -1451,7 +1458,7 @@
14511458
<value>/tmp/logs</value>
14521459
</property>
14531460
<property>
1454-
<description>The remote log dir will be created at
1461+
<description>The remote log dir will be created at
14551462
{yarn.nodemanager.remote-app-log-dir}/${user}/{thisParam}
14561463
</description>
14571464
<name>yarn.nodemanager.remote-app-log-dir-suffix</name>
@@ -1471,7 +1478,7 @@
14711478
</property>
14721479

14731480
<property>
1474-
<description>Amount of physical memory, in MB, that can be allocated
1481+
<description>Amount of physical memory, in MB, that can be allocated
14751482
for containers. If set to -1 and
14761483
yarn.nodemanager.resource.detect-hardware-capabilities is true, it is
14771484
automatically calculated(in case of Windows and Linux).
@@ -1762,9 +1769,9 @@
17621769
</property>
17631770

17641771
<property>
1765-
<description>The maximum percentage of disk space utilization allowed after
1766-
which a disk is marked as bad. Values can range from 0.0 to 100.0.
1767-
If the value is greater than or equal to 100, the nodemanager will check
1772+
<description>The maximum percentage of disk space utilization allowed after
1773+
which a disk is marked as bad. Values can range from 0.0 to 100.0.
1774+
If the value is greater than or equal to 100, the nodemanager will check
17681775
for full disk. This applies to yarn.nodemanager.local-dirs and
17691776
yarn.nodemanager.log-dirs when
17701777
yarn.nodemanager.disk-health-checker.disk-utilization-threshold.enabled is true.</description>
@@ -2120,8 +2127,8 @@
21202127
</property>
21212128

21222129
<property>
2123-
<description>The minimum allowed version of a resourcemanager that a nodemanager will connect to.
2124-
The valid values are NONE (no version checking), EqualToNM (the resourcemanager's version is
2130+
<description>The minimum allowed version of a resourcemanager that a nodemanager will connect to.
2131+
The valid values are NONE (no version checking), EqualToNM (the resourcemanager's version is
21252132
equal to or greater than the NM version), or a Version String.</description>
21262133
<name>yarn.nodemanager.resourcemanager.minimum.version</name>
21272134
<value>NONE</value>
@@ -2202,7 +2209,7 @@
22022209
<name>yarn.client.max-cached-nodemanagers-proxies</name>
22032210
<value>0</value>
22042211
</property>
2205-
2212+
22062213
<property>
22072214
<description>Enable the node manager to recover after starting</description>
22082215
<name>yarn.nodemanager.recovery.enabled</name>
@@ -2314,13 +2321,13 @@
23142321
<name>yarn.web-proxy.principal</name>
23152322
<value/>
23162323
</property>
2317-
2324+
23182325
<property>
2319-
<description>Keytab for WebAppProxy, if the proxy is not running as part of
2326+
<description>Keytab for WebAppProxy, if the proxy is not running as part of
23202327
the RM.</description>
23212328
<name>yarn.web-proxy.keytab</name>
23222329
</property>
2323-
2330+
23242331
<property>
23252332
<description>The address for the web proxy as HOST:PORT, if this is not
23262333
given then the proxy will run as part of the RM</description>
@@ -2334,7 +2341,7 @@
23342341
<description>
23352342
CLASSPATH for YARN applications. A comma-separated list
23362343
of CLASSPATH entries. When this value is empty, the following default
2337-
CLASSPATH for YARN applications would be used.
2344+
CLASSPATH for YARN applications would be used.
23382345
For Linux:
23392346
$HADOOP_CONF_DIR,
23402347
$HADOOP_COMMON_HOME/share/hadoop/common/*,
@@ -2849,29 +2856,29 @@
28492856
<name>yarn.sharedcache.app-checker.class</name>
28502857
<value>org.apache.hadoop.yarn.server.sharedcachemanager.RemoteAppChecker</value>
28512858
</property>
2852-
2859+
28532860
<property>
28542861
<description>A resource in the in-memory store is considered stale
28552862
if the time since the last reference exceeds the staleness period.
28562863
This value is specified in minutes.</description>
28572864
<name>yarn.sharedcache.store.in-memory.staleness-period-mins</name>
28582865
<value>10080</value>
28592866
</property>
2860-
2867+
28612868
<property>
28622869
<description>Initial delay before the in-memory store runs its first check
28632870
to remove dead initial applications. Specified in minutes.</description>
28642871
<name>yarn.sharedcache.store.in-memory.initial-delay-mins</name>
28652872
<value>10</value>
28662873
</property>
2867-
2874+
28682875
<property>
28692876
<description>The frequency at which the in-memory store checks to remove
28702877
dead initial applications. Specified in minutes.</description>
28712878
<name>yarn.sharedcache.store.in-memory.check-period-mins</name>
28722879
<value>720</value>
28732880
</property>
2874-
2881+
28752882
<property>
28762883
<description>The address of the admin interface in the SCM (shared cache manager)</description>
28772884
<name>yarn.sharedcache.admin.address</name>
@@ -3302,7 +3309,7 @@
33023309
Private_Dirty, Private_Clean, Shared_Dirty, Shared_Clean which can be used
33033310
for computing more accurate RSS. When this flag is enabled, RSS is computed
33043311
as Min(Shared_Dirty, Pss) + Private_Clean + Private_Dirty. It excludes
3305-
read-only shared mappings in RSS computation.
3312+
read-only shared mappings in RSS computation.
33063313
</description>
33073314
<name>yarn.nodemanager.container-monitor.procfs-tree.smaps-based-rss.enabled</name>
33083315
<value>false</value>
@@ -3752,7 +3759,7 @@
37523759
<name>yarn.timeline-service.http-cross-origin.enabled</name>
37533760
<value>false</value>
37543761
</property>
3755-
3762+
37563763
<property>
37573764
<description>
37583765
Flag to enable cross-origin (CORS) support for timeline service v1.x or
@@ -3870,7 +3877,7 @@
38703877
to specify details about the individual resource types.
38713878
</description>
38723879
</property>
3873-
3880+
38743881
<property>
38753882
<name>yarn.webapp.filter-entity-list-by-user</name>
38763883
<value>false</value>

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AbstractYarnScheduler.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,7 @@ public abstract class AbstractYarnScheduler
159159
protected ConcurrentMap<ApplicationId, SchedulerApplication<T>> applications;
160160
protected int nmExpireInterval;
161161
protected long nmHeartbeatInterval;
162+
private long skipNodeInterval;
162163

163164
private final static List<Container> EMPTY_CONTAINER_LIST =
164165
new ArrayList<Container>();
@@ -361,6 +362,10 @@ public long getLastNodeUpdateTime() {
361362
return lastNodeUpdateTime;
362363
}
363364

365+
public long getSkipNodeInterval(){
366+
return skipNodeInterval;
367+
}
368+
364369
protected void containerLaunchedOnNode(
365370
ContainerId containerId, SchedulerNode node) {
366371
try {

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/SchedulerUtils.java

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@
6161
import org.apache.hadoop.yarn.util.resource.ResourceCalculator;
6262
import org.apache.hadoop.yarn.util.resource.ResourceUtils;
6363
import org.apache.hadoop.yarn.util.resource.Resources;
64+
import org.apache.hadoop.util.Time;
6465

6566
import static org.apache.hadoop.yarn.exceptions
6667
.InvalidResourceRequestException
@@ -72,7 +73,7 @@
7273
.InvalidResourceRequestException.UNKNOWN_REASON_MESSAGE_TEMPLATE;
7374

7475
/**
75-
* Utilities shared by schedulers.
76+
* Utilities shared by schedulers.
7677
*/
7778
@Private
7879
@Unstable
@@ -136,7 +137,7 @@ public String toString() {
136137
*
137138
* @param containerId {@link ContainerId} of returned/released/lost container.
138139
* @param diagnostics diagnostic message
139-
* @return <code>ContainerStatus</code> for an returned/released/lost
140+
* @return <code>ContainerStatus</code> for an returned/released/lost
140141
* container
141142
*/
142143
public static ContainerStatus createAbnormalContainerStatus(
@@ -179,7 +180,7 @@ public static ContainerStatus createPreemptedContainerStatus(
179180
*
180181
* @param containerId {@link ContainerId} of returned/released/lost container.
181182
* @param diagnostics diagnostic message
182-
* @return <code>ContainerStatus</code> for an returned/released/lost
183+
* @return <code>ContainerStatus</code> for an returned/released/lost
183184
* container
184185
*/
185186
private static ContainerStatus createAbnormalContainerStatus(
@@ -604,4 +605,11 @@ public static RMContainer createOpportunisticRmContainer(RMContext rmContext,
604605
node.allocateContainer(rmContainer);
605606
return rmContainer;
606607
}
608+
609+
public static boolean isNodeHeartbeated(SchedulerNode node,
610+
long skipNodeInterval) {
611+
long timeElapsedFromLastHeartbeat =
612+
Time.monotonicNow() - node.getLastHeartbeatMonotonicTime();
613+
return timeElapsedFromLastHeartbeat <= skipNodeInterval;
614+
}
607615
}

0 commit comments

Comments
 (0)