Skip to content

Commit 6c8c422

Browse files
bharatviswa504elek
authored andcommitted
HDDS-1178. Healthy pipeline Chill Mode Rule.
Closes #518
1 parent 0e45020 commit 6c8c422

File tree

10 files changed

+315
-47
lines changed

10 files changed

+315
-47
lines changed

hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/HddsConfigKeys.java

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,15 @@ public final class HddsConfigKeys {
8080
public static final String HDDS_SCM_CHILLMODE_THRESHOLD_PCT =
8181
"hdds.scm.chillmode.threshold.pct";
8282
public static final double HDDS_SCM_CHILLMODE_THRESHOLD_PCT_DEFAULT = 0.99;
83+
84+
85+
// percentage of healthy pipelines, where all 3 datanodes are reported in the
86+
// pipeline.
87+
public static final String HDDS_SCM_CHILLMODE_HEALTHY_PIPELINE_THRESHOLD_PCT =
88+
"hdds.scm.chillmode.healthy.pipelie.pct";
89+
public static final double
90+
HDDS_SCM_CHILLMODE_HEALTHY_PIPELINE_THRESHOLD_PCT_DEFAULT = 0.10;
91+
8392
public static final String HDDS_LOCK_MAX_CONCURRENCY =
8493
"hdds.lock.max.concurrency";
8594
public static final int HDDS_LOCK_MAX_CONCURRENCY_DEFAULT = 100;

hadoop-hdds/common/src/main/resources/ozone-default.xml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1315,6 +1315,16 @@
13151315
</description>
13161316
</property>
13171317

1318+
<property>
1319+
<name>hdds.scm.chillmode.healthy.pipelie.pct</name>
1320+
<value>0.10</value>
1321+
<tag>HDDS,SCM,OPERATION</tag>
1322+
<description>
1323+
Percentage of healthy pipelines, where all 3 datanodes are reported in the
1324+
pipeline.
1325+
</description>
1326+
</property>
1327+
13181328
<property>
13191329
<name>hdds.container.action.max.limit</name>
13201330
<value>20</value>
Lines changed: 70 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,9 @@
1717
*/
1818
package org.apache.hadoop.hdds.scm.chillmode;
1919

20-
import java.util.concurrent.atomic.AtomicBoolean;
21-
20+
import org.apache.hadoop.conf.Configuration;
21+
import org.apache.hadoop.hdds.HddsConfigKeys;
22+
import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
2223
import org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.PipelineReport;
2324
import org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.PipelineReportsProto;
2425
import org.apache.hadoop.hdds.scm.pipeline.Pipeline;
@@ -30,33 +31,82 @@
3031
import org.apache.hadoop.hdds.server.events.EventPublisher;
3132

3233
import com.google.common.base.Preconditions;
34+
import org.slf4j.Logger;
35+
import org.slf4j.LoggerFactory;
3336

3437
/**
3538
* Class defining Chill mode exit criteria for Pipelines.
39+
*
40+
* This rule defines percentage of healthy pipelines need to be reported.
41+
* Once chill mode exit happens, this rules take care of writes can go
42+
* through in a cluster.
3643
*/
37-
public class PipelineChillModeRule
44+
public class HealthyPipelineChillModeRule
3845
implements ChillModeExitRule<PipelineReportFromDatanode>,
3946
EventHandler<PipelineReportFromDatanode> {
40-
/** Pipeline availability.*/
41-
private AtomicBoolean isPipelineAvailable = new AtomicBoolean(false);
4247

48+
private static final Logger LOG =
49+
LoggerFactory.getLogger(HealthyPipelineChillModeRule.class);
4350
private final PipelineManager pipelineManager;
4451
private final SCMChillModeManager chillModeManager;
52+
private final int healthyPipelineThresholdCount;
53+
private int currentHealthyPipelineCount = 0;
4554

46-
PipelineChillModeRule(PipelineManager pipelineManager,
47-
SCMChillModeManager manager) {
55+
HealthyPipelineChillModeRule(PipelineManager pipelineManager,
56+
SCMChillModeManager manager, Configuration configuration) {
4857
this.pipelineManager = pipelineManager;
4958
this.chillModeManager = manager;
59+
double healthyPipelinesPercent =
60+
configuration.getDouble(HddsConfigKeys.
61+
HDDS_SCM_CHILLMODE_HEALTHY_PIPELINE_THRESHOLD_PCT,
62+
HddsConfigKeys.
63+
HDDS_SCM_CHILLMODE_HEALTHY_PIPELINE_THRESHOLD_PCT_DEFAULT);
64+
65+
// As we want to wait for 3 node pipelines
66+
int pipelineCount =
67+
pipelineManager.getPipelines(HddsProtos.ReplicationType.RATIS,
68+
HddsProtos.ReplicationFactor.THREE).size();
69+
70+
// This value will be zero when pipeline count is 0.
71+
// On a fresh installed cluster, there will be zero pipelines in the SCM
72+
// pipeline DB.
73+
healthyPipelineThresholdCount =
74+
(int) Math.ceil((healthyPipelinesPercent / 100) * pipelineCount);
75+
76+
LOG.info(" Total pipeline count is {}, healthy pipeline " +
77+
"threshold count is {}", pipelineCount, healthyPipelineThresholdCount);
5078
}
5179

5280
@Override
5381
public boolean validate() {
54-
return isPipelineAvailable.get();
82+
if (currentHealthyPipelineCount >= healthyPipelineThresholdCount) {
83+
return true;
84+
}
85+
return false;
5586
}
5687

5788
@Override
58-
public void process(PipelineReportFromDatanode report) {
59-
// No need to deal with
89+
public void process(PipelineReportFromDatanode pipelineReportFromDatanode) {
90+
Pipeline pipeline;
91+
Preconditions.checkNotNull(pipelineReportFromDatanode);
92+
PipelineReportsProto pipelineReport =
93+
pipelineReportFromDatanode.getReport();
94+
95+
for (PipelineReport report : pipelineReport.getPipelineReportList()) {
96+
PipelineID pipelineID = PipelineID
97+
.getFromProtobuf(report.getPipelineID());
98+
try {
99+
pipeline = pipelineManager.getPipeline(pipelineID);
100+
} catch (PipelineNotFoundException e) {
101+
continue;
102+
}
103+
104+
if (pipeline.getPipelineState() == Pipeline.PipelineState.OPEN) {
105+
// If the pipeline is open state mean, all 3 datanodes are reported
106+
// for this pipeline.
107+
currentHealthyPipelineCount++;
108+
}
109+
}
60110
}
61111

62112
@Override
@@ -67,38 +117,22 @@ public void cleanup() {
67117
@Override
68118
public void onMessage(PipelineReportFromDatanode pipelineReportFromDatanode,
69119
EventPublisher publisher) {
70-
// If we are already in pipeline available state,
71-
// skipping following check.
120+
// If we have already reached healthy pipeline threshold, skip processing
121+
// pipeline report from datanode.
122+
72123
if (validate()) {
73124
chillModeManager.validateChillModeExitRules(publisher);
74125
return;
75126
}
76127

77-
Pipeline pipeline;
78-
Preconditions.checkNotNull(pipelineReportFromDatanode);
79-
PipelineReportsProto pipelineReport = pipelineReportFromDatanode
80-
.getReport();
81-
82-
for (PipelineReport report : pipelineReport.getPipelineReportList()) {
83-
PipelineID pipelineID = PipelineID
84-
.getFromProtobuf(report.getPipelineID());
85-
try {
86-
pipeline = pipelineManager.getPipeline(pipelineID);
87-
} catch (PipelineNotFoundException e) {
88-
continue;
89-
}
128+
// Process pipeline report from datanode
129+
process(pipelineReportFromDatanode);
90130

91-
if (pipeline.getPipelineState() == Pipeline.PipelineState.OPEN) {
92-
// ensure there is an OPEN state pipeline and then allowed
93-
// to exit chill mode
94-
isPipelineAvailable.set(true);
95-
96-
if (chillModeManager.getInChillMode()) {
97-
SCMChillModeManager.getLogger()
98-
.info("SCM in chill mode. 1 Pipeline reported, 1 required.");
99-
}
100-
break;
101-
}
131+
if (chillModeManager.getInChillMode()) {
132+
SCMChillModeManager.getLogger().info(
133+
"SCM in chill mode. Healthy pipelines reported count is {}, " +
134+
"required healthy pipeline reported count is {}",
135+
currentHealthyPipelineCount, healthyPipelineThresholdCount);
102136
}
103137

104138
if (validate()) {

hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/chillmode/SCMChillModeManager.java

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,8 @@ public class SCMChillModeManager implements
6060
private Configuration config;
6161
private static final String CONT_EXIT_RULE = "ContainerChillModeRule";
6262
private static final String DN_EXIT_RULE = "DataNodeChillModeRule";
63-
private static final String PIPELINE_EXIT_RULE = "PipelineChillModeRule";
63+
private static final String HEALTHY_PIPELINE_EXIT_RULE =
64+
"HealthyPipelineChillModeRule";
6465

6566
private final EventQueue eventPublisher;
6667
private final PipelineManager pipelineManager;
@@ -83,10 +84,10 @@ public SCMChillModeManager(Configuration conf,
8384
HddsConfigKeys.HDDS_SCM_CHILLMODE_PIPELINE_AVAILABILITY_CHECK,
8485
HddsConfigKeys.HDDS_SCM_CHILLMODE_PIPELINE_AVAILABILITY_CHECK_DEFAULT)
8586
&& pipelineManager != null) {
86-
PipelineChillModeRule rule = new PipelineChillModeRule(pipelineManager,
87-
this);
88-
exitRules.put(PIPELINE_EXIT_RULE, rule);
89-
eventPublisher.addHandler(SCMEvents.PIPELINE_REPORT, rule);
87+
HealthyPipelineChillModeRule rule = new HealthyPipelineChillModeRule(
88+
pipelineManager, this, config);
89+
exitRules.put(HEALTHY_PIPELINE_EXIT_RULE, rule);
90+
eventPublisher.addHandler(SCMEvents.PROCESSED_PIPELINE_REPORT, rule);
9091
}
9192
emitChillModeStatus();
9293
} else {
@@ -172,4 +173,10 @@ public double getCurrentContainerThreshold() {
172173
.getCurrentContainerThreshold();
173174
}
174175

176+
@VisibleForTesting
177+
public HealthyPipelineChillModeRule getHealthyPipelineChillModeRule() {
178+
return (HealthyPipelineChillModeRule)
179+
exitRules.get(HEALTHY_PIPELINE_EXIT_RULE);
180+
}
181+
175182
}

hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/events/SCMEvents.java

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,14 @@ public final class SCMEvents {
103103
public static final TypedEvent<PipelineReportFromDatanode> PIPELINE_REPORT =
104104
new TypedEvent<>(PipelineReportFromDatanode.class, "Pipeline_Report");
105105

106+
/**
107+
* PipelineReport processed by pipeline report handler. This event is
108+
* received by HealthyPipelineChillModeRule.
109+
*/
110+
public static final TypedEvent<PipelineReportFromDatanode>
111+
PROCESSED_PIPELINE_REPORT = new TypedEvent<>(
112+
PipelineReportFromDatanode.class, "Processed_Pipeline_Report");
113+
106114
/**
107115
* PipelineActions are sent by Datanode. This event is received by
108116
* SCMDatanodeHeartbeatDispatcher and PIPELINE_ACTIONS event is generated.

hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/pipeline/PipelineReportHandler.java

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,14 @@
2020

2121
import com.google.common.base.Preconditions;
2222
import org.apache.hadoop.conf.Configuration;
23+
import org.apache.hadoop.hdds.HddsConfigKeys;
2324
import org.apache.hadoop.hdds.protocol.DatanodeDetails;
2425
import org.apache.hadoop.hdds.protocol.proto
2526
.StorageContainerDatanodeProtocolProtos.PipelineReport;
2627
import org.apache.hadoop.hdds.protocol.proto
2728
.StorageContainerDatanodeProtocolProtos.PipelineReportsProto;
29+
import org.apache.hadoop.hdds.scm.chillmode.SCMChillModeManager;
30+
import org.apache.hadoop.hdds.scm.events.SCMEvents;
2831
import org.apache.hadoop.hdds.scm.server
2932
.SCMDatanodeHeartbeatDispatcher.PipelineReportFromDatanode;
3033
import org.apache.hadoop.hdds.server.events.EventHandler;
@@ -33,6 +36,7 @@
3336
import org.slf4j.LoggerFactory;
3437

3538
import java.io.IOException;
39+
import java.util.Objects;
3640

3741
/**
3842
* Handles Pipeline Reports from datanode.
@@ -44,12 +48,21 @@ public class PipelineReportHandler implements
4448
.getLogger(PipelineReportHandler.class);
4549
private final PipelineManager pipelineManager;
4650
private final Configuration conf;
51+
private final SCMChillModeManager scmChillModeManager;
52+
private final boolean pipelineAvailabilityCheck;
4753

48-
public PipelineReportHandler(PipelineManager pipelineManager,
54+
public PipelineReportHandler(SCMChillModeManager scmChillModeManager,
55+
PipelineManager pipelineManager,
4956
Configuration conf) {
5057
Preconditions.checkNotNull(pipelineManager);
58+
Objects.requireNonNull(scmChillModeManager);
59+
this.scmChillModeManager = scmChillModeManager;
5160
this.pipelineManager = pipelineManager;
5261
this.conf = conf;
62+
this.pipelineAvailabilityCheck = conf.getBoolean(
63+
HddsConfigKeys.HDDS_SCM_CHILLMODE_PIPELINE_AVAILABILITY_CHECK,
64+
HddsConfigKeys.HDDS_SCM_CHILLMODE_PIPELINE_AVAILABILITY_CHECK_DEFAULT);
65+
5366
}
5467

5568
@Override
@@ -70,6 +83,11 @@ public void onMessage(PipelineReportFromDatanode pipelineReportFromDatanode,
7083
report, dn, e);
7184
}
7285
}
86+
if (pipelineAvailabilityCheck && scmChillModeManager.getInChillMode()) {
87+
publisher.fireEvent(SCMEvents.PROCESSED_PIPELINE_REPORT,
88+
pipelineReportFromDatanode);
89+
}
90+
7391
}
7492

7593
private void processPipelineReport(PipelineReport report, DatanodeDetails dn)

hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -289,7 +289,7 @@ public StorageContainerManager(OzoneConfiguration conf,
289289
NodeReportHandler nodeReportHandler =
290290
new NodeReportHandler(scmNodeManager);
291291
PipelineReportHandler pipelineReportHandler =
292-
new PipelineReportHandler(pipelineManager, conf);
292+
new PipelineReportHandler(scmChillModeManager, pipelineManager, conf);
293293
CommandStatusReportHandler cmdStatusReportHandler =
294294
new CommandStatusReportHandler();
295295

0 commit comments

Comments
 (0)