-
Couldn't load subscription status.
- Fork 9.1k
HDDS-1406. Avoid usage of commonPool in RatisPipelineUtils. #714
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
25ef382
0a72fc6
457190b
e03e489
3eaca7e
9c0fa8a
cd751ad
2ddb4cf
8505fcf
b2100fa
e5bb220
c3d6890
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -24,35 +24,76 @@ | |
| import org.apache.hadoop.hdds.protocol.proto.HddsProtos.ReplicationFactor; | ||
| import org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeState; | ||
| import org.apache.hadoop.hdds.scm.ScmConfigKeys; | ||
| import org.apache.hadoop.hdds.scm.client.HddsClientUtils; | ||
| import org.apache.hadoop.hdds.scm.container.placement.algorithms.ContainerPlacementPolicy; | ||
| import org.apache.hadoop.hdds.scm.container.placement.algorithms.SCMContainerPlacementRandom; | ||
| import org.apache.hadoop.hdds.scm.node.NodeManager; | ||
| import org.apache.hadoop.hdds.scm.pipeline.Pipeline.PipelineState; | ||
| import org.apache.hadoop.hdds.security.x509.SecurityConfig; | ||
| import org.apache.hadoop.io.MultipleIOException; | ||
| import org.apache.ratis.RatisHelper; | ||
| import org.apache.ratis.client.RaftClient; | ||
| import org.apache.ratis.grpc.GrpcTlsConfig; | ||
| import org.apache.ratis.protocol.RaftClientReply; | ||
| import org.apache.ratis.protocol.RaftGroup; | ||
| import org.apache.ratis.protocol.RaftPeer; | ||
| import org.apache.ratis.retry.RetryPolicy; | ||
| import org.apache.ratis.rpc.SupportedRpcType; | ||
| import org.apache.ratis.util.TimeDuration; | ||
| import org.apache.ratis.util.function.CheckedBiConsumer; | ||
| import org.slf4j.Logger; | ||
| import org.slf4j.LoggerFactory; | ||
|
|
||
| import java.io.IOException; | ||
| import java.lang.reflect.Constructor; | ||
| import java.lang.reflect.InvocationTargetException; | ||
| import java.util.ArrayList; | ||
| import java.util.Collections; | ||
| import java.util.HashSet; | ||
| import java.util.List; | ||
| import java.util.Set; | ||
| import java.util.concurrent.ExecutionException; | ||
| import java.util.concurrent.ForkJoinPool; | ||
| import java.util.concurrent.ForkJoinWorkerThread; | ||
| import java.util.concurrent.RejectedExecutionException; | ||
| import java.util.concurrent.TimeUnit; | ||
| import java.util.stream.Collectors; | ||
|
|
||
| /** | ||
| * Implements Api for creating ratis pipelines. | ||
| */ | ||
| public class RatisPipelineProvider implements PipelineProvider { | ||
|
|
||
| private static final Logger LOG = | ||
| LoggerFactory.getLogger(RatisPipelineProvider.class); | ||
|
|
||
| private final NodeManager nodeManager; | ||
| private final PipelineStateManager stateManager; | ||
| private final Configuration conf; | ||
|
|
||
| // Set parallelism at 3, as now in Ratis we create 1 and 3 node pipelines. | ||
| private final int parallelismForPool = 3; | ||
|
|
||
| private final ForkJoinPool.ForkJoinWorkerThreadFactory factory = | ||
| (pool -> { | ||
| final ForkJoinWorkerThread worker = ForkJoinPool. | ||
| defaultForkJoinWorkerThreadFactory.newThread(pool); | ||
| worker.setName("RATISCREATEPIPELINE" + worker.getPoolIndex()); | ||
| return worker; | ||
| }); | ||
|
|
||
| private final ForkJoinPool forkJoinPool = new ForkJoinPool( | ||
| parallelismForPool, factory, null, false); | ||
|
|
||
|
|
||
| RatisPipelineProvider(NodeManager nodeManager, | ||
| PipelineStateManager stateManager, Configuration conf) { | ||
| this.nodeManager = nodeManager; | ||
| this.stateManager = stateManager; | ||
| this.conf = conf; | ||
| } | ||
|
|
||
|
|
||
| /** | ||
| * Create pluggable container placement policy implementation instance. | ||
| * | ||
|
|
@@ -133,7 +174,81 @@ public Pipeline create(ReplicationFactor factor, | |
| .build(); | ||
| } | ||
|
|
||
|
|
||
| @Override | ||
| public void shutdown() { | ||
| forkJoinPool.shutdownNow(); | ||
| try { | ||
| forkJoinPool.awaitTermination(60, TimeUnit.SECONDS); | ||
| } catch (Exception e) { | ||
| LOG.error("Unexpected exception occurred during shutdown of " + | ||
| "RatisPipelineProvider", e); | ||
| } | ||
| } | ||
|
|
||
| protected void initializePipeline(Pipeline pipeline) throws IOException { | ||
| RatisPipelineUtils.createPipeline(pipeline, conf); | ||
| final RaftGroup group = RatisHelper.newRaftGroup(pipeline); | ||
| LOG.debug("creating pipeline:{} with {}", pipeline.getId(), group); | ||
| callRatisRpc(pipeline.getNodes(), | ||
| (raftClient, peer) -> { | ||
| RaftClientReply reply = raftClient.groupAdd(group, peer.getId()); | ||
| if (reply == null || !reply.isSuccess()) { | ||
| String msg = "Pipeline initialization failed for pipeline:" | ||
| + pipeline.getId() + " node:" + peer.getId(); | ||
| LOG.error(msg); | ||
| throw new IOException(msg); | ||
| } | ||
| }); | ||
| } | ||
|
|
||
| private void callRatisRpc(List<DatanodeDetails> datanodes, | ||
| CheckedBiConsumer< RaftClient, RaftPeer, IOException> rpc) | ||
| throws IOException { | ||
| if (datanodes.isEmpty()) { | ||
| return; | ||
| } | ||
|
|
||
| final String rpcType = conf | ||
| .get(ScmConfigKeys.DFS_CONTAINER_RATIS_RPC_TYPE_KEY, | ||
| ScmConfigKeys.DFS_CONTAINER_RATIS_RPC_TYPE_DEFAULT); | ||
| final RetryPolicy retryPolicy = RatisHelper.createRetryPolicy(conf); | ||
| final List< IOException > exceptions = | ||
| Collections.synchronizedList(new ArrayList<>()); | ||
| final int maxOutstandingRequests = | ||
| HddsClientUtils.getMaxOutstandingRequests(conf); | ||
| final GrpcTlsConfig tlsConfig = RatisHelper.createTlsClientConfig(new | ||
| SecurityConfig(conf)); | ||
| final TimeDuration requestTimeout = | ||
| RatisHelper.getClientRequestTimeout(conf); | ||
| try { | ||
| forkJoinPool.submit(() -> { | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we please verify that none of the threads are waiting for the parallel stream call to finish? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @bharatviswa504 Can you please verify this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @lokeshj1703 Sorry missed this comment earlier. Output: So, I think we should be fine with parallelism set to 3. I even tried with 4, but I still see the same above output. |
||
| datanodes.parallelStream().forEach(d -> { | ||
| final RaftPeer p = RatisHelper.toRaftPeer(d); | ||
| try (RaftClient client = RatisHelper | ||
| .newRaftClient(SupportedRpcType.valueOfIgnoreCase(rpcType), p, | ||
| retryPolicy, maxOutstandingRequests, tlsConfig, | ||
| requestTimeout)) { | ||
| rpc.accept(client, p); | ||
| } catch (IOException ioe) { | ||
| String errMsg = | ||
| "Failed invoke Ratis rpc " + rpc + " for " + d.getUuid(); | ||
| LOG.error(errMsg, ioe); | ||
| exceptions.add(new IOException(errMsg, ioe)); | ||
| } | ||
| }); | ||
| }).get(); | ||
| } catch (ExecutionException | RejectedExecutionException ex) { | ||
| LOG.error(ex.getClass().getName() + " exception occurred during " + | ||
| "createPipeline", ex); | ||
| throw new IOException(ex.getClass().getName() + " exception occurred " + | ||
| "during createPipeline", ex); | ||
| } catch (InterruptedException ex) { | ||
| Thread.currentThread().interrupt(); | ||
| throw new IOException("Interrupt exception occurred during " + | ||
| "createPipeline", ex); | ||
| } | ||
| if (!exceptions.isEmpty()) { | ||
| throw MultipleIOException.createIOException(exceptions); | ||
| } | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We should also wait for the tasks to finish. We need to use awaitTermination call. We can use timeout of 60 seconds? That is what is used in Scheduler class.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is done based on arpit's comment, as on an unclean shutdown this terminate abruptly. So we can use shutdownNow(), instead of awaitTermination in normal case too.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@bharatviswa504 I agree. We need to use shutdownNow but we also need to use awaitTermination. shutdownNow would interrupt the running tasks but the running task should handle the interrupt. If the task does not exit on interrupt, it is a better idea to wait for the task to finish.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done