1919
2020import java .io .IOException ;
2121import java .util .ArrayList ;
22+ import java .util .Collection ;
2223import java .util .Collections ;
24+ import java .util .EnumSet ;
2325import java .util .List ;
2426import java .util .Map ;
2527import java .util .UUID ;
2628import java .util .concurrent .ThreadLocalRandom ;
2729import org .apache .hadoop .conf .Configuration ;
2830import org .apache .hadoop .hbase .Abortable ;
31+ import org .apache .hadoop .hbase .ClusterMetrics ;
2932import org .apache .hadoop .hbase .HBaseConfiguration ;
3033import org .apache .hadoop .hbase .ServerName ;
3134import org .apache .hadoop .hbase .client .AsyncClusterConnection ;
3235import org .apache .hadoop .hbase .client .AsyncRegionServerAdmin ;
3336import org .apache .hadoop .hbase .client .ClusterConnectionFactory ;
3437import org .apache .hadoop .hbase .security .User ;
35- import org .apache .hadoop .hbase .zookeeper .ZKClusterId ;
36- import org .apache .hadoop .hbase .zookeeper .ZKListener ;
37- import org .apache .hadoop .hbase .zookeeper .ZKUtil ;
38- import org .apache .hadoop .hbase .zookeeper .ZKWatcher ;
38+ import org .apache .hadoop .hbase .util .FutureUtils ;
39+ import org .apache .hadoop .hbase .util .ReservoirSample ;
3940import org .apache .yetus .audience .InterfaceAudience ;
40- import org .apache .zookeeper .KeeperException ;
41- import org .apache .zookeeper .KeeperException .AuthFailedException ;
42- import org .apache .zookeeper .KeeperException .ConnectionLossException ;
43- import org .apache .zookeeper .KeeperException .SessionExpiredException ;
4441import org .slf4j .Logger ;
4542import org .slf4j .LoggerFactory ;
4643
@@ -56,12 +53,11 @@ public abstract class HBaseReplicationEndpoint extends BaseReplicationEndpoint
5653
5754 private static final Logger LOG = LoggerFactory .getLogger (HBaseReplicationEndpoint .class );
5855
59- private ZKWatcher zkw = null ;
60- private final Object zkwLock = new Object ();
61-
6256 protected Configuration conf ;
6357
64- private AsyncClusterConnection conn ;
58+ private final Object connLock = new Object ();
59+
60+ private volatile AsyncClusterConnection conn ;
6561
6662 /**
6763 * Default maximum number of times a replication sink can be reported as bad before it will no
@@ -106,36 +102,15 @@ public void init(Context context) throws IOException {
106102 this .badReportCounts = Maps .newHashMap ();
107103 }
108104
109- protected void disconnect () {
110- synchronized (zkwLock ) {
111- if (zkw != null ) {
112- zkw .close ();
113- }
114- }
115- if (this .conn != null ) {
116- try {
117- this .conn .close ();
118- this .conn = null ;
119- } catch (IOException e ) {
120- LOG .warn ("{} Failed to close the connection" , ctx .getPeerId ());
121- }
122- }
123- }
124-
125- /**
126- * A private method used to re-establish a zookeeper session with a peer cluster.
127- */
128- private void reconnect (KeeperException ke ) {
129- if (
130- ke instanceof ConnectionLossException || ke instanceof SessionExpiredException
131- || ke instanceof AuthFailedException
132- ) {
133- String clusterKey = ctx .getPeerConfig ().getClusterKey ();
134- LOG .warn ("Lost the ZooKeeper connection for peer {}" , clusterKey , ke );
135- try {
136- reloadZkWatcher ();
137- } catch (IOException io ) {
138- LOG .warn ("Creation of ZookeeperWatcher failed for peer {}" , clusterKey , io );
105+ private void disconnect () {
106+ synchronized (connLock ) {
107+ if (this .conn != null ) {
108+ try {
109+ this .conn .close ();
110+ this .conn = null ;
111+ } catch (IOException e ) {
112+ LOG .warn ("{} Failed to close the connection" , ctx .getPeerId ());
113+ }
139114 }
140115 }
141116 }
@@ -152,13 +127,7 @@ public void stop() {
152127
153128 @ Override
154129 protected void doStart () {
155- try {
156- reloadZkWatcher ();
157- connectPeerCluster ();
158- notifyStarted ();
159- } catch (IOException e ) {
160- notifyFailed (e );
161- }
130+ notifyStarted ();
162131 }
163132
164133 @ Override
@@ -168,44 +137,40 @@ protected void doStop() {
168137 }
169138
170139 @ Override
171- // Synchronize peer cluster connection attempts to avoid races and rate
172- // limit connections when multiple replication sources try to connect to
173- // the peer cluster. If the peer cluster is down we can get out of control
174- // over time.
175140 public UUID getPeerUUID () {
176- UUID peerUUID = null ;
177141 try {
178- synchronized (zkwLock ) {
179- peerUUID = ZKClusterId .getUUIDForCluster (zkw );
180- }
181- } catch (KeeperException ke ) {
182- reconnect (ke );
142+ AsyncClusterConnection conn = connect ();
143+ String clusterId = FutureUtils
144+ .get (conn .getAdmin ().getClusterMetrics (EnumSet .of (ClusterMetrics .Option .CLUSTER_ID )))
145+ .getClusterId ();
146+ return UUID .fromString (clusterId );
147+ } catch (IOException e ) {
148+ LOG .warn ("Failed to get cluster id for cluster" , e );
149+ return null ;
183150 }
184- return peerUUID ;
185151 }
186152
187- /**
188- * Closes the current ZKW (if not null) and creates a new one
189- * @throws IOException If anything goes wrong connecting
190- */
191- private void reloadZkWatcher () throws IOException {
192- synchronized (zkwLock ) {
193- if (zkw != null ) {
194- zkw .close ();
195- }
196- zkw =
197- new ZKWatcher (ctx .getConfiguration (), "connection to cluster: " + ctx .getPeerId (), this );
198- zkw .registerListener (new PeerRegionServerListener (this ));
153+ // do not call this method in doStart method, only initialize the connection to remote cluster
154+ // when you actually wants to make use of it. The problem here is that, starting the replication
155+ // endpoint is part of the region server initialization work, so if the peer cluster is fully
156+ // down and we can not connect to it, we will cause the initialization to fail and crash the
157+ // region server, as we need the cluster id while setting up the AsyncClusterConnection, which
158+ // needs to at least connect to zookeeper or some other servers in the peer cluster based on
159+ // different connection registry implementation
160+ private AsyncClusterConnection connect () throws IOException {
161+ AsyncClusterConnection c = this .conn ;
162+ if (c != null ) {
163+ return c ;
199164 }
200- }
201-
202- private void connectPeerCluster () throws IOException {
203- try {
204- conn = createConnection (this .conf );
205- } catch (IOException ioe ) {
206- LOG .warn ("{} Failed to create connection for peer cluster" , ctx .getPeerId (), ioe );
207- throw ioe ;
165+ synchronized (connLock ) {
166+ c = this .conn ;
167+ if (c != null ) {
168+ return c ;
169+ }
170+ c = createConnection (this .conf );
171+ conn = c ;
208172 }
173+ return c ;
209174 }
210175
211176 @ Override
@@ -224,36 +189,27 @@ public boolean isAborted() {
224189 * Get the list of all the region servers from the specified peer
225190 * @return list of region server addresses or an empty list if the slave is unavailable
226191 */
227- protected List < ServerName > fetchSlavesAddresses () {
228- List < String > children = null ;
192+ // will be overrided in tests so protected
193+ protected Collection < ServerName > fetchPeerAddresses () {
229194 try {
230- synchronized (zkwLock ) {
231- children = ZKUtil .listChildrenAndWatchForNewChildren (zkw , zkw .getZNodePaths ().rsZNode );
232- }
233- } catch (KeeperException ke ) {
234- if (LOG .isDebugEnabled ()) {
235- LOG .debug ("Fetch slaves addresses failed" , ke );
236- }
237- reconnect (ke );
238- }
239- if (children == null ) {
195+ return FutureUtils .get (connect ().getAdmin ().getRegionServers (true ));
196+ } catch (IOException e ) {
197+ LOG .debug ("Fetch peer addresses failed" , e );
240198 return Collections .emptyList ();
241199 }
242- List <ServerName > addresses = new ArrayList <>(children .size ());
243- for (String child : children ) {
244- addresses .add (ServerName .parseServerName (child ));
245- }
246- return addresses ;
247200 }
248201
249202 protected synchronized void chooseSinks () {
250- List <ServerName > slaveAddresses = fetchSlavesAddresses ();
203+ Collection <ServerName > slaveAddresses = fetchPeerAddresses ();
251204 if (slaveAddresses .isEmpty ()) {
252205 LOG .warn ("No sinks available at peer. Will not be able to replicate" );
206+ this .sinkServers = Collections .emptyList ();
207+ } else {
208+ int numSinks = (int ) Math .ceil (slaveAddresses .size () * ratio );
209+ ReservoirSample <ServerName > sample = new ReservoirSample <>(numSinks );
210+ sample .add (slaveAddresses .iterator ());
211+ this .sinkServers = sample .getSamplingResult ();
253212 }
254- Collections .shuffle (slaveAddresses , ThreadLocalRandom .current ());
255- int numSinks = (int ) Math .ceil (slaveAddresses .size () * ratio );
256- this .sinkServers = slaveAddresses .subList (0 , numSinks );
257213 badReportCounts .clear ();
258214 }
259215
@@ -275,7 +231,7 @@ protected synchronized SinkPeer getReplicationSink() throws IOException {
275231 }
276232 ServerName serverName =
277233 sinkServers .get (ThreadLocalRandom .current ().nextInt (sinkServers .size ()));
278- return new SinkPeer (serverName , conn .getRegionServerAdmin (serverName ));
234+ return new SinkPeer (serverName , connect () .getRegionServerAdmin (serverName ));
279235 }
280236
281237 /**
@@ -307,29 +263,6 @@ List<ServerName> getSinkServers() {
307263 return sinkServers ;
308264 }
309265
310- /**
311- * Tracks changes to the list of region servers in a peer's cluster.
312- */
313- public static class PeerRegionServerListener extends ZKListener {
314-
315- private final HBaseReplicationEndpoint replicationEndpoint ;
316- private final String regionServerListNode ;
317-
318- public PeerRegionServerListener (HBaseReplicationEndpoint endpoint ) {
319- super (endpoint .zkw );
320- this .replicationEndpoint = endpoint ;
321- this .regionServerListNode = endpoint .zkw .getZNodePaths ().rsZNode ;
322- }
323-
324- @ Override
325- public synchronized void nodeChildrenChanged (String path ) {
326- if (path .equals (regionServerListNode )) {
327- LOG .info ("Detected change to peer region servers, fetching updated list" );
328- replicationEndpoint .chooseSinks ();
329- }
330- }
331- }
332-
333266 /**
334267 * Wraps a replication region server sink to provide the ability to identify it.
335268 */
0 commit comments