Skip to content

Commit cc4fd8b

Browse files
authored
HBASE-27579 CatalogJanitor can cause data loss due to errors during cleanMergeRegion (#4986)
Signed-off-by: Andrew Purtell <[email protected]> Signed-off-by: Viraj Jasani <[email protected]> Signed-off-by: Duo Zhang <[email protected]>
1 parent a0fdfa7 commit cc4fd8b

File tree

2 files changed

+145
-56
lines changed

2 files changed

+145
-56
lines changed

hbase-server/src/main/java/org/apache/hadoop/hbase/master/janitor/CatalogJanitor.java

Lines changed: 45 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@
3535
import org.apache.hadoop.hbase.MetaTableAccessor;
3636
import org.apache.hadoop.hbase.ScheduledChore;
3737
import org.apache.hadoop.hbase.TableName;
38-
import org.apache.hadoop.hbase.client.ColumnFamilyDescriptor;
3938
import org.apache.hadoop.hbase.client.Connection;
4039
import org.apache.hadoop.hbase.client.ConnectionFactory;
4140
import org.apache.hadoop.hbase.client.Get;
@@ -184,12 +183,12 @@ public int scan() throws IOException {
184183
for (Map.Entry<RegionInfo, Result> e : mergedRegions.entrySet()) {
185184
if (this.services.isInMaintenanceMode()) {
186185
// Stop cleaning if the master is in maintenance mode
187-
LOG.debug("In maintenence mode, not cleaning");
186+
LOG.debug("In maintenance mode, not cleaning");
188187
break;
189188
}
190189

191190
List<RegionInfo> parents = CatalogFamilyFormat.getMergeRegions(e.getValue().rawCells());
192-
if (parents != null && cleanMergeRegion(e.getKey(), parents)) {
191+
if (parents != null && cleanMergeRegion(this.services, e.getKey(), parents)) {
193192
gcs++;
194193
}
195194
}
@@ -202,7 +201,7 @@ public int scan() throws IOException {
202201
if (this.services.isInMaintenanceMode()) {
203202
// Stop cleaning if the master is in maintenance mode
204203
if (LOG.isDebugEnabled()) {
205-
LOG.debug("In maintenence mode, not cleaning");
204+
LOG.debug("In maintenance mode, not cleaning");
206205
}
207206
break;
208207
}
@@ -249,30 +248,24 @@ public CatalogJanitorReport getLastReport() {
249248
* @return true if we delete references in merged region on hbase:meta and archive the files on
250249
* the file system
251250
*/
252-
private boolean cleanMergeRegion(final RegionInfo mergedRegion, List<RegionInfo> parents)
253-
throws IOException {
251+
static boolean cleanMergeRegion(MasterServices services, final RegionInfo mergedRegion,
252+
List<RegionInfo> parents) throws IOException {
254253
if (LOG.isDebugEnabled()) {
255254
LOG.debug("Cleaning merged region {}", mergedRegion);
256255
}
257-
FileSystem fs = this.services.getMasterFileSystem().getFileSystem();
258-
Path rootdir = this.services.getMasterFileSystem().getRootDir();
259-
Path tabledir = CommonFSUtils.getTableDir(rootdir, mergedRegion.getTable());
260-
TableDescriptor htd = getDescriptor(mergedRegion.getTable());
261-
HRegionFileSystem regionFs = null;
262-
try {
263-
regionFs = HRegionFileSystem.openRegionFromFileSystem(this.services.getConfiguration(), fs,
264-
tabledir, mergedRegion, true);
265-
} catch (IOException e) {
266-
LOG.warn("Merged region does not exist: " + mergedRegion.getEncodedName());
267-
}
268-
if (regionFs == null || !regionFs.hasReferences(htd)) {
256+
257+
Pair<Boolean, Boolean> result =
258+
checkRegionReferences(services, mergedRegion.getTable(), mergedRegion);
259+
260+
if (hasNoReferences(result)) {
269261
if (LOG.isDebugEnabled()) {
270262
LOG.debug(
271263
"Deleting parents ({}) from fs; merged child {} no longer holds references", parents
272264
.stream().map(r -> RegionInfo.getShortNameToLog(r)).collect(Collectors.joining(", ")),
273265
mergedRegion);
274266
}
275-
ProcedureExecutor<MasterProcedureEnv> pe = this.services.getMasterProcedureExecutor();
267+
268+
ProcedureExecutor<MasterProcedureEnv> pe = services.getMasterProcedureExecutor();
276269
GCMultipleMergedRegionsProcedure mergeRegionProcedure =
277270
new GCMultipleMergedRegionsProcedure(pe.getEnvironment(), mergedRegion, parents);
278271
pe.submitProcedure(mergeRegionProcedure);
@@ -281,7 +274,15 @@ private boolean cleanMergeRegion(final RegionInfo mergedRegion, List<RegionInfo>
281274
mergedRegion);
282275
}
283276
return true;
277+
} else {
278+
if (LOG.isDebugEnabled()) {
279+
LOG.debug(
280+
"Deferring cleanup up of {} parents of merged region {}, because references "
281+
+ "still exist in merged region or we encountered an exception in checking",
282+
parents.size(), mergedRegion.getEncodedName());
283+
}
284284
}
285+
285286
return false;
286287
}
287288

@@ -333,8 +334,10 @@ static boolean cleanParent(MasterServices services, RegionInfo parent, Result ro
333334
}
334335
// Run checks on each daughter split.
335336
PairOfSameType<RegionInfo> daughters = MetaTableAccessor.getDaughterRegions(rowContent);
336-
Pair<Boolean, Boolean> a = checkDaughterInFs(services, parent, daughters.getFirst());
337-
Pair<Boolean, Boolean> b = checkDaughterInFs(services, parent, daughters.getSecond());
337+
Pair<Boolean, Boolean> a =
338+
checkRegionReferences(services, parent.getTable(), daughters.getFirst());
339+
Pair<Boolean, Boolean> b =
340+
checkRegionReferences(services, parent.getTable(), daughters.getSecond());
338341
if (hasNoReferences(a) && hasNoReferences(b)) {
339342
String daughterA =
340343
daughters.getFirst() != null ? daughters.getFirst().getShortNameToLog() : "null";
@@ -387,59 +390,45 @@ private static boolean hasNoReferences(final Pair<Boolean, Boolean> p) {
387390
}
388391

389392
/**
390-
* Checks if a daughter region -- either splitA or splitB -- still holds references to parent.
391-
* @param parent Parent region
392-
* @param daughter Daughter region
393-
* @return A pair where the first boolean says whether or not the daughter region directory exists
394-
* in the filesystem and then the second boolean says whether the daughter has references
395-
* to the parent.
393+
* Checks if a region still holds references to parent.
394+
* @param tableName The table for the region
395+
* @param region The region to check
396+
* @return A pair where the first boolean says whether the region directory exists in the
397+
* filesystem and then the second boolean says whether the region has references to a
398+
* parent.
396399
*/
397-
private static Pair<Boolean, Boolean> checkDaughterInFs(MasterServices services,
398-
final RegionInfo parent, final RegionInfo daughter) throws IOException {
399-
if (daughter == null) {
400+
private static Pair<Boolean, Boolean> checkRegionReferences(MasterServices services,
401+
TableName tableName, RegionInfo region) throws IOException {
402+
if (region == null) {
400403
return new Pair<>(Boolean.FALSE, Boolean.FALSE);
401404
}
402405

403406
FileSystem fs = services.getMasterFileSystem().getFileSystem();
404407
Path rootdir = services.getMasterFileSystem().getRootDir();
405-
Path tabledir = CommonFSUtils.getTableDir(rootdir, daughter.getTable());
406-
407-
Path daughterRegionDir = new Path(tabledir, daughter.getEncodedName());
408-
409-
HRegionFileSystem regionFs;
408+
Path tabledir = CommonFSUtils.getTableDir(rootdir, tableName);
409+
Path regionDir = new Path(tabledir, region.getEncodedName());
410410

411411
try {
412-
if (!CommonFSUtils.isExists(fs, daughterRegionDir)) {
412+
if (!CommonFSUtils.isExists(fs, regionDir)) {
413413
return new Pair<>(Boolean.FALSE, Boolean.FALSE);
414414
}
415415
} catch (IOException ioe) {
416-
LOG.error("Error trying to determine if daughter region exists, "
417-
+ "assuming exists and has references", ioe);
416+
LOG.error("Error trying to determine if region exists, assuming exists and has references",
417+
ioe);
418418
return new Pair<>(Boolean.TRUE, Boolean.TRUE);
419419
}
420420

421-
boolean references = false;
422-
TableDescriptor parentDescriptor = services.getTableDescriptors().get(parent.getTable());
421+
TableDescriptor tableDescriptor = services.getTableDescriptors().get(tableName);
423422
try {
424-
regionFs = HRegionFileSystem.openRegionFromFileSystem(services.getConfiguration(), fs,
425-
tabledir, daughter, true);
426-
427-
for (ColumnFamilyDescriptor family : parentDescriptor.getColumnFamilies()) {
428-
references = regionFs.hasReferences(family.getNameAsString());
429-
if (references) {
430-
break;
431-
}
432-
}
423+
HRegionFileSystem regionFs = HRegionFileSystem
424+
.openRegionFromFileSystem(services.getConfiguration(), fs, tabledir, region, true);
425+
boolean references = regionFs.hasReferences(tableDescriptor);
426+
return new Pair<>(Boolean.TRUE, references);
433427
} catch (IOException e) {
434-
LOG.error("Error trying to determine referenced files from : " + daughter.getEncodedName()
435-
+ ", to: " + parent.getEncodedName() + " assuming has references", e);
428+
LOG.error("Error trying to determine if region {} has references, assuming it does",
429+
region.getEncodedName(), e);
436430
return new Pair<>(Boolean.TRUE, Boolean.TRUE);
437431
}
438-
return new Pair<>(Boolean.TRUE, references);
439-
}
440-
441-
private TableDescriptor getDescriptor(final TableName tableName) throws IOException {
442-
return this.services.getTableDescriptors().get(tableName);
443432
}
444433

445434
private void updateAssignmentManagerMetrics() {

hbase-server/src/test/java/org/apache/hadoop/hbase/master/janitor/TestCatalogJanitor.java

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,12 @@
2121
import static org.junit.Assert.assertEquals;
2222
import static org.junit.Assert.assertFalse;
2323
import static org.junit.Assert.assertTrue;
24+
import static org.mockito.Mockito.any;
25+
import static org.mockito.Mockito.doAnswer;
2426
import static org.mockito.Mockito.doReturn;
27+
import static org.mockito.Mockito.doThrow;
2528
import static org.mockito.Mockito.spy;
29+
import static org.mockito.Mockito.when;
2630

2731
import java.io.IOException;
2832
import java.util.ArrayList;
@@ -34,6 +38,7 @@
3438
import java.util.SortedSet;
3539
import java.util.TreeMap;
3640
import java.util.concurrent.ConcurrentSkipListMap;
41+
import java.util.concurrent.atomic.AtomicBoolean;
3742
import org.apache.hadoop.fs.FSDataOutputStream;
3843
import org.apache.hadoop.fs.FileStatus;
3944
import org.apache.hadoop.fs.FileSystem;
@@ -52,6 +57,7 @@
5257
import org.apache.hadoop.hbase.client.TableDescriptor;
5358
import org.apache.hadoop.hbase.client.TableDescriptorBuilder;
5459
import org.apache.hadoop.hbase.io.Reference;
60+
import org.apache.hadoop.hbase.master.MasterFileSystem;
5561
import org.apache.hadoop.hbase.master.MasterServices;
5662
import org.apache.hadoop.hbase.master.assignment.MockMasterServices;
5763
import org.apache.hadoop.hbase.master.janitor.CatalogJanitor.SplitParentFirstComparator;
@@ -126,6 +132,100 @@ private RegionInfo createRegionInfo(TableName tableName, byte[] startKey, byte[]
126132
.setSplit(split).build();
127133
}
128134

135+
@Test
136+
public void testCleanMerge() throws IOException {
137+
TableDescriptor td = createTableDescriptorForCurrentMethod();
138+
// Create regions.
139+
RegionInfo merged =
140+
createRegionInfo(td.getTableName(), Bytes.toBytes("aaa"), Bytes.toBytes("eee"));
141+
RegionInfo parenta =
142+
createRegionInfo(td.getTableName(), Bytes.toBytes("aaa"), Bytes.toBytes("ccc"));
143+
RegionInfo parentb =
144+
createRegionInfo(td.getTableName(), Bytes.toBytes("ccc"), Bytes.toBytes("eee"));
145+
146+
List<RegionInfo> parents = new ArrayList<>();
147+
parents.add(parenta);
148+
parents.add(parentb);
149+
150+
Path rootdir = this.masterServices.getMasterFileSystem().getRootDir();
151+
Path tabledir = CommonFSUtils.getTableDir(rootdir, td.getTableName());
152+
Path storedir =
153+
HRegionFileSystem.getStoreHomedir(tabledir, merged, td.getColumnFamilies()[0].getName());
154+
155+
Path parentaRef = createMergeReferenceFile(storedir, merged, parenta);
156+
Path parentbRef = createMergeReferenceFile(storedir, merged, parentb);
157+
158+
// references exist, should not clean
159+
assertFalse(CatalogJanitor.cleanMergeRegion(masterServices, merged, parents));
160+
161+
masterServices.getMasterFileSystem().getFileSystem().delete(parentaRef, false);
162+
163+
// one reference still exists, should not clean
164+
assertFalse(CatalogJanitor.cleanMergeRegion(masterServices, merged, parents));
165+
166+
masterServices.getMasterFileSystem().getFileSystem().delete(parentbRef, false);
167+
168+
// all references removed, should clean
169+
assertTrue(CatalogJanitor.cleanMergeRegion(masterServices, merged, parents));
170+
}
171+
172+
@Test
173+
public void testDontCleanMergeIfFileSystemException() throws IOException {
174+
TableDescriptor td = createTableDescriptorForCurrentMethod();
175+
// Create regions.
176+
RegionInfo merged =
177+
createRegionInfo(td.getTableName(), Bytes.toBytes("aaa"), Bytes.toBytes("eee"));
178+
RegionInfo parenta =
179+
createRegionInfo(td.getTableName(), Bytes.toBytes("aaa"), Bytes.toBytes("ccc"));
180+
RegionInfo parentb =
181+
createRegionInfo(td.getTableName(), Bytes.toBytes("ccc"), Bytes.toBytes("eee"));
182+
183+
List<RegionInfo> parents = new ArrayList<>();
184+
parents.add(parenta);
185+
parents.add(parentb);
186+
187+
Path rootdir = this.masterServices.getMasterFileSystem().getRootDir();
188+
Path tabledir = CommonFSUtils.getTableDir(rootdir, td.getTableName());
189+
Path storedir =
190+
HRegionFileSystem.getStoreHomedir(tabledir, merged, td.getColumnFamilies()[0].getName());
191+
createMergeReferenceFile(storedir, merged, parenta);
192+
193+
MasterServices mockedMasterServices = spy(masterServices);
194+
MasterFileSystem mockedMasterFileSystem = spy(masterServices.getMasterFileSystem());
195+
FileSystem mockedFileSystem = spy(masterServices.getMasterFileSystem().getFileSystem());
196+
197+
when(mockedMasterServices.getMasterFileSystem()).thenReturn(mockedMasterFileSystem);
198+
when(mockedMasterFileSystem.getFileSystem()).thenReturn(mockedFileSystem);
199+
200+
// throw on the first exists check
201+
doThrow(new IOException("Some exception")).when(mockedFileSystem).exists(any());
202+
203+
assertFalse(CatalogJanitor.cleanMergeRegion(mockedMasterServices, merged, parents));
204+
205+
// throw on the second exists check (within HRegionfileSystem)
206+
AtomicBoolean returned = new AtomicBoolean(false);
207+
doAnswer(invocationOnMock -> {
208+
if (!returned.get()) {
209+
returned.set(true);
210+
return masterServices.getMasterFileSystem().getFileSystem()
211+
.exists(invocationOnMock.getArgument(0));
212+
}
213+
throw new IOException("Some exception");
214+
}).when(mockedFileSystem).exists(any());
215+
216+
assertFalse(CatalogJanitor.cleanMergeRegion(mockedMasterServices, merged, parents));
217+
}
218+
219+
private Path createMergeReferenceFile(Path storeDir, RegionInfo mergedRegion,
220+
RegionInfo parentRegion) throws IOException {
221+
Reference ref = Reference.createTopReference(mergedRegion.getStartKey());
222+
long now = EnvironmentEdgeManager.currentTime();
223+
// Reference name has this format: StoreFile#REF_NAME_PARSER
224+
Path p = new Path(storeDir, Long.toString(now) + "." + parentRegion.getEncodedName());
225+
FileSystem fs = this.masterServices.getMasterFileSystem().getFileSystem();
226+
return ref.write(fs, p);
227+
}
228+
129229
/**
130230
* Test clearing a split parent.
131231
*/

0 commit comments

Comments
 (0)