Skip to content

Commit ac7b475

Browse files
Apache9apurtell
authored andcommitted
HBASE-26700 The way we bypass broken track file is not enough in StoreFileListFile (apache#4055)
Signed-off-by: Wellington Ramos Chevreuil <[email protected]>
1 parent d3df795 commit ac7b475

File tree

5 files changed

+216
-25
lines changed

5 files changed

+216
-25
lines changed

hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/storefiletracker/StoreFileListFile.java

Lines changed: 41 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,10 @@
1717
*/
1818
package org.apache.hadoop.hbase.regionserver.storefiletracker;
1919

20+
import java.io.EOFException;
2021
import java.io.FileNotFoundException;
2122
import java.io.IOException;
23+
import java.util.zip.CRC32;
2224
import org.apache.hadoop.fs.FSDataInputStream;
2325
import org.apache.hadoop.fs.FSDataOutputStream;
2426
import org.apache.hadoop.fs.FileSystem;
@@ -29,9 +31,6 @@
2931
import org.slf4j.Logger;
3032
import org.slf4j.LoggerFactory;
3133

32-
import org.apache.hbase.thirdparty.com.google.common.io.ByteStreams;
33-
import org.apache.hbase.thirdparty.com.google.protobuf.InvalidProtocolBufferException;
34-
3534
import org.apache.hadoop.hbase.shaded.protobuf.generated.StoreFileTrackerProtos.StoreFileList;
3635

3736
/**
@@ -42,18 +41,27 @@
4241
* other file.
4342
* <p/>
4443
* So in this way, we could avoid listing when we want to load the store file list file.
44+
* <p/>
45+
* To prevent loading partial file, we use the first 4 bytes as file length, and also append a 4
46+
* bytes crc32 checksum at the end. This is because the protobuf message parser sometimes can return
47+
* without error on partial bytes if you stop at some special points, but the return message will
48+
* have incorrect field value. We should try our best to prevent this happens because loading an
49+
* incorrect store file list file usually leads to data loss.
4550
*/
4651
@InterfaceAudience.Private
4752
class StoreFileListFile {
4853

4954
private static final Logger LOG = LoggerFactory.getLogger(StoreFileListFile.class);
5055

51-
private static final String TRACK_FILE_DIR = ".filelist";
56+
static final String TRACK_FILE_DIR = ".filelist";
5257

5358
private static final String TRACK_FILE = "f1";
5459

5560
private static final String TRACK_FILE_ROTATE = "f2";
5661

62+
// 16 MB, which is big enough for a tracker file
63+
private static final int MAX_FILE_SIZE = 16 * 1024 * 1024;
64+
5765
private final StoreContext ctx;
5866

5967
private final Path trackFileDir;
@@ -74,16 +82,26 @@ class StoreFileListFile {
7482

7583
private StoreFileList load(Path path) throws IOException {
7684
FileSystem fs = ctx.getRegionFileSystem().getFileSystem();
77-
byte[] bytes;
85+
byte[] data;
86+
int expectedChecksum;
7887
try (FSDataInputStream in = fs.open(path)) {
79-
bytes = ByteStreams.toByteArray(in);
88+
int length = in.readInt();
89+
if (length <= 0 || length > MAX_FILE_SIZE) {
90+
throw new IOException("Invalid file length " + length +
91+
", either less than 0 or greater then max allowed size " + MAX_FILE_SIZE);
92+
}
93+
data = new byte[length];
94+
in.readFully(data);
95+
expectedChecksum = in.readInt();
8096
}
81-
// Read all the bytes and then parse it, so we will only throw InvalidProtocolBufferException
82-
// here. This is very important for upper layer to determine whether this is the normal case,
83-
// where the file does not exist or is incomplete. If there is another type of exception, the
84-
// upper layer should throw it out instead of just ignoring it, otherwise it will lead to data
85-
// loss.
86-
return StoreFileList.parseFrom(bytes);
97+
CRC32 crc32 = new CRC32();
98+
crc32.update(data);
99+
int calculatedChecksum = (int) crc32.getValue();
100+
if (expectedChecksum != calculatedChecksum) {
101+
throw new IOException(
102+
"Checksum mismatch, expected " + expectedChecksum + ", actual " + calculatedChecksum);
103+
}
104+
return StoreFileList.parseFrom(data);
87105
}
88106

89107
private int select(StoreFileList[] lists) {
@@ -101,9 +119,9 @@ StoreFileList load() throws IOException {
101119
for (int i = 0; i < 2; i++) {
102120
try {
103121
lists[i] = load(trackFiles[i]);
104-
} catch (FileNotFoundException | InvalidProtocolBufferException e) {
122+
} catch (FileNotFoundException | EOFException e) {
105123
// this is normal case, so use info and do not log stacktrace
106-
LOG.info("Failed to load track file {}: {}", trackFiles[i], e);
124+
LOG.info("Failed to load track file {}: {}", trackFiles[i], e.toString());
107125
}
108126
}
109127
int winnerIndex = select(lists);
@@ -124,10 +142,17 @@ void update(StoreFileList.Builder builder) throws IOException {
124142
// we need to call load first to load the prevTimestamp and also the next file
125143
load();
126144
}
127-
FileSystem fs = ctx.getRegionFileSystem().getFileSystem();
128145
long timestamp = Math.max(prevTimestamp + 1, EnvironmentEdgeManager.currentTime());
146+
byte[] actualData = builder.setTimestamp(timestamp).build().toByteArray();
147+
CRC32 crc32 = new CRC32();
148+
crc32.update(actualData);
149+
int checksum = (int) crc32.getValue();
150+
// 4 bytes length at the beginning, plus 4 bytes checksum
151+
FileSystem fs = ctx.getRegionFileSystem().getFileSystem();
129152
try (FSDataOutputStream out = fs.create(trackFiles[nextTrackFile], true)) {
130-
builder.setTimestamp(timestamp).build().writeTo(out);
153+
out.writeInt(actualData.length);
154+
out.write(actualData);
155+
out.writeInt(checksum);
131156
}
132157
// record timestamp
133158
prevTimestamp = timestamp;

hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestCreateTableProcedure.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@
4040
import org.apache.hadoop.hbase.procedure2.ProcedureExecutor;
4141
import org.apache.hadoop.hbase.procedure2.ProcedureTestingUtility;
4242
import org.apache.hadoop.hbase.regionserver.storefiletracker.StoreFileTrackerFactory;
43-
import org.apache.hadoop.hbase.regionserver.storefiletracker.TestStoreFileTracker;
43+
import org.apache.hadoop.hbase.regionserver.storefiletracker.StoreFileTrackerForTest;
4444
import org.apache.hadoop.hbase.testclassification.MasterTests;
4545
import org.apache.hadoop.hbase.testclassification.MediumTests;
4646
import org.apache.hadoop.hbase.util.Bytes;
@@ -96,7 +96,7 @@ public void testCreateWithTrackImpl() throws Exception {
9696
final TableName tableName = TableName.valueOf(name.getMethodName());
9797
ProcedureExecutor<MasterProcedureEnv> procExec = getMasterProcedureExecutor();
9898
TableDescriptor htd = MasterProcedureTestingUtility.createHTD(tableName, F1);
99-
String trackerName = TestStoreFileTracker.class.getName();
99+
String trackerName = StoreFileTrackerForTest.class.getName();
100100
htd = TableDescriptorBuilder.newBuilder(htd).setValue(TRACKER_IMPL, trackerName).build();
101101
RegionInfo[] regions = ModifyRegionUtils.createRegionInfos(htd, null);
102102
long procId = ProcedureTestingUtility.submitAndWait(procExec,

hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestMergesSplitsAddToTracker.java

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@
4343
import org.apache.hadoop.hbase.client.TableDescriptor;
4444
import org.apache.hadoop.hbase.client.TableDescriptorBuilder;
4545
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
46-
import org.apache.hadoop.hbase.regionserver.storefiletracker.TestStoreFileTracker;
46+
import org.apache.hadoop.hbase.regionserver.storefiletracker.StoreFileTrackerForTest;
4747
import org.apache.hadoop.hbase.testclassification.LargeTests;
4848
import org.apache.hadoop.hbase.testclassification.RegionServerTests;
4949
import org.apache.hadoop.hbase.util.Bytes;
@@ -86,13 +86,13 @@ public static void afterClass() throws Exception {
8686

8787
@Before
8888
public void setup(){
89-
TestStoreFileTracker.clear();
89+
StoreFileTrackerForTest.clear();
9090
}
9191

9292
private TableName createTable(byte[] splitKey) throws IOException {
9393
TableDescriptor td = TableDescriptorBuilder.newBuilder(name.getTableName())
9494
.setColumnFamily(ColumnFamilyDescriptorBuilder.of(FAMILY_NAME))
95-
.setValue(TRACKER_IMPL, TestStoreFileTracker.class.getName()).build();
95+
.setValue(TRACKER_IMPL, StoreFileTrackerForTest.class.getName()).build();
9696
if (splitKey != null) {
9797
TEST_UTIL.getAdmin().createTable(td, new byte[][] { splitKey });
9898
} else {
@@ -247,7 +247,8 @@ private void validateDaughterRegionsFiles(HRegion region, String orignalFileName
247247

248248
private void verifyFilesAreTracked(Path regionDir, FileSystem fs) throws Exception {
249249
for (FileStatus f : fs.listStatus(new Path(regionDir, FAMILY_NAME_STR))) {
250-
assertTrue(TestStoreFileTracker.tracked(regionDir.getName(), FAMILY_NAME_STR, f.getPath()));
250+
assertTrue(
251+
StoreFileTrackerForTest.tracked(regionDir.getName(), FAMILY_NAME_STR, f.getPath()));
251252
}
252253
}
253254

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,14 +32,14 @@
3232
import org.slf4j.Logger;
3333
import org.slf4j.LoggerFactory;
3434

35-
public class TestStoreFileTracker extends DefaultStoreFileTracker {
35+
public class StoreFileTrackerForTest extends DefaultStoreFileTracker {
3636

37-
private static final Logger LOG = LoggerFactory.getLogger(TestStoreFileTracker.class);
37+
private static final Logger LOG = LoggerFactory.getLogger(StoreFileTrackerForTest.class);
3838
private static ConcurrentMap<String, BlockingQueue<StoreFileInfo>> trackedFiles =
3939
new ConcurrentHashMap<>();
4040
private String storeId;
4141

42-
public TestStoreFileTracker(Configuration conf, boolean isPrimaryReplica, StoreContext ctx) {
42+
public StoreFileTrackerForTest(Configuration conf, boolean isPrimaryReplica, StoreContext ctx) {
4343
super(conf, isPrimaryReplica, ctx);
4444
if (ctx != null && ctx.getRegionFileSystem() != null) {
4545
this.storeId = ctx.getRegionInfo().getEncodedName() + "-" + ctx.getFamily().getNameAsString();
Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
/**
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
package org.apache.hadoop.hbase.regionserver.storefiletracker;
19+
20+
import static org.junit.Assert.assertNull;
21+
import static org.junit.Assert.assertThrows;
22+
import static org.mockito.Mockito.mock;
23+
import static org.mockito.Mockito.when;
24+
25+
import java.io.IOException;
26+
import org.apache.hadoop.fs.FSDataInputStream;
27+
import org.apache.hadoop.fs.FSDataOutputStream;
28+
import org.apache.hadoop.fs.FileStatus;
29+
import org.apache.hadoop.fs.FileSystem;
30+
import org.apache.hadoop.fs.Path;
31+
import org.apache.hadoop.hbase.HBaseClassTestRule;
32+
import org.apache.hadoop.hbase.HBaseCommonTestingUtility;
33+
import org.apache.hadoop.hbase.regionserver.HRegionFileSystem;
34+
import org.apache.hadoop.hbase.regionserver.StoreContext;
35+
import org.apache.hadoop.hbase.testclassification.RegionServerTests;
36+
import org.apache.hadoop.hbase.testclassification.SmallTests;
37+
import org.apache.hadoop.hbase.util.Bytes;
38+
import org.junit.AfterClass;
39+
import org.junit.Before;
40+
import org.junit.ClassRule;
41+
import org.junit.Rule;
42+
import org.junit.Test;
43+
import org.junit.experimental.categories.Category;
44+
import org.junit.rules.TestName;
45+
import org.slf4j.Logger;
46+
import org.slf4j.LoggerFactory;
47+
48+
import org.apache.hbase.thirdparty.com.google.common.io.ByteStreams;
49+
50+
import org.apache.hadoop.hbase.shaded.protobuf.generated.StoreFileTrackerProtos.StoreFileList;
51+
52+
@Category({ RegionServerTests.class, SmallTests.class })
53+
public class TestStoreFileListFile {
54+
55+
@ClassRule
56+
public static final HBaseClassTestRule CLASS_RULE =
57+
HBaseClassTestRule.forClass(TestStoreFileListFile.class);
58+
59+
private static final Logger LOG = LoggerFactory.getLogger(TestStoreFileListFile.class);
60+
61+
private static final HBaseCommonTestingUtility UTIL = new HBaseCommonTestingUtility();
62+
63+
private Path testDir;
64+
65+
private StoreFileListFile storeFileListFile;
66+
67+
@Rule
68+
public TestName name = new TestName();
69+
70+
@Before
71+
public void setUp() throws IOException {
72+
testDir = UTIL.getDataTestDir(name.getMethodName());
73+
HRegionFileSystem hfs = mock(HRegionFileSystem.class);
74+
when(hfs.getFileSystem()).thenReturn(FileSystem.get(UTIL.getConfiguration()));
75+
StoreContext ctx = StoreContext.getBuilder().withFamilyStoreDirectoryPath(testDir)
76+
.withRegionFileSystem(hfs).build();
77+
storeFileListFile = new StoreFileListFile(ctx);
78+
}
79+
80+
@AfterClass
81+
public static void tearDown() {
82+
UTIL.cleanupTestDir();
83+
}
84+
85+
@Test
86+
public void testEmptyLoad() throws IOException {
87+
assertNull(storeFileListFile.load());
88+
}
89+
90+
private FileStatus getOnlyTrackerFile(FileSystem fs) throws IOException {
91+
return fs.listStatus(new Path(testDir, StoreFileListFile.TRACK_FILE_DIR))[0];
92+
}
93+
94+
private byte[] readAll(FileSystem fs, Path file) throws IOException {
95+
try (FSDataInputStream in = fs.open(file)) {
96+
return ByteStreams.toByteArray(in);
97+
}
98+
}
99+
100+
private void write(FileSystem fs, Path file, byte[] buf, int off, int len) throws IOException {
101+
try (FSDataOutputStream out = fs.create(file, true)) {
102+
out.write(buf, off, len);
103+
}
104+
}
105+
106+
@Test
107+
public void testLoadPartial() throws IOException {
108+
StoreFileList.Builder builder = StoreFileList.newBuilder();
109+
storeFileListFile.update(builder);
110+
FileSystem fs = FileSystem.get(UTIL.getConfiguration());
111+
FileStatus trackerFileStatus = getOnlyTrackerFile(fs);
112+
// truncate it so we do not have enough data
113+
LOG.info("Truncate file {} with size {} to {}", trackerFileStatus.getPath(),
114+
trackerFileStatus.getLen(), trackerFileStatus.getLen() / 2);
115+
byte[] content = readAll(fs, trackerFileStatus.getPath());
116+
write(fs, trackerFileStatus.getPath(), content, 0, content.length / 2);
117+
assertNull(storeFileListFile.load());
118+
}
119+
120+
private void writeInt(byte[] buf, int off, int value) {
121+
byte[] b = Bytes.toBytes(value);
122+
for (int i = 0; i < 4; i++) {
123+
buf[off + i] = b[i];
124+
}
125+
}
126+
127+
@Test
128+
public void testZeroFileLength() throws IOException {
129+
StoreFileList.Builder builder = StoreFileList.newBuilder();
130+
storeFileListFile.update(builder);
131+
FileSystem fs = FileSystem.get(UTIL.getConfiguration());
132+
FileStatus trackerFileStatus = getOnlyTrackerFile(fs);
133+
// write a zero length
134+
byte[] content = readAll(fs, trackerFileStatus.getPath());
135+
writeInt(content, 0, 0);
136+
write(fs, trackerFileStatus.getPath(), content, 0, content.length);
137+
assertThrows(IOException.class, () -> storeFileListFile.load());
138+
}
139+
140+
@Test
141+
public void testBigFileLength() throws IOException {
142+
StoreFileList.Builder builder = StoreFileList.newBuilder();
143+
storeFileListFile.update(builder);
144+
FileSystem fs = FileSystem.get(UTIL.getConfiguration());
145+
FileStatus trackerFileStatus = getOnlyTrackerFile(fs);
146+
// write a large length
147+
byte[] content = readAll(fs, trackerFileStatus.getPath());
148+
writeInt(content, 0, 128 * 1024 * 1024);
149+
write(fs, trackerFileStatus.getPath(), content, 0, content.length);
150+
assertThrows(IOException.class, () -> storeFileListFile.load());
151+
}
152+
153+
@Test
154+
public void testChecksumMismatch() throws IOException {
155+
StoreFileList.Builder builder = StoreFileList.newBuilder();
156+
storeFileListFile.update(builder);
157+
FileSystem fs = FileSystem.get(UTIL.getConfiguration());
158+
FileStatus trackerFileStatus = getOnlyTrackerFile(fs);
159+
// flip one byte
160+
byte[] content = readAll(fs, trackerFileStatus.getPath());
161+
content[5] = (byte) ~content[5];
162+
write(fs, trackerFileStatus.getPath(), content, 0, content.length);
163+
assertThrows(IOException.class, () -> storeFileListFile.load());
164+
}
165+
}

0 commit comments

Comments
 (0)