Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
import org.apache.hadoop.hbase.regionserver.querymatcher.CompactionScanQueryMatcher;
import org.apache.hadoop.hbase.regionserver.querymatcher.ScanQueryMatcher;
import org.apache.hadoop.hbase.regionserver.querymatcher.UserScanQueryMatcher;
import org.apache.hadoop.hbase.security.visibility.VisibilityUtils;
import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
import org.apache.yetus.audience.InterfaceAudience;
import org.slf4j.Logger;
Expand Down Expand Up @@ -163,6 +164,7 @@ public class StoreScanner extends NonReversedNonLazyKeyValueScanner

protected final long readPt;
private boolean topChanged = false;
private final boolean visibilityLabelEnabled;

/** An internal constructor. */
private StoreScanner(HStore store, Scan scan, ScanInfo scanInfo, int numColumns, long readPt,
Expand All @@ -177,6 +179,7 @@ private StoreScanner(HStore store, Scan scan, ScanInfo scanInfo, int numColumns,
this.now = EnvironmentEdgeManager.currentTime();
this.oldestUnexpiredTS = scan.isRaw() ? 0L : now - scanInfo.getTtl();
this.minVersions = scanInfo.getMinVersions();
visibilityLabelEnabled = store != null && VisibilityUtils.isVisibilityLabelEnabled(store.conf);

// We look up row-column Bloom filters for multi-column queries as part of
// the seek operation. However, we also look the row-column Bloom filter
Expand Down Expand Up @@ -246,7 +249,7 @@ public StoreScanner(HStore store, ScanInfo scanInfo, Scan scan, NavigableSet<byt
throw new DoNotRetryIOException("Cannot specify any column for a raw scan");
}
matcher = UserScanQueryMatcher.create(scan, scanInfo, columns, oldestUnexpiredTS, now,
store.getCoprocessorHost());
store.getCoprocessorHost(), visibilityLabelEnabled);

store.addChangedReaderObserver(this);

Expand Down Expand Up @@ -360,8 +363,8 @@ public StoreScanner(ScanInfo scanInfo, ScanType scanType,
this(null, scan, scanInfo, columns != null ? columns.size() : 0, 0L, scan.getCacheBlocks(),
scanType);
if (scanType == ScanType.USER_SCAN) {
this.matcher =
UserScanQueryMatcher.create(scan, scanInfo, columns, oldestUnexpiredTS, now, null);
this.matcher = UserScanQueryMatcher.create(scan, scanInfo, columns, oldestUnexpiredTS, now,
null, visibilityLabelEnabled);
} else {
this.matcher = CompactionScanQueryMatcher.create(scanInfo, scanType, Long.MAX_VALUE,
PrivateConstants.OLDEST_TIMESTAMP, oldestUnexpiredTS, now, null, null, null);
Expand All @@ -375,8 +378,8 @@ public StoreScanner(ScanInfo scanInfo, ScanType scanType,
// 0 is passed as readpoint because the test bypasses Store
this(null, scan, scanInfo, columns != null ? columns.size() : 0, 0L, scan.getCacheBlocks(),
ScanType.USER_SCAN);
this.matcher =
UserScanQueryMatcher.create(scan, scanInfo, columns, oldestUnexpiredTS, now, null);
this.matcher = UserScanQueryMatcher.create(scan, scanInfo, columns, oldestUnexpiredTS, now,
null, visibilityLabelEnabled);
seekAllScanner(scanInfo, scanners);
}

Expand Down Expand Up @@ -636,10 +639,11 @@ public boolean next(List<? super ExtendedCell> outResult, ScannerContext scanner
scannerContext.incrementBlockProgress(blockSize);
});

prevCell = cell;
scannerContext.setLastPeekedCell(cell);
topChanged = false;
ScanQueryMatcher.MatchCode qcode = matcher.match(cell);
ScanQueryMatcher.MatchCode qcode = matcher.match(cell, prevCell);
LOG.trace("next - cell={}, prevCell={}, qCode={}", cell, prevCell, qcode);
prevCell = cell;
switch (qcode) {
case INCLUDE:
case INCLUDE_AND_SEEK_NEXT_ROW:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
package org.apache.hadoop.hbase.regionserver.querymatcher;

import java.io.IOException;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.ExtendedCell;
import org.apache.hadoop.hbase.KeepDeletedCells;
import org.apache.hadoop.hbase.PrivateCellUtil;
Expand All @@ -40,12 +41,19 @@ public abstract class NormalUserScanQueryMatcher extends UserScanQueryMatcher {
/** whether time range queries can see rows "behind" a delete */
protected final boolean seePastDeleteMarkers;

private final int scanMaxVersions;

private final boolean visibilityLabelEnabled;

protected NormalUserScanQueryMatcher(Scan scan, ScanInfo scanInfo, ColumnTracker columns,
boolean hasNullColumn, DeleteTracker deletes, long oldestUnexpiredTS, long now) {
boolean hasNullColumn, DeleteTracker deletes, long oldestUnexpiredTS, long now,
boolean visibilityLabelEnabled) {
super(scan, scanInfo, columns, hasNullColumn, oldestUnexpiredTS, now);
this.deletes = deletes;
this.get = scan.isGetScan();
this.seePastDeleteMarkers = scanInfo.getKeepDeletedCells() != KeepDeletedCells.FALSE;
this.scanMaxVersions = Math.max(scan.getMaxVersions(), scanInfo.getMaxVersions());
this.visibilityLabelEnabled = visibilityLabelEnabled;
}

@Override
Expand All @@ -56,11 +64,16 @@ public void beforeShipped() throws IOException {

@Override
public MatchCode match(ExtendedCell cell) throws IOException {
return match(cell, null);
}

@Override
public MatchCode match(ExtendedCell cell, ExtendedCell prevCell) throws IOException {
if (filter != null && filter.filterAllRemaining()) {
return MatchCode.DONE_SCAN;
}
MatchCode returnCode = preCheck(cell);
if (returnCode != null) {
MatchCode returnCode;
if ((returnCode = preCheck(cell)) != null) {
return returnCode;
}
long timestamp = cell.getTimestamp();
Expand All @@ -71,15 +84,42 @@ public MatchCode match(ExtendedCell cell) throws IOException {
if (includeDeleteMarker) {
this.deletes.add(cell);
}
return MatchCode.SKIP;
// In some cases, optimization can not be done
if (!canOptimizeReadDeleteMarkers()) {
return MatchCode.SKIP;
}
}
returnCode = checkDeleted(deletes, cell);
if (returnCode != null) {
// optimization when prevCell is Delete or DeleteFamilyVersion
if ((returnCode = checkDeletedEffectively(cell, prevCell)) != null) {
return returnCode;
}
if ((returnCode = checkDeleted(deletes, cell)) != null) {
return returnCode;
}
return matchColumn(cell, timestamp, typeByte);
}

// If prevCell is a delete marker and cell is a delete marked Put or delete marker,
// it means the cell is deleted effectively.
// And we can do SEEK_NEXT_COL.
private MatchCode checkDeletedEffectively(ExtendedCell cell, ExtendedCell prevCell) {
if (
prevCell != null && canOptimizeReadDeleteMarkers()
&& CellUtil.matchingRowColumn(prevCell, cell) && CellUtil.matchingTimestamp(prevCell, cell)
&& (PrivateCellUtil.isDeleteType(prevCell)
|| PrivateCellUtil.isDeleteFamilyVersion(prevCell))
) {
return MatchCode.SEEK_NEXT_COL;
}
return null;
}

private boolean canOptimizeReadDeleteMarkers() {
// for simplicity, optimization works only for these cases
return !seePastDeleteMarkers && scanMaxVersions == 1 && !visibilityLabelEnabled
&& getFilter() == null && !(deletes instanceof NewVersionBehaviorTracker);
}
Comment on lines +117 to +121
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@EungsopYoo have you also considered Dual File Compaction #5545 ?
Could you also run some perf test comparing Dual File Compaction with this optimization? This might be really helpful.

Copy link
Contributor Author

@EungsopYoo EungsopYoo Jan 20, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@virajjasani
I have reviewed Dual File Compaction you mentioned. This PR and Dual File Compaction have something in common, especially handling delete markers. But I think there are some differences.

This PR focuses on the accumulated delete markers of the same row or cell, but that handles delete marker of different rows or columns. And this PR can optimize read from both of MemStore and StoreFiles, but that can optimize read from StoreFiles only.

So I think they are complementary and can be used all together.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks @EungsopYoo, this is what I was also expecting.

On the Jira https://issues.apache.org/jira/browse/HBASE-25972, Kadir has also provided how full scan is improvement is observed using PE (second comment on the Jira). Could you also run the same steps to see how much improvement you observe using this PR?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@virajjasani
PE does not have the test case Put, Delete and Get on the same row. Should I add the new test case and run it, maybe?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not necessary because the steps mentioned in the Jira will take care of adding many delete markers so you can follow the exact same steps. Thank you!

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To me, this improvement is only meaningful when the scanned data is in memstore assuming that that the skip list will be used for jumping from one column to the next (I have not looked at the code in detail recently so I assume it is the case). However, when HBase scans data from HFile, do we have data structures in place to jump from one column to next one? I think we do not have one. No only we linearly scan the cells within a row, we also linearly scan all rows within a HBase block, do not we? So I did not understand why skipping to the next column would be a significant optimization in general.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks @EungsopYoo, how about you keep KEEP_DELETED_CELL false above and see how perf numbers look like with and without your patch?

I have run the same tests again except KEEP_DELETED_CELL is set false.

master

  1. scan with dual file compaction enabled 4047ms
  2. scan with dual file compaction disabled 4277ms
  3. scan with delete markers and dual file compaction disabled 2.5031 seconds
  4. scan with delete markers and dual file compaction enabled 0.0198 seconds

this PR

  1. scan with dual file compaction enabled 4134ms
  2. scan with dual file compaction disabled 3383ms
  3. scan with delete markers and dual file compaction disabled 3.4726 seconds
  4. scan with delete markers and dual file compaction enabled 0.0245 seconds

It looks like there is some performance degradation on the result 3. I will dig into it.

Copy link
Contributor Author

@EungsopYoo EungsopYoo Jan 23, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To me, this improvement is only meaningful when the scanned data is in memstore assuming that that the skip list will be used for jumping from one column to the next (I have not looked at the code in detail recently so I assume it is the case). However, when HBase scans data from HFile, do we have data structures in place to jump from one column to next one? I think we do not have one. No only we linearly scan the cells within a row, we also linearly scan all rows within a HBase block, do not we? So I did not understand why skipping to the next column would be a significant optimization in general.

https://issues.apache.org/jira/browse/HBASE-29039
The performance tests on the Jira description are just the cases of reading from MemStore only. So I have run new performance tests, reading from StoreFiles only with or without dual file compaction.

create 'test', 'c'

java_import org.apache.hadoop.hbase.client.Delete
java_import org.apache.hadoop.hbase.TableName
java_import java.lang.System

con = @hbase.instance_variable_get(:@connection)
table = con.getTable(TableName.valueOf('test'))

1000.times do |i|
  # batch 10000 deletes with different timestamps every 10 seconds
  now = System.currentTimeMillis()
  dels = 10000.times.map do |i|
    del = Delete.new(Bytes.toBytes('row'))
    del.addFamily(Bytes.toBytes('c'), now + i)
  end
  table.delete(dels)
  sleep(10)
  flush 'test'
  # Trigger manually minor compaction because of compaction is not triggered automatically. But I don't know why yet
  compact 'test' if i % 10 == 0
  puts "i - #{i}"
  # Read after flush from StoreFiles
  get 'test', 'row'
end

master - without dual file compaction

i - 0
COLUMN  CELL
0 row(s)
Took 0.0184 seconds
...
i - 10
COLUMN  CELL
0 row(s)
Took 0.0206 seconds
...
i - 20
COLUMN  CELL
0 row(s)
Took 0.0398 seconds
...
i - 30
COLUMN  CELL
0 row(s)
Took 0.0462 seconds
...
i - 40
COLUMN  CELL
0 row(s)
Took 0.0517 seconds
...
i - 50
COLUMN  CELL
0 row(s)
Took 0.0730 seconds
...
i - 60
COLUMN  CELL
0 row(s)
Took 0.0903 seconds

master - with dual file compaction

i - 0
COLUMN  CELL
0 row(s)
Took 0.0050 seconds
...
i - 10
COLUMN  CELL
0 row(s)
Took 0.0175 seconds
...
i - 20
COLUMN  CELL
0 row(s)
Took 0.0265 seconds
...
i - 30
COLUMN  CELL
0 row(s)
Took 0.0310 seconds
...
i - 40
COLUMN  CELL
0 row(s)
Took 0.0205 seconds
...
i - 50
COLUMN  CELL
0 row(s)
Took 0.0532 seconds
...
i - 60
COLUMN  CELL
0 row(s)
Took 0.0360 seconds

this PR - without dual file compaction

i - 0
COLUMN  CELL
0 row(s)
Took 0.0061 seconds
...
i - 10
COLUMN  CELL
0 row(s)
Took 0.0102 seconds
...
i - 20
COLUMN  CELL
0 row(s)
Took 0.0052 seconds
...
i - 30
COLUMN  CELL
0 row(s)
Took 0.0073 seconds
...
i - 40
COLUMN  CELL
0 row(s)
Took 0.0077 seconds
...
i - 50
COLUMN  CELL
0 row(s)
Took 0.0115 seconds
...
i - 60
COLUMN  CELL
0 row(s)
Took 0.0101 seconds

this PR - with dual file compaction

i - 0
COLUMN  CELL
0 row(s)
Took 0.0046 seconds
...
i - 10
COLUMN  CELL
0 row(s)
Took 0.0103 seconds
...
i - 20
COLUMN  CELL
0 row(s)
Took 0.0074 seconds
...
i - 30
COLUMN  CELL
0 row(s)
Took 0.0067 seconds
...
i - 40
COLUMN  CELL
0 row(s)
Took 0.0077 seconds
...
i - 50
COLUMN  CELL
0 row(s)
Took 0.0091 seconds
i - 60
COLUMN  CELL
0 row(s)
Took 0.0106 seconds

The results show that the optimization of this PR works on reading from StoreFiles too, even without dual file compaction. What do you think about this results?

Copy link
Contributor Author

@EungsopYoo EungsopYoo Jan 23, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks @EungsopYoo, how about you keep KEEP_DELETED_CELL false above and see how perf numbers look like with and without your patch?

I have run the same tests again except KEEP_DELETED_CELL is set false.

master

  1. scan with dual file compaction enabled 4047ms
  2. scan with dual file compaction disabled 4277ms
  3. scan with delete markers and dual file compaction disabled 2.5031 seconds
  4. scan with delete markers and dual file compaction enabled 0.0198 seconds

this PR

  1. scan with dual file compaction enabled 4134ms
  2. scan with dual file compaction disabled 3383ms
  3. scan with delete markers and dual file compaction disabled 3.4726 seconds
  4. scan with delete markers and dual file compaction enabled 0.0245 seconds

It looks like there is some performance degradation on the result 3. I will dig into it.

https://github.com/EungsopYoo/hbase/blob/63901155caf5c226b02564128669234c08251e8d/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/querymatcher/NormalUserScanQueryMatcher.java#L81-L99

The slight performance degradation is due to removing of early return of MatchCode.SKIP in the normal cases. Because of the removal of early return, checkDeleted() method is executed more than before, and then some burden of computation is added. I found the result by removing added code blocks one by one.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ScanQueryMatcher.MatchCode qcode = matcher.match(cell, prevCell);

case SEEK_NEXT_COL:
seekOrSkipToNextColumn(cell);
NextState stateAfterSeekNextColumn = needToReturn(outResult);
if (stateAfterSeekNextColumn != null) {
return scannerContext.setScannerState(stateAfterSeekNextColumn).hasMoreValues();
}
break;
case SKIP:
this.heap.next();
break;

With some more digging, I found the actual cause of the degradation is return value of matcher.match(). It is very lightweight to process the return value of SKIP. But it is much more heavier to process the return value of SEEK_NEXT_COL.


@Override
protected void reset() {
deletes.reset();
Expand All @@ -92,11 +132,11 @@ protected boolean isGet() {

public static NormalUserScanQueryMatcher create(Scan scan, ScanInfo scanInfo,
ColumnTracker columns, DeleteTracker deletes, boolean hasNullColumn, long oldestUnexpiredTS,
long now) throws IOException {
long now, boolean visibilityLabelEnabled) throws IOException {
if (scan.isReversed()) {
if (scan.includeStopRow()) {
return new NormalUserScanQueryMatcher(scan, scanInfo, columns, hasNullColumn, deletes,
oldestUnexpiredTS, now) {
oldestUnexpiredTS, now, visibilityLabelEnabled) {

@Override
protected boolean moreRowsMayExistsAfter(int cmpToStopRow) {
Expand All @@ -105,7 +145,7 @@ protected boolean moreRowsMayExistsAfter(int cmpToStopRow) {
};
} else {
return new NormalUserScanQueryMatcher(scan, scanInfo, columns, hasNullColumn, deletes,
oldestUnexpiredTS, now) {
oldestUnexpiredTS, now, visibilityLabelEnabled) {

@Override
protected boolean moreRowsMayExistsAfter(int cmpToStopRow) {
Expand All @@ -116,7 +156,7 @@ protected boolean moreRowsMayExistsAfter(int cmpToStopRow) {
} else {
if (scan.includeStopRow()) {
return new NormalUserScanQueryMatcher(scan, scanInfo, columns, hasNullColumn, deletes,
oldestUnexpiredTS, now) {
oldestUnexpiredTS, now, visibilityLabelEnabled) {

@Override
protected boolean moreRowsMayExistsAfter(int cmpToStopRow) {
Expand All @@ -125,7 +165,7 @@ protected boolean moreRowsMayExistsAfter(int cmpToStopRow) {
};
} else {
return new NormalUserScanQueryMatcher(scan, scanInfo, columns, hasNullColumn, deletes,
oldestUnexpiredTS, now) {
oldestUnexpiredTS, now, visibilityLabelEnabled) {

@Override
protected boolean moreRowsMayExistsAfter(int cmpToStopRow) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,25 @@ protected final MatchCode checkDeleted(DeleteTracker deletes, ExtendedCell cell)
*/
public abstract MatchCode match(ExtendedCell cell) throws IOException;

/**
* Determines if the caller should do one of several things:
* <ul>
* <li>seek/skip to the next row (MatchCode.SEEK_NEXT_ROW)</li>
* <li>seek/skip to the next column (MatchCode.SEEK_NEXT_COL)</li>
* <li>include the current KeyValue (MatchCode.INCLUDE)</li>
* <li>ignore the current KeyValue (MatchCode.SKIP)</li>
* <li>got to the next row (MatchCode.DONE)</li>
* </ul>
* @param cell KeyValue to check
* @param prevCell KeyValue checked previously
* @return The match code instance.
* @throws IOException in case there is an internal consistency problem caused by a data
* corruption.
*/
public MatchCode match(ExtendedCell cell, ExtendedCell prevCell) throws IOException {
return match(cell);
}

/** Returns the start key */
public ExtendedCell getStartKey() {
return startKey;
Expand Down Expand Up @@ -284,7 +303,8 @@ public ExtendedCell getKeyForNextColumn(ExtendedCell cell) {
// see HBASE-18471 for more details
// see TestFromClientSide3#testScanAfterDeletingSpecifiedRow
// see TestFromClientSide3#testScanAfterDeletingSpecifiedRowV2
if (cell.getQualifierLength() == 0) {
// But we can seek to next column if the cell is a type of DeleteFamily.
if (cell.getQualifierLength() == 0 && !PrivateCellUtil.isDeleteFamily(cell)) {
ExtendedCell nextKey = PrivateCellUtil.createNextOnRowCol(cell);
if (nextKey != cell) {
return nextKey;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,8 @@ public boolean moreRowsMayExistAfter(ExtendedCell cell) {

public static UserScanQueryMatcher create(Scan scan, ScanInfo scanInfo,
NavigableSet<byte[]> columns, long oldestUnexpiredTS, long now,
RegionCoprocessorHost regionCoprocessorHost) throws IOException {
RegionCoprocessorHost regionCoprocessorHost, boolean visibilityLabelEnabled)
throws IOException {
boolean hasNullColumn =
!(columns != null && columns.size() != 0 && columns.first().length != 0);
Pair<DeleteTracker, ColumnTracker> trackers =
Expand All @@ -299,7 +300,7 @@ public static UserScanQueryMatcher create(Scan scan, ScanInfo scanInfo,
oldestUnexpiredTS, now);
} else {
return NormalUserScanQueryMatcher.create(scan, scanInfo, columnTracker, deleteTracker,
hasNullColumn, oldestUnexpiredTS, now);
hasNullColumn, oldestUnexpiredTS, now, visibilityLabelEnabled);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
import org.apache.hadoop.hbase.Tag;
import org.apache.hadoop.hbase.TagType;
import org.apache.hadoop.hbase.client.ColumnFamilyDescriptor;
import org.apache.hadoop.hbase.coprocessor.CoprocessorHost;
import org.apache.hadoop.hbase.exceptions.DeserializationException;
import org.apache.hadoop.hbase.filter.Filter;
import org.apache.hadoop.hbase.io.util.StreamUtils;
Expand Down Expand Up @@ -329,6 +330,15 @@ public static List<Tag> createVisibilityExpTags(String visExpression,
return tags;
}

public static boolean isVisibilityLabelEnabled(Configuration conf) {
return conf.getInt("hfile.format.version", 0) == 3
&& conf.getBoolean(User.HBASE_SECURITY_AUTHORIZATION_CONF_KEY, false)
&& (conf.get(CoprocessorHost.MASTER_COPROCESSOR_CONF_KEY, "")
.contains(VisibilityController.class.getName())
|| conf.get(CoprocessorHost.REGION_COPROCESSOR_CONF_KEY, "")
.contains(VisibilityController.class.getName()));
}

private static void getLabelOrdinals(ExpressionNode node, List<Integer> labelOrdinals,
Set<Integer> auths, boolean checkAuths, VisibilityLabelOrdinalProvider ordinalProvider)
throws IOException, InvalidLabelException {
Expand Down
Loading