From 915ecef4c4bf03d370ec6cdaf0edf929731d3c41 Mon Sep 17 00:00:00 2001 From: Yutong Sean Date: Mon, 13 Dec 2021 17:24:37 +0800 Subject: [PATCH 1/6] HBASE-26566 Optimize determine E step in OrderedBytes --- .../org/apache/hadoop/hbase/util/OrderedBytes.java | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/hbase-common/src/main/java/org/apache/hadoop/hbase/util/OrderedBytes.java b/hbase-common/src/main/java/org/apache/hadoop/hbase/util/OrderedBytes.java index f565bc1ad181..d65385d543c3 100644 --- a/hbase-common/src/main/java/org/apache/hadoop/hbase/util/OrderedBytes.java +++ b/hbase-common/src/main/java/org/apache/hadoop/hbase/util/OrderedBytes.java @@ -641,8 +641,11 @@ private static int encodeNumericSmall(PositionedByteRange dst, BigDecimal val) { } // normalize abs(val) to determine E - while (abs.compareTo(EN10) < 0) { abs = abs.movePointRight(8); e += 4; } - while (abs.compareTo(EN2) < 0) { abs = abs.movePointRight(2); e++; } + int zerosBeforeFirstNonZero = abs.scale() - abs.precision(); + int lengthToMoveRight = zerosBeforeFirstNonZero % 2 == + 0 ? zerosBeforeFirstNonZero : zerosBeforeFirstNonZero - 1; + e = lengthToMoveRight / 2; + abs = abs.movePointRight(lengthToMoveRight); putVaruint64(dst, e, !isNeg); // encode appropriate E value. @@ -716,9 +719,10 @@ private static int encodeNumericLarge(PositionedByteRange dst, BigDecimal val) { } // normalize abs(val) to determine E - while (abs.compareTo(E32) >= 0 && e <= 350) { abs = abs.movePointLeft(32); e +=16; } - while (abs.compareTo(E8) >= 0 && e <= 350) { abs = abs.movePointLeft(8); e+= 4; } - while (abs.compareTo(BigDecimal.ONE) >= 0 && e <= 350) { abs = abs.movePointLeft(2); e++; } + int integerDigits = abs.precision() - abs.scale(); + int lengthToMoveLeft = integerDigits % 2 == 0 ? integerDigits : integerDigits + 1; + e = lengthToMoveLeft / 2; + abs = abs.movePointLeft(lengthToMoveLeft); // encode appropriate header byte and/or E value. if (e > 10) { /* large number, write out {~,}E */ From 620ef188df2738270e4c8414e5adac557eca643c Mon Sep 17 00:00:00 2001 From: Yutong Sean Date: Mon, 13 Dec 2021 20:28:42 +0800 Subject: [PATCH 2/6] Add the edge condition in encodeNumericLarge --- .../main/java/org/apache/hadoop/hbase/util/OrderedBytes.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/hbase-common/src/main/java/org/apache/hadoop/hbase/util/OrderedBytes.java b/hbase-common/src/main/java/org/apache/hadoop/hbase/util/OrderedBytes.java index d65385d543c3..266de78b80a3 100644 --- a/hbase-common/src/main/java/org/apache/hadoop/hbase/util/OrderedBytes.java +++ b/hbase-common/src/main/java/org/apache/hadoop/hbase/util/OrderedBytes.java @@ -722,6 +722,10 @@ private static int encodeNumericLarge(PositionedByteRange dst, BigDecimal val) { int integerDigits = abs.precision() - abs.scale(); int lengthToMoveLeft = integerDigits % 2 == 0 ? integerDigits : integerDigits + 1; e = lengthToMoveLeft / 2; + if (e > 350) { + e = 351; + lengthToMoveLeft = 702; + } abs = abs.movePointLeft(lengthToMoveLeft); // encode appropriate header byte and/or E value. From c97de4dbb9673f01fa3361637d3ffb4fecb1b3ed Mon Sep 17 00:00:00 2001 From: Yutong Sean Date: Thu, 16 Dec 2021 19:41:45 +0800 Subject: [PATCH 3/6] Optimized the encoding phase --- .../hadoop/hbase/util/OrderedBytes.java | 37 +++++++++++-------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/hbase-common/src/main/java/org/apache/hadoop/hbase/util/OrderedBytes.java b/hbase-common/src/main/java/org/apache/hadoop/hbase/util/OrderedBytes.java index 266de78b80a3..f0fe20892d6f 100644 --- a/hbase-common/src/main/java/org/apache/hadoop/hbase/util/OrderedBytes.java +++ b/hbase-common/src/main/java/org/apache/hadoop/hbase/util/OrderedBytes.java @@ -314,6 +314,10 @@ public class OrderedBytes { private static final BigDecimal EN2 = BigDecimal.valueOf(1e-2); private static final BigDecimal EN10 = BigDecimal.valueOf(1e-10); + // TODO: 36 is an arbitrary encoding limit. Reevaluate once we have a better handling of + // numeric scale. + private static final int MAX_NUM_ENCODE_BYTES = 18; + /** * Max precision guaranteed to fit into a {@code long}. */ @@ -651,14 +655,7 @@ private static int encodeNumericSmall(PositionedByteRange dst, BigDecimal val) { // encode M by peeling off centimal digits, encoding x as 2x+1 startM = dst.getPosition(); - // TODO: 18 is an arbitrary encoding limit. Reevaluate once we have a better handling of - // numeric scale. - for (int i = 0; i < 18 && abs.compareTo(BigDecimal.ZERO) != 0; i++) { - abs = abs.movePointRight(2); - d = abs.intValue(); - dst.put((byte) ((2 * d + 1) & 0xff)); - abs = abs.subtract(BigDecimal.valueOf(d)); - } + encodeToCentimal(dst, abs); // terminal digit should be 2x a[offset + dst.getPosition() - 1] = (byte) (a[offset + dst.getPosition() - 1] & 0xfe); if (isNeg) { @@ -741,14 +738,7 @@ private static int encodeNumericLarge(PositionedByteRange dst, BigDecimal val) { // encode M by peeling off centimal digits, encoding x as 2x+1 startM = dst.getPosition(); - // TODO: 18 is an arbitrary encoding limit. Reevaluate once we have a better handling of - // numeric scale. - for (int i = 0; i < 18 && abs.compareTo(BigDecimal.ZERO) != 0; i++) { - abs = abs.movePointRight(2); - d = abs.intValue(); - dst.put((byte) (2 * d + 1)); - abs = abs.subtract(BigDecimal.valueOf(d)); - } + encodeToCentimal(dst, abs); // terminal digit should be 2x a[offset + dst.getPosition() - 1] = (byte) (a[offset + dst.getPosition() - 1] & 0xfe); if (isNeg) { @@ -758,6 +748,21 @@ private static int encodeNumericLarge(PositionedByteRange dst, BigDecimal val) { return dst.getPosition() - start; } + private static void encodeToCentimal(PositionedByteRange dst, BigDecimal val) { + String stringOfAbs = val.stripTrailingZeros().toPlainString(); + String value = stringOfAbs.substring(stringOfAbs.indexOf('.') + 1); + int d; + + int maxPrecision = Math.min(MAX_NUM_ENCODE_BYTES * 2, value.length()); + for (int i = 0; i < maxPrecision; i += 2) { + d = (value.charAt(i) - '0') * 10; + if (i + 1 < maxPrecision) { + d += (value.charAt(i + 1) - '0'); + } + dst.put((byte) (2 * d + 1)); + } + } + /** * Encode a numerical value using the variable-length encoding. * @param dst The destination to which encoded digits are written. From 6742739c0efcdd5db105f3a10210ab0f8219803e Mon Sep 17 00:00:00 2001 From: Yutong Sean Date: Fri, 17 Dec 2021 11:31:36 +0800 Subject: [PATCH 4/6] Remove redundant local variable --- .../main/java/org/apache/hadoop/hbase/util/OrderedBytes.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hbase-common/src/main/java/org/apache/hadoop/hbase/util/OrderedBytes.java b/hbase-common/src/main/java/org/apache/hadoop/hbase/util/OrderedBytes.java index f0fe20892d6f..e3d596e8a053 100644 --- a/hbase-common/src/main/java/org/apache/hadoop/hbase/util/OrderedBytes.java +++ b/hbase-common/src/main/java/org/apache/hadoop/hbase/util/OrderedBytes.java @@ -636,7 +636,7 @@ private static int encodeNumericSmall(PositionedByteRange dst, BigDecimal val) { byte[] a = dst.getBytes(); boolean isNeg = val.signum() == -1; final int offset = dst.getOffset(), start = dst.getPosition(); - int e = 0, d, startM; + int e = 0, startM; if (isNeg) { /* Small negative number: 0x14, -E, ~M */ dst.put(NEG_SMALL); @@ -707,7 +707,7 @@ private static int encodeNumericLarge(PositionedByteRange dst, BigDecimal val) { byte[] a = dst.getBytes(); boolean isNeg = val.signum() == -1; final int start = dst.getPosition(), offset = dst.getOffset(); - int e = 0, d, startM; + int e = 0, startM; if (isNeg) { /* Large negative number: 0x08, ~E, ~M */ dst.put(NEG_LARGE); From cd75c8436efdbd992c4729c9e1b446d8685577cd Mon Sep 17 00:00:00 2001 From: Yutong Sean Date: Sat, 18 Dec 2021 17:45:47 +0800 Subject: [PATCH 5/6] Remove the redundant limit and added big numbers in UT --- .../hadoop/hbase/util/OrderedBytes.java | 25 +++++++++---------- .../hadoop/hbase/util/TestOrderedBytes.java | 15 ++++++++--- 2 files changed, 24 insertions(+), 16 deletions(-) diff --git a/hbase-common/src/main/java/org/apache/hadoop/hbase/util/OrderedBytes.java b/hbase-common/src/main/java/org/apache/hadoop/hbase/util/OrderedBytes.java index e3d596e8a053..635df3db4dd1 100644 --- a/hbase-common/src/main/java/org/apache/hadoop/hbase/util/OrderedBytes.java +++ b/hbase-common/src/main/java/org/apache/hadoop/hbase/util/OrderedBytes.java @@ -309,14 +309,6 @@ public class OrderedBytes { public static final Charset UTF8 = Charset.forName("UTF-8"); private static final byte TERM = 0x00; - private static final BigDecimal E8 = BigDecimal.valueOf(1e8); - private static final BigDecimal E32 = BigDecimal.valueOf(1e32); - private static final BigDecimal EN2 = BigDecimal.valueOf(1e-2); - private static final BigDecimal EN10 = BigDecimal.valueOf(1e-10); - - // TODO: 36 is an arbitrary encoding limit. Reevaluate once we have a better handling of - // numeric scale. - private static final int MAX_NUM_ENCODE_BYTES = 18; /** * Max precision guaranteed to fit into a {@code long}. @@ -719,10 +711,6 @@ private static int encodeNumericLarge(PositionedByteRange dst, BigDecimal val) { int integerDigits = abs.precision() - abs.scale(); int lengthToMoveLeft = integerDigits % 2 == 0 ? integerDigits : integerDigits + 1; e = lengthToMoveLeft / 2; - if (e > 350) { - e = 351; - lengthToMoveLeft = 702; - } abs = abs.movePointLeft(lengthToMoveLeft); // encode appropriate header byte and/or E value. @@ -748,12 +736,23 @@ private static int encodeNumericLarge(PositionedByteRange dst, BigDecimal val) { return dst.getPosition() - start; } + /** + * Encode a value val in [0.01, 1.0) into Centimals. + * Util function for {@link this.encodeNumericLarge()} and {@link this.encodeNumericSmall()} + * @param dst The destination to which encoded digits are written. + * @param val A BigDecimal after the normalization. The value must be in [0.01, 1.0). + */ private static void encodeToCentimal(PositionedByteRange dst, BigDecimal val) { + // The input value val must be in [0.01, 1.0) String stringOfAbs = val.stripTrailingZeros().toPlainString(); String value = stringOfAbs.substring(stringOfAbs.indexOf('.') + 1); int d; - int maxPrecision = Math.min(MAX_NUM_ENCODE_BYTES * 2, value.length()); + // If the first float digit is 0, we will encode one digit more than MAX_PRECISION + // We encode at most MAX_PRECISION significant digits into centimals, + // because the input value, has been already normalized. + int maxPrecision = value.charAt(0) == '0' ? MAX_PRECISION + 1 : MAX_PRECISION; + maxPrecision = Math.min(maxPrecision, value.length()); for (int i = 0; i < maxPrecision; i += 2) { d = (value.charAt(i) - '0') * 10; if (i + 1 < maxPrecision) { diff --git a/hbase-common/src/test/java/org/apache/hadoop/hbase/util/TestOrderedBytes.java b/hbase-common/src/test/java/org/apache/hadoop/hbase/util/TestOrderedBytes.java index c8e0381969b2..943038f539a7 100644 --- a/hbase-common/src/test/java/org/apache/hadoop/hbase/util/TestOrderedBytes.java +++ b/hbase-common/src/test/java/org/apache/hadoop/hbase/util/TestOrderedBytes.java @@ -70,9 +70,14 @@ public class TestOrderedBytes { static final BigDecimal[] BD_VALS = { null, BigDecimal.valueOf(Long.MAX_VALUE), BigDecimal.valueOf(Long.MIN_VALUE), BigDecimal.valueOf(Double.MAX_VALUE), BigDecimal.valueOf(Double.MIN_VALUE), - BigDecimal.valueOf(Long.MAX_VALUE).multiply(BigDecimal.valueOf(100)) }; + BigDecimal.valueOf(Long.MAX_VALUE).multiply(BigDecimal.valueOf(100)), + BigDecimal.valueOf(Long.MAX_VALUE).pow(64), + BigDecimal.valueOf(Long.MAX_VALUE).pow(64).negate(), + new BigDecimal("0." + String.join("", Collections.nCopies(500, "123"))), + new BigDecimal("-0." + String.join("", Collections.nCopies(500, "123"))) + }; static final int[] BD_LENGTHS = - { 1, 11, 11, 11, 4, 12 }; + { 1, 11, 11, 11, 4, 12, 19, 19, 18, 18 }; /* * This is the smallest difference between two doubles in D_VALS @@ -335,7 +340,11 @@ public void testNumericOther() { if (null == BD_VALS[i]) { assertEquals(BD_VALS[i], decoded); } else { - assertEquals("Deserialization failed.", 0, BD_VALS[i].compareTo(decoded)); + // The num will be rounded to a specific precision in the encoding phase. + // So that big value will lose precision here. Need to add a normalization here to + // make the test pass. + assertEquals("Deserialization failed.", 0, + OrderedBytes.normalize(BD_VALS[i]).compareTo(decoded)); } assertEquals("Did not consume enough bytes.", BD_LENGTHS[i], buf1.getPosition() - 1); } From 50a7e457e9e0d83bfb029ab8cbd2becfea8e1c20 Mon Sep 17 00:00:00 2001 From: Yutong Sean Date: Sat, 18 Dec 2021 17:53:31 +0800 Subject: [PATCH 6/6] Added comment about the precision losing of large numbers --- .../main/java/org/apache/hadoop/hbase/util/OrderedBytes.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/hbase-common/src/main/java/org/apache/hadoop/hbase/util/OrderedBytes.java b/hbase-common/src/main/java/org/apache/hadoop/hbase/util/OrderedBytes.java index 635df3db4dd1..3a9a0c35cae7 100644 --- a/hbase-common/src/main/java/org/apache/hadoop/hbase/util/OrderedBytes.java +++ b/hbase-common/src/main/java/org/apache/hadoop/hbase/util/OrderedBytes.java @@ -802,6 +802,8 @@ public static int encodeNumeric(PositionedByteRange dst, double val, Order ord) /** * Encode a numerical value using the variable-length encoding. + * If the number of significant digits of the value exceeds the + * {@link OrderedBytes#MAX_PRECISION}, the exceeding part will be lost. * @param dst The destination to which encoded digits are written. * @param val The value to encode. * @param ord The {@link Order} to respect while encoding {@code val}.