From 45cd1cd93e896bca396bffa38a382c6a3abce84f Mon Sep 17 00:00:00 2001 From: Ignacio Vera Date: Thu, 13 Nov 2025 08:48:22 +0100 Subject: [PATCH 1/2] Add method in DirectWriter to compute how many bytes are written for encoding a number of values using a number of bits per value. --- .../lucene/util/packed/DirectWriter.java | 36 +++++++++++++++---- .../lucene/util/packed/TestDirectPacked.java | 2 ++ 2 files changed, 31 insertions(+), 7 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/packed/DirectWriter.java b/lucene/core/src/java/org/apache/lucene/util/packed/DirectWriter.java index 9d846d3e9a2f..c9d2cbe52db6 100644 --- a/lucene/core/src/java/org/apache/lucene/util/packed/DirectWriter.java +++ b/lucene/core/src/java/org/apache/lucene/util/packed/DirectWriter.java @@ -149,11 +149,20 @@ public void finish() throws IOException { } assert !finished; flush(); + // add padding bytes for fast io + final int paddingBytesNeeded = paddingBytesNeeded(bitsPerValue); + for (int i = 0; i < paddingBytesNeeded; i++) { + output.writeByte((byte) 0); + } + finished = true; + } + + private static int paddingBytesNeeded(int bitsPerValue) { // for every number of bits per value, we want to be able to read the entire value in a single // read e.g. for 20 bits per value, we want to be able to read values using ints so we need // 32 - 20 = 12 bits of padding - int paddingBitsNeeded; + final int paddingBitsNeeded; if (bitsPerValue > Integer.SIZE) { paddingBitsNeeded = Long.SIZE - bitsPerValue; } else if (bitsPerValue > Short.SIZE) { @@ -166,20 +175,20 @@ public void finish() throws IOException { assert paddingBitsNeeded >= 0; final int paddingBytesNeeded = (paddingBitsNeeded + Byte.SIZE - 1) / Byte.SIZE; assert paddingBytesNeeded <= 3; - - for (int i = 0; i < paddingBytesNeeded; i++) { - output.writeByte((byte) 0); - } - finished = true; + return paddingBytesNeeded; } /** Returns an instance suitable for encoding {@code numValues} using {@code bitsPerValue} */ public static DirectWriter getInstance(DataOutput output, long numValues, int bitsPerValue) { + checkBitsPerValue(bitsPerValue); + return new DirectWriter(output, numValues, bitsPerValue); + } + + private static void checkBitsPerValue(int bitsPerValue) { if (Arrays.binarySearch(SUPPORTED_BITS_PER_VALUE, bitsPerValue) < 0) { throw new IllegalArgumentException( "Unsupported bitsPerValue " + bitsPerValue + ". Did you use bitsRequired?"); } - return new DirectWriter(output, numValues, bitsPerValue); } /** @@ -224,4 +233,17 @@ public static int unsignedBitsRequired(long maxValue) { static final int[] SUPPORTED_BITS_PER_VALUE = new int[] {1, 2, 4, 8, 12, 16, 20, 24, 28, 32, 40, 48, 56, 64}; + + /** + * Returns how many bytes are written for encoding {@code numValues} using {@code bitsPerValue}. + * + * @param numValues total number of values + * @param bitsPerValue the number of bits required per value + * @return The amount of bytes written + */ + public static long bytesRequired(long numValues, int bitsPerValue) { + checkBitsPerValue(bitsPerValue); + final long bytes = (numValues * bitsPerValue + Byte.SIZE - 1) / 8; + return bytes + paddingBytesNeeded(bitsPerValue); + } } diff --git a/lucene/core/src/test/org/apache/lucene/util/packed/TestDirectPacked.java b/lucene/core/src/test/org/apache/lucene/util/packed/TestDirectPacked.java index 3ed18de0fef6..78f939577fe0 100644 --- a/lucene/core/src/test/org/apache/lucene/util/packed/TestDirectPacked.java +++ b/lucene/core/src/test/org/apache/lucene/util/packed/TestDirectPacked.java @@ -128,6 +128,8 @@ private void doTestBpv(Directory directory, int bpv, long offset, boolean merge) writer.finish(); output.close(); IndexInput input = directory.openInput(name, IOContext.DEFAULT); + assertEquals( + input.length() - offset, DirectWriter.bytesRequired(original.length, bitsRequired)); LongValues reader; if (merge) { reader = From eab6612658e34a37fc285d6668fcb6d5e9c0ea2b Mon Sep 17 00:00:00 2001 From: Ignacio Vera Date: Thu, 13 Nov 2025 10:08:45 +0100 Subject: [PATCH 2/2] entry in CHANGES.txt --- lucene/CHANGES.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 0742b8d96fa6..30b1c55f9116 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -184,6 +184,9 @@ Improvements * GITHUB#15332: Add PhraseQuery.Builder.setMaxTerms() method to limit the maximum number of terms and excessive memory use (linyunanit) +* GITHUB#15422: Add a new method in DirectWriter to compute how many bytes are written for encoding a number of values using a number of + bits per value. (Ignacio Vera) + Optimizations --------------------- * GITHUB#15140: Optimize TopScoreDocCollector with TernaryLongHeap for improved performance over Binary-LongHeap. (Ramakrishna Chilaka)