diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 6a5536023821..2babac26f567 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -153,6 +153,9 @@ API Changes (with optional timeout) or asynchronously via CompletableFuture. Backward compatible - existing code that ignores the return value works unchanged. (Salvatore Campagna) +* GITHUB#15422: Add a new method in DirectWriter to compute how many bytes are written for encoding a number of values using a number of + bits per value. (Ignacio Vera) + New Features --------------------- * GITHUB#15328: VectorSimilarityFunction.getValues() now implements doubleVal allowing its diff --git a/lucene/core/src/java/org/apache/lucene/util/packed/DirectWriter.java b/lucene/core/src/java/org/apache/lucene/util/packed/DirectWriter.java index 9d846d3e9a2f..c9d2cbe52db6 100644 --- a/lucene/core/src/java/org/apache/lucene/util/packed/DirectWriter.java +++ b/lucene/core/src/java/org/apache/lucene/util/packed/DirectWriter.java @@ -149,11 +149,20 @@ public void finish() throws IOException { } assert !finished; flush(); + // add padding bytes for fast io + final int paddingBytesNeeded = paddingBytesNeeded(bitsPerValue); + for (int i = 0; i < paddingBytesNeeded; i++) { + output.writeByte((byte) 0); + } + finished = true; + } + + private static int paddingBytesNeeded(int bitsPerValue) { // for every number of bits per value, we want to be able to read the entire value in a single // read e.g. for 20 bits per value, we want to be able to read values using ints so we need // 32 - 20 = 12 bits of padding - int paddingBitsNeeded; + final int paddingBitsNeeded; if (bitsPerValue > Integer.SIZE) { paddingBitsNeeded = Long.SIZE - bitsPerValue; } else if (bitsPerValue > Short.SIZE) { @@ -166,20 +175,20 @@ public void finish() throws IOException { assert paddingBitsNeeded >= 0; final int paddingBytesNeeded = (paddingBitsNeeded + Byte.SIZE - 1) / Byte.SIZE; assert paddingBytesNeeded <= 3; - - for (int i = 0; i < paddingBytesNeeded; i++) { - output.writeByte((byte) 0); - } - finished = true; + return paddingBytesNeeded; } /** Returns an instance suitable for encoding {@code numValues} using {@code bitsPerValue} */ public static DirectWriter getInstance(DataOutput output, long numValues, int bitsPerValue) { + checkBitsPerValue(bitsPerValue); + return new DirectWriter(output, numValues, bitsPerValue); + } + + private static void checkBitsPerValue(int bitsPerValue) { if (Arrays.binarySearch(SUPPORTED_BITS_PER_VALUE, bitsPerValue) < 0) { throw new IllegalArgumentException( "Unsupported bitsPerValue " + bitsPerValue + ". Did you use bitsRequired?"); } - return new DirectWriter(output, numValues, bitsPerValue); } /** @@ -224,4 +233,17 @@ public static int unsignedBitsRequired(long maxValue) { static final int[] SUPPORTED_BITS_PER_VALUE = new int[] {1, 2, 4, 8, 12, 16, 20, 24, 28, 32, 40, 48, 56, 64}; + + /** + * Returns how many bytes are written for encoding {@code numValues} using {@code bitsPerValue}. + * + * @param numValues total number of values + * @param bitsPerValue the number of bits required per value + * @return The amount of bytes written + */ + public static long bytesRequired(long numValues, int bitsPerValue) { + checkBitsPerValue(bitsPerValue); + final long bytes = (numValues * bitsPerValue + Byte.SIZE - 1) / 8; + return bytes + paddingBytesNeeded(bitsPerValue); + } } diff --git a/lucene/core/src/test/org/apache/lucene/util/packed/TestDirectPacked.java b/lucene/core/src/test/org/apache/lucene/util/packed/TestDirectPacked.java index 3ed18de0fef6..78f939577fe0 100644 --- a/lucene/core/src/test/org/apache/lucene/util/packed/TestDirectPacked.java +++ b/lucene/core/src/test/org/apache/lucene/util/packed/TestDirectPacked.java @@ -128,6 +128,8 @@ private void doTestBpv(Directory directory, int bpv, long offset, boolean merge) writer.finish(); output.close(); IndexInput input = directory.openInput(name, IOContext.DEFAULT); + assertEquals( + input.length() - offset, DirectWriter.bytesRequired(original.length, bitsRequired)); LongValues reader; if (merge) { reader =