From 7a3ad46ff86bd3d2d47f6a56bace1a0c4fd171c8 Mon Sep 17 00:00:00 2001 From: Bcpoole Date: Wed, 8 Feb 2017 17:11:07 -0800 Subject: [PATCH 01/13] Swamidass & Baldi approx. items in intersection of two Bloom filters. Also function to create union (non-mutation) of two Bloom filters. --- .../apache/spark/util/sketch/BloomFilter.java | 5 ++ .../spark/util/sketch/BloomFilterImpl.java | 56 +++++++++++++++++++ .../sketch/IncompatibleUnionException.java | 24 ++++++++ 3 files changed, 85 insertions(+) create mode 100644 common/sketch/src/main/java/org/apache/spark/util/sketch/IncompatibleUnionException.java diff --git a/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilter.java b/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilter.java index c0b425e729595..009ba5c997d16 100644 --- a/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilter.java +++ b/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilter.java @@ -80,6 +80,11 @@ int getVersionNumber() { */ public abstract long bitSize(); + /** + * Swamidass & Baldi (2007) approximation for number of items in a Bloom filter + */ + public abstract double approxItems(); + /** * Puts an item into this {@code BloomFilter}. Ensures that subsequent invocations of * {@linkplain #mightContain(Object)} with the same item will always return {@code true}. diff --git a/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilterImpl.java b/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilterImpl.java index 92c28bcb56a5a..ea769e5e34515 100644 --- a/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilterImpl.java +++ b/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilterImpl.java @@ -220,6 +220,62 @@ public BloomFilter mergeInPlace(BloomFilter other) throws IncompatibleMergeExcep return this; } + @Override + public double approxItems() { + double m = bitSize(); + return (m / numHashFunctions) * Math.log(1 - (bits.cardinality() / m)); + } + + /** + * Returns a new Bloom filter of the union of two Bloom filters. + * Unlike mergeInplace, this will not cause a mutation. + * Callers must ensure the bloom filters are appropriately sized to avoid saturating them. + * + * @throws IncompatibleUnionException if either are null, different classes, or different size or number of hash functions + */ + public static BloomFilterImpl createUnionBloomFilter(BloomFilter bf1, BloomFilter bf2) throws IncompatibleUnionException { + // Duplicates the logic of `isCompatible` here to provide better error message. + if (bf1 == null || bf2 == null) { + throw new IncompatibleUnionException("Cannot union null bloom filters"); + } + + if (!(bf1 instanceof BloomFilterImpl)) { + throw new IncompatibleUnionException( + "Cannot union bloom filter of class " + bf1.getClass().getName() + ); + } else if (!(bf2 instanceof BloomFilterImpl)) { + throw new IncompatibleUnionException( + "Cannot union bloom filter of class " + bf2.getClass().getName() + ); + } + + BloomFilterImpl bfImpl1 = (BloomFilterImpl) bf1; + BloomFilterImpl bfImpl2 = (BloomFilterImpl) bf2; + + if (bfImpl1.bitSize() != bfImpl2.bitSize()) { + throw new IncompatibleUnionException("Cannot union bloom filters with different bit size"); + } + + if (bfImpl1.numHashFunctions != bfImpl2.numHashFunctions) { + throw new IncompatibleUnionException("Cannot union bloom filters with different number of hash functions"); + } + + BloomFilterImpl bfUnion = (BloomFilterImpl)BloomFilter.create(bf1.bitSize()); + + bfUnion.bits.putAll(bfImpl1.bits); + bfUnion.bits.putAll(bfImpl2.bits); + return bfUnion; + } + + /** + * Swamidass & Baldi (2007) approximation for number of items in the intersection of two Bloom filters + */ + public static double approxItemsInIntersection(BloomFilterImpl bf1, BloomFilterImpl bf2) throws IncompatibleUnionException { + BloomFilterImpl union = createUnionBloomFilter(bf1, bf2); + + return bf1.approxItems() + bf2.approxItems() - union.approxItems(); + } + @Override public void writeTo(OutputStream out) throws IOException { DataOutputStream dos = new DataOutputStream(out); diff --git a/common/sketch/src/main/java/org/apache/spark/util/sketch/IncompatibleUnionException.java b/common/sketch/src/main/java/org/apache/spark/util/sketch/IncompatibleUnionException.java new file mode 100644 index 0000000000000..5ac68e5d5ed47 --- /dev/null +++ b/common/sketch/src/main/java/org/apache/spark/util/sketch/IncompatibleUnionException.java @@ -0,0 +1,24 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.util.sketch; + +public class IncompatibleUnionException extends Exception { + public IncompatibleUnionException(String message) { + super(message); + } +} From b9680c57b2f8b1d93c28884de9a7ebbe52505f6c Mon Sep 17 00:00:00 2001 From: Bcpoole Date: Wed, 8 Feb 2017 17:42:36 -0800 Subject: [PATCH 02/13] Changed createUnionBloomFilter & approxItemsInIntersection to be instance instead of static functions --- .../spark/util/sketch/BloomFilterImpl.java | 31 ++++++++----------- 1 file changed, 13 insertions(+), 18 deletions(-) diff --git a/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilterImpl.java b/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilterImpl.java index ea769e5e34515..4c1ee2854c683 100644 --- a/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilterImpl.java +++ b/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilterImpl.java @@ -233,47 +233,42 @@ public double approxItems() { * * @throws IncompatibleUnionException if either are null, different classes, or different size or number of hash functions */ - public static BloomFilterImpl createUnionBloomFilter(BloomFilter bf1, BloomFilter bf2) throws IncompatibleUnionException { + public BloomFilterImpl createUnionBloomFilter(BloomFilter other) throws IncompatibleUnionException { // Duplicates the logic of `isCompatible` here to provide better error message. - if (bf1 == null || bf2 == null) { + if (other == null) { throw new IncompatibleUnionException("Cannot union null bloom filters"); } - if (!(bf1 instanceof BloomFilterImpl)) { - throw new IncompatibleUnionException( - "Cannot union bloom filter of class " + bf1.getClass().getName() - ); - } else if (!(bf2 instanceof BloomFilterImpl)) { + if (!(other instanceof BloomFilterImpl)) { throw new IncompatibleUnionException( - "Cannot union bloom filter of class " + bf2.getClass().getName() + "Cannot union bloom filter of class " + other.getClass().getName() ); } - BloomFilterImpl bfImpl1 = (BloomFilterImpl) bf1; - BloomFilterImpl bfImpl2 = (BloomFilterImpl) bf2; + BloomFilterImpl that = (BloomFilterImpl) other; - if (bfImpl1.bitSize() != bfImpl2.bitSize()) { + if (this.bitSize() != that.bitSize()) { throw new IncompatibleUnionException("Cannot union bloom filters with different bit size"); } - if (bfImpl1.numHashFunctions != bfImpl2.numHashFunctions) { + if (this.numHashFunctions != that.numHashFunctions) { throw new IncompatibleUnionException("Cannot union bloom filters with different number of hash functions"); } - BloomFilterImpl bfUnion = (BloomFilterImpl)BloomFilter.create(bf1.bitSize()); + BloomFilterImpl bfUnion = (BloomFilterImpl)BloomFilter.create(bitSize()); - bfUnion.bits.putAll(bfImpl1.bits); - bfUnion.bits.putAll(bfImpl2.bits); + bfUnion.bits.putAll(this.bits); + bfUnion.bits.putAll(that.bits); return bfUnion; } /** * Swamidass & Baldi (2007) approximation for number of items in the intersection of two Bloom filters */ - public static double approxItemsInIntersection(BloomFilterImpl bf1, BloomFilterImpl bf2) throws IncompatibleUnionException { - BloomFilterImpl union = createUnionBloomFilter(bf1, bf2); + public double approxItemsInIntersection(BloomFilter that) throws IncompatibleUnionException { + BloomFilterImpl union = createUnionBloomFilter(that); - return bf1.approxItems() + bf2.approxItems() - union.approxItems(); + return this.approxItems() + that.approxItems() - union.approxItems(); } @Override From 501ad7e22101b00862c0c77ef8c38e1b166d33a4 Mon Sep 17 00:00:00 2001 From: Bcpoole Date: Wed, 8 Feb 2017 17:53:50 -0800 Subject: [PATCH 03/13] Updated abstract class to reflect changes in previous commit --- .../org/apache/spark/util/sketch/BloomFilter.java | 14 ++++++++++++++ .../apache/spark/util/sketch/BloomFilterImpl.java | 12 ++---------- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilter.java b/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilter.java index 009ba5c997d16..122cbfb68830b 100644 --- a/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilter.java +++ b/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilter.java @@ -152,6 +152,20 @@ int getVersionNumber() { */ public abstract boolean mightContainBinary(byte[] item); + /** + * Returns a new Bloom filter of the union of two Bloom filters. + * Unlike mergeInplace, this will not cause a mutation. + * Callers must ensure the bloom filters are appropriately sized to avoid saturating them. + * + * @throws IncompatibleUnionException if either are null, different classes, or different size or number of hash functions + */ + public abstract BloomFilterImpl createUnionBloomFilter(BloomFilter other) throws IncompatibleUnionException; + + /** + * Swamidass & Baldi (2007) approximation for number of items in the intersection of two Bloom filters + */ + public abstract double approxItemsInIntersection(BloomFilter that) throws IncompatibleUnionException; + /** * Writes out this {@link BloomFilter} to an output stream in binary format. It is the caller's * responsibility to close the stream. diff --git a/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilterImpl.java b/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilterImpl.java index 4c1ee2854c683..0a064a174afed 100644 --- a/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilterImpl.java +++ b/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilterImpl.java @@ -226,13 +226,7 @@ public double approxItems() { return (m / numHashFunctions) * Math.log(1 - (bits.cardinality() / m)); } - /** - * Returns a new Bloom filter of the union of two Bloom filters. - * Unlike mergeInplace, this will not cause a mutation. - * Callers must ensure the bloom filters are appropriately sized to avoid saturating them. - * - * @throws IncompatibleUnionException if either are null, different classes, or different size or number of hash functions - */ + @Override public BloomFilterImpl createUnionBloomFilter(BloomFilter other) throws IncompatibleUnionException { // Duplicates the logic of `isCompatible` here to provide better error message. if (other == null) { @@ -262,9 +256,7 @@ public BloomFilterImpl createUnionBloomFilter(BloomFilter other) throws Incompat return bfUnion; } - /** - * Swamidass & Baldi (2007) approximation for number of items in the intersection of two Bloom filters - */ + @Override public double approxItemsInIntersection(BloomFilter that) throws IncompatibleUnionException { BloomFilterImpl union = createUnionBloomFilter(that); From c2a775d09b78feb160a30e7e430bc7abba988bbd Mon Sep 17 00:00:00 2001 From: Bcpoole Date: Thu, 9 Feb 2017 10:42:29 -0800 Subject: [PATCH 04/13] =?UTF-8?q?renamed=20=E2=80=98createUnionBloomFilter?= =?UTF-8?q?=E2=80=99=20to=20=E2=80=98createUnion=E2=80=99=20as=20per=20@rx?= =?UTF-8?q?in=20suggestion?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../java/org/apache/spark/util/sketch/BloomFilterImpl.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilterImpl.java b/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilterImpl.java index 0a064a174afed..017fbaa34d961 100644 --- a/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilterImpl.java +++ b/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilterImpl.java @@ -227,7 +227,7 @@ public double approxItems() { } @Override - public BloomFilterImpl createUnionBloomFilter(BloomFilter other) throws IncompatibleUnionException { + public BloomFilterImpl createUnion(BloomFilter other) throws IncompatibleUnionException { // Duplicates the logic of `isCompatible` here to provide better error message. if (other == null) { throw new IncompatibleUnionException("Cannot union null bloom filters"); @@ -258,7 +258,7 @@ public BloomFilterImpl createUnionBloomFilter(BloomFilter other) throws Incompat @Override public double approxItemsInIntersection(BloomFilter that) throws IncompatibleUnionException { - BloomFilterImpl union = createUnionBloomFilter(that); + BloomFilterImpl union = createUnion(that); return this.approxItems() + that.approxItems() - union.approxItems(); } From 40a3954e8e03dc4684162e869dbe1548d4ca4cdb Mon Sep 17 00:00:00 2001 From: Bcpoole Date: Thu, 9 Feb 2017 10:47:13 -0800 Subject: [PATCH 05/13] renamed to to createUnion to union --- .../main/java/org/apache/spark/util/sketch/BloomFilter.java | 2 +- .../java/org/apache/spark/util/sketch/BloomFilterImpl.java | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilter.java b/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilter.java index 122cbfb68830b..5381c63b5567b 100644 --- a/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilter.java +++ b/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilter.java @@ -159,7 +159,7 @@ int getVersionNumber() { * * @throws IncompatibleUnionException if either are null, different classes, or different size or number of hash functions */ - public abstract BloomFilterImpl createUnionBloomFilter(BloomFilter other) throws IncompatibleUnionException; + public abstract BloomFilterImpl union(BloomFilter other) throws IncompatibleUnionException; /** * Swamidass & Baldi (2007) approximation for number of items in the intersection of two Bloom filters diff --git a/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilterImpl.java b/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilterImpl.java index 017fbaa34d961..d0a67bace9033 100644 --- a/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilterImpl.java +++ b/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilterImpl.java @@ -227,7 +227,7 @@ public double approxItems() { } @Override - public BloomFilterImpl createUnion(BloomFilter other) throws IncompatibleUnionException { + public BloomFilterImpl union(BloomFilter other) throws IncompatibleUnionException { // Duplicates the logic of `isCompatible` here to provide better error message. if (other == null) { throw new IncompatibleUnionException("Cannot union null bloom filters"); @@ -258,7 +258,7 @@ public BloomFilterImpl createUnion(BloomFilter other) throws IncompatibleUnionEx @Override public double approxItemsInIntersection(BloomFilter that) throws IncompatibleUnionException { - BloomFilterImpl union = createUnion(that); + BloomFilterImpl union = union(that); return this.approxItems() + that.approxItems() - union.approxItems(); } From ea554a632f46d66503a1f78624498cfdba3fc3e6 Mon Sep 17 00:00:00 2001 From: Bcpoole Date: Thu, 9 Feb 2017 10:55:45 -0800 Subject: [PATCH 06/13] Added some more javadoc --- .../main/java/org/apache/spark/util/sketch/BloomFilter.java | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilter.java b/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilter.java index 5381c63b5567b..b10d341c39cbb 100644 --- a/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilter.java +++ b/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilter.java @@ -157,12 +157,16 @@ int getVersionNumber() { * Unlike mergeInplace, this will not cause a mutation. * Callers must ensure the bloom filters are appropriately sized to avoid saturating them. * - * @throws IncompatibleUnionException if either are null, different classes, or different size or number of hash functions + * @param other The bloom filter to union this bloom filter with. + * @throws IncompatibleUnionException if {@code isCompatible(other) == false} */ public abstract BloomFilterImpl union(BloomFilter other) throws IncompatibleUnionException; /** * Swamidass & Baldi (2007) approximation for number of items in the intersection of two Bloom filters + * + * @param other The bloom filter to intersect this bloom filter with. + * @throws IncompatibleUnionException if {@code isCompatible(other) == false} */ public abstract double approxItemsInIntersection(BloomFilter that) throws IncompatibleUnionException; From 9cd716506daaf3508a984713bc8a267e4d1bfcb2 Mon Sep 17 00:00:00 2001 From: Bcpoole Date: Thu, 9 Feb 2017 12:59:42 -0800 Subject: [PATCH 07/13] =?UTF-8?q?Readded=20=E2=80=98-=E2=80=98=20to=20star?= =?UTF-8?q?t=20of=20approxItems()=20formula=20that=20somehow=20got=20remov?= =?UTF-8?q?ed?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../main/java/org/apache/spark/util/sketch/BloomFilterImpl.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilterImpl.java b/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilterImpl.java index d0a67bace9033..942fb84cd3408 100644 --- a/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilterImpl.java +++ b/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilterImpl.java @@ -223,7 +223,7 @@ public BloomFilter mergeInPlace(BloomFilter other) throws IncompatibleMergeExcep @Override public double approxItems() { double m = bitSize(); - return (m / numHashFunctions) * Math.log(1 - (bits.cardinality() / m)); + return (-m / numHashFunctions) * Math.log(1 - (bits.cardinality() / m)); } @Override From 0ac7bf199b7d026947fd90c36ab302eab69c6b4e Mon Sep 17 00:00:00 2001 From: Bcpoole Date: Thu, 9 Feb 2017 13:08:22 -0800 Subject: [PATCH 08/13] Handling of expected size being converted by BitArray concur --- .../main/java/org/apache/spark/util/sketch/BloomFilterImpl.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilterImpl.java b/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilterImpl.java index 942fb84cd3408..f3b0d3f871075 100644 --- a/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilterImpl.java +++ b/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilterImpl.java @@ -249,7 +249,7 @@ public BloomFilterImpl union(BloomFilter other) throws IncompatibleUnionExceptio throw new IncompatibleUnionException("Cannot union bloom filters with different number of hash functions"); } - BloomFilterImpl bfUnion = (BloomFilterImpl)BloomFilter.create(bitSize()); + BloomFilterImpl bfUnion = (BloomFilterImpl)BloomFilter.create(bitSize()/Long.SIZE); bfUnion.bits.putAll(this.bits); bfUnion.bits.putAll(that.bits); From 0a3910e324147ab76a020f5f4e72fc40379abe7e Mon Sep 17 00:00:00 2001 From: Bcpoole Date: Fri, 10 Feb 2017 13:07:39 -0800 Subject: [PATCH 09/13] Added more description (formula + explanation) of approxItems and approxItemsInIntersection. Also added reference to paper --- .../apache/spark/util/sketch/BloomFilter.java | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilter.java b/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilter.java index b10d341c39cbb..ec0b8c63b74f7 100644 --- a/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilter.java +++ b/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilter.java @@ -82,6 +82,16 @@ int getVersionNumber() { /** * Swamidass & Baldi (2007) approximation for number of items in a Bloom filter + * + * n* = - m/k * ln(1- X/m) + * where: + * n* = the estimated number of items in the Bloom filter, + * k = the number of hash functions used (k-fold compression), + * m = the length of the filter, + * X = the number of bits set to one + * + * @seealso + * Mathematical Correction for Fingerprint Similarity Measures to Improve Chemical Retrieval */ public abstract double approxItems(); @@ -165,8 +175,14 @@ int getVersionNumber() { /** * Swamidass & Baldi (2007) approximation for number of items in the intersection of two Bloom filters * + * n(A* ∩ B*) = n(A*) + n(B*) - n(A* ∪ B*) + * The approx. of the intersection is the approx. of A plus B minus the approx. of their union + * * @param other The bloom filter to intersect this bloom filter with. * @throws IncompatibleUnionException if {@code isCompatible(other) == false} + * @seealso #approxItems() + * @seealso + * Mathematical Correction for Fingerprint Similarity Measures to Improve Chemical Retrieval */ public abstract double approxItemsInIntersection(BloomFilter that) throws IncompatibleUnionException; From 8ec6092ee1c04bd8b183c3a0dcd03fdfe6292d4b Mon Sep 17 00:00:00 2001 From: Bcpoole Date: Fri, 10 Feb 2017 13:52:26 -0800 Subject: [PATCH 10/13] Changed @seealso to @see. Added a note about approximation divergence --- .../java/org/apache/spark/util/sketch/BloomFilter.java | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilter.java b/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilter.java index ec0b8c63b74f7..7f76700bc2beb 100644 --- a/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilter.java +++ b/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilter.java @@ -90,7 +90,10 @@ int getVersionNumber() { * m = the length of the filter, * X = the number of bits set to one * - * @seealso + * Note: the approximation is not valid when the Bloom filter is close to full + * since it yields a diverging value. + * + * @see * Mathematical Correction for Fingerprint Similarity Measures to Improve Chemical Retrieval */ public abstract double approxItems(); @@ -169,6 +172,7 @@ int getVersionNumber() { * * @param other The bloom filter to union this bloom filter with. * @throws IncompatibleUnionException if {@code isCompatible(other) == false} + * @see #approxItems() */ public abstract BloomFilterImpl union(BloomFilter other) throws IncompatibleUnionException; @@ -180,8 +184,8 @@ int getVersionNumber() { * * @param other The bloom filter to intersect this bloom filter with. * @throws IncompatibleUnionException if {@code isCompatible(other) == false} - * @seealso #approxItems() - * @seealso + * @see #approxItems() + * @see * Mathematical Correction for Fingerprint Similarity Measures to Improve Chemical Retrieval */ public abstract double approxItemsInIntersection(BloomFilter that) throws IncompatibleUnionException; From 5e085c0575380cf3c26f1ae6051baa5d07d45708 Mon Sep 17 00:00:00 2001 From: Bcpoole Date: Fri, 10 Feb 2017 15:40:37 -0800 Subject: [PATCH 11/13] Wrote test for approxItems() being infinity when full --- .../spark/util/sketch/BloomFilterSuite.scala | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/common/sketch/src/test/scala/org/apache/spark/util/sketch/BloomFilterSuite.scala b/common/sketch/src/test/scala/org/apache/spark/util/sketch/BloomFilterSuite.scala index a0408d2da4dff..dca66b7eaa6ae 100644 --- a/common/sketch/src/test/scala/org/apache/spark/util/sketch/BloomFilterSuite.scala +++ b/common/sketch/src/test/scala/org/apache/spark/util/sketch/BloomFilterSuite.scala @@ -99,6 +99,18 @@ class BloomFilterSuite extends FunSuite { // scalastyle:ignore funsuite } } + def testApproxItems[T: ClassTag](): Unit = { + test("approxItems") { + val filter = BloomFilter.create(10) + + for (x <- 1 to 1000) { + filter.putLong(x) + } + + assert(filter.approxItems().isInfinite) + } + } + def testItemType[T: ClassTag](typeName: String, numItems: Int)(itemGen: Random => T): Unit = { testAccuracy[T](typeName, numItems)(itemGen) testMergeInPlace[T](typeName, numItems)(itemGen) @@ -131,4 +143,6 @@ class BloomFilterSuite extends FunSuite { // scalastyle:ignore funsuite filter1.mergeInPlace(filter2) } } + + testApproxItems() } From d1921f84499b52c5312a96de1d7bf5a19c03fcea Mon Sep 17 00:00:00 2001 From: Bcpoole Date: Fri, 10 Feb 2017 15:46:43 -0800 Subject: [PATCH 12/13] Added a non-full Bloom filter to testApproxItems --- .../apache/spark/util/sketch/BloomFilterSuite.scala | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/common/sketch/src/test/scala/org/apache/spark/util/sketch/BloomFilterSuite.scala b/common/sketch/src/test/scala/org/apache/spark/util/sketch/BloomFilterSuite.scala index dca66b7eaa6ae..68612d26c1f8f 100644 --- a/common/sketch/src/test/scala/org/apache/spark/util/sketch/BloomFilterSuite.scala +++ b/common/sketch/src/test/scala/org/apache/spark/util/sketch/BloomFilterSuite.scala @@ -101,13 +101,15 @@ class BloomFilterSuite extends FunSuite { // scalastyle:ignore funsuite def testApproxItems[T: ClassTag](): Unit = { test("approxItems") { - val filter = BloomFilter.create(10) - + val filter1 = BloomFilter.create(10) + val filter2 = BloomFilter.create(10000) for (x <- 1 to 1000) { - filter.putLong(x) + filter1.putLong(x) + filter2.putLong(x) } - assert(filter.approxItems().isInfinite) + assert(filter1.approxItems().isInfinite) + assert(!filter2.approxItems().isInfinite) } } From 2cbf490cfc9dc216b1e52e66534193a789ab3d32 Mon Sep 17 00:00:00 2001 From: Bcpoole Date: Fri, 10 Feb 2017 17:02:49 -0800 Subject: [PATCH 13/13] =?UTF-8?q?Added=20explanation=20as=20to=20why=20can?= =?UTF-8?q?=E2=80=99t=20run=20approxItems()=20directly=20on=20the=20inters?= =?UTF-8?q?ection=20of=20A=20&=20B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../main/java/org/apache/spark/util/sketch/BloomFilter.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilter.java b/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilter.java index 7f76700bc2beb..d05e0cd130310 100644 --- a/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilter.java +++ b/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilter.java @@ -182,6 +182,10 @@ int getVersionNumber() { * n(A* ∩ B*) = n(A*) + n(B*) - n(A* ∪ B*) * The approx. of the intersection is the approx. of A plus B minus the approx. of their union * + * Running approxItems() directly on A ∩ B leads to overestimation because "some bits in A ∩ B are + * set to 1 by chance and do not correspond to a compression of bits present in the uncompressed intersection vector + * (A->)* ∩ (B->)*" + * * @param other The bloom filter to intersect this bloom filter with. * @throws IncompatibleUnionException if {@code isCompatible(other) == false} * @see #approxItems()