-
Notifications
You must be signed in to change notification settings - Fork 28.9k
[SPARK-19527][Core] Approximate Size of Intersection of Bloom Filters #16864
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
7a3ad46
b9680c5
501ad7e
c2a775d
40a3954
ea554a6
9cd7165
0ac7bf1
0a3910e
8ec6092
5e085c0
d1921f8
2cbf490
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -80,6 +80,24 @@ int getVersionNumber() { | |
| */ | ||
| public abstract long bitSize(); | ||
|
|
||
| /** | ||
| * Swamidass & Baldi (2007) approximation for number of items in a Bloom filter | ||
| * | ||
| * n* = - m/k * ln(1- X/m) | ||
| * where: | ||
| * n* = the estimated number of items in the Bloom filter, | ||
| * k = the number of hash functions used (k-fold compression), | ||
| * m = the length of the filter, | ||
| * X = the number of bits set to one | ||
| * | ||
| * Note: the approximation is not valid when the Bloom filter is close to full | ||
| * since it yields a diverging value. | ||
| * | ||
| * @see <a href="http://pubs.acs.org/doi/abs/10.1021/ci600526a"> | ||
| * Mathematical Correction for Fingerprint Similarity Measures to Improve Chemical Retrieval</a> | ||
| */ | ||
| public abstract double approxItems(); | ||
|
|
||
| /** | ||
| * Puts an item into this {@code BloomFilter}. Ensures that subsequent invocations of | ||
| * {@linkplain #mightContain(Object)} with the same item will always return {@code true}. | ||
|
|
@@ -147,6 +165,35 @@ int getVersionNumber() { | |
| */ | ||
| public abstract boolean mightContainBinary(byte[] item); | ||
|
|
||
| /** | ||
| * Returns a new Bloom filter of the union of two Bloom filters. | ||
| * Unlike mergeInplace, this will not cause a mutation. | ||
| * Callers must ensure the bloom filters are appropriately sized to avoid saturating them. | ||
| * | ||
| * @param other The bloom filter to union this bloom filter with. | ||
| * @throws IncompatibleUnionException if {@code isCompatible(other) == false} | ||
| * @see #approxItems() | ||
| */ | ||
| public abstract BloomFilterImpl union(BloomFilter other) throws IncompatibleUnionException; | ||
|
|
||
| /** | ||
| * Swamidass & Baldi (2007) approximation for number of items in the intersection of two Bloom filters | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
| * | ||
| * n(A* ∩ B*) = n(A*) + n(B*) - n(A* ∪ B*) | ||
| * The approx. of the intersection is the approx. of A plus B minus the approx. of their union | ||
| * | ||
| * Running approxItems() directly on A ∩ B leads to overestimation because "some bits in A ∩ B are | ||
| * set to 1 by chance and do not correspond to a compression of bits present in the uncompressed intersection vector | ||
| * (A->)* ∩ (B->)*" | ||
| * | ||
| * @param other The bloom filter to intersect this bloom filter with. | ||
| * @throws IncompatibleUnionException if {@code isCompatible(other) == false} | ||
| * @see #approxItems() | ||
| * @see <a href="http://pubs.acs.org/doi/abs/10.1021/ci600526a"> | ||
| * Mathematical Correction for Fingerprint Similarity Measures to Improve Chemical Retrieval</a> | ||
| */ | ||
| public abstract double approxItemsInIntersection(BloomFilter that) throws IncompatibleUnionException; | ||
|
|
||
| /** | ||
| * Writes out this {@link BloomFilter} to an output stream in binary format. It is the caller's | ||
| * responsibility to close the stream. | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -220,6 +220,49 @@ public BloomFilter mergeInPlace(BloomFilter other) throws IncompatibleMergeExcep | |
| return this; | ||
| } | ||
|
|
||
| @Override | ||
| public double approxItems() { | ||
| double m = bitSize(); | ||
| return (-m / numHashFunctions) * Math.log(1 - (bits.cardinality() / m)); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Assume you were thinking of Scala? |
||
| } | ||
|
|
||
| @Override | ||
| public BloomFilterImpl union(BloomFilter other) throws IncompatibleUnionException { | ||
| // Duplicates the logic of `isCompatible` here to provide better error message. | ||
| if (other == null) { | ||
| throw new IncompatibleUnionException("Cannot union null bloom filters"); | ||
| } | ||
|
|
||
| if (!(other instanceof BloomFilterImpl)) { | ||
| throw new IncompatibleUnionException( | ||
| "Cannot union bloom filter of class " + other.getClass().getName() | ||
| ); | ||
| } | ||
|
|
||
| BloomFilterImpl that = (BloomFilterImpl) other; | ||
|
|
||
| if (this.bitSize() != that.bitSize()) { | ||
| throw new IncompatibleUnionException("Cannot union bloom filters with different bit size"); | ||
| } | ||
|
|
||
| if (this.numHashFunctions != that.numHashFunctions) { | ||
| throw new IncompatibleUnionException("Cannot union bloom filters with different number of hash functions"); | ||
| } | ||
|
|
||
| BloomFilterImpl bfUnion = (BloomFilterImpl)BloomFilter.create(bitSize()/Long.SIZE); | ||
|
|
||
| bfUnion.bits.putAll(this.bits); | ||
| bfUnion.bits.putAll(that.bits); | ||
| return bfUnion; | ||
| } | ||
|
|
||
| @Override | ||
| public double approxItemsInIntersection(BloomFilter that) throws IncompatibleUnionException { | ||
| BloomFilterImpl union = union(that); | ||
|
|
||
| return this.approxItems() + that.approxItems() - union.approxItems(); | ||
| } | ||
|
|
||
| @Override | ||
| public void writeTo(OutputStream out) throws IOException { | ||
| DataOutputStream dos = new DataOutputStream(out); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,24 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| * contributor license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright ownership. | ||
| * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| * (the "License"); you may not use this file except in compliance with | ||
| * the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.spark.util.sketch; | ||
|
|
||
| public class IncompatibleUnionException extends Exception { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we need some javadoc ere. |
||
| public IncompatibleUnionException(String message) { | ||
| super(message); | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please describe the method first and its properties (approximation error). Then put the reference in
@seealsowith a permanent link to the paper: https://dx.doi.org/10.1021%2Fci600526a