From 91adce590461dda885d88319a700a775e63f9ce6 Mon Sep 17 00:00:00 2001 From: Marco Gaido Date: Tue, 4 Sep 2018 17:02:07 +0200 Subject: [PATCH 1/2] [SPARK-25317][CORE] Avoid perf regression in Murmur3 Hash on UTF8String --- .../spark/unsafe/hash/Murmur3_x86_32.java | 27 ++++++++++++------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java index aff6e93d647fe..c15dfe08db801 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java @@ -19,6 +19,7 @@ import com.google.common.primitives.Ints; +import org.apache.spark.unsafe.Platform; import org.apache.spark.unsafe.memory.MemoryBlock; import org.apache.spark.unsafe.types.UTF8String; @@ -59,7 +60,7 @@ public static int hashUnsafeWordsBlock(MemoryBlock base, int seed) { // This is based on Guava's `Murmur32_Hasher.processRemaining(ByteBuffer)` method. int lengthInBytes = Ints.checkedCast(base.size()); assert (lengthInBytes % 8 == 0): "lengthInBytes must be a multiple of 8 (word-aligned)"; - int h1 = hashBytesByIntBlock(base, seed); + int h1 = hashBytesByIntBlock(base, lengthInBytes, seed); return fmix(h1, lengthInBytes); } @@ -69,14 +70,19 @@ public static int hashUnsafeWords(Object base, long offset, int lengthInBytes, i } public static int hashUnsafeBytesBlock(MemoryBlock base, int seed) { + return hashUnsafeBytesBlock(base, Ints.checkedCast(base.size()), seed); + } + + private static int hashUnsafeBytesBlock(MemoryBlock base, int lengthInBytes, int seed) { // This is not compatible with original and another implementations. // But remain it for backward compatibility for the components existing before 2.3. - int lengthInBytes = Ints.checkedCast(base.size()); assert (lengthInBytes >= 0): "lengthInBytes cannot be negative"; int lengthAligned = lengthInBytes - lengthInBytes % 4; - int h1 = hashBytesByIntBlock(base.subBlock(0, lengthAligned), seed); + int h1 = hashBytesByIntBlock(base, lengthAligned, seed); + long offset = base.getBaseOffset(); + Object o = base.getBaseObject(); for (int i = lengthAligned; i < lengthInBytes; i++) { - int halfWord = base.getByte(i); + int halfWord = Platform.getByte(o, offset + i); int k1 = mixK1(halfWord); h1 = mixH1(h1, k1); } @@ -84,7 +90,7 @@ public static int hashUnsafeBytesBlock(MemoryBlock base, int seed) { } public static int hashUTF8String(UTF8String str, int seed) { - return hashUnsafeBytesBlock(str.getMemoryBlock(), seed); + return hashUnsafeBytesBlock(str.getMemoryBlock(), str.numBytes(), seed); } public static int hashUnsafeBytes(Object base, long offset, int lengthInBytes, int seed) { @@ -101,7 +107,7 @@ public static int hashUnsafeBytes2Block(MemoryBlock base, int seed) { int lengthInBytes = Ints.checkedCast(base.size()); assert (lengthInBytes >= 0) : "lengthInBytes cannot be negative"; int lengthAligned = lengthInBytes - lengthInBytes % 4; - int h1 = hashBytesByIntBlock(base.subBlock(0, lengthAligned), seed); + int h1 = hashBytesByIntBlock(base, lengthAligned, seed); int k1 = 0; for (int i = lengthAligned, shift = 0; i < lengthInBytes; i++, shift += 8) { k1 ^= (base.getByte(i) & 0xFF) << shift; @@ -110,12 +116,13 @@ public static int hashUnsafeBytes2Block(MemoryBlock base, int seed) { return fmix(h1, lengthInBytes); } - private static int hashBytesByIntBlock(MemoryBlock base, int seed) { - long lengthInBytes = base.size(); + private static int hashBytesByIntBlock(MemoryBlock base, int lengthInBytes, int seed) { + Object o = base.getBaseObject(); + long offset = base.getBaseOffset(); assert (lengthInBytes % 4 == 0); int h1 = seed; - for (long i = 0; i < lengthInBytes; i += 4) { - int halfWord = base.getInt(i); + for (int i = 0; i < lengthInBytes; i += 4) { + int halfWord = Platform.getInt(o, offset + i); int k1 = mixK1(halfWord); h1 = mixH1(h1, k1); } From 6cb94ed5ed76e23e4fb775a2fd6f0e66e9c15abd Mon Sep 17 00:00:00 2001 From: Marco Gaido Date: Wed, 5 Sep 2018 15:56:03 +0200 Subject: [PATCH 2/2] use MemoryBlock in hashBytesByIntBlock according to kiszk's comment --- .../java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java index c15dfe08db801..566f116154302 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java @@ -117,12 +117,10 @@ public static int hashUnsafeBytes2Block(MemoryBlock base, int seed) { } private static int hashBytesByIntBlock(MemoryBlock base, int lengthInBytes, int seed) { - Object o = base.getBaseObject(); - long offset = base.getBaseOffset(); assert (lengthInBytes % 4 == 0); int h1 = seed; for (int i = 0; i < lengthInBytes; i += 4) { - int halfWord = Platform.getInt(o, offset + i); + int halfWord = base.getInt(i); int k1 = mixK1(halfWord); h1 = mixH1(h1, k1); }