From 91adce590461dda885d88319a700a775e63f9ce6 Mon Sep 17 00:00:00 2001
From: Marco Gaido <marcogaido91@gmail.com>
Date: Tue, 4 Sep 2018 17:02:07 +0200
Subject: [PATCH 1/2] [SPARK-25317][CORE] Avoid perf regression in Murmur3 Hash
 on UTF8String

---
 .../spark/unsafe/hash/Murmur3_x86_32.java     | 27 ++++++++++++-------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java
index aff6e93d647fe..c15dfe08db801 100644
--- a/common/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java
+++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java
@@ -19,6 +19,7 @@
 
 import com.google.common.primitives.Ints;
 
+import org.apache.spark.unsafe.Platform;
 import org.apache.spark.unsafe.memory.MemoryBlock;
 import org.apache.spark.unsafe.types.UTF8String;
 
@@ -59,7 +60,7 @@ public static int hashUnsafeWordsBlock(MemoryBlock base, int seed) {
     // This is based on Guava's `Murmur32_Hasher.processRemaining(ByteBuffer)` method.
     int lengthInBytes = Ints.checkedCast(base.size());
     assert (lengthInBytes % 8 == 0): "lengthInBytes must be a multiple of 8 (word-aligned)";
-    int h1 = hashBytesByIntBlock(base, seed);
+    int h1 = hashBytesByIntBlock(base, lengthInBytes, seed);
     return fmix(h1, lengthInBytes);
   }
 
@@ -69,14 +70,19 @@ public static int hashUnsafeWords(Object base, long offset, int lengthInBytes, i
   }
 
   public static int hashUnsafeBytesBlock(MemoryBlock base, int seed) {
+    return hashUnsafeBytesBlock(base, Ints.checkedCast(base.size()), seed);
+  }
+
+  private static int hashUnsafeBytesBlock(MemoryBlock base, int lengthInBytes, int seed) {
     // This is not compatible with original and another implementations.
     // But remain it for backward compatibility for the components existing before 2.3.
-    int lengthInBytes = Ints.checkedCast(base.size());
     assert (lengthInBytes >= 0): "lengthInBytes cannot be negative";
     int lengthAligned = lengthInBytes - lengthInBytes % 4;
-    int h1 = hashBytesByIntBlock(base.subBlock(0, lengthAligned), seed);
+    int h1 = hashBytesByIntBlock(base, lengthAligned, seed);
+    long offset = base.getBaseOffset();
+    Object o = base.getBaseObject();
     for (int i = lengthAligned; i < lengthInBytes; i++) {
-      int halfWord = base.getByte(i);
+      int halfWord = Platform.getByte(o, offset + i);
       int k1 = mixK1(halfWord);
       h1 = mixH1(h1, k1);
     }
@@ -84,7 +90,7 @@ public static int hashUnsafeBytesBlock(MemoryBlock base, int seed) {
   }
 
   public static int hashUTF8String(UTF8String str, int seed) {
-    return hashUnsafeBytesBlock(str.getMemoryBlock(), seed);
+    return hashUnsafeBytesBlock(str.getMemoryBlock(), str.numBytes(), seed);
   }
 
   public static int hashUnsafeBytes(Object base, long offset, int lengthInBytes, int seed) {
@@ -101,7 +107,7 @@ public static int hashUnsafeBytes2Block(MemoryBlock base, int seed) {
     int lengthInBytes = Ints.checkedCast(base.size());
     assert (lengthInBytes >= 0) : "lengthInBytes cannot be negative";
     int lengthAligned = lengthInBytes - lengthInBytes % 4;
-    int h1 = hashBytesByIntBlock(base.subBlock(0, lengthAligned), seed);
+    int h1 = hashBytesByIntBlock(base, lengthAligned, seed);
     int k1 = 0;
     for (int i = lengthAligned, shift = 0; i < lengthInBytes; i++, shift += 8) {
       k1 ^= (base.getByte(i) & 0xFF) << shift;
@@ -110,12 +116,13 @@ public static int hashUnsafeBytes2Block(MemoryBlock base, int seed) {
     return fmix(h1, lengthInBytes);
   }
 
-  private static int hashBytesByIntBlock(MemoryBlock base, int seed) {
-    long lengthInBytes = base.size();
+  private static int hashBytesByIntBlock(MemoryBlock base, int lengthInBytes, int seed) {
+    Object o = base.getBaseObject();
+    long offset = base.getBaseOffset();
     assert (lengthInBytes % 4 == 0);
     int h1 = seed;
-    for (long i = 0; i < lengthInBytes; i += 4) {
-      int halfWord = base.getInt(i);
+    for (int i = 0; i < lengthInBytes; i += 4) {
+      int halfWord = Platform.getInt(o, offset + i);
       int k1 = mixK1(halfWord);
       h1 = mixH1(h1, k1);
     }

From 6cb94ed5ed76e23e4fb775a2fd6f0e66e9c15abd Mon Sep 17 00:00:00 2001
From: Marco Gaido <marcogaido91@gmail.com>
Date: Wed, 5 Sep 2018 15:56:03 +0200
Subject: [PATCH 2/2] use MemoryBlock in hashBytesByIntBlock according to
 kiszk's comment

---
 .../java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java     | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java
index c15dfe08db801..566f116154302 100644
--- a/common/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java
+++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java
@@ -117,12 +117,10 @@ public static int hashUnsafeBytes2Block(MemoryBlock base, int seed) {
   }
 
   private static int hashBytesByIntBlock(MemoryBlock base, int lengthInBytes, int seed) {
-    Object o = base.getBaseObject();
-    long offset = base.getBaseOffset();
     assert (lengthInBytes % 4 == 0);
     int h1 = seed;
     for (int i = 0; i < lengthInBytes; i += 4) {
-      int halfWord = Platform.getInt(o, offset + i);
+      int halfWord = base.getInt(i);
       int k1 = mixK1(halfWord);
       h1 = mixH1(h1, k1);
     }