[SPARK-24076][SQL] Use different seed in HashAggregate to avoid hash conflict

yucai · yucai · commit 08186183553a · 2018-04-25T17:44:08.000+08:00
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
@@ -755,7 +755,10 @@ case class HashAggregateExec(
     }
 
     // generate hash code for key
-    val hashExpr = Murmur3Hash(groupingExpressions, 42)
+    // SPARK-24076: HashAggregate uses the same hash algorithm on the same expressions
+    // as ShuffleExchange, it may lead to bad hash conflict when shuffle.partitions=8192*n,
+    // pick a different seed to avoid this conflict
+    val hashExpr = Murmur3Hash(groupingExpressions, 48)
     val hashEval = BindReferences.bindReference(hashExpr, child.output).genCode(ctx)
 
     val (checkFallbackForGeneratedHashMap, checkFallbackForBytesToBytesMap, resetCounter,