From ba303fde8caabf276b45addc8ba1375a1f0fa19e Mon Sep 17 00:00:00 2001 From: Wenchen Fan Date: Tue, 5 Jan 2016 13:43:34 +0800 Subject: [PATCH 1/5] use a single column vararg for hash --- .../sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala | 1 + sql/core/src/main/scala/org/apache/spark/sql/functions.scala | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala index 915c585ec91f..0f5ee77bef6c 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala @@ -163,6 +163,7 @@ class ExpressionTypeCheckingSuite extends SparkFunSuite { assertError(Coalesce(Seq('intField, 'booleanField)), "input to function coalesce should all be the same type") assertError(Coalesce(Nil), "input to function coalesce cannot be empty") + assertError(new Murmur3Hash(Nil), "arguments of function hash cannot be empty") assertError(Explode('intField), "input to function explode should be array or map type") } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala index e223e32fd702..1c96f647b634 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala @@ -1820,8 +1820,8 @@ object functions extends LegacyFunctions { * @since 2.0 */ @scala.annotation.varargs - def hash(col: Column, cols: Column*): Column = withExpr { - new Murmur3Hash((col +: cols).map(_.expr)) + def hash(cols: Column*): Column = withExpr { + new Murmur3Hash(cols.map(_.expr)) } ////////////////////////////////////////////////////////////////////////////////////////////// From b652b4548fb2b9270f7ebd11397fdbc09a89f583 Mon Sep 17 00:00:00 2001 From: Wenchen Fan Date: Tue, 5 Jan 2016 14:40:28 +0800 Subject: [PATCH 2/5] update error message --- .../scala/org/apache/spark/sql/catalyst/expressions/misc.scala | 2 +- .../sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala index 8834924687c0..6697d463614d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala @@ -200,7 +200,7 @@ case class Murmur3Hash(children: Seq[Expression], seed: Int) extends Expression override def checkInputDataTypes(): TypeCheckResult = { if (children.isEmpty) { - TypeCheckResult.TypeCheckFailure("arguments of function hash cannot be empty") + TypeCheckResult.TypeCheckFailure("function hash requires at least one argument") } else { TypeCheckResult.TypeCheckSuccess } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala index 0f5ee77bef6c..f3df716a5782 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala @@ -163,7 +163,7 @@ class ExpressionTypeCheckingSuite extends SparkFunSuite { assertError(Coalesce(Seq('intField, 'booleanField)), "input to function coalesce should all be the same type") assertError(Coalesce(Nil), "input to function coalesce cannot be empty") - assertError(new Murmur3Hash(Nil), "arguments of function hash cannot be empty") + assertError(new Murmur3Hash(Nil), "function hash requires at least one argument") assertError(Explode('intField), "input to function explode should be array or map type") } From 9c72e0115849623009e347c9e78be269a9172943 Mon Sep 17 00:00:00 2001 From: Wenchen Fan Date: Tue, 5 Jan 2016 15:09:13 +0800 Subject: [PATCH 3/5] add python version --- python/pyspark/sql/functions.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 7c15e3845869..526b79ad7bad 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -1017,6 +1017,17 @@ def sha2(col, numBits): jc = sc._jvm.functions.sha2(_to_java_column(col), numBits) return Column(jc) +@since(2.0) +def hash(col): + """Calculates the hash code of given columns, and returns the result as a int column. + + >>> sqlContext.createDataFrame([('ABC',)], ['a']).select(hash('a').alias('hash')).collect() + [Row(hash=1358996357)] + """ + sc = SparkContext._active_spark_context + jc = sc._jvm.functions.hash(_to_java_column(col)) + return Column(jc) + # ---------------------- String/Binary functions ------------------------------ From f3a557b5534c506e6987388a84ae4e561585d895 Mon Sep 17 00:00:00 2001 From: Wenchen Fan Date: Tue, 5 Jan 2016 16:19:00 +0800 Subject: [PATCH 4/5] fix style --- python/pyspark/sql/functions.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 526b79ad7bad..88026279b660 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -1017,6 +1017,7 @@ def sha2(col, numBits): jc = sc._jvm.functions.sha2(_to_java_column(col), numBits) return Column(jc) + @since(2.0) def hash(col): """Calculates the hash code of given columns, and returns the result as a int column. From 9e05b1e5b53afefbd8d16a8b8d9dd8df0f37c566 Mon Sep 17 00:00:00 2001 From: Wenchen Fan Date: Tue, 5 Jan 2016 19:18:18 +0800 Subject: [PATCH 5/5] fix python --- python/pyspark/sql/functions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 88026279b660..b0390cb9942e 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -1019,14 +1019,14 @@ def sha2(col, numBits): @since(2.0) -def hash(col): +def hash(*cols): """Calculates the hash code of given columns, and returns the result as a int column. >>> sqlContext.createDataFrame([('ABC',)], ['a']).select(hash('a').alias('hash')).collect() [Row(hash=1358996357)] """ sc = SparkContext._active_spark_context - jc = sc._jvm.functions.hash(_to_java_column(col)) + jc = sc._jvm.functions.hash(_to_seq(sc, cols, _to_java_column)) return Column(jc)