Skip to content

Commit a5c2961

Browse files
tarekbeckerDavies Liu
authored andcommitted
[SPARK-8235] [SQL] misc function sha / sha1
Jira: https://issues.apache.org/jira/browse/SPARK-8235 I added the support for sha1. If I understood rxin correctly, sha and sha1 should execute the same algorithm, shouldn't they? Please take a close look on the Python part. This is adopted from #6934 Author: Tarek Auel <[email protected]> Author: Tarek Auel <[email protected]> Closes #6963 from tarekauel/SPARK-8235 and squashes the following commits: f064563 [Tarek Auel] change to shaHex 7ce3cdc [Tarek Auel] rely on automatic cast a1251d6 [Tarek Auel] Merge remote-tracking branch 'upstream/master' into SPARK-8235 68eb043 [Tarek Auel] added docstring be5aff1 [Tarek Auel] improved error message 7336c96 [Tarek Auel] added type check cf23a80 [Tarek Auel] simplified example ebf75ef [Tarek Auel] [SPARK-8301] updated the python documentation. Removed sha in python and scala 6d6ff0d [Tarek Auel] [SPARK-8233] added docstring ea191a9 [Tarek Auel] [SPARK-8233] fixed signatureof python function. Added expected type to misc e3fd7c3 [Tarek Auel] SPARK[8235] added sha to the list of __all__ e5dad4e [Tarek Auel] SPARK[8235] sha / sha1
1 parent 3664ee2 commit a5c2961

File tree

6 files changed

+81
-1
lines changed

6 files changed

+81
-1
lines changed

python/pyspark/sql/functions.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
'monotonicallyIncreasingId',
4343
'rand',
4444
'randn',
45+
'sha1',
4546
'sha2',
4647
'sparkPartitionId',
4748
'struct',
@@ -382,6 +383,19 @@ def sha2(col, numBits):
382383
return Column(jc)
383384

384385

386+
@ignore_unicode_prefix
387+
@since(1.5)
388+
def sha1(col):
389+
"""Returns the hex string result of SHA-1.
390+
391+
>>> sqlContext.createDataFrame([('ABC',)], ['a']).select(sha1('a').alias('hash')).collect()
392+
[Row(hash=u'3c01bdbb26f358bab27f267924aa2c9a03fcfdb8')]
393+
"""
394+
sc = SparkContext._active_spark_context
395+
jc = sc._jvm.functions.sha1(_to_java_column(col))
396+
return Column(jc)
397+
398+
385399
@since(1.4)
386400
def sparkPartitionId():
387401
"""A column for partition ID of the Spark task.

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,8 @@ object FunctionRegistry {
136136
// misc functions
137137
expression[Md5]("md5"),
138138
expression[Sha2]("sha2"),
139+
expression[Sha1]("sha1"),
140+
expression[Sha1]("sha"),
139141

140142
// aggregate functions
141143
expression[Average]("avg"),

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,9 @@ import java.security.MessageDigest
2121
import java.security.NoSuchAlgorithmException
2222

2323
import org.apache.commons.codec.digest.DigestUtils
24+
import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
2425
import org.apache.spark.sql.catalyst.expressions.codegen._
25-
import org.apache.spark.sql.types.{BinaryType, IntegerType, StringType, DataType}
26+
import org.apache.spark.sql.types._
2627
import org.apache.spark.unsafe.types.UTF8String
2728

2829
/**
@@ -140,3 +141,30 @@ case class Sha2(left: Expression, right: Expression)
140141
"""
141142
}
142143
}
144+
145+
/**
146+
* A function that calculates a sha1 hash value and returns it as a hex string
147+
* For input of type [[BinaryType]] or [[StringType]]
148+
*/
149+
case class Sha1(child: Expression) extends UnaryExpression with ExpectsInputTypes {
150+
151+
override def dataType: DataType = StringType
152+
153+
override def expectedChildTypes: Seq[DataType] = Seq(BinaryType)
154+
155+
override def eval(input: InternalRow): Any = {
156+
val value = child.eval(input)
157+
if (value == null) {
158+
null
159+
} else {
160+
UTF8String.fromString(DigestUtils.shaHex(value.asInstanceOf[Array[Byte]]))
161+
}
162+
}
163+
164+
override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
165+
defineCodeGen(ctx, ev, c =>
166+
"org.apache.spark.unsafe.types.UTF8String.fromString" +
167+
s"(org.apache.commons.codec.digest.DigestUtils.shaHex($c))"
168+
)
169+
}
170+
}

sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MiscFunctionsSuite.scala

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,14 @@ class MiscFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
3131
checkEvaluation(Md5(Literal.create(null, BinaryType)), null)
3232
}
3333

34+
test("sha1") {
35+
checkEvaluation(Sha1(Literal("ABC".getBytes)), "3c01bdbb26f358bab27f267924aa2c9a03fcfdb8")
36+
checkEvaluation(Sha1(Literal.create(Array[Byte](1, 2, 3, 4, 5, 6), BinaryType)),
37+
"5d211bad8f4ee70e16c7d343a838fc344a1ed961")
38+
checkEvaluation(Sha1(Literal.create(null, BinaryType)), null)
39+
checkEvaluation(Sha1(Literal("".getBytes)), "da39a3ee5e6b4b0d3255bfef95601890afd80709")
40+
}
41+
3442
test("sha2") {
3543
checkEvaluation(Sha2(Literal("ABC".getBytes), Literal(256)), DigestUtils.sha256Hex("ABC"))
3644
checkEvaluation(Sha2(Literal.create(Array[Byte](1, 2, 3, 4, 5, 6), BinaryType), Literal(384)),

sql/core/src/main/scala/org/apache/spark/sql/functions.scala

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1414,6 +1414,22 @@ object functions {
14141414
*/
14151415
def md5(columnName: String): Column = md5(Column(columnName))
14161416

1417+
/**
1418+
* Calculates the SHA-1 digest and returns the value as a 40 character hex string.
1419+
*
1420+
* @group misc_funcs
1421+
* @since 1.5.0
1422+
*/
1423+
def sha1(e: Column): Column = Sha1(e.expr)
1424+
1425+
/**
1426+
* Calculates the SHA-1 digest and returns the value as a 40 character hex string.
1427+
*
1428+
* @group misc_funcs
1429+
* @since 1.5.0
1430+
*/
1431+
def sha1(columnName: String): Column = sha1(Column(columnName))
1432+
14171433
/**
14181434
* Calculates the SHA-2 family of hash functions and returns the value as a hex string.
14191435
*

sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,18 @@ class DataFrameFunctionsSuite extends QueryTest {
144144
Row("902fbdd2b1df0c4f70b4a5d23525e932", "6ac1e56bc78f031059be7be854522c4c"))
145145
}
146146

147+
test("misc sha1 function") {
148+
val df = Seq(("ABC", "ABC".getBytes)).toDF("a", "b")
149+
checkAnswer(
150+
df.select(sha1($"a"), sha1("b")),
151+
Row("3c01bdbb26f358bab27f267924aa2c9a03fcfdb8", "3c01bdbb26f358bab27f267924aa2c9a03fcfdb8"))
152+
153+
val dfEmpty = Seq(("", "".getBytes)).toDF("a", "b")
154+
checkAnswer(
155+
dfEmpty.selectExpr("sha1(a)", "sha1(b)"),
156+
Row("da39a3ee5e6b4b0d3255bfef95601890afd80709", "da39a3ee5e6b4b0d3255bfef95601890afd80709"))
157+
}
158+
147159
test("misc sha2 function") {
148160
val df = Seq(("ABC", Array[Byte](1, 2, 3, 4, 5, 6))).toDF("a", "b")
149161
checkAnswer(

0 commit comments

Comments
 (0)