Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,12 @@
String arguments() default "";
String examples() default "";
String note() default "";
/**
* Valid group names are almost the same with one defined as `groupname` in
* `sql/functions.scala`. But, `collection_funcs` is split into fine-grained three groups:
* `array_funcs`, `map_funcs`, and `json_funcs`. See `ExpressionInfo` for the
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why all the csv _funcs are left behind?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is just a basic set for the 3.0 release. Yea, we can improve docs by adding more groups. Actually, we need to assign all the exprs to groups for removing the SQL Built-in Function doc:
#28224 (comment) @HyukjinKwon

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see, thanks

* detailed group names.
*/
String group() default "";
String since() default "";
String deprecated() default "";
Expand Down
80 changes: 0 additions & 80 deletions sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
Original file line number Diff line number Diff line change
Expand Up @@ -22,16 +22,13 @@ import java.net.{MalformedURLException, URL}
import java.sql.{Date, Timestamp}
import java.util.concurrent.atomic.AtomicBoolean

import scala.collection.parallel.immutable.ParVector

import org.apache.spark.{AccumulatorSuite, SparkException}
import org.apache.spark.scheduler.{SparkListener, SparkListenerJobStart}
import org.apache.spark.sql.catalyst.expressions.GenericRow
import org.apache.spark.sql.catalyst.expressions.aggregate.{Complete, Partial}
import org.apache.spark.sql.catalyst.optimizer.{ConvertToLocalRelation, NestedColumnAliasingSuite}
import org.apache.spark.sql.catalyst.plans.logical.Project
import org.apache.spark.sql.catalyst.util.StringUtils
import org.apache.spark.sql.execution.HiveResult.hiveResultString
import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper
import org.apache.spark.sql.execution.aggregate.{HashAggregateExec, ObjectHashAggregateExec, SortAggregateExec}
import org.apache.spark.sql.execution.columnar.InMemoryTableScanExec
Expand Down Expand Up @@ -126,83 +123,6 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark
}
}

test("using _FUNC_ instead of function names in examples") {
val exampleRe = "(>.*;)".r
val setStmtRe = "(?i)^(>\\s+set\\s+).+".r
val ignoreSet = Set(
// Examples for CaseWhen show simpler syntax:
// `CASE WHEN ... THEN ... WHEN ... THEN ... END`
"org.apache.spark.sql.catalyst.expressions.CaseWhen",
// _FUNC_ is replaced by `locate` but `locate(... IN ...)` is not supported
"org.apache.spark.sql.catalyst.expressions.StringLocate",
// _FUNC_ is replaced by `%` which causes a parsing error on `SELECT %(2, 1.8)`
"org.apache.spark.sql.catalyst.expressions.Remainder",
// Examples demonstrate alternative names, see SPARK-20749
"org.apache.spark.sql.catalyst.expressions.Length")
spark.sessionState.functionRegistry.listFunction().foreach { funcId =>
val info = spark.sessionState.catalog.lookupFunctionInfo(funcId)
val className = info.getClassName
withClue(s"Expression class '$className'") {
val exprExamples = info.getOriginalExamples
if (!exprExamples.isEmpty && !ignoreSet.contains(className)) {
assert(exampleRe.findAllIn(exprExamples).toIterable
.filter(setStmtRe.findFirstIn(_).isEmpty) // Ignore SET commands
.forall(_.contains("_FUNC_")))
}
}
}
}

test("check outputs of expression examples") {
def unindentAndTrim(s: String): String = {
s.replaceAll("\n\\s+", "\n").trim
}
val beginSqlStmtRe = " > ".r
val endSqlStmtRe = ";\n".r
def checkExampleSyntax(example: String): Unit = {
val beginStmtNum = beginSqlStmtRe.findAllIn(example).length
val endStmtNum = endSqlStmtRe.findAllIn(example).length
assert(beginStmtNum === endStmtNum,
"The number of ` > ` does not match to the number of `;`")
}
val exampleRe = """^(.+);\n(?s)(.+)$""".r
val ignoreSet = Set(
// One of examples shows getting the current timestamp
"org.apache.spark.sql.catalyst.expressions.UnixTimestamp",
// Random output without a seed
"org.apache.spark.sql.catalyst.expressions.Rand",
"org.apache.spark.sql.catalyst.expressions.Randn",
"org.apache.spark.sql.catalyst.expressions.Shuffle",
"org.apache.spark.sql.catalyst.expressions.Uuid",
// The example calls methods that return unstable results.
"org.apache.spark.sql.catalyst.expressions.CallMethodViaReflection")

val parFuncs = new ParVector(spark.sessionState.functionRegistry.listFunction().toVector)
parFuncs.foreach { funcId =>
// Examples can change settings. We clone the session to prevent tests clashing.
val clonedSpark = spark.cloneSession()
// Coalescing partitions can change result order, so disable it.
clonedSpark.sessionState.conf.setConf(SQLConf.COALESCE_PARTITIONS_ENABLED, false)
val info = clonedSpark.sessionState.catalog.lookupFunctionInfo(funcId)
val className = info.getClassName
if (!ignoreSet.contains(className)) {
withClue(s"Function '${info.getName}', Expression class '$className'") {
val example = info.getExamples
checkExampleSyntax(example)
example.split(" > ").toList.foreach(_ match {
case exampleRe(sql, output) =>
val df = clonedSpark.sql(sql)
val actual = unindentAndTrim(
hiveResultString(df.queryExecution.executedPlan).mkString("\n"))
val expected = unindentAndTrim(output)
assert(actual === expected)
case _ =>
})
}
}
}
}

test("SPARK-6743: no columns from cache") {
Seq(
(83, 0, 38),
Expand Down
31 changes: 0 additions & 31 deletions sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,6 @@ package org.apache.spark.sql
import java.math.BigDecimal

import org.apache.spark.sql.api.java._
import org.apache.spark.sql.catalyst.FunctionIdentifier
import org.apache.spark.sql.catalyst.expressions.ExpressionInfo
import org.apache.spark.sql.catalyst.plans.logical.Project
import org.apache.spark.sql.execution.{QueryExecution, SimpleMode}
import org.apache.spark.sql.execution.columnar.InMemoryRelation
Expand Down Expand Up @@ -534,35 +532,6 @@ class UDFSuite extends QueryTest with SharedSparkSession {
assert(spark.range(2).select(nonDeterministicJavaUDF()).distinct().count() == 2)
}

test("Replace _FUNC_ in UDF ExpressionInfo") {
val info = spark.sessionState.catalog.lookupFunctionInfo(FunctionIdentifier("upper"))
assert(info.getName === "upper")
assert(info.getClassName === "org.apache.spark.sql.catalyst.expressions.Upper")
assert(info.getUsage === "upper(str) - Returns `str` with all characters changed to uppercase.")
assert(info.getExamples.contains("> SELECT upper('SparkSql');"))
assert(info.getSince === "1.0.1")
assert(info.getNote === "")
assert(info.getExtended.contains("> SELECT upper('SparkSql');"))
}

test("group info in ExpressionInfo") {
val info = spark.sessionState.catalog.lookupFunctionInfo(FunctionIdentifier("sum"))
assert(info.getGroup === "agg_funcs")

Seq("agg_funcs", "array_funcs", "datetime_funcs", "json_funcs", "map_funcs", "window_funcs")
.foreach { groupName =>
val info = new ExpressionInfo(
"testClass", null, "testName", null, "", "", "", groupName, "", "")
assert(info.getGroup === groupName)
}

val errMsg = intercept[IllegalArgumentException] {
val invalidGroupName = "invalid_group_funcs"
new ExpressionInfo("testClass", null, "testName", null, "", "", "", invalidGroupName, "", "")
}.getMessage
assert(errMsg.contains("'group' is malformed in the expression [testName]."))
}

test("SPARK-28521 error message for CAST(parameter types contains DataType)") {
val e = intercept[AnalysisException] {
spark.sql("SELECT CAST(1)")
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.spark.sql.expressions

import scala.collection.parallel.immutable.ParVector

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.catalyst.FunctionIdentifier
import org.apache.spark.sql.catalyst.expressions.ExpressionInfo
import org.apache.spark.sql.execution.HiveResult.hiveResultString
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.test.SharedSparkSession

class ExpressionInfoSuite extends SparkFunSuite with SharedSparkSession {

test("Replace _FUNC_ in ExpressionInfo") {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

from UDFSuite

val info = spark.sessionState.catalog.lookupFunctionInfo(FunctionIdentifier("upper"))
assert(info.getName === "upper")
assert(info.getClassName === "org.apache.spark.sql.catalyst.expressions.Upper")
assert(info.getUsage === "upper(str) - Returns `str` with all characters changed to uppercase.")
assert(info.getExamples.contains("> SELECT upper('SparkSql');"))
assert(info.getSince === "1.0.1")
assert(info.getNote === "")
assert(info.getExtended.contains("> SELECT upper('SparkSql');"))
}

test("group info in ExpressionInfo") {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

from UDFSuite

val info = spark.sessionState.catalog.lookupFunctionInfo(FunctionIdentifier("sum"))
assert(info.getGroup === "agg_funcs")

Seq("agg_funcs", "array_funcs", "datetime_funcs", "json_funcs", "map_funcs", "window_funcs")
.foreach { groupName =>
val info = new ExpressionInfo(
"testClass", null, "testName", null, "", "", "", groupName, "", "")
assert(info.getGroup === groupName)
}

val errMsg = intercept[IllegalArgumentException] {
val invalidGroupName = "invalid_group_funcs"
new ExpressionInfo("testClass", null, "testName", null, "", "", "", invalidGroupName, "", "")
}.getMessage
assert(errMsg.contains("'group' is malformed in the expression [testName]."))
}

test("error handling in ExpressionInfo") {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To improve test coverage, a new test added here.

val errMsg1 = intercept[IllegalArgumentException] {
val invalidNote = " invalid note"
new ExpressionInfo("testClass", null, "testName", null, "", "", invalidNote, "", "", "")
}.getMessage
assert(errMsg1.contains("'note' is malformed in the expression [testName]."))

val errMsg2 = intercept[IllegalArgumentException] {
val invalidSince = "-3.0.0"
new ExpressionInfo("testClass", null, "testName", null, "", "", "", "", invalidSince, "")
}.getMessage
assert(errMsg2.contains("'since' is malformed in the expression [testName]."))

val errMsg3 = intercept[IllegalArgumentException] {
val invalidDeprecated = " invalid deprecated"
new ExpressionInfo("testClass", null, "testName", null, "", "", "", "", "", invalidDeprecated)
}.getMessage
assert(errMsg3.contains("'deprecated' is malformed in the expression [testName]."))
}

test("using _FUNC_ instead of function names in examples") {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

from SQLQuerySuite

val exampleRe = "(>.*;)".r
val setStmtRe = "(?i)^(>\\s+set\\s+).+".r
val ignoreSet = Set(
// Examples for CaseWhen show simpler syntax:
// `CASE WHEN ... THEN ... WHEN ... THEN ... END`
"org.apache.spark.sql.catalyst.expressions.CaseWhen",
// _FUNC_ is replaced by `locate` but `locate(... IN ...)` is not supported
"org.apache.spark.sql.catalyst.expressions.StringLocate",
// _FUNC_ is replaced by `%` which causes a parsing error on `SELECT %(2, 1.8)`
"org.apache.spark.sql.catalyst.expressions.Remainder",
// Examples demonstrate alternative names, see SPARK-20749
"org.apache.spark.sql.catalyst.expressions.Length")
spark.sessionState.functionRegistry.listFunction().foreach { funcId =>
val info = spark.sessionState.catalog.lookupFunctionInfo(funcId)
val className = info.getClassName
withClue(s"Expression class '$className'") {
val exprExamples = info.getOriginalExamples
if (!exprExamples.isEmpty && !ignoreSet.contains(className)) {
assert(exampleRe.findAllIn(exprExamples).toIterable
.filter(setStmtRe.findFirstIn(_).isEmpty) // Ignore SET commands
.forall(_.contains("_FUNC_")))
}
}
}
}

test("check outputs of expression examples") {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

from SQLQuerySuite

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi, @maropu this test case has a minor problem when running individually for machines in different timezones: expressions such as FromUnixTime are timezone aware.
In SQLQuerySuite, the timezone is set explicitly in org.apache.spark.sql.QueryTest, however ExpressionInfoSuite doesn't set timezone, thus fails this test case.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@advancedxy, that was fixed SPARK-31725.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Have you checked the thread in #28538?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@HyukjinKwon and @maropu thanks for the information, I was checking an older version of Spark 3.0

def unindentAndTrim(s: String): String = {
s.replaceAll("\n\\s+", "\n").trim
}
val beginSqlStmtRe = " > ".r
val endSqlStmtRe = ";\n".r
def checkExampleSyntax(example: String): Unit = {
val beginStmtNum = beginSqlStmtRe.findAllIn(example).length
val endStmtNum = endSqlStmtRe.findAllIn(example).length
assert(beginStmtNum === endStmtNum,
"The number of ` > ` does not match to the number of `;`")
}
val exampleRe = """^(.+);\n(?s)(.+)$""".r
val ignoreSet = Set(
// One of examples shows getting the current timestamp
"org.apache.spark.sql.catalyst.expressions.UnixTimestamp",
// Random output without a seed
"org.apache.spark.sql.catalyst.expressions.Rand",
"org.apache.spark.sql.catalyst.expressions.Randn",
"org.apache.spark.sql.catalyst.expressions.Shuffle",
"org.apache.spark.sql.catalyst.expressions.Uuid",
// The example calls methods that return unstable results.
"org.apache.spark.sql.catalyst.expressions.CallMethodViaReflection")

val parFuncs = new ParVector(spark.sessionState.functionRegistry.listFunction().toVector)
parFuncs.foreach { funcId =>
// Examples can change settings. We clone the session to prevent tests clashing.
val clonedSpark = spark.cloneSession()
// Coalescing partitions can change result order, so disable it.
clonedSpark.sessionState.conf.setConf(SQLConf.COALESCE_PARTITIONS_ENABLED, false)
val info = clonedSpark.sessionState.catalog.lookupFunctionInfo(funcId)
val className = info.getClassName
if (!ignoreSet.contains(className)) {
withClue(s"Function '${info.getName}', Expression class '$className'") {
val example = info.getExamples
checkExampleSyntax(example)
example.split(" > ").toList.foreach {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

minor: I modified this line a bit as my IDE suggested.

case exampleRe(sql, output) =>
val df = clonedSpark.sql(sql)
val actual = unindentAndTrim(
hiveResultString(df.queryExecution.executedPlan).mkString("\n"))
val expected = unindentAndTrim(output)
assert(actual === expected)
case _ =>
}
}
}
}
}
}