Skip to content

Commit 3dda58a

Browse files
ueshincloud-fan
authored andcommitted
[SPARK-26370][SQL] Fix resolution of higher-order function for the same identifier.
## What changes were proposed in this pull request? When using a higher-order function with the same variable name as the existing columns in `Filter` or something which uses `Analyzer.resolveExpressionBottomUp` during the resolution, e.g.,: ```scala val df = Seq( (Seq(1, 9, 8, 7), 1, 2), (Seq(5, 9, 7), 2, 2), (Seq.empty, 3, 2), (null, 4, 2) ).toDF("i", "x", "d") checkAnswer(df.filter("exists(i, x -> x % d == 0)"), Seq(Row(Seq(1, 9, 8, 7), 1, 2))) checkAnswer(df.select("x").filter("exists(i, x -> x % d == 0)"), Seq(Row(1))) ``` the following exception happens: ``` java.lang.ClassCastException: org.apache.spark.sql.catalyst.expressions.BoundReference cannot be cast to org.apache.spark.sql.catalyst.expressions.NamedExpression at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:237) at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62) at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55) at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49) at scala.collection.TraversableLike.map(TraversableLike.scala:237) at scala.collection.TraversableLike.map$(TraversableLike.scala:230) at scala.collection.AbstractTraversable.map(Traversable.scala:108) at org.apache.spark.sql.catalyst.expressions.HigherOrderFunction.$anonfun$functionsForEval$1(higherOrderFunctions.scala:147) at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:237) at scala.collection.immutable.List.foreach(List.scala:392) at scala.collection.TraversableLike.map(TraversableLike.scala:237) at scala.collection.TraversableLike.map$(TraversableLike.scala:230) at scala.collection.immutable.List.map(List.scala:298) at org.apache.spark.sql.catalyst.expressions.HigherOrderFunction.functionsForEval(higherOrderFunctions.scala:145) at org.apache.spark.sql.catalyst.expressions.HigherOrderFunction.functionsForEval$(higherOrderFunctions.scala:145) at org.apache.spark.sql.catalyst.expressions.ArrayExists.functionsForEval$lzycompute(higherOrderFunctions.scala:369) at org.apache.spark.sql.catalyst.expressions.ArrayExists.functionsForEval(higherOrderFunctions.scala:369) at org.apache.spark.sql.catalyst.expressions.SimpleHigherOrderFunction.functionForEval(higherOrderFunctions.scala:176) at org.apache.spark.sql.catalyst.expressions.SimpleHigherOrderFunction.functionForEval$(higherOrderFunctions.scala:176) at org.apache.spark.sql.catalyst.expressions.ArrayExists.functionForEval(higherOrderFunctions.scala:369) at org.apache.spark.sql.catalyst.expressions.ArrayExists.nullSafeEval(higherOrderFunctions.scala:387) at org.apache.spark.sql.catalyst.expressions.SimpleHigherOrderFunction.eval(higherOrderFunctions.scala:190) at org.apache.spark.sql.catalyst.expressions.SimpleHigherOrderFunction.eval$(higherOrderFunctions.scala:185) at org.apache.spark.sql.catalyst.expressions.ArrayExists.eval(higherOrderFunctions.scala:369) at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificPredicate.eval(Unknown Source) at org.apache.spark.sql.execution.FilterExec.$anonfun$doExecute$3(basicPhysicalOperators.scala:216) at org.apache.spark.sql.execution.FilterExec.$anonfun$doExecute$3$adapted(basicPhysicalOperators.scala:215) ... ``` because the `UnresolvedAttribute`s in `LambdaFunction` are unexpectedly resolved by the rule. This pr modified to use a placeholder `UnresolvedNamedLambdaVariable` to prevent unexpected resolution. ## How was this patch tested? Added a test and modified some tests. Closes #23320 from ueshin/issues/SPARK-26370/hof_resolution. Authored-by: Takuya UESHIN <[email protected]> Signed-off-by: Wenchen Fan <[email protected]>
1 parent 2d8838d commit 3dda58a

File tree

8 files changed

+72
-20
lines changed

8 files changed

+72
-20
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/higherOrderFunctions.scala

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -150,13 +150,14 @@ case class ResolveLambdaVariables(conf: SQLConf) extends Rule[LogicalPlan] {
150150
val lambdaMap = l.arguments.map(v => canonicalizer(v.name) -> v).toMap
151151
l.mapChildren(resolve(_, parentLambdaMap ++ lambdaMap))
152152

153-
case u @ UnresolvedAttribute(name +: nestedFields) =>
153+
case u @ UnresolvedNamedLambdaVariable(name +: nestedFields) =>
154154
parentLambdaMap.get(canonicalizer(name)) match {
155155
case Some(lambda) =>
156156
nestedFields.foldLeft(lambda: Expression) { (expr, fieldName) =>
157157
ExtractValue(expr, Literal(fieldName), conf.resolver)
158158
}
159-
case None => u
159+
case None =>
160+
UnresolvedAttribute(u.nameParts)
160161
}
161162

162163
case _ =>

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,34 @@ import java.util.concurrent.atomic.AtomicReference
2222
import scala.collection.mutable
2323

2424
import org.apache.spark.sql.catalyst.InternalRow
25-
import org.apache.spark.sql.catalyst.analysis.{TypeCheckResult, TypeCoercion, UnresolvedAttribute}
25+
import org.apache.spark.sql.catalyst.analysis.{TypeCheckResult, TypeCoercion, UnresolvedAttribute, UnresolvedException}
2626
import org.apache.spark.sql.catalyst.expressions.codegen._
2727
import org.apache.spark.sql.catalyst.util._
2828
import org.apache.spark.sql.types._
2929
import org.apache.spark.unsafe.array.ByteArrayMethods
3030

31+
/**
32+
* A placeholder of lambda variables to prevent unexpected resolution of [[LambdaFunction]].
33+
*/
34+
case class UnresolvedNamedLambdaVariable(nameParts: Seq[String])
35+
extends LeafExpression with NamedExpression with Unevaluable {
36+
37+
override def name: String =
38+
nameParts.map(n => if (n.contains(".")) s"`$n`" else n).mkString(".")
39+
40+
override def exprId: ExprId = throw new UnresolvedException(this, "exprId")
41+
override def dataType: DataType = throw new UnresolvedException(this, "dataType")
42+
override def nullable: Boolean = throw new UnresolvedException(this, "nullable")
43+
override def qualifier: Seq[String] = throw new UnresolvedException(this, "qualifier")
44+
override def toAttribute: Attribute = throw new UnresolvedException(this, "toAttribute")
45+
override def newInstance(): NamedExpression = throw new UnresolvedException(this, "newInstance")
46+
override lazy val resolved = false
47+
48+
override def toString: String = s"lambda '$name"
49+
50+
override def sql: String = name
51+
}
52+
3153
/**
3254
* A named lambda variable.
3355
*/
@@ -79,7 +101,7 @@ case class LambdaFunction(
79101

80102
object LambdaFunction {
81103
val identity: LambdaFunction = {
82-
val id = UnresolvedAttribute.quoted("id")
104+
val id = UnresolvedNamedLambdaVariable(Seq("id"))
83105
LambdaFunction(id, Seq(id))
84106
}
85107
}

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1338,9 +1338,12 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging
13381338
*/
13391339
override def visitLambda(ctx: LambdaContext): Expression = withOrigin(ctx) {
13401340
val arguments = ctx.IDENTIFIER().asScala.map { name =>
1341-
UnresolvedAttribute.quoted(name.getText)
1341+
UnresolvedNamedLambdaVariable(UnresolvedAttribute.quoted(name.getText).nameParts)
13421342
}
1343-
LambdaFunction(expression(ctx.expression), arguments)
1343+
val function = expression(ctx.expression).transformUp {
1344+
case a: UnresolvedAttribute => UnresolvedNamedLambdaVariable(a.nameParts)
1345+
}
1346+
LambdaFunction(function, arguments)
13441347
}
13451348

13461349
/**

sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ResolveLambdaVariablesSuite.scala

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -49,19 +49,21 @@ class ResolveLambdaVariablesSuite extends PlanTest {
4949
comparePlans(Analyzer.execute(plan(e1)), plan(e2))
5050
}
5151

52+
private def lv(s: Symbol) = UnresolvedNamedLambdaVariable(Seq(s.name))
53+
5254
test("resolution - no op") {
5355
checkExpression(key, key)
5456
}
5557

5658
test("resolution - simple") {
57-
val in = ArrayTransform(values1, LambdaFunction('x.attr + 1, 'x.attr :: Nil))
59+
val in = ArrayTransform(values1, LambdaFunction(lv('x) + 1, lv('x) :: Nil))
5860
val out = ArrayTransform(values1, LambdaFunction(lvInt + 1, lvInt :: Nil))
5961
checkExpression(in, out)
6062
}
6163

6264
test("resolution - nested") {
6365
val in = ArrayTransform(values2, LambdaFunction(
64-
ArrayTransform('x.attr, LambdaFunction('x.attr + 1, 'x.attr :: Nil)), 'x.attr :: Nil))
66+
ArrayTransform(lv('x), LambdaFunction(lv('x) + 1, lv('x) :: Nil)), lv('x) :: Nil))
6567
val out = ArrayTransform(values2, LambdaFunction(
6668
ArrayTransform(lvArray, LambdaFunction(lvInt + 1, lvInt :: Nil)), lvArray :: Nil))
6769
checkExpression(in, out)
@@ -75,14 +77,14 @@ class ResolveLambdaVariablesSuite extends PlanTest {
7577

7678
test("fail - name collisions") {
7779
val p = plan(ArrayTransform(values1,
78-
LambdaFunction('x.attr + 'X.attr, 'x.attr :: 'X.attr :: Nil)))
80+
LambdaFunction(lv('x) + lv('X), lv('x) :: lv('X) :: Nil)))
7981
val msg = intercept[AnalysisException](Analyzer.execute(p)).getMessage
8082
assert(msg.contains("arguments should not have names that are semantically the same"))
8183
}
8284

8385
test("fail - lambda arguments") {
8486
val p = plan(ArrayTransform(values1,
85-
LambdaFunction('x.attr + 'y.attr + 'z.attr, 'x.attr :: 'y.attr :: 'z.attr :: Nil)))
87+
LambdaFunction(lv('x) + lv('y) + lv('z), lv('x) :: lv('y) :: lv('z) :: Nil)))
8688
val msg = intercept[AnalysisException](Analyzer.execute(p)).getMessage
8789
assert(msg.contains("does not match the number of arguments expected"))
8890
}

sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ReplaceNullWithFalseInPredicateSuite.scala

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst.optimizer
2020
import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
2121
import org.apache.spark.sql.catalyst.dsl.expressions._
2222
import org.apache.spark.sql.catalyst.dsl.plans._
23-
import org.apache.spark.sql.catalyst.expressions.{And, ArrayExists, ArrayFilter, ArrayTransform, CaseWhen, Expression, GreaterThan, If, LambdaFunction, Literal, MapFilter, NamedExpression, Or}
23+
import org.apache.spark.sql.catalyst.expressions.{And, ArrayExists, ArrayFilter, ArrayTransform, CaseWhen, Expression, GreaterThan, If, LambdaFunction, Literal, MapFilter, NamedExpression, Or, UnresolvedNamedLambdaVariable}
2424
import org.apache.spark.sql.catalyst.expressions.Literal.{FalseLiteral, TrueLiteral}
2525
import org.apache.spark.sql.catalyst.plans.{Inner, PlanTest}
2626
import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
@@ -306,22 +306,24 @@ class ReplaceNullWithFalseInPredicateSuite extends PlanTest {
306306
testProjection(originalExpr = column, expectedExpr = column)
307307
}
308308

309+
private def lv(s: Symbol) = UnresolvedNamedLambdaVariable(Seq(s.name))
310+
309311
test("replace nulls in lambda function of ArrayFilter") {
310-
testHigherOrderFunc('a, ArrayFilter, Seq('e))
312+
testHigherOrderFunc('a, ArrayFilter, Seq(lv('e)))
311313
}
312314

313315
test("replace nulls in lambda function of ArrayExists") {
314-
testHigherOrderFunc('a, ArrayExists, Seq('e))
316+
testHigherOrderFunc('a, ArrayExists, Seq(lv('e)))
315317
}
316318

317319
test("replace nulls in lambda function of MapFilter") {
318-
testHigherOrderFunc('m, MapFilter, Seq('k, 'v))
320+
testHigherOrderFunc('m, MapFilter, Seq(lv('k), lv('v)))
319321
}
320322

321323
test("inability to replace nulls in arbitrary higher-order function") {
322324
val lambdaFunc = LambdaFunction(
323-
function = If('e > 0, Literal(null, BooleanType), TrueLiteral),
324-
arguments = Seq[NamedExpression]('e))
325+
function = If(lv('e) > 0, Literal(null, BooleanType), TrueLiteral),
326+
arguments = Seq[NamedExpression](lv('e)))
325327
val column = ArrayTransform('a, lambdaFunc)
326328
testProjection(originalExpr = column, expectedExpr = column)
327329
}

sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -246,9 +246,11 @@ class ExpressionParserSuite extends PlanTest {
246246
intercept("foo(a x)", "extraneous input 'x'")
247247
}
248248

249+
private def lv(s: Symbol) = UnresolvedNamedLambdaVariable(Seq(s.name))
250+
249251
test("lambda functions") {
250-
assertEqual("x -> x + 1", LambdaFunction('x + 1, Seq('x.attr)))
251-
assertEqual("(x, y) -> x + y", LambdaFunction('x + 'y, Seq('x.attr, 'y.attr)))
252+
assertEqual("x -> x + 1", LambdaFunction(lv('x) + 1, Seq(lv('x))))
253+
assertEqual("(x, y) -> x + y", LambdaFunction(lv('x) + lv('y), Seq(lv('x), lv('y))))
252254
}
253255

254256
test("window function expressions") {

sql/core/src/test/resources/sql-tests/results/typeCoercion/native/mapZipWith.sql.out

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ FROM various_maps
8585
struct<>
8686
-- !query 5 output
8787
org.apache.spark.sql.AnalysisException
88-
cannot resolve 'map_zip_with(various_maps.`decimal_map1`, various_maps.`decimal_map2`, lambdafunction(named_struct(NamePlaceholder(), `k`, NamePlaceholder(), `v1`, NamePlaceholder(), `v2`), `k`, `v1`, `v2`))' due to argument data type mismatch: The input to function map_zip_with should have been two maps with compatible key types, but the key types are [decimal(36,0), decimal(36,35)].; line 1 pos 7
88+
cannot resolve 'map_zip_with(various_maps.`decimal_map1`, various_maps.`decimal_map2`, lambdafunction(named_struct(NamePlaceholder(), k, NamePlaceholder(), v1, NamePlaceholder(), v2), k, v1, v2))' due to argument data type mismatch: The input to function map_zip_with should have been two maps with compatible key types, but the key types are [decimal(36,0), decimal(36,35)].; line 1 pos 7
8989

9090

9191
-- !query 6
@@ -113,7 +113,7 @@ FROM various_maps
113113
struct<>
114114
-- !query 8 output
115115
org.apache.spark.sql.AnalysisException
116-
cannot resolve 'map_zip_with(various_maps.`decimal_map2`, various_maps.`int_map`, lambdafunction(named_struct(NamePlaceholder(), `k`, NamePlaceholder(), `v1`, NamePlaceholder(), `v2`), `k`, `v1`, `v2`))' due to argument data type mismatch: The input to function map_zip_with should have been two maps with compatible key types, but the key types are [decimal(36,35), int].; line 1 pos 7
116+
cannot resolve 'map_zip_with(various_maps.`decimal_map2`, various_maps.`int_map`, lambdafunction(named_struct(NamePlaceholder(), k, NamePlaceholder(), v1, NamePlaceholder(), v2), k, v1, v2))' due to argument data type mismatch: The input to function map_zip_with should have been two maps with compatible key types, but the key types are [decimal(36,35), int].; line 1 pos 7
117117

118118

119119
-- !query 9

sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2908,6 +2908,26 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSQLContext {
29082908
}
29092909
assert(ex.getMessage.contains("Cannot use null as map key"))
29102910
}
2911+
2912+
test("SPARK-26370: Fix resolution of higher-order function for the same identifier") {
2913+
val df = Seq(
2914+
(Seq(1, 9, 8, 7), 1, 2),
2915+
(Seq(5, 9, 7), 2, 2),
2916+
(Seq.empty, 3, 2),
2917+
(null, 4, 2)
2918+
).toDF("i", "x", "d")
2919+
2920+
checkAnswer(df.selectExpr("x", "exists(i, x -> x % d == 0)"),
2921+
Seq(
2922+
Row(1, true),
2923+
Row(2, false),
2924+
Row(3, false),
2925+
Row(4, null)))
2926+
checkAnswer(df.filter("exists(i, x -> x % d == 0)"),
2927+
Seq(Row(Seq(1, 9, 8, 7), 1, 2)))
2928+
checkAnswer(df.select("x").filter("exists(i, x -> x % d == 0)"),
2929+
Seq(Row(1)))
2930+
}
29112931
}
29122932

29132933
object DataFrameFunctionsSuite {

0 commit comments

Comments
 (0)