Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,10 @@ object SchemaPruning extends SQLConfHelper {
RootField(field, derivedFromAtt = false, prunedIfAnyChildAccessed = true) :: Nil
case IsNotNull(_: Attribute) | IsNull(_: Attribute) =>
expr.children.flatMap(getRootFields).map(_.copy(prunedIfAnyChildAccessed = true))
case s: SubqueryExpression =>
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Initially, I tried another approach. I was passing AttributeSet with table attributes and checking above if an attribute belongs to the table output. However, that required changing many places. This change is much smaller. Let me know if there are cases when this will not work.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This change looks reasonable to me. I am not aware of cases when this will not work. Let's wait for feedback from others.

// use subquery references that only include outer attrs and
// ignore join conditions as those may include attributes from other tables
s.references.toSeq.flatMap(getRootFields)
case _ =>
expr.children.flatMap(getRootFields)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -935,4 +935,106 @@ abstract class SchemaPruningSuite
.count()
assert(count == 0)
}

testSchemaPruning("SPARK-38977: schema pruning with correlated EXISTS subquery") {
Copy link
Contributor Author

@aokolnychyi aokolnychyi Apr 21, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

All of these queries would previously fail for V2 tables.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this bug only happens for v2 tables, not file source tables?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess it will fail for both as the same method is used. Tests cover V1 and V2 so it should work for both now.


import testImplicits._

withTempView("ids", "first_names") {
val df1 = Seq(1, 2, 3).toDF("value")
df1.createOrReplaceTempView("ids")

val df2 = Seq("John", "Bob").toDF("value")
df2.createOrReplaceTempView("first_names")

val query = sql(
"""SELECT name FROM contacts c
|WHERE
| EXISTS (SELECT 1 FROM ids i WHERE i.value = c.id)
| AND
| EXISTS (SELECT 1 FROM first_names n WHERE c.name.first = n.value)
|""".stripMargin)

checkScan(query, "struct<id:int,name:struct<first:string,middle:string,last:string>>")

checkAnswer(query, Row(Row("John", "Y.", "Doe")) :: Nil)
}
}

testSchemaPruning("SPARK-38977: schema pruning with correlated NOT EXISTS subquery") {

import testImplicits._

withTempView("ids", "first_names") {
val df1 = Seq(1, 2, 3).toDF("value")
df1.createOrReplaceTempView("ids")

val df2 = Seq("John", "Bob").toDF("value")
df2.createOrReplaceTempView("first_names")

val query = sql(
"""SELECT name FROM contacts c
|WHERE
| NOT EXISTS (SELECT 1 FROM ids i WHERE i.value = c.id)
| AND
| NOT EXISTS (SELECT 1 FROM first_names n WHERE c.name.first = n.value)
|""".stripMargin)

checkScan(query, "struct<id:int,name:struct<first:string,middle:string,last:string>>")

checkAnswer(query, Row(Row("Jane", "X.", "Doe")) :: Nil)
}
}

testSchemaPruning("SPARK-38977: schema pruning with correlated IN subquery") {

import testImplicits._

withTempView("ids", "first_names") {
val df1 = Seq(1, 2, 3).toDF("value")
df1.createOrReplaceTempView("ids")

val df2 = Seq("John", "Bob").toDF("value")
df2.createOrReplaceTempView("first_names")

val query = sql(
"""SELECT name FROM contacts c
|WHERE
| id IN (SELECT * FROM ids i WHERE c.pets > i.value)
| AND
| name.first IN (SELECT * FROM first_names n WHERE c.name.last < n.value)
|""".stripMargin)

checkScan(query,
"struct<id:int,name:struct<first:string,middle:string,last:string>,pets:int>")

checkAnswer(query, Row(Row("John", "Y.", "Doe")) :: Nil)
}
}

testSchemaPruning("SPARK-38977: schema pruning with correlated NOT IN subquery") {

import testImplicits._

withTempView("ids", "first_names") {
val df1 = Seq(1, 2, 3).toDF("value")
df1.createOrReplaceTempView("ids")

val df2 = Seq("John", "Janet", "Jim", "Bob").toDF("value")
df2.createOrReplaceTempView("first_names")

val query = sql(
"""SELECT name FROM contacts c
|WHERE
| id NOT IN (SELECT * FROM ids i WHERE c.pets > i.value)
| AND
| name.first NOT IN (SELECT * FROM first_names n WHERE c.name.last > n.value)
|""".stripMargin)

checkScan(query,
"struct<id:int,name:struct<first:string,middle:string,last:string>,pets:int>")

checkAnswer(query, Row(Row("Jane", "X.", "Doe")) :: Nil)
}
}
}