Skip to content

Conversation

@wangshuo128
Copy link
Contributor

@wangshuo128 wangshuo128 commented Sep 23, 2019

What changes were proposed in this pull request?

Currently the behavior of getting output and generating null checks in FilterExec is different. Thus some nullable attribute could be treated as not nullable by mistake.

In FilterExec.ouput, an attribute is marked as nullable or not by finding its exprId in notNullAttributes:

a.nullable && notNullAttributes.contains(a.exprId)

But in FilterExec.doConsume, a nullCheck is generated or not for a predicate is decided by whether there is semantic equal not null predicate:

      val nullChecks = c.references.map { r =>
        val idx = notNullPreds.indexWhere { n => n.asInstanceOf[IsNotNull].child.semanticEquals(r)}
        if (idx != -1 && !generatedIsNotNullChecks(idx)) {
          generatedIsNotNullChecks(idx) = true
          // Use the child's output. The nullability is what the child produced.
          genPredicate(notNullPreds(idx), input, child.output)
        } else {
          ""
        }
      }.mkString("\n").trim

NPE will happen when run the SQL below:

sql("create table table1(x string)")
sql("create table table2(x bigint)")
sql("create table table3(x string)")
sql("insert into table2 select null as x")
sql(
  """
    |select t1.x
    |from (
    |    select x from table1) t1
    |left join (
    |    select x from (
    |        select x from table2
    |        union all
    |        select substr(x,5) x from table3
    |    ) a
    |    where length(x)>0
    |) t3
    |on t1.x=t3.x
  """.stripMargin).collect()

NPE Exception:

java.lang.NullPointerException
    at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.processNext(generated.java:40)
    at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
    at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:726)
    at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
    at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:135)
    at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
    at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:94)
    at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
    at org.apache.spark.scheduler.Task.run(Task.scala:127)
    at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:449)
    at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:452)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
    at java.lang.Thread.run(Thread.java:748)

the generated code:

== Subtree 4 / 5 ==
*(2) Project [cast(x#7L as string) AS x#9]
+- *(2) Filter ((length(cast(x#7L as string)) > 0) AND isnotnull(cast(x#7L as string)))
   +- Scan hive default.table2 [x#7L], HiveTableRelation `default`.`table2`, org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, [x#7L]

Generated code:
/* 001 */ public Object generate(Object[] references) {
/* 002 */   return new GeneratedIteratorForCodegenStage2(references);
/* 003 */ }
/* 004 */
/* 005 */ // codegenStageId=2
/* 006 */ final class GeneratedIteratorForCodegenStage2 extends org.apache.spark.sql.execution.BufferedRowIterator {
/* 007 */   private Object[] references;
/* 008 */   private scala.collection.Iterator[] inputs;
/* 009 */   private scala.collection.Iterator inputadapter_input_0;
/* 010 */   private org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[] filter_mutableStateArray_0 = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[2];
/* 011 */
/* 012 */   public GeneratedIteratorForCodegenStage2(Object[] references) {
/* 013 */     this.references = references;
/* 014 */   }
/* 015 */
/* 016 */   public void init(int index, scala.collection.Iterator[] inputs) {
/* 017 */     partitionIndex = index;
/* 018 */     this.inputs = inputs;
/* 019 */     inputadapter_input_0 = inputs[0];
/* 020 */     filter_mutableStateArray_0[0] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 0);
/* 021 */     filter_mutableStateArray_0[1] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 32);
/* 022 */
/* 023 */   }
/* 024 */
/* 025 */   protected void processNext() throws java.io.IOException {
/* 026 */     while ( inputadapter_input_0.hasNext()) {
/* 027 */       InternalRow inputadapter_row_0 = (InternalRow) inputadapter_input_0.next();
/* 028 */
/* 029 */       do {
/* 030 */         boolean inputadapter_isNull_0 = inputadapter_row_0.isNullAt(0);
/* 031 */         long inputadapter_value_0 = inputadapter_isNull_0 ?
/* 032 */         -1L : (inputadapter_row_0.getLong(0));
/* 033 */
/* 034 */         boolean filter_isNull_2 = inputadapter_isNull_0;
/* 035 */         UTF8String filter_value_2 = null;
/* 036 */         if (!inputadapter_isNull_0) {
/* 037 */           filter_value_2 = UTF8String.fromString(String.valueOf(inputadapter_value_0));
/* 038 */         }
/* 039 */         int filter_value_1 = -1;
/* 040 */         filter_value_1 = (filter_value_2).numChars();
/* 041 */
/* 042 */         boolean filter_value_0 = false;
/* 043 */         filter_value_0 = filter_value_1 > 0;
/* 044 */         if (!filter_value_0) continue;
/* 045 */
/* 046 */         boolean filter_isNull_6 = inputadapter_isNull_0;
/* 047 */         UTF8String filter_value_6 = null;
/* 048 */         if (!inputadapter_isNull_0) {
/* 049 */           filter_value_6 = UTF8String.fromString(String.valueOf(inputadapter_value_0));
/* 050 */         }
/* 051 */         if (!(!filter_isNull_6)) continue;
/* 052 */
/* 053 */         ((org.apache.spark.sql.execution.metric.SQLMetric) references[0] /* numOutputRows */).add(1);
/* 054 */
/* 055 */         boolean project_isNull_0 = false;
/* 056 */         UTF8String project_value_0 = null;
/* 057 */         if (!false) {
/* 058 */           project_value_0 = UTF8String.fromString(String.valueOf(inputadapter_value_0));
/* 059 */         }
/* 060 */         filter_mutableStateArray_0[1].reset();
/* 061 */
/* 062 */         filter_mutableStateArray_0[1].zeroOutNullBytes();
/* 063 */
/* 064 */         if (project_isNull_0) {
/* 065 */           filter_mutableStateArray_0[1].setNullAt(0);
/* 066 */         } else {
/* 067 */           filter_mutableStateArray_0[1].write(0, project_value_0);
/* 068 */         }
/* 069 */         append((filter_mutableStateArray_0[1].getRow()));
/* 070 */
/* 071 */       } while(false);
/* 072 */       if (shouldStop()) return;
/* 073 */     }
/* 074 */   }
/* 075 */
/* 076 */ }

This PR proposes to use semantic comparison both in FilterExec.output and FilterExec.doConsume for nullable attribute.

With this PR, the generated code snippet is below:

== Subtree 2 / 5 ==
*(3) Project [substring(x#8, 5, 2147483647) AS x#5]
+- *(3) Filter ((length(substring(x#8, 5, 2147483647)) > 0) AND isnotnull(substring(x#8, 5, 2147483647)))
   +- Scan hive default.table3 [x#8], HiveTableRelation `default`.`table3`, org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, [x#8]

Generated code:
/* 001 */ public Object generate(Object[] references) {
/* 002 */   return new GeneratedIteratorForCodegenStage3(references);
/* 003 */ }
/* 004 */
/* 005 */ // codegenStageId=3
/* 006 */ final class GeneratedIteratorForCodegenStage3 extends org.apache.spark.sql.execution.BufferedRowIterator {
/* 007 */   private Object[] references;
/* 008 */   private scala.collection.Iterator[] inputs;
/* 009 */   private scala.collection.Iterator inputadapter_input_0;
/* 010 */   private org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[] filter_mutableStateArray_0 = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[2];
/* 011 */
/* 012 */   public GeneratedIteratorForCodegenStage3(Object[] references) {
/* 013 */     this.references = references;
/* 014 */   }
/* 015 */
/* 016 */   public void init(int index, scala.collection.Iterator[] inputs) {
/* 017 */     partitionIndex = index;
/* 018 */     this.inputs = inputs;
/* 019 */     inputadapter_input_0 = inputs[0];
/* 020 */     filter_mutableStateArray_0[0] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 32);
/* 021 */     filter_mutableStateArray_0[1] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 32);
/* 022 */
/* 023 */   }
/* 024 */
/* 025 */   protected void processNext() throws java.io.IOException {
/* 026 */     while ( inputadapter_input_0.hasNext()) {
/* 027 */       InternalRow inputadapter_row_0 = (InternalRow) inputadapter_input_0.next();
/* 028 */
/* 029 */       do {
/* 030 */         boolean inputadapter_isNull_0 = inputadapter_row_0.isNullAt(0);
/* 031 */         UTF8String inputadapter_value_0 = inputadapter_isNull_0 ?
/* 032 */         null : (inputadapter_row_0.getUTF8String(0));
/* 033 */
/* 034 */         boolean filter_isNull_0 = true;
/* 035 */         boolean filter_value_0 = false;
/* 036 */         boolean filter_isNull_2 = true;
/* 037 */         UTF8String filter_value_2 = null;
/* 038 */
/* 039 */         if (!inputadapter_isNull_0) {
/* 040 */           filter_isNull_2 = false; // resultCode could change nullability.
/* 041 */           filter_value_2 = inputadapter_value_0.substringSQL(5, 2147483647);
/* 042 */
/* 043 */         }
/* 044 */         boolean filter_isNull_1 = filter_isNull_2;
/* 045 */         int filter_value_1 = -1;
/* 046 */
/* 047 */         if (!filter_isNull_2) {
/* 048 */           filter_value_1 = (filter_value_2).numChars();
/* 049 */         }
/* 050 */         if (!filter_isNull_1) {
/* 051 */           filter_isNull_0 = false; // resultCode could change nullability.
/* 052 */           filter_value_0 = filter_value_1 > 0;
/* 053 */
/* 054 */         }
/* 055 */         if (filter_isNull_0 || !filter_value_0) continue;
/* 056 */         boolean filter_isNull_8 = true;
/* 057 */         UTF8String filter_value_8 = null;
/* 058 */
/* 059 */         if (!inputadapter_isNull_0) {
/* 060 */           filter_isNull_8 = false; // resultCode could change nullability.
/* 061 */           filter_value_8 = inputadapter_value_0.substringSQL(5, 2147483647);
/* 062 */
/* 063 */         }
/* 064 */         if (!(!filter_isNull_8)) continue;
/* 065 */
/* 066 */         ((org.apache.spark.sql.execution.metric.SQLMetric) references[0] /* numOutputRows */).add(1);
/* 067 */
/* 068 */         boolean project_isNull_0 = true;
/* 069 */         UTF8String project_value_0 = null;
/* 070 */
/* 071 */         if (!inputadapter_isNull_0) {
/* 072 */           project_isNull_0 = false; // resultCode could change nullability.
/* 073 */           project_value_0 = inputadapter_value_0.substringSQL(5, 2147483647);
/* 074 */
/* 075 */         }
/* 076 */         filter_mutableStateArray_0[1].reset();
/* 077 */
/* 078 */         filter_mutableStateArray_0[1].zeroOutNullBytes();
/* 079 */
/* 080 */         if (project_isNull_0) {
/* 081 */           filter_mutableStateArray_0[1].setNullAt(0);
/* 082 */         } else {
/* 083 */           filter_mutableStateArray_0[1].write(0, project_value_0);
/* 084 */         }
/* 085 */         append((filter_mutableStateArray_0[1].getRow()));
/* 086 */
/* 087 */       } while(false);
/* 088 */       if (shouldStop()) return;
/* 089 */     }
/* 090 */   }
/* 091 */
/* 092 */ }

Why are the changes needed?

Fix NPE bug in FilterExec.

Does this PR introduce any user-facing change?

no

How was this patch tested?

new UT

@wangshuo128
Copy link
Contributor Author

@viirya @cloud-fan gentle ping, would you like to review this, thank you. :)

@wangshuo128 wangshuo128 changed the title [SPARK-29213][SQL] Make it consistent when get notnull output and generate… [SPARK-29213][SQL] Make it consistent when get notnull output and generate null checks in FilterExec Sep 23, 2019
@cloud-fan
Copy link
Contributor

ok to test

override def output: Seq[Attribute] = {
child.output.map { a =>
if (a.nullable && notNullAttributes.contains(a.exprId)) {
if (a.nullable && notNullPreds.exists(_.semanticEquals(a))) {
Copy link
Contributor

@cloud-fan cloud-fan Sep 23, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the real difference? AFAIK Attribute#sementicEquals just compare the expr id.

@SparkQA
Copy link

SparkQA commented Sep 23, 2019

Test build #111232 has finished for PR 25902 at commit 7e54ccb.

  • This patch fails Spark unit tests.
  • This patch merges cleanly.
  • This patch adds no public classes.

override def output: Seq[Attribute] = {
child.output.map { a =>
if (a.nullable && notNullAttributes.contains(a.exprId)) {
if (a.nullable && notNullPreds.exists(_.semanticEquals(a))) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

notNullPreds is a list of IsNotNull expressions. I think this semanticEquals comparison will fail at all?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, this will fail all. This change is not good enough. I'll try another way.

@wangshuo128
Copy link
Contributor Author

wangshuo128 commented Sep 24, 2019

@cloud-fan @viirya
Sorry, my earlier description and fix are not good enough.

The real problem is here:
nullcheck will not be generated for (length(cast(x#7L as string)) > 0) because
(length(cast(x#7L as string)) > 0) and cast(x#7L as string)(child of IsNotNull predicate isnotnull(cast(x#7L as string))) are not semantic equal when generating code for Filter ((length(cast(x#7L as string)) > 0) AND isnotnull(cast(x#7L as string))).

    val generated = otherPreds.map { c =>
      val nullChecks = c.references.map { r =>
        val idx = notNullPreds.indexWhere { n => n.asInstanceOf[IsNotNull].child.semanticEquals(r)}
        if (idx != -1 && !generatedIsNotNullChecks(idx)) {
          generatedIsNotNullChecks(idx) = true
          // Use the child's output. The nullability is what the child produced.
          genPredicate(notNullPreds(idx), input, child.output)
        } else {
          ""
        }
      }.mkString("\n").trim

      // Here we use *this* operator's output with this output's nullability since we already
      // enforced them with the IsNotNull checks above.
      s"""
         |$nullChecks
         |${genPredicate(c, input, output)}
       """.stripMargin.trim
    }.mkString("\n")

At the same time, filter output attribute x#7L is marked as not nullable. Nullability will not be checked when generating code for x#7L.
So NPE is thrown when reading null data in generated code of (length(cast(x#7L as string)) > 0) .

To fix this, I think we can filter attributes both in notNullPreds and otherPreds when get notNullAttributes:

  private val notNullAttributes = notNullPreds.flatMap(_.references).distinct.map(_.exprId)
    .diff(otherPreds.flatMap(_.references).distinct.map(_.exprId))

@SparkQA
Copy link

SparkQA commented Sep 24, 2019

Test build #111255 has finished for PR 25902 at commit af0fb3f.

  • This patch fails Spark unit tests.
  • This patch merges cleanly.
  • This patch adds no public classes.

@SparkQA
Copy link

SparkQA commented Sep 24, 2019

Test build #111256 has finished for PR 25902 at commit bc0a688.

  • This patch fails Spark unit tests.
  • This patch merges cleanly.
  • This patch adds no public classes.

JoshRosen added a commit to JoshRosen/spark that referenced this pull request Sep 24, 2019
Some slight modifications so we no longer depend on Hive.

// The columns that will filtered out by `IsNotNull` could be considered as not nullable.
private val notNullAttributes = notNullPreds.flatMap(_.references).distinct.map(_.exprId)
.diff(otherPreds.flatMap(_.references).distinct.map(_.exprId))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This fix seems sub-optimal to me. If we have a IsNotNull(a), then a should be a notNullAttribute even if a appears in otherPreds like EqualTo(a, b).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the bug is about how we codegen null checks, the current logic is:

  1. split the predicates into notNullPreds (IsNotNull expressions) and otherPreds
  2. try codegen otherPreds first, then notNullPreds.
  3. if an other-predicate can leverage a specific not-null-predicate, then codegen that not-null-predicate first, so that we can codegen the other-predicate with non-nullable attributes.

There is a problem in step 3. Image that we have a IsNotNull(SubString(a)) and a SomeFunc(a). Then a is a not-null attribute for sure, even if there is no IsNullNull(a) predicate.

When we codegen SomeFunc(a), we can't find a IsNotNull(a) expression, so we skip the null check. However, we assume a is not nullable when codegen SomeFunc(a), which is wrong as IsNotNull(SubString(a)) has not been codegened yet.

@viirya
Copy link
Member

viirya commented Sep 24, 2019

I think the problem is because the child of IsNotNull is not a reference. So when we look at the IsNotNull that can filter null values for a non-IsNotNull predicate, we fail to look at it up:

val idx = notNullPreds.indexWhere { n => n.asInstanceOf[IsNotNull].child.semanticEquals(r)}

Doesn't the easiest fix look:

val idx = notNullPreds.indexWhere { n =>
  n.asInstanceOf[IsNotNull].child.references.contains(r)
}

@cloud-fan
Copy link
Contributor

@viirya your proposal works, but is a little against the original design goal. Think about IsNotNull(a + b + c + d ..) and Substring(a).

When codegen Substring(a), with your proposal, we will codegen IsNotNull(a + b + c + d ..) first, which loads more columns and may cause perf regression.

I have a slightly different proposal: when codegen Substring(a), check if there is a IsNotNull(a) predicate. If there is, codegen it first. If not, and a is a not-nullable attribute, then generate a IsNotNull(a) and codegen it first.

@viirya
Copy link
Member

viirya commented Sep 25, 2019

Ah, I see. You are correct. Your proposal sounds good.

@wangshuo128
Copy link
Contributor Author

wangshuo128 commented Sep 25, 2019

@cloud-fan

I have a slightly different proposal: when codegen Substring(a), check if there is a IsNotNull(a) predicate. If there is, codegen it first. If not, and a is a not-nullable attribute, then generate a IsNotNull(a) and codegen it first.

Substring(a) a could be an expression. If we have predicates like Substring(Substring(a)) and IsNotNulll(Substring(a)), we could codegen IsNotNulll(Substring(a)) for null check first?

To be more comprehensive, should we check if there is IsNotNull predicate for a predicate and all its children before generate IsNotNull for its references? But this is a little complicated.

@cloud-fan
Copy link
Contributor

@wangshuo128 you are right that we can improve the algorithm further, and we can do it once the bug is fixed. For now let's fix the bug without introducing a regression.

That said, for any predicate (not IsNotNull), find all its references (e.g. a and b), and check if there are corresponding IsNotNull predicates (e.g. IsNotNull(a) and IsNotNull(b)). If a reference has a corresponding IsNotNull predicate, codegen it first. Otherwise, generate a new IsNotNull predicate for it and codegen it, if the reference is a not-nullable attribute.

@wangshuo128
Copy link
Contributor Author

For now let's fix the bug without introducing a regression.

Ok, I'll try to fix this bug first.

@SparkQA
Copy link

SparkQA commented Sep 25, 2019

Test build #111341 has finished for PR 25902 at commit 5555731.

  • This patch fails Spark unit tests.
  • This patch merges cleanly.
  • This patch adds no public classes.

@wangshuo128
Copy link
Contributor Author

Sorry, @JoshRosen , do you mind I use your code to move test from sql/hive to sql/core ?

@JoshRosen
Copy link
Contributor

Sorry, @JoshRosen , do you mind I use your code to move test from sql/hive to sql/core ?

Yes, please feel free to use that! This bug is somewhat hard to reproduce in unit tests because filter pushdown to data source scans will prevent the FilterExec code from processing the null-containing input rows. In d1658bb I worked round that by using typed Dataset operations to create black-box operators through which the optimizer cannot push filters.

(I'm following this PR because I'm interested in other aspects of IsNotNull optimizations; I have some notes on this particular code in #24765, something of a backburner project as I learn more about this code)

// TODO: revisit this. We can consider reordering predicates as well.
val generatedIsNotNullChecks = new Array[Boolean](notNullPreds.length)

val extraIsNotNullReferences = mutable.Set[Attribute]()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: extraIsNotNullAttrs


val extraIsNotNullReferences = mutable.Set[Attribute]()

def outputContainsNotNull(ref: Attribute): Boolean = {
Copy link
Contributor

@cloud-fan cloud-fan Sep 25, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This method always return true, so we add IsNotNull for all attributes which is sub-optimal. We should check notNullAttributes.contains(ref.exprId)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, my mistake.

}
}

test("SPARK-29213 Make it consistent when get notnull output and generate null " +
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can simply describe the bug in the test name: SPARK-29213: FilterExec should not throw NPE

@wangshuo128
Copy link
Contributor Author

Yes, please feel free to use that!

@JoshRosen Thanks a lot!

}

test("SPARK-29213: FilterExec should not throw NPE") {
withView("t1", "t2", "t3") {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: withView -> withTempView

sql("select ''").as[String].map(identity).toDF("x").createOrReplaceTempView("t3")
sql(
"""
|select t1.x
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: No strict rule though, I like capitalized words for SQL keywords: e.g., select -> SELECT, ....

@SparkQA
Copy link

SparkQA commented Sep 26, 2019

Test build #111377 has finished for PR 25902 at commit 5272097.

  • This patch passes all tests.
  • This patch merges cleanly.
  • This patch adds no public classes.

@SparkQA
Copy link

SparkQA commented Sep 26, 2019

Test build #111378 has finished for PR 25902 at commit fcada1c.

  • This patch passes all tests.
  • This patch merges cleanly.
  • This patch adds no public classes.

@SparkQA
Copy link

SparkQA commented Sep 26, 2019

Test build #111393 has finished for PR 25902 at commit f33d34c.

  • This patch fails due to an unknown error code, -9.
  • This patch does not merge cleanly.
  • This patch adds no public classes.

@wangshuo128 wangshuo128 changed the title [SPARK-29213][SQL] Make it consistent when get notnull output and generate null checks in FilterExec [SPARK-29213][SQL] Generate extra IsNotNull predicate in FilterExec Sep 26, 2019
@maropu
Copy link
Member

maropu commented Sep 27, 2019

@wangshuo128 Can you resolve the conflict by git-rebase?

@wangshuo128
Copy link
Contributor Author

@wangshuo128 Can you resolve the conflict first?

Ok

@SparkQA
Copy link

SparkQA commented Sep 27, 2019

Test build #111460 has finished for PR 25902 at commit 124ad87.

  • This patch passes all tests.
  • This patch merges cleanly.
  • This patch adds no public classes.

@cloud-fan cloud-fan closed this in bd28e8e Sep 27, 2019
@cloud-fan
Copy link
Contributor

thanks, merging to master/2.4!

@wangshuo128
Copy link
Contributor Author

Thanks!

cloud-fan pushed a commit that referenced this pull request Sep 27, 2019
Currently the behavior of getting output and generating null checks in `FilterExec` is different. Thus some nullable attribute could be treated as not nullable by mistake.

In `FilterExec.ouput`, an attribute is marked as nullable or not by finding its `exprId` in notNullAttributes:
```
a.nullable && notNullAttributes.contains(a.exprId)
```
But in `FilterExec.doConsume`,  a `nullCheck` is generated or not for a predicate is decided by whether there is semantic equal not null predicate:
```
      val nullChecks = c.references.map { r =>
        val idx = notNullPreds.indexWhere { n => n.asInstanceOf[IsNotNull].child.semanticEquals(r)}
        if (idx != -1 && !generatedIsNotNullChecks(idx)) {
          generatedIsNotNullChecks(idx) = true
          // Use the child's output. The nullability is what the child produced.
          genPredicate(notNullPreds(idx), input, child.output)
        } else {
          ""
        }
      }.mkString("\n").trim
```
NPE will happen when run the SQL below:
```
sql("create table table1(x string)")
sql("create table table2(x bigint)")
sql("create table table3(x string)")
sql("insert into table2 select null as x")
sql(
  """
    |select t1.x
    |from (
    |    select x from table1) t1
    |left join (
    |    select x from (
    |        select x from table2
    |        union all
    |        select substr(x,5) x from table3
    |    ) a
    |    where length(x)>0
    |) t3
    |on t1.x=t3.x
  """.stripMargin).collect()
```
NPE Exception:
```
java.lang.NullPointerException
    at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.processNext(generated.java:40)
    at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
    at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:726)
    at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
    at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:135)
    at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
    at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:94)
    at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
    at org.apache.spark.scheduler.Task.run(Task.scala:127)
    at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:449)
    at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:452)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
    at java.lang.Thread.run(Thread.java:748)
```
the generated code:
```
== Subtree 4 / 5 ==
*(2) Project [cast(x#7L as string) AS x#9]
+- *(2) Filter ((length(cast(x#7L as string)) > 0) AND isnotnull(cast(x#7L as string)))
   +- Scan hive default.table2 [x#7L], HiveTableRelation `default`.`table2`, org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, [x#7L]

Generated code:
/* 001 */ public Object generate(Object[] references) {
/* 002 */   return new GeneratedIteratorForCodegenStage2(references);
/* 003 */ }
/* 004 */
/* 005 */ // codegenStageId=2
/* 006 */ final class GeneratedIteratorForCodegenStage2 extends org.apache.spark.sql.execution.BufferedRowIterator {
/* 007 */   private Object[] references;
/* 008 */   private scala.collection.Iterator[] inputs;
/* 009 */   private scala.collection.Iterator inputadapter_input_0;
/* 010 */   private org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[] filter_mutableStateArray_0 = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[2];
/* 011 */
/* 012 */   public GeneratedIteratorForCodegenStage2(Object[] references) {
/* 013 */     this.references = references;
/* 014 */   }
/* 015 */
/* 016 */   public void init(int index, scala.collection.Iterator[] inputs) {
/* 017 */     partitionIndex = index;
/* 018 */     this.inputs = inputs;
/* 019 */     inputadapter_input_0 = inputs[0];
/* 020 */     filter_mutableStateArray_0[0] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 0);
/* 021 */     filter_mutableStateArray_0[1] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 32);
/* 022 */
/* 023 */   }
/* 024 */
/* 025 */   protected void processNext() throws java.io.IOException {
/* 026 */     while ( inputadapter_input_0.hasNext()) {
/* 027 */       InternalRow inputadapter_row_0 = (InternalRow) inputadapter_input_0.next();
/* 028 */
/* 029 */       do {
/* 030 */         boolean inputadapter_isNull_0 = inputadapter_row_0.isNullAt(0);
/* 031 */         long inputadapter_value_0 = inputadapter_isNull_0 ?
/* 032 */         -1L : (inputadapter_row_0.getLong(0));
/* 033 */
/* 034 */         boolean filter_isNull_2 = inputadapter_isNull_0;
/* 035 */         UTF8String filter_value_2 = null;
/* 036 */         if (!inputadapter_isNull_0) {
/* 037 */           filter_value_2 = UTF8String.fromString(String.valueOf(inputadapter_value_0));
/* 038 */         }
/* 039 */         int filter_value_1 = -1;
/* 040 */         filter_value_1 = (filter_value_2).numChars();
/* 041 */
/* 042 */         boolean filter_value_0 = false;
/* 043 */         filter_value_0 = filter_value_1 > 0;
/* 044 */         if (!filter_value_0) continue;
/* 045 */
/* 046 */         boolean filter_isNull_6 = inputadapter_isNull_0;
/* 047 */         UTF8String filter_value_6 = null;
/* 048 */         if (!inputadapter_isNull_0) {
/* 049 */           filter_value_6 = UTF8String.fromString(String.valueOf(inputadapter_value_0));
/* 050 */         }
/* 051 */         if (!(!filter_isNull_6)) continue;
/* 052 */
/* 053 */         ((org.apache.spark.sql.execution.metric.SQLMetric) references[0] /* numOutputRows */).add(1);
/* 054 */
/* 055 */         boolean project_isNull_0 = false;
/* 056 */         UTF8String project_value_0 = null;
/* 057 */         if (!false) {
/* 058 */           project_value_0 = UTF8String.fromString(String.valueOf(inputadapter_value_0));
/* 059 */         }
/* 060 */         filter_mutableStateArray_0[1].reset();
/* 061 */
/* 062 */         filter_mutableStateArray_0[1].zeroOutNullBytes();
/* 063 */
/* 064 */         if (project_isNull_0) {
/* 065 */           filter_mutableStateArray_0[1].setNullAt(0);
/* 066 */         } else {
/* 067 */           filter_mutableStateArray_0[1].write(0, project_value_0);
/* 068 */         }
/* 069 */         append((filter_mutableStateArray_0[1].getRow()));
/* 070 */
/* 071 */       } while(false);
/* 072 */       if (shouldStop()) return;
/* 073 */     }
/* 074 */   }
/* 075 */
/* 076 */ }

```

This PR proposes to use semantic comparison both in `FilterExec.output` and `FilterExec.doConsume` for nullable attribute.

With this PR, the generated code snippet is below:
```
== Subtree 2 / 5 ==
*(3) Project [substring(x#8, 5, 2147483647) AS x#5]
+- *(3) Filter ((length(substring(x#8, 5, 2147483647)) > 0) AND isnotnull(substring(x#8, 5, 2147483647)))
   +- Scan hive default.table3 [x#8], HiveTableRelation `default`.`table3`, org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, [x#8]

Generated code:
/* 001 */ public Object generate(Object[] references) {
/* 002 */   return new GeneratedIteratorForCodegenStage3(references);
/* 003 */ }
/* 004 */
/* 005 */ // codegenStageId=3
/* 006 */ final class GeneratedIteratorForCodegenStage3 extends org.apache.spark.sql.execution.BufferedRowIterator {
/* 007 */   private Object[] references;
/* 008 */   private scala.collection.Iterator[] inputs;
/* 009 */   private scala.collection.Iterator inputadapter_input_0;
/* 010 */   private org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[] filter_mutableStateArray_0 = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[2];
/* 011 */
/* 012 */   public GeneratedIteratorForCodegenStage3(Object[] references) {
/* 013 */     this.references = references;
/* 014 */   }
/* 015 */
/* 016 */   public void init(int index, scala.collection.Iterator[] inputs) {
/* 017 */     partitionIndex = index;
/* 018 */     this.inputs = inputs;
/* 019 */     inputadapter_input_0 = inputs[0];
/* 020 */     filter_mutableStateArray_0[0] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 32);
/* 021 */     filter_mutableStateArray_0[1] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 32);
/* 022 */
/* 023 */   }
/* 024 */
/* 025 */   protected void processNext() throws java.io.IOException {
/* 026 */     while ( inputadapter_input_0.hasNext()) {
/* 027 */       InternalRow inputadapter_row_0 = (InternalRow) inputadapter_input_0.next();
/* 028 */
/* 029 */       do {
/* 030 */         boolean inputadapter_isNull_0 = inputadapter_row_0.isNullAt(0);
/* 031 */         UTF8String inputadapter_value_0 = inputadapter_isNull_0 ?
/* 032 */         null : (inputadapter_row_0.getUTF8String(0));
/* 033 */
/* 034 */         boolean filter_isNull_0 = true;
/* 035 */         boolean filter_value_0 = false;
/* 036 */         boolean filter_isNull_2 = true;
/* 037 */         UTF8String filter_value_2 = null;
/* 038 */
/* 039 */         if (!inputadapter_isNull_0) {
/* 040 */           filter_isNull_2 = false; // resultCode could change nullability.
/* 041 */           filter_value_2 = inputadapter_value_0.substringSQL(5, 2147483647);
/* 042 */
/* 043 */         }
/* 044 */         boolean filter_isNull_1 = filter_isNull_2;
/* 045 */         int filter_value_1 = -1;
/* 046 */
/* 047 */         if (!filter_isNull_2) {
/* 048 */           filter_value_1 = (filter_value_2).numChars();
/* 049 */         }
/* 050 */         if (!filter_isNull_1) {
/* 051 */           filter_isNull_0 = false; // resultCode could change nullability.
/* 052 */           filter_value_0 = filter_value_1 > 0;
/* 053 */
/* 054 */         }
/* 055 */         if (filter_isNull_0 || !filter_value_0) continue;
/* 056 */         boolean filter_isNull_8 = true;
/* 057 */         UTF8String filter_value_8 = null;
/* 058 */
/* 059 */         if (!inputadapter_isNull_0) {
/* 060 */           filter_isNull_8 = false; // resultCode could change nullability.
/* 061 */           filter_value_8 = inputadapter_value_0.substringSQL(5, 2147483647);
/* 062 */
/* 063 */         }
/* 064 */         if (!(!filter_isNull_8)) continue;
/* 065 */
/* 066 */         ((org.apache.spark.sql.execution.metric.SQLMetric) references[0] /* numOutputRows */).add(1);
/* 067 */
/* 068 */         boolean project_isNull_0 = true;
/* 069 */         UTF8String project_value_0 = null;
/* 070 */
/* 071 */         if (!inputadapter_isNull_0) {
/* 072 */           project_isNull_0 = false; // resultCode could change nullability.
/* 073 */           project_value_0 = inputadapter_value_0.substringSQL(5, 2147483647);
/* 074 */
/* 075 */         }
/* 076 */         filter_mutableStateArray_0[1].reset();
/* 077 */
/* 078 */         filter_mutableStateArray_0[1].zeroOutNullBytes();
/* 079 */
/* 080 */         if (project_isNull_0) {
/* 081 */           filter_mutableStateArray_0[1].setNullAt(0);
/* 082 */         } else {
/* 083 */           filter_mutableStateArray_0[1].write(0, project_value_0);
/* 084 */         }
/* 085 */         append((filter_mutableStateArray_0[1].getRow()));
/* 086 */
/* 087 */       } while(false);
/* 088 */       if (shouldStop()) return;
/* 089 */     }
/* 090 */   }
/* 091 */
/* 092 */ }
```
Fix NPE bug in FilterExec.

no

new UT

Closes #25902 from wangshuo128/filter-codegen-npe.

Authored-by: Wang Shuo <[email protected]>
Signed-off-by: Wenchen Fan <[email protected]>
(cherry picked from commit bd28e8e)
Signed-off-by: Wenchen Fan <[email protected]>
@wangshuo128 wangshuo128 deleted the filter-codegen-npe branch September 27, 2019 08:08
@wangshuo128 wangshuo128 restored the filter-codegen-npe branch September 27, 2019 08:15
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

Projects

None yet

Development

Successfully merging this pull request may close these issues.

7 participants