Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions docs/changelog/113123.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
pr: 113123
summary: "ES|QL: Skip CASE function from `InferIsNotNull` rule checks"
area: ES|QL
type: bug
issues:
- 112704
Original file line number Diff line number Diff line change
Expand Up @@ -261,3 +261,23 @@ warning:Line 1:38: java.lang.IllegalArgumentException: single-value function enc

a:integer | b:integer | same:boolean
;

caseOnTheValue_NotOnTheField
required_capability: fixed_wrong_is_not_null_check_on_case

FROM employees
| WHERE emp_no < 10022 AND emp_no > 10012
| KEEP languages, emp_no
| EVAL eval = CASE(languages == 1, null, languages == 2, "bilingual", languages > 2, "multilingual", languages IS NULL, "languages is null")
| SORT languages, emp_no
| WHERE eval IS NOT NULL;

languages:integer| emp_no:integer|eval:keyword
2 |10016 |bilingual
2 |10017 |bilingual
2 |10018 |bilingual
5 |10014 |multilingual
5 |10015 |multilingual
null |10020 |languages is null
null |10021 |languages is null
;
Original file line number Diff line number Diff line change
Expand Up @@ -314,7 +314,13 @@ public enum Cap {
/**
* QSTR function
*/
QSTR_FUNCTION(true);
QSTR_FUNCTION(true),

/**
* Don't optimize CASE IS NOT NULL function by not requiring the fields to be not null as well.
* https://github.com/elastic/elasticsearch/issues/112704
*/
FIXED_WRONG_IS_NOT_NULL_CHECK_ON_CASE;

private final boolean snapshotOnly;
private final FeatureFlag featureFlag;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import org.elasticsearch.xpack.esql.core.expression.predicate.nulls.IsNotNull;
import org.elasticsearch.xpack.esql.core.rule.Rule;
import org.elasticsearch.xpack.esql.core.util.CollectionUtils;
import org.elasticsearch.xpack.esql.expression.function.scalar.conditional.Case;
import org.elasticsearch.xpack.esql.expression.function.scalar.nulls.Coalesce;
import org.elasticsearch.xpack.esql.plan.logical.LogicalPlan;

Expand All @@ -24,8 +25,7 @@
import static java.util.Collections.emptySet;

/**
* Simplify IsNotNull targets by resolving the underlying expression to its root fields with unknown
* nullability.
* Simplify IsNotNull targets by resolving the underlying expression to its root fields.
* e.g.
* (x + 1) / 2 IS NOT NULL --> x IS NOT NULL AND (x+1) / 2 IS NOT NULL
* SUBSTRING(x, 3) > 4 IS NOT NULL --> x IS NOT NULL AND SUBSTRING(x, 3) > 4 IS NOT NULL
Expand Down Expand Up @@ -85,7 +85,7 @@ protected Set<Expression> resolveExpressionAsRootAttributes(Expression exp, Attr

private boolean doResolve(Expression exp, AttributeMap<Expression> aliases, Set<Expression> resolvedExpressions) {
boolean changed = false;
// check if the expression can be skipped or is not nullabe
// check if the expression can be skipped
if (skipExpression(exp)) {
resolvedExpressions.add(exp);
} else {
Expand All @@ -106,6 +106,13 @@ private boolean doResolve(Expression exp, AttributeMap<Expression> aliases, Set<
}

private static boolean skipExpression(Expression e) {
return e instanceof Coalesce;
// These two functions can have a complex set of expressions as arguments that can mess up the simplification we are trying to add.
// If there is a "case(f is null, null, ...) is not null" expression,
// assuming that "case(f is null.....) is not null AND f is not null" (what this rule is doing) is a wrong assumption because
// the "case" function will want both null "f" and not null "f". Doing it like this contradicts the condition inside case, so we
// must avoid these cases.
// We could be smarter and look inside "case" and "coalesce" to see if there is any comparison of fields with "null" but,
// the complexity is too high to warrant an attempt _now_.
return e instanceof Coalesce || e instanceof Case;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: in case any other change is required to the PR, maybe move the comment from the call site as function javadoc/comment.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be nice to understand why these two functions are special, at least with a comment, and eventually extract a general rule (a flag interface or a method in the Expression class to identify the functions that fall into this category).
The only patterns I can see are:

  • both have Nullability.UNKNOWN (but also BinaryLogic does, yet it's not listed here). The javadoc mentions unknown nullability as well: Simplify IsNotNull targets by resolving the underlying expression to its root fields with unknown nullability...
  • both return the value of one of the expressions passed as params (but other functions do as well, eg. GREATEST())

Can you please elaborate a bit on this?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After spending some time on this issue when it was initially reported, my thinking is the following:

  • Nullability has nothing to do with this optimization. Nullability is most useful in the LogicalPlanOptimizer to shortcircuit folding to null expressions. At this point, in the local logical optimizer, the Nullability did its job.
  • the fact that Nullability is the same (and unknown) for these two functions (for coalesce it can also be false) is a fact indeed, but conceptually speaking I think it's not relevant
  • these two functions have special behavior in the sense that they can mess up the is not null predicate when it's used on them. They can accept complex expressions where the fields used can also be checked for null. For example, case (x is null,.....) is not null cannot be transformed in case(....) is not null AND x is not null because there is already an expression that contradicts x is null.
    • I have tried to come up with a smarter way of evaluating the expressions inside a case and determine if the fields there end up as x is null but it proved to be more complicated and I found this not worth the effort now. Maybe we can come back to this and be smarter about case and coalesce but we need a stronger reason to do it. Also, this bug is serious enough that it warrants this simple exclusion and fix.
  • the flag interface or a method in the Expression class at this point in the life of ES|QL (where we welcome any contribution from outside) is, imo, not warranted: we will make the behavior of a function more complex than it needs and it will confuse contributors even more. For now, I prefer to have this logic centralized in an optimization rule. Functions land should be a more approachable one, optimization rules are left for advanced logic and knowledge.

I will add a comment in the PR for this method, @bpintea also mentioned this.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

these two functions have special behavior in the sense that they can mess up the is not null

I don't completely agree with this statement. These two functions mess up with this rule, not with IsNotNull.

They can accept complex expressions where the fields used can also be checked for null

This is closer to the point IMHO: the real peculiarity is that these functions have a custom behavior with null inputs, ie. even if one input is null, the function can still be not null, so func(input1, input2, ...) IS NOT NULL is not equivalent to input1 IS NOT NULL and func(input1, input2, ...) IS NOT NULL.
I think we can easily express this as a method.

The danger comes from the fact that this is an implementation detail, peculiar to each function.
Or better, it's how our code generator creates evaluators (Coalesce and Case have a custom evaluator, not generated).
This makes the risk pretty low for now, that's why IMHO it's still good as a fix, but IMHO we should make this rule more robust as a follow-up.

the flag interface or a method in the Expression class at this point in the life of ES|QL (where we welcome any contribution from outside) is, imo, not warranted: we will make the behavior of a function more complex than it needs and it will confuse contributors even more

My concern is that a contributor could introduce a new function with a similar behavior, and will unintentionally break this rule.

Also, this bug is serious enough that it warrants this simple exclusion and fix.
💯

}
}
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import org.elasticsearch.xpack.esql.core.type.DataType;
import org.elasticsearch.xpack.esql.core.type.EsField;
import org.elasticsearch.xpack.esql.expression.function.EsqlFunctionRegistry;
import org.elasticsearch.xpack.esql.expression.function.scalar.conditional.Case;
import org.elasticsearch.xpack.esql.expression.function.scalar.nulls.Coalesce;
import org.elasticsearch.xpack.esql.expression.function.scalar.string.StartsWith;
import org.elasticsearch.xpack.esql.expression.predicate.operator.arithmetic.Add;
Expand Down Expand Up @@ -58,8 +59,11 @@

import static java.util.Collections.emptyMap;
import static org.elasticsearch.xpack.esql.EsqlTestUtils.L;
import static org.elasticsearch.xpack.esql.EsqlTestUtils.ONE;
import static org.elasticsearch.xpack.esql.EsqlTestUtils.TEST_SEARCH_STATS;
import static org.elasticsearch.xpack.esql.EsqlTestUtils.TEST_VERIFIER;
import static org.elasticsearch.xpack.esql.EsqlTestUtils.THREE;
import static org.elasticsearch.xpack.esql.EsqlTestUtils.TWO;
import static org.elasticsearch.xpack.esql.EsqlTestUtils.as;
import static org.elasticsearch.xpack.esql.EsqlTestUtils.getFieldAttribute;
import static org.elasticsearch.xpack.esql.EsqlTestUtils.greaterThanOf;
Expand All @@ -81,10 +85,6 @@ public class LocalLogicalPlanOptimizerTests extends ESTestCase {
private static LogicalPlanOptimizer logicalOptimizer;
private static Map<String, EsField> mapping;

private static final Literal ONE = L(1);
private static final Literal TWO = L(2);
private static final Literal THREE = L(3);

@BeforeClass
public static void init() {
parser = new EsqlParser();
Expand Down Expand Up @@ -386,38 +386,6 @@ public void testMissingFieldInFilterNoProjection() {
);
}

public void testIsNotNullOnCoalesce() {
var plan = localPlan("""
from test
| where coalesce(emp_no, salary) is not null
""");

var limit = as(plan, Limit.class);
var filter = as(limit.child(), Filter.class);
var inn = as(filter.condition(), IsNotNull.class);
var coalesce = as(inn.children().get(0), Coalesce.class);
assertThat(Expressions.names(coalesce.children()), contains("emp_no", "salary"));
var source = as(filter.child(), EsRelation.class);
}

public void testIsNotNullOnExpression() {
var plan = localPlan("""
from test
| eval x = emp_no + 1
| where x is not null
""");

var limit = as(plan, Limit.class);
var filter = as(limit.child(), Filter.class);
var inn = as(filter.condition(), IsNotNull.class);
assertThat(Expressions.names(inn.children()), contains("x"));
var eval = as(filter.child(), Eval.class);
filter = as(eval.child(), Filter.class);
inn = as(filter.condition(), IsNotNull.class);
assertThat(Expressions.names(inn.children()), contains("emp_no"));
var source = as(filter.child(), EsRelation.class);
}

public void testSparseDocument() throws Exception {
var query = """
from large
Expand Down Expand Up @@ -516,6 +484,66 @@ public void testIsNotNullOnFunctionWithTwoFields() {
assertEquals(expected, new InferIsNotNull().apply(f));
}

public void testIsNotNullOnCoalesce() {
var plan = localPlan("""
from test
| where coalesce(emp_no, salary) is not null
""");

var limit = as(plan, Limit.class);
var filter = as(limit.child(), Filter.class);
var inn = as(filter.condition(), IsNotNull.class);
var coalesce = as(inn.children().get(0), Coalesce.class);
assertThat(Expressions.names(coalesce.children()), contains("emp_no", "salary"));
var source = as(filter.child(), EsRelation.class);
}

public void testIsNotNullOnExpression() {
var plan = localPlan("""
from test
| eval x = emp_no + 1
| where x is not null
""");

var limit = as(plan, Limit.class);
var filter = as(limit.child(), Filter.class);
var inn = as(filter.condition(), IsNotNull.class);
assertThat(Expressions.names(inn.children()), contains("x"));
var eval = as(filter.child(), Eval.class);
filter = as(eval.child(), Filter.class);
inn = as(filter.condition(), IsNotNull.class);
assertThat(Expressions.names(inn.children()), contains("emp_no"));
var source = as(filter.child(), EsRelation.class);
}
Comment on lines +487 to +517
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(just checking: these are moved only, right?)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, sorry. I forgot to add a comment after I created the PR.


public void testIsNotNullOnCase() {
var plan = localPlan("""
from test
| where case(emp_no > 10000, "1", salary < 50000, "2", first_name) is not null
""");

var limit = as(plan, Limit.class);
var filter = as(limit.child(), Filter.class);
var inn = as(filter.condition(), IsNotNull.class);
var caseF = as(inn.children().get(0), Case.class);
assertThat(Expressions.names(caseF.children()), contains("emp_no > 10000", "\"1\"", "salary < 50000", "\"2\"", "first_name"));
var source = as(filter.child(), EsRelation.class);
}

public void testIsNotNullOnCase_With_IS_NULL() {
var plan = localPlan("""
from test
| where case(emp_no IS NULL, "1", salary IS NOT NULL, "2", first_name) is not null
""");

var limit = as(plan, Limit.class);
var filter = as(limit.child(), Filter.class);
var inn = as(filter.condition(), IsNotNull.class);
var caseF = as(inn.children().get(0), Case.class);
assertThat(Expressions.names(caseF.children()), contains("emp_no IS NULL", "\"1\"", "salary IS NOT NULL", "\"2\"", "first_name"));
var source = as(filter.child(), EsRelation.class);
}

private IsNotNull isNotNull(Expression field) {
return new IsNotNull(EMPTY, field);
}
Expand Down