-
Notifications
You must be signed in to change notification settings - Fork 28.9k
[SPARK-32148][SS] Fix stream-stream join issue on missing to copy reused unsafe row #28975
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
82e5a76
be34258
e2201ef
1c011ab
fb63d7e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,6 +18,7 @@ | |
| package org.apache.spark.sql.streaming | ||
|
|
||
| import java.io.File | ||
| import java.sql.Timestamp | ||
| import java.util.{Locale, UUID} | ||
|
|
||
| import scala.util.Random | ||
|
|
@@ -996,4 +997,47 @@ class StreamingOuterJoinSuite extends StreamTest with StateStoreMetricsTest with | |
| ) | ||
| } | ||
| } | ||
|
|
||
| test("SPARK-32148 stream-stream join regression on Spark 3.0.0") { | ||
| val input1 = MemoryStream[(Timestamp, String, String)] | ||
| val df1 = input1.toDF | ||
| .selectExpr("_1 as eventTime", "_2 as id", "_3 as comment") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Any specific reason why not use
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess it's pretty much simpler and more readable than |
||
| .withWatermark(s"eventTime", "2 minutes") | ||
|
|
||
| val input2 = MemoryStream[(Timestamp, String, String)] | ||
| val df2 = input2.toDF | ||
| .selectExpr("_1 as eventTime", "_2 as id", "_3 as name") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same here.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same here as well. |
||
| .withWatermark(s"eventTime", "4 minutes") | ||
|
|
||
| val joined = df1.as("left") | ||
| .join(df2.as("right"), | ||
| expr(""" | ||
| |left.id = right.id AND left.eventTime BETWEEN | ||
| | right.eventTime - INTERVAL 30 seconds AND | ||
| | right.eventTime + INTERVAL 30 seconds | ||
| """.stripMargin), | ||
| joinType = "leftOuter") | ||
|
|
||
| val inputDataForInput1 = Seq( | ||
| (Timestamp.valueOf("2020-01-01 00:00:00"), "abc", "has no join partner"), | ||
| (Timestamp.valueOf("2020-01-02 00:00:00"), "abc", "joined with A"), | ||
| (Timestamp.valueOf("2020-01-02 01:00:00"), "abc", "joined with B")) | ||
|
|
||
| val inputDataForInput2 = Seq( | ||
| (Timestamp.valueOf("2020-01-02 00:00:10"), "abc", "A"), | ||
| (Timestamp.valueOf("2020-01-02 00:59:59"), "abc", "B"), | ||
| (Timestamp.valueOf("2020-01-02 02:00:00"), "abc", "C")) | ||
|
|
||
| val expectedOutput = Seq( | ||
| (Timestamp.valueOf("2020-01-01 00:00:00"), "abc", "has no join partner", null, null, null), | ||
| (Timestamp.valueOf("2020-01-02 00:00:00"), "abc", "joined with A", | ||
| Timestamp.valueOf("2020-01-02 00:00:10"), "abc", "A"), | ||
| (Timestamp.valueOf("2020-01-02 01:00:00"), "abc", "joined with B", | ||
| Timestamp.valueOf("2020-01-02 00:59:59"), "abc", "B")) | ||
|
|
||
| testStream(joined)( | ||
| MultiAddData((input1, inputDataForInput1), (input2, inputDataForInput2)), | ||
| CheckNewAnswer(expectedOutput.head, expectedOutput.tail: _*) | ||
| ) | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just to clarify: this comment is not related to the bug and just to document an existing assumption?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes right.
TBH I suspected this first and crafted a patch including the part with new iterator explicitly runs the logic after evaluating innerOutputIter, and later realized current logic already dealt with this properly, because removeOldState() doesn't materialize the candidates and evaluate lazily. This patch contains minimal change.
Worth to mention how it works for someone who may need to touch here.