-
Notifications
You must be signed in to change notification settings - Fork 28.9k
[SPARK-21595] Separate thresholds for buffering and spilling in ExternalAppendOnlyUnsafeRowArray #18843
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-21595] Separate thresholds for buffering and spilling in ExternalAppendOnlyUnsafeRowArray #18843
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -844,24 +844,47 @@ object SQLConf { | |
| .stringConf | ||
| .createWithDefaultFunction(() => TimeZone.getDefault.getID) | ||
|
|
||
| val WINDOW_EXEC_BUFFER_IN_MEMORY_THRESHOLD = | ||
| buildConf("spark.sql.windowExec.buffer.in.memory.threshold") | ||
| .internal() | ||
| .doc("Threshold for number of rows guaranteed to be held in memory by the window operator") | ||
| .intConf | ||
| .createWithDefault(4096) | ||
|
|
||
| val WINDOW_EXEC_BUFFER_SPILL_THRESHOLD = | ||
| buildConf("spark.sql.windowExec.buffer.spill.threshold") | ||
| .internal() | ||
| .doc("Threshold for number of rows buffered in window operator") | ||
| .doc("Threshold for number of rows to be spilled by window operator") | ||
| .intConf | ||
| .createWithDefault(4096) | ||
| .createWithDefault(UnsafeExternalSorter.DEFAULT_NUM_ELEMENTS_FOR_SPILL_THRESHOLD.toInt) | ||
|
|
||
| val SORT_MERGE_JOIN_EXEC_BUFFER_IN_MEMORY_THRESHOLD = | ||
| buildConf("spark.sql.sortMergeJoinExec.buffer.in.memory.threshold") | ||
| .internal() | ||
| .doc("Threshold for number of rows guaranteed to be held in memory by the sort merge " + | ||
| "join operator") | ||
| .intConf | ||
| .createWithDefault(Int.MaxValue) | ||
|
||
|
|
||
| val SORT_MERGE_JOIN_EXEC_BUFFER_SPILL_THRESHOLD = | ||
| buildConf("spark.sql.sortMergeJoinExec.buffer.spill.threshold") | ||
| .internal() | ||
| .doc("Threshold for number of rows buffered in sort merge join operator") | ||
| .doc("Threshold for number of rows to be spilled by sort merge join operator") | ||
| .intConf | ||
| .createWithDefault(Int.MaxValue) | ||
| .createWithDefault(UnsafeExternalSorter.DEFAULT_NUM_ELEMENTS_FOR_SPILL_THRESHOLD.toInt) | ||
|
|
||
| val CARTESIAN_PRODUCT_EXEC_BUFFER_IN_MEMORY_THRESHOLD = | ||
| buildConf("spark.sql.cartesianProductExec.buffer.in.memory.threshold") | ||
| .internal() | ||
| .doc("Threshold for number of rows guaranteed to be held in memory by the cartesian " + | ||
| "product operator") | ||
| .intConf | ||
| .createWithDefault(4096) | ||
|
|
||
| val CARTESIAN_PRODUCT_EXEC_BUFFER_SPILL_THRESHOLD = | ||
| buildConf("spark.sql.cartesianProductExec.buffer.spill.threshold") | ||
| .internal() | ||
| .doc("Threshold for number of rows buffered in cartesian product operator") | ||
| .doc("Threshold for number of rows to be spilled by cartesian product operator") | ||
| .intConf | ||
| .createWithDefault(UnsafeExternalSorter.DEFAULT_NUM_ELEMENTS_FOR_SPILL_THRESHOLD.toInt) | ||
|
|
||
|
|
@@ -1137,11 +1160,19 @@ class SQLConf extends Serializable with Logging { | |
|
|
||
| def joinReorderDPStarFilter: Boolean = getConf(SQLConf.JOIN_REORDER_DP_STAR_FILTER) | ||
|
|
||
| def windowExecBufferInMemoryThreshold: Int = getConf(WINDOW_EXEC_BUFFER_IN_MEMORY_THRESHOLD) | ||
|
|
||
| def windowExecBufferSpillThreshold: Int = getConf(WINDOW_EXEC_BUFFER_SPILL_THRESHOLD) | ||
|
|
||
| def sortMergeJoinExecBufferInMemoryThreshold: Int = | ||
| getConf(SORT_MERGE_JOIN_EXEC_BUFFER_IN_MEMORY_THRESHOLD) | ||
|
|
||
| def sortMergeJoinExecBufferSpillThreshold: Int = | ||
| getConf(SORT_MERGE_JOIN_EXEC_BUFFER_SPILL_THRESHOLD) | ||
|
|
||
| def cartesianProductExecBufferInMemoryThreshold: Int = | ||
| getConf(CARTESIAN_PRODUCT_EXEC_BUFFER_IN_MEMORY_THRESHOLD) | ||
|
|
||
| def cartesianProductExecBufferSpillThreshold: Int = | ||
| getConf(CARTESIAN_PRODUCT_EXEC_BUFFER_SPILL_THRESHOLD) | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can we just have one config for both window and SMJ? ideally we can say this config is for
ExternalAppendOnlyUnsafeRowArrayUh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am fine with that. We can even go a step further and just have two configs : in-mem threshold and spill threshold at the
ExternalAppendOnlyUnsafeRowArrayfor all its clients (currently SMJ, cartesian product, Window). That way we have consistency across all clients and both knobs. One downside is backward compatibility : spill threshold was already defined per operator level and people might be using it in prod.Let me know what you think about that.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ok let's keep them separated for each operator.