-
Notifications
You must be signed in to change notification settings - Fork 28.9k
[SPARK-30536][CORE][SQL] Sort-merge join operator spilling performance improvements #27246
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -47,55 +47,49 @@ public final class UnsafeSorterSpillReader extends UnsafeSorterIterator implemen | |
| private int numRecords; | ||
| private int numRecordsRemaining; | ||
|
|
||
| private byte[] arr = new byte[1024 * 1024]; | ||
| private byte[] arr = new byte[1024]; | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If this number is performance-senstive, could we parameterize it?
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes. I am looking into this. Thank you
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does this look good? Perhaps you have some suggestion. private[spark] val UNSAFE_SORTER_SPILL_READER_BUFFER_SIZE_RATIO =
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you apply the change into this PR, first? Otherwise, its hard to leave comments line-by-line...
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. NP. I will push update soon. I am testing changes with 10TB benchmarks. |
||
| private Object baseObject = arr; | ||
| private final TaskContext taskContext = TaskContext.get(); | ||
| private final SerializerManager serManager; | ||
| private final File dataFile; | ||
| private final BlockId blkId; | ||
| private boolean initialized; | ||
|
|
||
| public UnsafeSorterSpillReader( | ||
| SerializerManager serializerManager, | ||
| File file, | ||
| BlockId blockId) throws IOException { | ||
| assert (file.length() > 0); | ||
| final ConfigEntry<Object> bufferSizeConfigEntry = | ||
| package$.MODULE$.UNSAFE_SORTER_SPILL_READER_BUFFER_SIZE(); | ||
| // This value must be less than or equal to MAX_BUFFER_SIZE_BYTES. Cast to int is always safe. | ||
| final int DEFAULT_BUFFER_SIZE_BYTES = | ||
| ((Long) bufferSizeConfigEntry.defaultValue().get()).intValue(); | ||
| int bufferSizeBytes = SparkEnv.get() == null ? DEFAULT_BUFFER_SIZE_BYTES : | ||
| ((Long) SparkEnv.get().conf().get(bufferSizeConfigEntry)).intValue(); | ||
|
|
||
| final boolean readAheadEnabled = SparkEnv.get() != null && (boolean)SparkEnv.get().conf().get( | ||
| package$.MODULE$.UNSAFE_SORTER_SPILL_READ_AHEAD_ENABLED()); | ||
|
|
||
| final InputStream bs = | ||
| new NioBufferedFileInputStream(file, bufferSizeBytes); | ||
| try { | ||
| if (readAheadEnabled) { | ||
| this.in = new ReadAheadInputStream(serializerManager.wrapStream(blockId, bs), | ||
| bufferSizeBytes); | ||
| } else { | ||
| this.in = serializerManager.wrapStream(blockId, bs); | ||
| } | ||
| this.din = new DataInputStream(this.in); | ||
| numRecords = numRecordsRemaining = din.readInt(); | ||
| } catch (IOException e) { | ||
| Closeables.close(bs, /* swallowIOException = */ true); | ||
| throw e; | ||
| } | ||
| serManager = serializerManager; | ||
| dataFile = file; | ||
| blkId = blockId; | ||
| initialized = false; | ||
| } | ||
|
|
||
| @Override | ||
| public int getNumRecords() { | ||
| public int getNumRecords() throws IOException { | ||
| if (!initialized) { | ||
| readSpilledFile(); | ||
| initialized = true; | ||
| } | ||
| return numRecords; | ||
| } | ||
|
|
||
| @Override | ||
| public boolean hasNext() { | ||
| public boolean hasNext() throws IOException { | ||
| if (!initialized) { | ||
| readSpilledFile(); | ||
| initialized = true; | ||
| } | ||
| return (numRecordsRemaining > 0); | ||
| } | ||
|
|
||
| @Override | ||
| public void loadNext() throws IOException { | ||
| if (!initialized) { | ||
| readSpilledFile(); | ||
| initialized = true; | ||
| } | ||
| // Kill the task in case it has been marked as killed. This logic is from | ||
| // InterruptibleIterator, but we inline it here instead of wrapping the iterator in order | ||
| // to avoid performance overhead. This check is added here in `loadNext()` instead of in | ||
|
|
@@ -148,4 +142,34 @@ public void close() throws IOException { | |
| } | ||
| } | ||
| } | ||
|
|
||
| private void readSpilledFile() throws IOException { | ||
| assert (dataFile.length() > 0); | ||
| final ConfigEntry<Object> bufferSizeConfigEntry = | ||
| package$.MODULE$.UNSAFE_SORTER_SPILL_READER_BUFFER_SIZE(); | ||
| // This value must be less than or equal to MAX_BUFFER_SIZE_BYTES. Cast to int is always safe. | ||
| final int DEFAULT_BUFFER_SIZE_BYTES = | ||
| ((Long) bufferSizeConfigEntry.defaultValue().get()).intValue(); | ||
| int bufferSizeBytes = SparkEnv.get() == null ? DEFAULT_BUFFER_SIZE_BYTES : | ||
| ((Long) SparkEnv.get().conf().get(bufferSizeConfigEntry)).intValue(); | ||
|
|
||
| final boolean readAheadEnabled = SparkEnv.get() != null && (boolean)SparkEnv.get().conf().get( | ||
| package$.MODULE$.UNSAFE_SORTER_SPILL_READ_AHEAD_ENABLED()); | ||
|
|
||
| final InputStream bs = | ||
| new NioBufferedFileInputStream(dataFile, bufferSizeBytes); | ||
| try { | ||
| if (readAheadEnabled) { | ||
| this.in = new ReadAheadInputStream(serManager.wrapStream(blkId, bs), | ||
| bufferSizeBytes); | ||
| } else { | ||
| this.in = serManager.wrapStream(blkId, bs); | ||
| } | ||
| this.din = new DataInputStream(this.in); | ||
| numRecords = numRecordsRemaining = din.readInt(); | ||
| } catch (IOException e) { | ||
| Closeables.close(bs, /* swallowIOException = */ true); | ||
| throw e; | ||
| } | ||
| } | ||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.