Skip to content

Commit fbe726f

Browse files
HeartSaVioRHyukjinKwon
authored andcommitted
[SPARK-34339][CORE][SQL] Expose the number of total paths in Utils.buildLocationMetadata()
### What changes were proposed in this pull request? This PR proposes to expose the number of total paths in Utils.buildLocationMetadata(), with relaxing space usage a bit (around 10+ chars). Suppose the first 2 of 5 paths are only fit to the threshold, the outputs between the twos are below: * before the change: `[path1, path2]` * after the change: `(5 paths)[path1, path2, ...]` ### Why are the changes needed? SPARK-31793 silently truncates the paths hence end users can't indicate how many paths are truncated, and even more, whether paths are truncated or not. ### Does this PR introduce _any_ user-facing change? Yes, the location metadata will also show how many paths are truncated (not shown), instead of silently truncated. ### How was this patch tested? Modified UTs Closes #31464 from HeartSaVioR/SPARK-34339. Authored-by: Jungtaek Lim (HeartSaVioR) <[email protected]> Signed-off-by: HyukjinKwon <[email protected]>
1 parent 7675582 commit fbe726f

File tree

5 files changed

+20
-16
lines changed

5 files changed

+20
-16
lines changed

core/src/main/scala/org/apache/spark/util/Utils.scala

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2980,7 +2980,7 @@ private[spark] object Utils extends Logging {
29802980
* exceeds `stopAppendingThreshold`, stop appending paths for saving memory.
29812981
*/
29822982
def buildLocationMetadata(paths: Seq[Path], stopAppendingThreshold: Int): String = {
2983-
val metadata = new StringBuilder("[")
2983+
val metadata = new StringBuilder(s"(${paths.length} paths)[")
29842984
var index: Int = 0
29852985
while (index < paths.length && metadata.length < stopAppendingThreshold) {
29862986
if (index > 0) {
@@ -2989,6 +2989,12 @@ private[spark] object Utils extends Logging {
29892989
metadata.append(paths(index).toString)
29902990
index += 1
29912991
}
2992+
if (paths.length > index) {
2993+
if (index > 0) {
2994+
metadata.append(", ")
2995+
}
2996+
metadata.append("...")
2997+
}
29922998
metadata.append("]")
29932999
metadata.toString
29943000
}

core/src/test/scala/org/apache/spark/util/UtilsSuite.scala

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1304,16 +1304,11 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging {
13041304

13051305
test("pathsToMetadata") {
13061306
val paths = (0 to 4).map(i => new Path(s"path$i"))
1307-
assert(Utils.buildLocationMetadata(paths, 5) == "[path0]")
1308-
assert(Utils.buildLocationMetadata(paths, 10) == "[path0, path1]")
1309-
assert(Utils.buildLocationMetadata(paths, 15) == "[path0, path1, path2]")
1310-
assert(Utils.buildLocationMetadata(paths, 25) == "[path0, path1, path2, path3]")
1311-
1312-
// edge-case: we should consider the fact non-path chars including '[' and ", " are accounted
1313-
// 1. second path is not added due to the addition of '['
1314-
assert(Utils.buildLocationMetadata(paths, 6) == "[path0]")
1315-
// 2. third path is not added due to the addition of ", "
1316-
assert(Utils.buildLocationMetadata(paths, 13) == "[path0, path1]")
1307+
assert(Utils.buildLocationMetadata(paths, 10) == "(5 paths)[...]")
1308+
// 11 is the minimum threshold to print at least one path
1309+
assert(Utils.buildLocationMetadata(paths, 11) == "(5 paths)[path0, ...]")
1310+
// 11 + 5 + 2 = 18 is the minimum threshold to print two paths
1311+
assert(Utils.buildLocationMetadata(paths, 18) == "(5 paths)[path0, path1, ...]")
13171312
}
13181313

13191314
test("checkHost supports both IPV4 and IPV6") {

external/avro/src/test/scala/org/apache/spark/sql/avro/AvroSuite.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2132,7 +2132,7 @@ class AvroV2Suite extends AvroSuite with ExplainSuiteHelper {
21322132
|Output \\[2\\]: \\[value#xL, id#x\\]
21332133
|DataFilters: \\[isnotnull\\(value#xL\\), \\(value#xL > 2\\)\\]
21342134
|Format: avro
2135-
|Location: InMemoryFileIndex\\[.*\\]
2135+
|Location: InMemoryFileIndex\\([0-9]+ paths\\)\\[.*\\]
21362136
|PartitionFilters: \\[isnotnull\\(id#x\\), \\(id#x > 1\\)\\]
21372137
|PushedFilers: \\[IsNotNull\\(value\\), GreaterThan\\(value,2\\)\\]
21382138
|ReadSchema: struct\\<value:bigint\\>

sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -411,7 +411,7 @@ class ExplainSuite extends ExplainSuiteHelper with DisableAdaptiveExecutionSuite
411411
|Output \\[2\\]: \\[value#x, id#x\\]
412412
|DataFilters: \\[isnotnull\\(value#x\\), \\(value#x > 2\\)\\]
413413
|Format: $fmt
414-
|Location: InMemoryFileIndex\\[.*\\]
414+
|Location: InMemoryFileIndex\\([0-9]+ paths\\)\\[.*\\]
415415
|PartitionFilters: \\[isnotnull\\(id#x\\), \\(id#x > 1\\)\\]
416416
${pushFilterMaps.get(fmt).get}
417417
|ReadSchema: struct\\<value:int\\>

sql/core/src/test/scala/org/apache/spark/sql/execution/DataSourceScanExecRedactionSuite.scala

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -122,8 +122,6 @@ class DataSourceScanExecRedactionSuite extends DataSourceScanRedactionTest {
122122

123123
test("SPARK-31793: FileSourceScanExec metadata should contain limited file paths") {
124124
withTempPath { path =>
125-
val dir = path.getCanonicalPath
126-
127125
// create a sub-directory with long name so that each root path will always exceed the limit
128126
// this is to ensure we always test the case for the path truncation
129127
val dataDirName = Random.alphanumeric.take(100).toList.mkString
@@ -146,6 +144,9 @@ class DataSourceScanExecRedactionSuite extends DataSourceScanRedactionTest {
146144
// The location metadata should at least contain one path
147145
assert(location.get.contains(paths.head))
148146

147+
// The location metadata should have the number of root paths
148+
assert(location.get.contains("(10 paths)"))
149+
149150
// The location metadata should have bracket wrapping paths
150151
assert(location.get.indexOf('[') > -1)
151152
assert(location.get.indexOf(']') > -1)
@@ -155,7 +156,9 @@ class DataSourceScanExecRedactionSuite extends DataSourceScanRedactionTest {
155156
location.get.indexOf('[') + 1, location.get.indexOf(']')).split(", ").toSeq
156157

157158
// the only one path should be available
158-
assert(pathsInLocation.size == 1)
159+
assert(pathsInLocation.size == 2)
160+
// indicator ("...") should be available
161+
assert(pathsInLocation.exists(_.contains("...")))
159162
}
160163
}
161164
}

0 commit comments

Comments
 (0)