Skip to content

Commit a7cfe51

Browse files
xuejianbestsrowen
authored andcommitted
[SPARK-25108][SQL] Fix the show method to display the wide character alignment problem
This is not a perfect solution. It is designed to minimize complexity on the basis of solving problems. It is effective for English, Chinese characters, Japanese, Korean and so on. ```scala before: +---+---------------------------+-------------+ |id |中国 |s2 | +---+---------------------------+-------------+ |1 |ab |[a] | |2 |null |[中国, abc] | |3 |ab1 |[hello world]| |4 |か行 きゃ(kya) きゅ(kyu) きょ(kyo) |[“中国] | |5 |中国(你好)a |[“中(国), 312] | |6 |中国山(东)服务区 |[“中(国)] | |7 |中国山东服务区 |[中(国)] | |8 | |[中国] | +---+---------------------------+-------------+ after: +---+-----------------------------------+----------------+ |id |中国 |s2 | +---+-----------------------------------+----------------+ |1 |ab |[a] | |2 |null |[中国, abc] | |3 |ab1 |[hello world] | |4 |か行 きゃ(kya) きゅ(kyu) きょ(kyo) |[“中国] | |5 |中国(你好)a |[“中(国), 312]| |6 |中国山(东)服务区 |[“中(国)] | |7 |中国山东服务区 |[中(国)] | |8 | |[中国] | +---+-----------------------------------+----------------+ ``` ## What changes were proposed in this pull request? When there are wide characters such as Chinese characters or Japanese characters in the data, the show method has a alignment problem. Try to fix this problem. ## How was this patch tested? (Please explain how this patch was tested. E.g. unit tests, integration tests, manual tests) ![image](https://user-images.githubusercontent.com/13044869/44250564-69f6b400-a227-11e8-88b2-6cf6960377ff.png) Please review http://spark.apache.org/contributing.html before opening a pull request. Closes #22048 from xuejianbest/master. Authored-by: xuejianbest <[email protected]> Signed-off-by: Sean Owen <[email protected]>
1 parent 3682d29 commit a7cfe51

File tree

4 files changed

+109
-9
lines changed

4 files changed

+109
-9
lines changed

core/src/main/scala/org/apache/spark/util/Utils.scala

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2795,6 +2795,36 @@ private[spark] object Utils extends Logging {
27952795
}
27962796
}
27972797
}
2798+
2799+
/**
2800+
* Regular expression matching full width characters.
2801+
*
2802+
* Looked at all the 0x0000-0xFFFF characters (unicode) and showed them under Xshell.
2803+
* Found all the full width characters, then get the regular expression.
2804+
*/
2805+
private val fullWidthRegex = ("""[""" +
2806+
// scalastyle:off nonascii
2807+
"""\u1100-\u115F""" +
2808+
"""\u2E80-\uA4CF""" +
2809+
"""\uAC00-\uD7A3""" +
2810+
"""\uF900-\uFAFF""" +
2811+
"""\uFE10-\uFE19""" +
2812+
"""\uFE30-\uFE6F""" +
2813+
"""\uFF00-\uFF60""" +
2814+
"""\uFFE0-\uFFE6""" +
2815+
// scalastyle:on nonascii
2816+
"""]""").r
2817+
2818+
/**
2819+
* Return the number of half widths in a given string. Note that a full width character
2820+
* occupies two half widths.
2821+
*
2822+
* For a string consisting of 1 million characters, the execution of this method requires
2823+
* about 50ms.
2824+
*/
2825+
def stringHalfWidth(str: String): Int = {
2826+
if (str == null) 0 else str.length + fullWidthRegex.findAllIn(str).size
2827+
}
27982828
}
27992829

28002830
private[util] object CallerContext extends Logging {

core/src/test/scala/org/apache/spark/util/UtilsSuite.scala

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1184,6 +1184,27 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging {
11841184
assert(Utils.getSimpleName(classOf[MalformedClassObject.MalformedClass]) ===
11851185
"UtilsSuite$MalformedClassObject$MalformedClass")
11861186
}
1187+
1188+
test("stringHalfWidth") {
1189+
// scalastyle:off nonascii
1190+
assert(Utils.stringHalfWidth(null) == 0)
1191+
assert(Utils.stringHalfWidth("") == 0)
1192+
assert(Utils.stringHalfWidth("ab c") == 4)
1193+
assert(Utils.stringHalfWidth("1098") == 4)
1194+
assert(Utils.stringHalfWidth("") == 2)
1195+
assert(Utils.stringHalfWidth("γύρ") == 3)
1196+
assert(Utils.stringHalfWidth("") == 2)
1197+
assert(Utils.stringHalfWidth("") == 2)
1198+
assert(Utils.stringHalfWidth("") == 2)
1199+
assert(Utils.stringHalfWidth("") == 2)
1200+
assert(Utils.stringHalfWidth("") == 2)
1201+
assert(Utils.stringHalfWidth("à") == 1)
1202+
assert(Utils.stringHalfWidth("") == 2)
1203+
assert(Utils.stringHalfWidth("羍む") == 4)
1204+
assert(Utils.stringHalfWidth("뺭ᾘ") == 3)
1205+
assert(Utils.stringHalfWidth("\u0967\u0968\u0969") == 3)
1206+
// scalastyle:on nonascii
1207+
}
11871208
}
11881209

11891210
private class SimpleExtension

sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -306,16 +306,16 @@ class Dataset[T] private[sql](
306306
// Compute the width of each column
307307
for (row <- rows) {
308308
for ((cell, i) <- row.zipWithIndex) {
309-
colWidths(i) = math.max(colWidths(i), cell.length)
309+
colWidths(i) = math.max(colWidths(i), Utils.stringHalfWidth(cell))
310310
}
311311
}
312312

313313
val paddedRows = rows.map { row =>
314314
row.zipWithIndex.map { case (cell, i) =>
315315
if (truncate > 0) {
316-
StringUtils.leftPad(cell, colWidths(i))
316+
StringUtils.leftPad(cell, colWidths(i) - Utils.stringHalfWidth(cell) + cell.length)
317317
} else {
318-
StringUtils.rightPad(cell, colWidths(i))
318+
StringUtils.rightPad(cell, colWidths(i) - Utils.stringHalfWidth(cell) + cell.length)
319319
}
320320
}
321321
}
@@ -337,12 +337,10 @@ class Dataset[T] private[sql](
337337

338338
// Compute the width of field name and data columns
339339
val fieldNameColWidth = fieldNames.foldLeft(minimumColWidth) { case (curMax, fieldName) =>
340-
math.max(curMax, fieldName.length)
340+
math.max(curMax, Utils.stringHalfWidth(fieldName))
341341
}
342342
val dataColWidth = dataRows.foldLeft(minimumColWidth) { case (curMax, row) =>
343-
math.max(curMax, row.map(_.length).reduceLeftOption[Int] { case (cellMax, cell) =>
344-
math.max(cellMax, cell)
345-
}.getOrElse(0))
343+
math.max(curMax, row.map(cell => Utils.stringHalfWidth(cell)).max)
346344
}
347345

348346
dataRows.zipWithIndex.foreach { case (row, i) =>
@@ -351,8 +349,10 @@ class Dataset[T] private[sql](
351349
s"-RECORD $i", fieldNameColWidth + dataColWidth + 5, "-")
352350
sb.append(rowHeader).append("\n")
353351
row.zipWithIndex.map { case (cell, j) =>
354-
val fieldName = StringUtils.rightPad(fieldNames(j), fieldNameColWidth)
355-
val data = StringUtils.rightPad(cell, dataColWidth)
352+
val fieldName = StringUtils.rightPad(fieldNames(j),
353+
fieldNameColWidth - Utils.stringHalfWidth(fieldNames(j)) + fieldNames(j).length)
354+
val data = StringUtils.rightPad(cell,
355+
dataColWidth - Utils.stringHalfWidth(cell) + cell.length)
356356
s" $fieldName | $data "
357357
}.addString(sb, "", "\n", "\n")
358358
}

sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -969,6 +969,55 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
969969
checkShowString(ds, expected)
970970
}
971971

972+
test("SPARK-25108 Fix the show method to display the full width character alignment problem") {
973+
// scalastyle:off nonascii
974+
val df = Seq(
975+
(0, null, 1),
976+
(0, "", 1),
977+
(0, "ab c", 1),
978+
(0, "1098", 1),
979+
(0, "", 1),
980+
(0, "γύρ", 1),
981+
(0, "", 1),
982+
(0, "", 1),
983+
(0, "", 1),
984+
(0, "", 1),
985+
(0, "", 1),
986+
(0, "à", 1),
987+
(0, "", 1),
988+
(0, "羍む", 1),
989+
(0, "뺭ᾘ", 1),
990+
(0, "\u0967\u0968\u0969", 1)
991+
).toDF("b", "a", "c")
992+
// scalastyle:on nonascii
993+
val ds = df.as[ClassData]
994+
val expected =
995+
// scalastyle:off nonascii
996+
"""+---+----+---+
997+
|| b| a| c|
998+
|+---+----+---+
999+
|| 0|null| 1|
1000+
|| 0| | 1|
1001+
|| 0|ab c| 1|
1002+
|| 0|1098| 1|
1003+
|| 0| mø| 1|
1004+
|| 0| γύρ| 1|
1005+
|| 0| pê| 1|
1006+
|| 0| ー| 1|
1007+
|| 0| 测| 1|
1008+
|| 0| か| 1|
1009+
|| 0| 걸| 1|
1010+
|| 0| à| 1|
1011+
|| 0| 焼| 1|
1012+
|| 0|羍む| 1|
1013+
|| 0| 뺭ᾘ| 1|
1014+
|| 0| १२३| 1|
1015+
|+---+----+---+
1016+
|""".stripMargin
1017+
// scalastyle:on nonascii
1018+
checkShowString(ds, expected)
1019+
}
1020+
9721021
test(
9731022
"SPARK-15112: EmbedDeserializerInFilter should not optimize plan fragment that changes schema"
9741023
) {

0 commit comments

Comments
 (0)