From 1b9b2e73b8637b8da15b790a06b08e8810fc75d9 Mon Sep 17 00:00:00 2001 From: xuejianbest <384329882@qq.com> Date: Thu, 9 Aug 2018 11:11:04 +0800 Subject: [PATCH 01/10] Fix the show method to display the wide character alignment problem MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit before: +---+---------------------------+-------------+ |id |中国 |s2 | +---+---------------------------+-------------+ |1 |ab |[a] | |2 |null |[中国, abc] | |3 |ab1 |[hello world]| |4 |か行 きゃ(kya) きゅ(kyu) きょ(kyo) |[“中国] | |5 |中国(你好)a |[“中(国), 312] | |6 |中国山(东)服务区 |[“中(国)] | |7 |中国山东服务区 |[中(国)] | |8 | |[中国] | +---+---------------------------+-------------+ after: +---+--------------------------+----------------+ | id| 中国| s2| +---+--------------------------+----------------+ | 1| ab| [a]| | 2| null| [中国, abc]| | 3| ab1| [hello world]| | 4|か行 きゃ(kya) きゅ(kyu...| [“中国]| | 5| 中国(你好)a|[“中(国), 312]| | 6| 中国山(东)服务区| [“中(国)]| | 7| 中国山东服务区| [中(国)]| | 8| | [中国]| +---+--------------------------+----------------+ --- .../scala/org/apache/spark/sql/Dataset.scala | 21 +++++++++---------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala index 0aee1d7be578..6af76ca81127 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -280,6 +280,7 @@ class Dataset[T] private[sql]( // We set a minimum column width at '3' val minimumColWidth = 3 + val regex = """[^\x00-\xff]""".r if (!vertical) { // Initialise the width of each column to a minimum value val colWidths = Array.fill(numCols)(minimumColWidth) @@ -287,7 +288,7 @@ class Dataset[T] private[sql]( // Compute the width of each column for (row <- rows) { for ((cell, i) <- row.zipWithIndex) { - colWidths(i) = math.max(colWidths(i), cell.length) + colWidths(i) = math.max(colWidths(i), cell.length + regex.findAllIn(cell).size) } } @@ -297,9 +298,9 @@ class Dataset[T] private[sql]( // column names rows.head.zipWithIndex.map { case (cell, i) => if (truncate > 0) { - StringUtils.leftPad(cell, colWidths(i)) + StringUtils.leftPad(cell, colWidths(i) - regex.findAllIn(cell).size) } else { - StringUtils.rightPad(cell, colWidths(i)) + StringUtils.rightPad(cell, colWidths(i) - regex.findAllIn(cell).size) } }.addString(sb, "|", "|", "|\n") @@ -309,9 +310,9 @@ class Dataset[T] private[sql]( rows.tail.foreach { _.zipWithIndex.map { case (cell, i) => if (truncate > 0) { - StringUtils.leftPad(cell.toString, colWidths(i)) + StringUtils.leftPad(cell.toString, colWidths(i) - regex.findAllIn(cell).size) } else { - StringUtils.rightPad(cell.toString, colWidths(i)) + StringUtils.rightPad(cell.toString, colWidths(i) - regex.findAllIn(cell).size) } }.addString(sb, "|", "|", "|\n") } @@ -324,12 +325,10 @@ class Dataset[T] private[sql]( // Compute the width of field name and data columns val fieldNameColWidth = fieldNames.foldLeft(minimumColWidth) { case (curMax, fieldName) => - math.max(curMax, fieldName.length) + math.max(curMax, fieldName.length + regex.findAllIn(fieldName).size) } val dataColWidth = dataRows.foldLeft(minimumColWidth) { case (curMax, row) => - math.max(curMax, row.map(_.length).reduceLeftOption[Int] { case (cellMax, cell) => - math.max(cellMax, cell) - }.getOrElse(0)) + math.max(curMax, row.map(cell => cell.length + regex.findAllIn(cell).size).max) } dataRows.zipWithIndex.foreach { case (row, i) => @@ -338,8 +337,8 @@ class Dataset[T] private[sql]( s"-RECORD $i", fieldNameColWidth + dataColWidth + 5, "-") sb.append(rowHeader).append("\n") row.zipWithIndex.map { case (cell, j) => - val fieldName = StringUtils.rightPad(fieldNames(j), fieldNameColWidth) - val data = StringUtils.rightPad(cell, dataColWidth) + val fieldName = StringUtils.rightPad(fieldNames(j), fieldNameColWidth - regex.findAllIn(fieldNames(j)).size) + val data = StringUtils.rightPad(cell, dataColWidth - regex.findAllIn(cell).size) s" $fieldName | $data " }.addString(sb, "", "\n", "\n") } From 906c0ba736552ae83c1fa3be1b501e9dbb61c5b1 Mon Sep 17 00:00:00 2001 From: xuejianbest <384329882@qq.com> Date: Fri, 10 Aug 2018 17:38:31 +0800 Subject: [PATCH 02/10] Modifying regular expressions for matching narrow characters --- sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala index 9f555fd0abfa..40459d4c46ba 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -294,7 +294,7 @@ class Dataset[T] private[sql]( // We set a minimum column width at '3' val minimumColWidth = 3 - val regex = """[^\x00-\xff“”]""".r + val regex = """[^\x00-\u2e39]""".r if (!vertical) { // Initialise the width of each column to a minimum value val colWidths = Array.fill(numCols)(minimumColWidth) From da37d2ef68c0212b723dd96b4ae571f6a16f03ad Mon Sep 17 00:00:00 2001 From: xuejianbest <384329882@qq.com> Date: Tue, 28 Aug 2018 17:05:14 +0800 Subject: [PATCH 03/10] Fix the show method to display full width characters. Modify regular expressions to make them more precise. Modify variable names and add comment. --- .../scala/org/apache/spark/sql/Dataset.scala | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala index 40459d4c46ba..bdd2bec26ef0 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -294,7 +294,8 @@ class Dataset[T] private[sql]( // We set a minimum column width at '3' val minimumColWidth = 3 - val regex = """[^\x00-\u2e39]""".r + //Regular expression matching full width characters + val fullWidthRegex = """[\u1100-\u115F\u2E80-\uA4CF\uAC00-\uD7A3\uF900-\uFAFF\uFE10-\uFE19\uFE30-\uFE6F\uFF00-\uFF60\uFFE0-\uFFE6]""".r if (!vertical) { // Initialise the width of each column to a minimum value val colWidths = Array.fill(numCols)(minimumColWidth) @@ -302,16 +303,16 @@ class Dataset[T] private[sql]( // Compute the width of each column for (row <- rows) { for ((cell, i) <- row.zipWithIndex) { - colWidths(i) = math.max(colWidths(i), cell.length + regex.findAllIn(cell).size) + colWidths(i) = math.max(colWidths(i), cell.length + fullWidthRegex.findAllIn(cell).size) } } val paddedRows = rows.map { row => row.zipWithIndex.map { case (cell, i) => if (truncate > 0) { - StringUtils.leftPad(cell, colWidths(i) - regex.findAllIn(cell).size) + StringUtils.leftPad(cell, colWidths(i) - fullWidthRegex.findAllIn(cell).size) } else { - StringUtils.rightPad(cell, colWidths(i) - regex.findAllIn(cell).size) + StringUtils.rightPad(cell, colWidths(i) - fullWidthRegex.findAllIn(cell).size) } } } @@ -333,10 +334,10 @@ class Dataset[T] private[sql]( // Compute the width of field name and data columns val fieldNameColWidth = fieldNames.foldLeft(minimumColWidth) { case (curMax, fieldName) => - math.max(curMax, fieldName.length + regex.findAllIn(fieldName).size) + math.max(curMax, fieldName.length + fullWidthRegex.findAllIn(fieldName).size) } val dataColWidth = dataRows.foldLeft(minimumColWidth) { case (curMax, row) => - math.max(curMax, row.map(cell => cell.length + regex.findAllIn(cell).size).max) + math.max(curMax, row.map(cell => cell.length + fullWidthRegex.findAllIn(cell).size).max) } dataRows.zipWithIndex.foreach { case (row, i) => @@ -345,8 +346,8 @@ class Dataset[T] private[sql]( s"-RECORD $i", fieldNameColWidth + dataColWidth + 5, "-") sb.append(rowHeader).append("\n") row.zipWithIndex.map { case (cell, j) => - val fieldName = StringUtils.rightPad(fieldNames(j), fieldNameColWidth - regex.findAllIn(fieldNames(j)).size) - val data = StringUtils.rightPad(cell, dataColWidth - regex.findAllIn(cell).size) + val fieldName = StringUtils.rightPad(fieldNames(j), fieldNameColWidth - fullWidthRegex.findAllIn(fieldNames(j)).size) + val data = StringUtils.rightPad(cell, dataColWidth - fullWidthRegex.findAllIn(cell).size) s" $fieldName | $data " }.addString(sb, "", "\n", "\n") } From 8737671db590e82118aa729c690f51b7af8d5674 Mon Sep 17 00:00:00 2001 From: xuejianbest <384329882@qq.com> Date: Wed, 29 Aug 2018 11:42:37 +0800 Subject: [PATCH 04/10] Write a utility method to get the number of half width of string Putting `fullWidthRegex` outside `stringHalfWidth` is to reduce the cost of compiling regular expressions. Since compilation is expensive, frequently used Regexes should be constructed once. --- .../scala/org/apache/spark/sql/Dataset.scala | 20 +++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala index bdd2bec26ef0..7382e2af3929 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -294,8 +294,12 @@ class Dataset[T] private[sql]( // We set a minimum column width at '3' val minimumColWidth = 3 - //Regular expression matching full width characters + // Regular expression matching full width characters val fullWidthRegex = """[\u1100-\u115F\u2E80-\uA4CF\uAC00-\uD7A3\uF900-\uFAFF\uFE10-\uFE19\uFE30-\uFE6F\uFF00-\uFF60\uFFE0-\uFFE6]""".r + // The number of half width of a string + def stringHalfWidth = (str: String) => { + str.length + fullWidthRegex.findAllIn(str).size + } if (!vertical) { // Initialise the width of each column to a minimum value val colWidths = Array.fill(numCols)(minimumColWidth) @@ -303,16 +307,16 @@ class Dataset[T] private[sql]( // Compute the width of each column for (row <- rows) { for ((cell, i) <- row.zipWithIndex) { - colWidths(i) = math.max(colWidths(i), cell.length + fullWidthRegex.findAllIn(cell).size) + colWidths(i) = math.max(colWidths(i), stringHalfWidth(cell)) } } val paddedRows = rows.map { row => row.zipWithIndex.map { case (cell, i) => if (truncate > 0) { - StringUtils.leftPad(cell, colWidths(i) - fullWidthRegex.findAllIn(cell).size) + " " * (colWidths(i) - stringHalfWidth(cell)) + cell } else { - StringUtils.rightPad(cell, colWidths(i) - fullWidthRegex.findAllIn(cell).size) + cell + " " * (colWidths(i) - stringHalfWidth(cell)) } } } @@ -334,10 +338,10 @@ class Dataset[T] private[sql]( // Compute the width of field name and data columns val fieldNameColWidth = fieldNames.foldLeft(minimumColWidth) { case (curMax, fieldName) => - math.max(curMax, fieldName.length + fullWidthRegex.findAllIn(fieldName).size) + math.max(curMax, stringHalfWidth(fieldName)) } val dataColWidth = dataRows.foldLeft(minimumColWidth) { case (curMax, row) => - math.max(curMax, row.map(cell => cell.length + fullWidthRegex.findAllIn(cell).size).max) + math.max(curMax, row.map(cell => stringHalfWidth(cell)).max) } dataRows.zipWithIndex.foreach { case (row, i) => @@ -346,8 +350,8 @@ class Dataset[T] private[sql]( s"-RECORD $i", fieldNameColWidth + dataColWidth + 5, "-") sb.append(rowHeader).append("\n") row.zipWithIndex.map { case (cell, j) => - val fieldName = StringUtils.rightPad(fieldNames(j), fieldNameColWidth - fullWidthRegex.findAllIn(fieldNames(j)).size) - val data = StringUtils.rightPad(cell, dataColWidth - fullWidthRegex.findAllIn(cell).size) + val fieldName = fieldNames(j) + " " * (fieldNameColWidth - stringHalfWidth(fieldNames(j))) + val data = cell + " " * (dataColWidth - stringHalfWidth(cell)) s" $fieldName | $data " }.addString(sb, "", "\n", "\n") } From 697ac047d85b5579c25fbd5f5e25d099562d20fc Mon Sep 17 00:00:00 2001 From: xuejianbest <384329882@qq.com> Date: Wed, 29 Aug 2018 17:27:14 +0800 Subject: [PATCH 05/10] Move the stringHalfWidth method into util.Utils Move the stringHalfWidth method into util.Utils and add tests for it. --- .../scala/org/apache/spark/util/Utils.scala | 21 +++++++++++++++++++ .../org/apache/spark/util/UtilsSuite.scala | 19 +++++++++++++++++ .../scala/org/apache/spark/sql/Dataset.scala | 20 +++++++----------- 3 files changed, 47 insertions(+), 13 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala index 7ec707d94ed8..8e0a7d91d7b3 100644 --- a/core/src/main/scala/org/apache/spark/util/Utils.scala +++ b/core/src/main/scala/org/apache/spark/util/Utils.scala @@ -2794,6 +2794,27 @@ private[spark] object Utils extends Logging { } } } + + /** + * Regular expression matching full width characters + */ + private lazy val fullWidthRegex = ("""[""" + + """\u1100-\u115F""" + + """\u2E80-\uA4CF""" + + """\uAC00-\uD7A3""" + + """\uF900-\uFAFF""" + + """\uFE10-\uFE19""" + + """\uFE30-\uFE6F""" + + """\uFF00-\uFF60""" + + """\uFFE0-\uFFE6""" + + """]""").r + /** + * Return the number of half width of a string + * A full width character occupies two half widths + */ + def stringHalfWidth(str: String): Int = { + if(str == null) 0 else str.length + fullWidthRegex.findAllIn(str).size + } } private[util] object CallerContext extends Logging { diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala index 418d2f9b8850..ac907c0dc1b1 100644 --- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala @@ -1184,6 +1184,25 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging { assert(Utils.getSimpleName(classOf[MalformedClassObject.MalformedClass]) === "UtilsSuite$MalformedClassObject$MalformedClass") } + + test("stringHalfWidth") { + assert(Utils.stringHalfWidth(null) == 0) + assert(Utils.stringHalfWidth("") == 0) + assert(Utils.stringHalfWidth("ab c") == 4) + assert(Utils.stringHalfWidth("1098") == 4) + assert(Utils.stringHalfWidth("mø") == 2) + assert(Utils.stringHalfWidth("γύρ") == 3) + assert(Utils.stringHalfWidth("pê") == 2) + assert(Utils.stringHalfWidth("ー") == 2) + assert(Utils.stringHalfWidth("测") == 2) + assert(Utils.stringHalfWidth("か") == 2) + assert(Utils.stringHalfWidth("걸") == 2) + assert(Utils.stringHalfWidth("à") == 1) + assert(Utils.stringHalfWidth("焼") == 2) + assert(Utils.stringHalfWidth("羍む") == 4) + assert(Utils.stringHalfWidth("뺭ᾘ") == 3) + assert(Utils.stringHalfWidth("\u0967\u0968\u0969") == 3) + } } private class SimpleExtension diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala index 7382e2af3929..a884a567812d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -294,12 +294,6 @@ class Dataset[T] private[sql]( // We set a minimum column width at '3' val minimumColWidth = 3 - // Regular expression matching full width characters - val fullWidthRegex = """[\u1100-\u115F\u2E80-\uA4CF\uAC00-\uD7A3\uF900-\uFAFF\uFE10-\uFE19\uFE30-\uFE6F\uFF00-\uFF60\uFFE0-\uFFE6]""".r - // The number of half width of a string - def stringHalfWidth = (str: String) => { - str.length + fullWidthRegex.findAllIn(str).size - } if (!vertical) { // Initialise the width of each column to a minimum value val colWidths = Array.fill(numCols)(minimumColWidth) @@ -307,16 +301,16 @@ class Dataset[T] private[sql]( // Compute the width of each column for (row <- rows) { for ((cell, i) <- row.zipWithIndex) { - colWidths(i) = math.max(colWidths(i), stringHalfWidth(cell)) + colWidths(i) = math.max(colWidths(i), Utils.stringHalfWidth(cell)) } } val paddedRows = rows.map { row => row.zipWithIndex.map { case (cell, i) => if (truncate > 0) { - " " * (colWidths(i) - stringHalfWidth(cell)) + cell + " " * (colWidths(i) - Utils.stringHalfWidth(cell)) + cell } else { - cell + " " * (colWidths(i) - stringHalfWidth(cell)) + cell + " " * (colWidths(i) - Utils.stringHalfWidth(cell)) } } } @@ -338,10 +332,10 @@ class Dataset[T] private[sql]( // Compute the width of field name and data columns val fieldNameColWidth = fieldNames.foldLeft(minimumColWidth) { case (curMax, fieldName) => - math.max(curMax, stringHalfWidth(fieldName)) + math.max(curMax, Utils.stringHalfWidth(fieldName)) } val dataColWidth = dataRows.foldLeft(minimumColWidth) { case (curMax, row) => - math.max(curMax, row.map(cell => stringHalfWidth(cell)).max) + math.max(curMax, row.map(cell => Utils.stringHalfWidth(cell)).max) } dataRows.zipWithIndex.foreach { case (row, i) => @@ -350,8 +344,8 @@ class Dataset[T] private[sql]( s"-RECORD $i", fieldNameColWidth + dataColWidth + 5, "-") sb.append(rowHeader).append("\n") row.zipWithIndex.map { case (cell, j) => - val fieldName = fieldNames(j) + " " * (fieldNameColWidth - stringHalfWidth(fieldNames(j))) - val data = cell + " " * (dataColWidth - stringHalfWidth(cell)) + val fieldName = fieldNames(j) + " " * (fieldNameColWidth - Utils.stringHalfWidth(fieldNames(j))) + val data = cell + " " * (dataColWidth - Utils.stringHalfWidth(cell)) s" $fieldName | $data " }.addString(sb, "", "\n", "\n") } From 3d65e6b9ffb35a6c6b38313768c35a7f08e4e2db Mon Sep 17 00:00:00 2001 From: xuejianbest <384329882@qq.com> Date: Thu, 30 Aug 2018 10:14:07 +0800 Subject: [PATCH 06/10] Formatted code and non-functional changes --- core/src/main/scala/org/apache/spark/util/Utils.scala | 9 +++++---- .../src/main/scala/org/apache/spark/sql/Dataset.scala | 10 ++++++---- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala index 8e0a7d91d7b3..89fd746b4243 100644 --- a/core/src/main/scala/org/apache/spark/util/Utils.scala +++ b/core/src/main/scala/org/apache/spark/util/Utils.scala @@ -2798,7 +2798,7 @@ private[spark] object Utils extends Logging { /** * Regular expression matching full width characters */ - private lazy val fullWidthRegex = ("""[""" + + private val fullWidthRegex = ("""[""" + """\u1100-\u115F""" + """\u2E80-\uA4CF""" + """\uAC00-\uD7A3""" + @@ -2808,12 +2808,13 @@ private[spark] object Utils extends Logging { """\uFF00-\uFF60""" + """\uFFE0-\uFFE6""" + """]""").r + /** - * Return the number of half width of a string - * A full width character occupies two half widths + * Return the number of half widths in a given string. Note that a full width character + * occupies two half widths. */ def stringHalfWidth(str: String): Int = { - if(str == null) 0 else str.length + fullWidthRegex.findAllIn(str).size + if (str == null) 0 else str.length + fullWidthRegex.findAllIn(str).size } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala index a884a567812d..01a11c306c0f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -308,9 +308,9 @@ class Dataset[T] private[sql]( val paddedRows = rows.map { row => row.zipWithIndex.map { case (cell, i) => if (truncate > 0) { - " " * (colWidths(i) - Utils.stringHalfWidth(cell)) + cell + StringUtils.leftPad(cell, colWidths(i) - Utils.stringHalfWidth(cell) + cell.length) } else { - cell + " " * (colWidths(i) - Utils.stringHalfWidth(cell)) + StringUtils.rightPad(cell, colWidths(i) - Utils.stringHalfWidth(cell) + cell.length) } } } @@ -344,8 +344,10 @@ class Dataset[T] private[sql]( s"-RECORD $i", fieldNameColWidth + dataColWidth + 5, "-") sb.append(rowHeader).append("\n") row.zipWithIndex.map { case (cell, j) => - val fieldName = fieldNames(j) + " " * (fieldNameColWidth - Utils.stringHalfWidth(fieldNames(j))) - val data = cell + " " * (dataColWidth - Utils.stringHalfWidth(cell)) + val fieldName = StringUtils.rightPad(fieldNames(j), + fieldNameColWidth - Utils.stringHalfWidth(fieldNames(j)) + fieldNames(j).length) + val data = StringUtils.rightPad(cell, + dataColWidth - Utils.stringHalfWidth(cell) + cell.length) s" $fieldName | $data " }.addString(sb, "", "\n", "\n") } From 363de6ba0f769d31ba179af6a1600bb29f5ba8ef Mon Sep 17 00:00:00 2001 From: xuejianbest <384329882@qq.com> Date: Thu, 30 Aug 2018 12:09:16 +0800 Subject: [PATCH 07/10] Add tests in DatasetSuite --- .../org/apache/spark/sql/DatasetSuite.scala | 47 +++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala index cf24eba12801..f58a41f25e53 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala @@ -969,6 +969,53 @@ class DatasetSuite extends QueryTest with SharedSQLContext { checkShowString(ds, expected) } + test("SPARK-25108 Fix the show method to display the full width character alignment problem") { + val df = Seq( + (0, null, 1), + (0, "", 1), + (0, "ab c", 1), + (0, "1098", 1), + (0, "mø", 1), + (0, "γύρ", 1), + (0, "pê", 1), + (0, "ー", 1), + (0, "测", 1), + (0, "か", 1), + (0, "걸", 1), + (0, "à", 1), + (0, "焼", 1), + (0, "羍む", 1), + (0, "뺭ᾘ", 1), + (0, "\u0967\u0968\u0969", 1) + ).toDF("b", "a", "c") + + val ds = df.as[ClassData] + val expected = + """+---+----+---+ + || b| a| c| + |+---+----+---+ + || 0|null| 1| + || 0| | 1| + || 0|ab c| 1| + || 0|1098| 1| + || 0| mø| 1| + || 0| γύρ| 1| + || 0| pê| 1| + || 0| ー| 1| + || 0| 测| 1| + || 0| か| 1| + || 0| 걸| 1| + || 0| à| 1| + || 0| 焼| 1| + || 0|羍む| 1| + || 0| 뺭ᾘ| 1| + || 0| १२३| 1| + |+---+----+---+ + |""".stripMargin + + checkShowString(df, expected) + } + test( "SPARK-15112: EmbedDeserializerInFilter should not optimize plan fragment that changes schema" ) { From 3649de50235cd19cfea2c3d88d1ccfb18ea8893a Mon Sep 17 00:00:00 2001 From: xuejianbest <384329882@qq.com> Date: Thu, 30 Aug 2018 12:12:47 +0800 Subject: [PATCH 08/10] Modified variable name in DatasetSuite --- sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala index f58a41f25e53..a18f48f0bffb 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala @@ -1013,7 +1013,7 @@ class DatasetSuite extends QueryTest with SharedSQLContext { |+---+----+---+ |""".stripMargin - checkShowString(df, expected) + checkShowString(ds, expected) } test( From 45ac272ca667f3330f7b550b463b23c284d9eadf Mon Sep 17 00:00:00 2001 From: xuejianbest <384329882@qq.com> Date: Fri, 31 Aug 2018 10:52:33 +0800 Subject: [PATCH 09/10] Adding scalastyle:off nonascii in code To solve style errors because of include non-ascii characters in files. Disable the check for sections of code where it's appropriate to use unicode chars. --- core/src/main/scala/org/apache/spark/util/Utils.scala | 2 ++ core/src/test/scala/org/apache/spark/util/UtilsSuite.scala | 2 ++ .../src/test/scala/org/apache/spark/sql/DatasetSuite.scala | 6 ++++-- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala index 89fd746b4243..d2d26a0b61bc 100644 --- a/core/src/main/scala/org/apache/spark/util/Utils.scala +++ b/core/src/main/scala/org/apache/spark/util/Utils.scala @@ -2799,6 +2799,7 @@ private[spark] object Utils extends Logging { * Regular expression matching full width characters */ private val fullWidthRegex = ("""[""" + + // scalastyle:off nonascii """\u1100-\u115F""" + """\u2E80-\uA4CF""" + """\uAC00-\uD7A3""" + @@ -2807,6 +2808,7 @@ private[spark] object Utils extends Logging { """\uFE30-\uFE6F""" + """\uFF00-\uFF60""" + """\uFFE0-\uFFE6""" + + // scalastyle:on nonascii """]""").r /** diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala index ac907c0dc1b1..943b53522d64 100644 --- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala @@ -1186,6 +1186,7 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging { } test("stringHalfWidth") { + // scalastyle:off nonascii assert(Utils.stringHalfWidth(null) == 0) assert(Utils.stringHalfWidth("") == 0) assert(Utils.stringHalfWidth("ab c") == 4) @@ -1202,6 +1203,7 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging { assert(Utils.stringHalfWidth("羍む") == 4) assert(Utils.stringHalfWidth("뺭ᾘ") == 3) assert(Utils.stringHalfWidth("\u0967\u0968\u0969") == 3) + // scalastyle:on nonascii } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala index a18f48f0bffb..ca8fbc991a3a 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala @@ -970,6 +970,7 @@ class DatasetSuite extends QueryTest with SharedSQLContext { } test("SPARK-25108 Fix the show method to display the full width character alignment problem") { + // scalastyle:off nonascii val df = Seq( (0, null, 1), (0, "", 1), @@ -988,9 +989,10 @@ class DatasetSuite extends QueryTest with SharedSQLContext { (0, "뺭ᾘ", 1), (0, "\u0967\u0968\u0969", 1) ).toDF("b", "a", "c") - + // scalastyle:on nonascii val ds = df.as[ClassData] val expected = + // scalastyle:off nonascii """+---+----+---+ || b| a| c| |+---+----+---+ @@ -1012,7 +1014,7 @@ class DatasetSuite extends QueryTest with SharedSQLContext { || 0| १२३| 1| |+---+----+---+ |""".stripMargin - + // scalastyle:on nonascii checkShowString(ds, expected) } From 52acfd58a82f20656331b43a2b27944fdfac2b3f Mon Sep 17 00:00:00 2001 From: xuejianbest <384329882@qq.com> Date: Tue, 4 Sep 2018 20:18:47 +0800 Subject: [PATCH 10/10] Fix the show method to display the full width character alignment problem Some characters (called full-width characters) occupy two ordinary characters (called half-width characters) width when displayed in a terminal such as Xshell. Therefore, when the Dataset.show() method is called, if there are full-width characters and half-width characters at the same time, alignment problems will occur. This commit adds a method to calculate the number of half widths in a given string, so that it can get the correct fill when calling show method, which gives the display a more perfect alignment. Performance impact: Tested a Dataset consisting of 100 rows, each row has two columns, one column is the index (0-99), and the other column is a random string of length 100 characters, and then the showString display is called separately. The original show method (w/o this patch) took about 42ms, and the improved time took about 46ms, the performance was about 10% worse. --- core/src/main/scala/org/apache/spark/util/Utils.scala | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala index d2d26a0b61bc..b5268f83309f 100644 --- a/core/src/main/scala/org/apache/spark/util/Utils.scala +++ b/core/src/main/scala/org/apache/spark/util/Utils.scala @@ -2796,7 +2796,10 @@ private[spark] object Utils extends Logging { } /** - * Regular expression matching full width characters + * Regular expression matching full width characters. + * + * Looked at all the 0x0000-0xFFFF characters (unicode) and showed them under Xshell. + * Found all the full width characters, then get the regular expression. */ private val fullWidthRegex = ("""[""" + // scalastyle:off nonascii @@ -2814,6 +2817,9 @@ private[spark] object Utils extends Logging { /** * Return the number of half widths in a given string. Note that a full width character * occupies two half widths. + * + * For a string consisting of 1 million characters, the execution of this method requires + * about 50ms. */ def stringHalfWidth(str: String): Int = { if (str == null) 0 else str.length + fullWidthRegex.findAllIn(str).size