From 14deeed1d1f82e32d5cc05a27c79abe7d9bb91b8 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Thu, 1 Dec 2016 04:08:28 +0900 Subject: [PATCH 1/5] initial commit --- pom.xml | 1 + sql/core/pom.xml | 5 ++ .../scala/org/apache/spark/sql/Dataset.scala | 56 ++++++++++++++++--- .../org/apache/spark/sql/DatasetSuite.scala | 36 ++++++++++++ 4 files changed, 90 insertions(+), 8 deletions(-) diff --git a/pom.xml b/pom.xml index c391102d3750..6f2b1cebf7f7 100644 --- a/pom.xml +++ b/pom.xml @@ -182,6 +182,7 @@ 2.8 1.8 1.0.0 + 58.1 ${java.home} diff --git a/sql/core/pom.xml b/sql/core/pom.xml index 7da77158ff07..77b086824e6e 100644 --- a/sql/core/pom.xml +++ b/sql/core/pom.xml @@ -91,6 +91,11 @@ jackson-databind ${fasterxml.jackson.version} + + com.ibm.icu + icu4j + ${ibm.icu.version} + org.scalacheck scalacheck_${scala.binary.version} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala index fcc02e5eb3ef..1fcbc1410cb2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -18,13 +18,15 @@ package org.apache.spark.sql import java.io.CharArrayWriter +import java.util.Locale import scala.collection.JavaConverters._ import scala.language.implicitConversions import scala.reflect.runtime.universe.TypeTag import scala.util.control.NonFatal -import org.apache.commons.lang3.StringUtils +import com.ibm.icu.lang.UCharacter +import com.ibm.icu.lang.UProperty import org.apache.spark.annotation.{DeveloperApi, Experimental, InterfaceStability} import org.apache.spark.api.java.JavaRDD @@ -236,6 +238,37 @@ class Dataset[T] private[sql]( } } + var EAST_ASIAN_LANGS = Seq("ja", "vi", "kr", "zh") + + private def unicodeWidth(str: String): Int = { + val locale = Locale.getDefault() + if (locale == null) { + throw new NullPointerException("locale is null") + } + val ambiguousLen = if (EAST_ASIAN_LANGS.contains(locale.getLanguage())) 2 else 1 + var len = 0 + for (i <- 0 until str.length) { + val codePoint = str.codePointAt(i) + val value = UCharacter.getIntPropertyValue(codePoint, UProperty.EAST_ASIAN_WIDTH); + len = len + (value match { + case UCharacter.EastAsianWidth.NARROW | UCharacter.EastAsianWidth.NEUTRAL | + UCharacter.EastAsianWidth.HALFWIDTH => 1 + case UCharacter.EastAsianWidth.FULLWIDTH | UCharacter.EastAsianWidth.WIDE => 2 + case UCharacter.EastAsianWidth.AMBIGUOUS => ambiguousLen + case _ => 1 + }) + } + len + } + + private def repeatPadding(len: Int): StringBuilder = { + var str = new StringBuilder(len) + for (i <- 0 until len) { + str.append(' ') + } + str + } + /** * Compose the string representing rows for output * @@ -275,36 +308,43 @@ class Dataset[T] private[sql]( val numCols = schema.fieldNames.length // Initialise the width of each column to a minimum value of '3' - val colWidths = Array.fill(numCols)(3) + val colMaxWidths = Array.fill(numCols)(3) + val colWidths = Array.ofDim[Int](rows.length, numCols) // Compute the width of each column + var j = 0 for (row <- rows) { for ((cell, i) <- row.zipWithIndex) { - colWidths(i) = math.max(colWidths(i), cell.length) + val width = unicodeWidth(cell) + colWidths(j)(i) = width + colMaxWidths(i) = math.max(colMaxWidths(i), width) } + j = j + 1 } // Create SeparateLine - val sep: String = colWidths.map("-" * _).addString(sb, "+", "+", "+\n").toString() + val sep: String = colMaxWidths.map("-" * _).addString(sb, "+", "+", "+\n").toString() // column names rows.head.zipWithIndex.map { case (cell, i) => if (truncate > 0) { - StringUtils.leftPad(cell, colWidths(i)) + repeatPadding(colMaxWidths(i) - colWidths(0)(i)).append(cell) } else { - StringUtils.rightPad(cell, colWidths(i)) + new StringBuffer(cell).append(repeatPadding(colMaxWidths(i) - colWidths(0)(i))) } }.addString(sb, "|", "|", "|\n") sb.append(sep) // data + j = 0 rows.tail.map { + j = j + 1 _.zipWithIndex.map { case (cell, i) => if (truncate > 0) { - StringUtils.leftPad(cell.toString, colWidths(i)) + repeatPadding(colMaxWidths(i) - colWidths(j)(i)).append(cell) } else { - StringUtils.rightPad(cell.toString, colWidths(i)) + new StringBuffer(cell).append(repeatPadding(colMaxWidths(i) - colWidths(j)(i))) } }.addString(sb, "|", "|", "|\n") } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala index 1174d7354f93..173aa31ce289 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala @@ -1060,6 +1060,39 @@ class DatasetSuite extends QueryTest with SharedSQLContext { } assert(e.getMessage.contains("Cannot create encoder for Option of Product type")) } + + private def checkString[T](actual: String, expected: String): Unit = { + if (expected != actual) { + fail( + "Dataset.showString() gives wrong result:\n\n" + sideBySide( + "== Expected ==\n" + expected, + "== Actual ==\n" + actual + ).mkString("\n") + ) + } + } + + test("SPARK-18653: Dataset.show() should generate correct padding for Unicode Character") { + // scalastyle:off + val ds = Seq(UnicodeCaseClass(1, 1.1, "文字列1")).toDS + val leftPadding = ds.showString(1, 99) + val rightPadding = ds.showString(1, -99) + checkString(leftPadding, + """+----+----+-------+ + ||整数|実数| s| + |+----+----+-------+ + || 1| 1.1|文字列1| + |+----+----+-------+ + |""".stripMargin) + checkString(rightPadding, + """+----+----+-------+ + ||整数|実数|s | + |+----+----+-------+ + ||1 |1.1 |文字列1| + |+----+----+-------+ + |""".stripMargin) + // scalastyle:on + } } case class Generic[T](id: T, value: Double) @@ -1135,3 +1168,6 @@ object DatasetTransform { case class Route(src: String, dest: String, cost: Int) case class GroupedRoutes(src: String, dest: String, routes: Seq[Route]) +// scalastyle:off +case class UnicodeCaseClass(整数: Int, 実数: Double, s: String) +// scalastyle:on From a42547d91655a493e96420fd4d989827d4be4529 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Thu, 1 Dec 2016 04:24:49 +0900 Subject: [PATCH 2/5] add jar for icu --- dev/deps/spark-deps-hadoop-2.2 | 1 + dev/deps/spark-deps-hadoop-2.3 | 1 + dev/deps/spark-deps-hadoop-2.4 | 1 + dev/deps/spark-deps-hadoop-2.6 | 1 + dev/deps/spark-deps-hadoop-2.7 | 1 + 5 files changed, 5 insertions(+) diff --git a/dev/deps/spark-deps-hadoop-2.2 b/dev/deps/spark-deps-hadoop-2.2 index 89bfcef4d946..90a3f91047b1 100644 --- a/dev/deps/spark-deps-hadoop-2.2 +++ b/dev/deps/spark-deps-hadoop-2.2 @@ -72,6 +72,7 @@ hk2-locator-2.4.0-b34.jar hk2-utils-2.4.0-b34.jar httpclient-4.5.2.jar httpcore-4.4.4.jar +icu4j-58.1.jar ivy-2.4.0.jar jackson-annotations-2.6.5.jar jackson-core-2.6.5.jar diff --git a/dev/deps/spark-deps-hadoop-2.3 b/dev/deps/spark-deps-hadoop-2.3 index 8df3858825e1..52bf28a2f36d 100644 --- a/dev/deps/spark-deps-hadoop-2.3 +++ b/dev/deps/spark-deps-hadoop-2.3 @@ -74,6 +74,7 @@ hk2-locator-2.4.0-b34.jar hk2-utils-2.4.0-b34.jar httpclient-4.5.2.jar httpcore-4.4.4.jar +icu4j-58.1.jar ivy-2.4.0.jar jackson-annotations-2.6.5.jar jackson-core-2.6.5.jar diff --git a/dev/deps/spark-deps-hadoop-2.4 b/dev/deps/spark-deps-hadoop-2.4 index 71e7fb6dd243..6da8c491fd72 100644 --- a/dev/deps/spark-deps-hadoop-2.4 +++ b/dev/deps/spark-deps-hadoop-2.4 @@ -74,6 +74,7 @@ hk2-locator-2.4.0-b34.jar hk2-utils-2.4.0-b34.jar httpclient-4.5.2.jar httpcore-4.4.4.jar +icu4j-58.1.jar ivy-2.4.0.jar jackson-annotations-2.6.5.jar jackson-core-2.6.5.jar diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6 index ba31391495f5..7039bdeb1b73 100644 --- a/dev/deps/spark-deps-hadoop-2.6 +++ b/dev/deps/spark-deps-hadoop-2.6 @@ -80,6 +80,7 @@ hk2-utils-2.4.0-b34.jar htrace-core-3.0.4.jar httpclient-4.5.2.jar httpcore-4.4.4.jar +icu4j-58.1.jar ivy-2.4.0.jar jackson-annotations-2.6.5.jar jackson-core-2.6.5.jar diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7 index b129e5a99e2f..afe4b2a5db88 100644 --- a/dev/deps/spark-deps-hadoop-2.7 +++ b/dev/deps/spark-deps-hadoop-2.7 @@ -80,6 +80,7 @@ hk2-utils-2.4.0-b34.jar htrace-core-3.1.0-incubating.jar httpclient-4.5.2.jar httpcore-4.4.4.jar +icu4j-58.1.jar ivy-2.4.0.jar jackson-annotations-2.6.5.jar jackson-core-2.6.5.jar From 70498098303af9535f7ea99dcd33294d6ba4b3f8 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Thu, 1 Dec 2016 10:44:07 +0900 Subject: [PATCH 3/5] fix test failure --- sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala index 1fcbc1410cb2..ab518fcb3563 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -338,9 +338,9 @@ class Dataset[T] private[sql]( // data j = 0 - rows.tail.map { + rows.tail.map { row => j = j + 1 - _.zipWithIndex.map { case (cell, i) => + row.zipWithIndex.map { case (cell, i) => if (truncate > 0) { repeatPadding(colMaxWidths(i) - colWidths(j)(i)).append(cell) } else { From 8e1b434119d0d4451f9be5f6109d44ff3b3c30d6 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Fri, 2 Dec 2016 01:22:21 +0900 Subject: [PATCH 4/5] addressed review comments --- .../scala/org/apache/spark/sql/Dataset.scala | 24 +++++++------------ 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala index ab518fcb3563..5eb8d5d35d98 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -238,7 +238,7 @@ class Dataset[T] private[sql]( } } - var EAST_ASIAN_LANGS = Seq("ja", "vi", "kr", "zh") + val EAST_ASIAN_LANGS = Seq("ja", "vi", "kr", "zh") private def unicodeWidth(str: String): Int = { val locale = Locale.getDefault() @@ -249,10 +249,10 @@ class Dataset[T] private[sql]( var len = 0 for (i <- 0 until str.length) { val codePoint = str.codePointAt(i) - val value = UCharacter.getIntPropertyValue(codePoint, UProperty.EAST_ASIAN_WIDTH); + val value = UCharacter.getIntPropertyValue(codePoint, UProperty.EAST_ASIAN_WIDTH) len = len + (value match { case UCharacter.EastAsianWidth.NARROW | UCharacter.EastAsianWidth.NEUTRAL | - UCharacter.EastAsianWidth.HALFWIDTH => 1 + UCharacter.EastAsianWidth.HALFWIDTH => 1 case UCharacter.EastAsianWidth.FULLWIDTH | UCharacter.EastAsianWidth.WIDE => 2 case UCharacter.EastAsianWidth.AMBIGUOUS => ambiguousLen case _ => 1 @@ -261,14 +261,6 @@ class Dataset[T] private[sql]( len } - private def repeatPadding(len: Int): StringBuilder = { - var str = new StringBuilder(len) - for (i <- 0 until len) { - str.append(' ') - } - str - } - /** * Compose the string representing rows for output * @@ -327,10 +319,11 @@ class Dataset[T] private[sql]( // column names rows.head.zipWithIndex.map { case (cell, i) => + val paddingLen = colMaxWidths(i) - colWidths(0)(i) if (truncate > 0) { - repeatPadding(colMaxWidths(i) - colWidths(0)(i)).append(cell) + new StringBuilder(cell.length, " " * paddingLen).append(cell) } else { - new StringBuffer(cell).append(repeatPadding(colMaxWidths(i) - colWidths(0)(i))) + new StringBuilder(paddingLen, cell).append(" " * paddingLen) } }.addString(sb, "|", "|", "|\n") @@ -341,10 +334,11 @@ class Dataset[T] private[sql]( rows.tail.map { row => j = j + 1 row.zipWithIndex.map { case (cell, i) => + val paddingLen = colMaxWidths(i) - colWidths(j)(i) if (truncate > 0) { - repeatPadding(colMaxWidths(i) - colWidths(j)(i)).append(cell) + new StringBuilder(cell.length, " " * paddingLen).append(cell) } else { - new StringBuffer(cell).append(repeatPadding(colMaxWidths(i) - colWidths(j)(i))) + new StringBuilder(paddingLen, cell).append(" " * paddingLen) } }.addString(sb, "|", "|", "|\n") } From e828ad9f5aa64663f747370dbf7c28a335804bc6 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Fri, 2 Dec 2016 01:22:52 +0900 Subject: [PATCH 5/5] add one more line for the testsuite --- .../src/test/scala/org/apache/spark/sql/DatasetSuite.scala | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala index 173aa31ce289..7a6f535964e4 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala @@ -1074,7 +1074,7 @@ class DatasetSuite extends QueryTest with SharedSQLContext { test("SPARK-18653: Dataset.show() should generate correct padding for Unicode Character") { // scalastyle:off - val ds = Seq(UnicodeCaseClass(1, 1.1, "文字列1")).toDS + val ds = Seq(UnicodeCaseClass(1, 1.1, "文字列1"), UnicodeCaseClass(-2, -2.2, "文字列")).toDS val leftPadding = ds.showString(1, 99) val rightPadding = ds.showString(1, -99) checkString(leftPadding, @@ -1082,6 +1082,7 @@ class DatasetSuite extends QueryTest with SharedSQLContext { ||整数|実数| s| |+----+----+-------+ || 1| 1.1|文字列1| + || -2|-2.2| 文字列| |+----+----+-------+ |""".stripMargin) checkString(rightPadding, @@ -1089,6 +1090,7 @@ class DatasetSuite extends QueryTest with SharedSQLContext { ||整数|実数|s | |+----+----+-------+ ||1 |1.1 |文字列1| + ||-2 |-2.2|文字列 | |+----+----+-------+ |""".stripMargin) // scalastyle:on