Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions dev/deps/spark-deps-hadoop-2.2
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ hk2-locator-2.4.0-b34.jar
hk2-utils-2.4.0-b34.jar
httpclient-4.5.2.jar
httpcore-4.4.4.jar
icu4j-58.1.jar
ivy-2.4.0.jar
jackson-annotations-2.6.5.jar
jackson-core-2.6.5.jar
Expand Down
1 change: 1 addition & 0 deletions dev/deps/spark-deps-hadoop-2.3
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ hk2-locator-2.4.0-b34.jar
hk2-utils-2.4.0-b34.jar
httpclient-4.5.2.jar
httpcore-4.4.4.jar
icu4j-58.1.jar
ivy-2.4.0.jar
jackson-annotations-2.6.5.jar
jackson-core-2.6.5.jar
Expand Down
1 change: 1 addition & 0 deletions dev/deps/spark-deps-hadoop-2.4
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ hk2-locator-2.4.0-b34.jar
hk2-utils-2.4.0-b34.jar
httpclient-4.5.2.jar
httpcore-4.4.4.jar
icu4j-58.1.jar
ivy-2.4.0.jar
jackson-annotations-2.6.5.jar
jackson-core-2.6.5.jar
Expand Down
1 change: 1 addition & 0 deletions dev/deps/spark-deps-hadoop-2.6
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ hk2-utils-2.4.0-b34.jar
htrace-core-3.0.4.jar
httpclient-4.5.2.jar
httpcore-4.4.4.jar
icu4j-58.1.jar
ivy-2.4.0.jar
jackson-annotations-2.6.5.jar
jackson-core-2.6.5.jar
Expand Down
1 change: 1 addition & 0 deletions dev/deps/spark-deps-hadoop-2.7
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ hk2-utils-2.4.0-b34.jar
htrace-core-3.1.0-incubating.jar
httpclient-4.5.2.jar
httpcore-4.4.4.jar
icu4j-58.1.jar
ivy-2.4.0.jar
jackson-annotations-2.6.5.jar
jackson-core-2.6.5.jar
Expand Down
1 change: 1 addition & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,7 @@
<paranamer.version>2.8</paranamer.version>
<maven-antrun.version>1.8</maven-antrun.version>
<commons-crypto.version>1.0.0</commons-crypto.version>
<ibm.icu.version>58.1</ibm.icu.version>

<test.java.home>${java.home}</test.java.home>
<test.exclude.tags></test.exclude.tags>
Expand Down
5 changes: 5 additions & 0 deletions sql/core/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,11 @@
<artifactId>jackson-databind</artifactId>
<version>${fasterxml.jackson.version}</version>
</dependency>
<dependency>
<groupId>com.ibm.icu</groupId>
<artifactId>icu4j</artifactId>
<version>${ibm.icu.version}</version>
</dependency>
<dependency>
<groupId>org.scalacheck</groupId>
<artifactId>scalacheck_${scala.binary.version}</artifactId>
Expand Down
54 changes: 44 additions & 10 deletions sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,15 @@
package org.apache.spark.sql

import java.io.CharArrayWriter
import java.util.Locale

import scala.collection.JavaConverters._
import scala.language.implicitConversions
import scala.reflect.runtime.universe.TypeTag
import scala.util.control.NonFatal

import org.apache.commons.lang3.StringUtils
import com.ibm.icu.lang.UCharacter
import com.ibm.icu.lang.UProperty

import org.apache.spark.annotation.{DeveloperApi, Experimental, InterfaceStability}
import org.apache.spark.api.java.JavaRDD
Expand Down Expand Up @@ -236,6 +238,29 @@ class Dataset[T] private[sql](
}
}

val EAST_ASIAN_LANGS = Seq("ja", "vi", "kr", "zh")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Only these four?


private def unicodeWidth(str: String): Int = {
val locale = Locale.getDefault()
if (locale == null) {
throw new NullPointerException("locale is null")
}
val ambiguousLen = if (EAST_ASIAN_LANGS.contains(locale.getLanguage())) 2 else 1
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about creating a separate helper function for the default width?

Copy link
Member Author

@kiszk kiszk Dec 2, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can create the separate helper for the default width. A challenge is how we can decide the helper can be applied when we have got a string.
While I have been thinking about these conditions, I have not answers yet.

var len = 0
for (i <- 0 until str.length) {
val codePoint = str.codePointAt(i)
val value = UCharacter.getIntPropertyValue(codePoint, UProperty.EAST_ASIAN_WIDTH)
len = len + (value match {
case UCharacter.EastAsianWidth.NARROW | UCharacter.EastAsianWidth.NEUTRAL |
UCharacter.EastAsianWidth.HALFWIDTH => 1
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

An indent issue.

case UCharacter.EastAsianWidth.FULLWIDTH | UCharacter.EastAsianWidth.WIDE => 2
case UCharacter.EastAsianWidth.AMBIGUOUS => ambiguousLen
case _ => 1
})
}
len
}

/**
* Compose the string representing rows for output
*
Expand Down Expand Up @@ -275,36 +300,45 @@ class Dataset[T] private[sql](
val numCols = schema.fieldNames.length

// Initialise the width of each column to a minimum value of '3'
val colWidths = Array.fill(numCols)(3)
val colMaxWidths = Array.fill(numCols)(3)
val colWidths = Array.ofDim[Int](rows.length, numCols)

// Compute the width of each column
var j = 0
for (row <- rows) {
for ((cell, i) <- row.zipWithIndex) {
colWidths(i) = math.max(colWidths(i), cell.length)
val width = unicodeWidth(cell)
colWidths(j)(i) = width
colMaxWidths(i) = math.max(colMaxWidths(i), width)
}
j = j + 1
}

// Create SeparateLine
val sep: String = colWidths.map("-" * _).addString(sb, "+", "+", "+\n").toString()
val sep: String = colMaxWidths.map("-" * _).addString(sb, "+", "+", "+\n").toString()

// column names
rows.head.zipWithIndex.map { case (cell, i) =>
val paddingLen = colMaxWidths(i) - colWidths(0)(i)
if (truncate > 0) {
StringUtils.leftPad(cell, colWidths(i))
new StringBuilder(cell.length, " " * paddingLen).append(cell)
} else {
StringUtils.rightPad(cell, colWidths(i))
new StringBuilder(paddingLen, cell).append(" " * paddingLen)
}
}.addString(sb, "|", "|", "|\n")

sb.append(sep)

// data
rows.tail.map {
_.zipWithIndex.map { case (cell, i) =>
j = 0
rows.tail.map { row =>
j = j + 1
row.zipWithIndex.map { case (cell, i) =>
val paddingLen = colMaxWidths(i) - colWidths(j)(i)
if (truncate > 0) {
StringUtils.leftPad(cell.toString, colWidths(i))
new StringBuilder(cell.length, " " * paddingLen).append(cell)
} else {
StringUtils.rightPad(cell.toString, colWidths(i))
new StringBuilder(paddingLen, cell).append(" " * paddingLen)
}
}.addString(sb, "|", "|", "|\n")
}
Expand Down
38 changes: 38 additions & 0 deletions sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
Original file line number Diff line number Diff line change
Expand Up @@ -1060,6 +1060,41 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
}
assert(e.getMessage.contains("Cannot create encoder for Option of Product type"))
}

private def checkString[T](actual: String, expected: String): Unit = {
if (expected != actual) {
fail(
"Dataset.showString() gives wrong result:\n\n" + sideBySide(
"== Expected ==\n" + expected,
"== Actual ==\n" + actual
).mkString("\n")
)
}
}

test("SPARK-18653: Dataset.show() should generate correct padding for Unicode Character") {
// scalastyle:off
val ds = Seq(UnicodeCaseClass(1, 1.1, "文字列1"), UnicodeCaseClass(-2, -2.2, "文字列")).toDS
val leftPadding = ds.showString(1, 99)
val rightPadding = ds.showString(1, -99)
checkString(leftPadding,
"""+----+----+-------+
||整数|実数| s|
|+----+----+-------+
|| 1| 1.1|文字列1|
|| -2|-2.2| 文字列|
|+----+----+-------+
|""".stripMargin)
checkString(rightPadding,
"""+----+----+-------+
||整数|実数|s |
|+----+----+-------+
||1 |1.1 |文字列1|
||-2 |-2.2|文字列 |
|+----+----+-------+
|""".stripMargin)
// scalastyle:on
}
}

case class Generic[T](id: T, value: Double)
Expand Down Expand Up @@ -1135,3 +1170,6 @@ object DatasetTransform {

case class Route(src: String, dest: String, cost: Int)
case class GroupedRoutes(src: String, dest: String, routes: Seq[Route])
// scalastyle:off
case class UnicodeCaseClass(整数: Int, 実数: Double, s: String)
// scalastyle:on