-
Notifications
You must be signed in to change notification settings - Fork 28.9k
[SPARK-18653][SQL] Fix incorrect space padding for unicode character at Dataset.show #16086
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,13 +18,15 @@ | |
| package org.apache.spark.sql | ||
|
|
||
| import java.io.CharArrayWriter | ||
| import java.util.Locale | ||
|
|
||
| import scala.collection.JavaConverters._ | ||
| import scala.language.implicitConversions | ||
| import scala.reflect.runtime.universe.TypeTag | ||
| import scala.util.control.NonFatal | ||
|
|
||
| import org.apache.commons.lang3.StringUtils | ||
| import com.ibm.icu.lang.UCharacter | ||
| import com.ibm.icu.lang.UProperty | ||
|
|
||
| import org.apache.spark.annotation.{DeveloperApi, Experimental, InterfaceStability} | ||
| import org.apache.spark.api.java.JavaRDD | ||
|
|
@@ -236,6 +238,29 @@ class Dataset[T] private[sql]( | |
| } | ||
| } | ||
|
|
||
| val EAST_ASIAN_LANGS = Seq("ja", "vi", "kr", "zh") | ||
|
|
||
| private def unicodeWidth(str: String): Int = { | ||
| val locale = Locale.getDefault() | ||
| if (locale == null) { | ||
| throw new NullPointerException("locale is null") | ||
| } | ||
| val ambiguousLen = if (EAST_ASIAN_LANGS.contains(locale.getLanguage())) 2 else 1 | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How about creating a separate helper function for the default width?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I can create the separate helper for the default width. A challenge is how we can decide the helper can be applied when we have got a string. |
||
| var len = 0 | ||
| for (i <- 0 until str.length) { | ||
| val codePoint = str.codePointAt(i) | ||
| val value = UCharacter.getIntPropertyValue(codePoint, UProperty.EAST_ASIAN_WIDTH) | ||
| len = len + (value match { | ||
| case UCharacter.EastAsianWidth.NARROW | UCharacter.EastAsianWidth.NEUTRAL | | ||
| UCharacter.EastAsianWidth.HALFWIDTH => 1 | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. An indent issue. |
||
| case UCharacter.EastAsianWidth.FULLWIDTH | UCharacter.EastAsianWidth.WIDE => 2 | ||
| case UCharacter.EastAsianWidth.AMBIGUOUS => ambiguousLen | ||
| case _ => 1 | ||
| }) | ||
| } | ||
| len | ||
| } | ||
|
|
||
| /** | ||
| * Compose the string representing rows for output | ||
| * | ||
|
|
@@ -275,36 +300,45 @@ class Dataset[T] private[sql]( | |
| val numCols = schema.fieldNames.length | ||
|
|
||
| // Initialise the width of each column to a minimum value of '3' | ||
| val colWidths = Array.fill(numCols)(3) | ||
| val colMaxWidths = Array.fill(numCols)(3) | ||
| val colWidths = Array.ofDim[Int](rows.length, numCols) | ||
|
|
||
| // Compute the width of each column | ||
| var j = 0 | ||
| for (row <- rows) { | ||
| for ((cell, i) <- row.zipWithIndex) { | ||
| colWidths(i) = math.max(colWidths(i), cell.length) | ||
| val width = unicodeWidth(cell) | ||
| colWidths(j)(i) = width | ||
| colMaxWidths(i) = math.max(colMaxWidths(i), width) | ||
| } | ||
| j = j + 1 | ||
| } | ||
|
|
||
| // Create SeparateLine | ||
| val sep: String = colWidths.map("-" * _).addString(sb, "+", "+", "+\n").toString() | ||
| val sep: String = colMaxWidths.map("-" * _).addString(sb, "+", "+", "+\n").toString() | ||
|
|
||
| // column names | ||
| rows.head.zipWithIndex.map { case (cell, i) => | ||
| val paddingLen = colMaxWidths(i) - colWidths(0)(i) | ||
| if (truncate > 0) { | ||
| StringUtils.leftPad(cell, colWidths(i)) | ||
| new StringBuilder(cell.length, " " * paddingLen).append(cell) | ||
| } else { | ||
| StringUtils.rightPad(cell, colWidths(i)) | ||
| new StringBuilder(paddingLen, cell).append(" " * paddingLen) | ||
| } | ||
| }.addString(sb, "|", "|", "|\n") | ||
|
|
||
| sb.append(sep) | ||
|
|
||
| // data | ||
| rows.tail.map { | ||
| _.zipWithIndex.map { case (cell, i) => | ||
| j = 0 | ||
| rows.tail.map { row => | ||
| j = j + 1 | ||
| row.zipWithIndex.map { case (cell, i) => | ||
| val paddingLen = colMaxWidths(i) - colWidths(j)(i) | ||
| if (truncate > 0) { | ||
| StringUtils.leftPad(cell.toString, colWidths(i)) | ||
| new StringBuilder(cell.length, " " * paddingLen).append(cell) | ||
| } else { | ||
| StringUtils.rightPad(cell.toString, colWidths(i)) | ||
| new StringBuilder(paddingLen, cell).append(" " * paddingLen) | ||
| } | ||
| }.addString(sb, "|", "|", "|\n") | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Only these four?