Skip to content

Commit ef739d9

Browse files
committed
[SQL] Add toString to DataFrame/Column
Author: Michael Armbrust <[email protected]> Closes #4436 from marmbrus/dfToString and squashes the following commits: 8a3c35f [Michael Armbrust] Merge remote-tracking branch 'origin/master' into dfToString b72a81b [Michael Armbrust] add toString (cherry picked from commit de80b1b) Signed-off-by: Michael Armbrust <[email protected]>
1 parent c294216 commit ef739d9

File tree

8 files changed

+86
-8
lines changed

8 files changed

+86
-8
lines changed

python/pyspark/sql/dataframe.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -447,7 +447,7 @@ def selectExpr(self, *expr):
447447
`select` that accepts SQL expressions.
448448
449449
>>> df.selectExpr("age * 2", "abs(age)").collect()
450-
[Row(('age * 2)=4, Abs('age)=2), Row(('age * 2)=10, Abs('age)=5)]
450+
[Row((age * 2)=4, Abs(age)=2), Row((age * 2)=10, Abs(age)=5)]
451451
"""
452452
jexpr = ListConverter().convert(expr, self._sc._gateway._gateway_client)
453453
jdf = self._jdf.selectExpr(self._sc._jvm.PythonUtils.toSeq(jexpr))

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
package org.apache.spark.sql.catalyst.expressions
1919

20+
import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
2021
import org.apache.spark.sql.catalyst.errors.TreeNodeException
2122
import org.apache.spark.sql.catalyst.trees
2223
import org.apache.spark.sql.catalyst.trees.TreeNode
@@ -66,6 +67,17 @@ abstract class Expression extends TreeNode[Expression] {
6667
*/
6768
def childrenResolved = !children.exists(!_.resolved)
6869

70+
/**
71+
* Returns a string representation of this expression that does not have developer centric
72+
* debugging information like the expression id.
73+
*/
74+
def prettyString: String = {
75+
transform {
76+
case a: AttributeReference => PrettyAttribute(a.name)
77+
case u: UnresolvedAttribute => PrettyAttribute(u.name)
78+
}.toString
79+
}
80+
6981
/**
7082
* A set of helper functions that return the correct descendant of `scala.math.Numeric[T]` type
7183
* and do any casting necessary of child evaluation.

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,26 @@ case class AttributeReference(
190190
override def toString: String = s"$name#${exprId.id}$typeSuffix"
191191
}
192192

193+
/**
194+
* A place holder used when printing expressions without debugging information such as the
195+
* expression id or the unresolved indicator.
196+
*/
197+
case class PrettyAttribute(name: String) extends Attribute with trees.LeafNode[Expression] {
198+
type EvaluatedType = Any
199+
200+
override def toString = name
201+
202+
override def withNullability(newNullability: Boolean): Attribute = ???
203+
override def newInstance(): Attribute = ???
204+
override def withQualifiers(newQualifiers: Seq[String]): Attribute = ???
205+
override def withName(newName: String): Attribute = ???
206+
override def qualifiers: Seq[String] = ???
207+
override def exprId: ExprId = ???
208+
override def eval(input: Row): EvaluatedType = ???
209+
override def nullable: Boolean = ???
210+
override def dataType: DataType = ???
211+
}
212+
193213
object VirtualColumn {
194214
val groupingIdName = "grouping__id"
195215
def newGroupingId = AttributeReference(groupingIdName, IntegerType, false)()

sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@ import org.apache.spark.sql.catalyst.plans.logical._
2727
import org.apache.spark.sql.types.StructType
2828
import org.apache.spark.util.Utils
2929

30+
import scala.util.control.NonFatal
31+
3032

3133
private[sql] object DataFrame {
3234
def apply(sqlContext: SQLContext, logicalPlan: LogicalPlan): DataFrame = {
@@ -92,6 +94,12 @@ trait DataFrame extends RDDApi[Row] {
9294
*/
9395
def toDataFrame: DataFrame = this
9496

97+
override def toString =
98+
try schema.map(f => s"${f.name}: ${f.dataType.simpleString}").mkString("[", ", ", "]") catch {
99+
case NonFatal(e) =>
100+
s"Invalid tree; ${e.getMessage}:\n$queryExecution"
101+
}
102+
95103
/**
96104
* Returns a new [[DataFrame]] with columns renamed. This can be quite convenient in conversion
97105
* from a RDD of tuples into a [[DataFrame]] with meaningful names. For example:

sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -201,13 +201,11 @@ private[sql] class DataFrameImpl protected[sql](
201201
override def as(alias: Symbol): DataFrame = Subquery(alias.name, logicalPlan)
202202

203203
override def select(cols: Column*): DataFrame = {
204-
val exprs = cols.zipWithIndex.map {
205-
case (Column(expr: NamedExpression), _) =>
206-
expr
207-
case (Column(expr: Expression), _) =>
208-
Alias(expr, expr.toString)()
204+
val namedExpressions = cols.map {
205+
case Column(expr: NamedExpression) => expr
206+
case Column(expr: Expression) => Alias(expr, expr.prettyString)()
209207
}
210-
Project(exprs.toSeq, logicalPlan)
208+
Project(namedExpressions.toSeq, logicalPlan)
211209
}
212210

213211
override def select(col: String, cols: String*): DataFrame = {

sql/core/src/main/scala/org/apache/spark/sql/IncomputableColumn.scala

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@ private[sql] class IncomputableColumn(protected[sql] val expr: Expression) exten
4040
throw new UnsupportedOperationException("Cannot run this method on an UncomputableColumn")
4141
}
4242

43+
override def toString = expr.prettyString
44+
4345
override def isComputable: Boolean = false
4446

4547
override val sqlContext: SQLContext = null

sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ import scala.collection.mutable.HashSet
2222
import org.apache.spark.{AccumulatorParam, Accumulator, SparkContext}
2323
import org.apache.spark.annotation.DeveloperApi
2424
import org.apache.spark.SparkContext._
25-
import org.apache.spark.sql.{DataFrame, Row}
25+
import org.apache.spark.sql.{SQLConf, SQLContext, DataFrame, Row}
2626
import org.apache.spark.sql.catalyst.trees.TreeNodeRef
2727
import org.apache.spark.sql.types._
2828

@@ -37,6 +37,15 @@ import org.apache.spark.sql.types._
3737
*/
3838
package object debug {
3939

40+
/**
41+
* Augments [[SQLContext]] with debug methods.
42+
*/
43+
implicit class DebugSQLContext(sqlContext: SQLContext) {
44+
def debug() = {
45+
sqlContext.setConf(SQLConf.DATAFRAME_EAGER_ANALYSIS, "false")
46+
}
47+
}
48+
4049
/**
4150
* :: DeveloperApi ::
4251
* Augments [[DataFrame]]s with debug methods.

sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717

1818
package org.apache.spark.sql
1919

20+
import org.apache.spark.sql.TestData._
21+
2022
import scala.language.postfixOps
2123

2224
import org.apache.spark.sql.Dsl._
@@ -53,6 +55,33 @@ class DataFrameSuite extends QueryTest {
5355
TestSQLContext.setConf(SQLConf.DATAFRAME_EAGER_ANALYSIS, oldSetting.toString)
5456
}
5557

58+
test("dataframe toString") {
59+
assert(testData.toString === "[key: int, value: string]")
60+
assert(testData("key").toString === "[key: int]")
61+
}
62+
63+
test("incomputable toString") {
64+
assert($"test".toString === "test")
65+
}
66+
67+
test("invalid plan toString, debug mode") {
68+
val oldSetting = TestSQLContext.conf.dataFrameEagerAnalysis
69+
TestSQLContext.setConf(SQLConf.DATAFRAME_EAGER_ANALYSIS, "true")
70+
71+
// Turn on debug mode so we can see invalid query plans.
72+
import org.apache.spark.sql.execution.debug._
73+
TestSQLContext.debug()
74+
75+
val badPlan = testData.select('badColumn)
76+
77+
assert(badPlan.toString contains badPlan.queryExecution.toString,
78+
"toString on bad query plans should include the query execution but was:\n" +
79+
badPlan.toString)
80+
81+
// Set the flag back to original value before this test.
82+
TestSQLContext.setConf(SQLConf.DATAFRAME_EAGER_ANALYSIS, oldSetting.toString)
83+
}
84+
5685
test("table scan") {
5786
checkAnswer(
5887
testData,

0 commit comments

Comments
 (0)