From b0b4b9e4f0756f2940d58e934a4e06c176146306 Mon Sep 17 00:00:00 2001 From: Sean Zhong Date: Tue, 6 Sep 2016 17:26:41 +0800 Subject: [PATCH] OOM --- .../org/apache/spark/sql/catalyst/trees/TreeNode.scala | 4 +++- .../src/test/scala/org/apache/spark/sql/QueryTest.scala | 8 ++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala index c97dc2d8be7e6..68d340b34c142 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala @@ -530,7 +530,9 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] extends Product { case s: String => JString(s) case u: UUID => JString(u.toString) case dt: DataType => dt.jsonValue - case m: Metadata => m.jsonValue + // SPARK-17356: In usage of mllib, Metadata may store a huge vector of data, transforming + // it to JSON may trigger OutOfMemoryError. + case m: Metadata => Metadata.empty.jsonValue case s: StorageLevel => ("useDisk" -> s.useDisk) ~ ("useMemory" -> s.useMemory) ~ ("useOffHeap" -> s.useOffHeap) ~ ("deserialized" -> s.deserialized) ~ ("replication" -> s.replication) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala index ebb6badbb452c..aa0c9b1a80172 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala @@ -30,6 +30,7 @@ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.execution.columnar.InMemoryRelation import org.apache.spark.sql.execution.datasources.LogicalRelation import org.apache.spark.sql.execution.{LogicalRDD, Queryable} +import org.apache.spark.sql.types.Metadata abstract class QueryTest extends PlanTest { @@ -224,6 +225,13 @@ abstract class QueryTest extends PlanTest { val normalized1 = logicalPlan.transformAllExpressions { case udf: ScalaUDF => udf.copy(function = null) case gen: UserDefinedGenerator => gen.copy(function = null) + // After SPARK-17356: the JSON representation no longer has the Metadata. We need to remove + // the Metadata from the normalized plan so that we can compare this plan with the + // JSON-deserialzed plan. + case a @ Alias(child, name) if a.explicitMetadata.isDefined => + Alias(child, name)(a.exprId, a.qualifiers, Some(Metadata.empty)) + case a: AttributeReference if a.metadata != Metadata.empty => + AttributeReference(a.name, a.dataType, a.nullable, Metadata.empty)(a.exprId, a.qualifiers) } // RDDs/data are not serializable to JSON, so we need to collect LogicalPlans that contains