apache
diff --git a/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala‎
Lines changed: 2 additions & 2 deletions b/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/BaseRelation.scala‎
Lines changed: 0 additions & 24 deletions b/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/BaseRelation.scala‎
Lines changed: 0 additions & 24 deletions
diff --git a/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala‎
Lines changed: 22 additions & 0 deletions b/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala‎
Lines changed: 38 additions & 27 deletions b/‎sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala‎
Lines changed: 38 additions & 27 deletions
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala‎
Lines changed: 12 additions & 15 deletions b/‎sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala‎
Lines changed: 12 additions & 15 deletions
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala‎
Lines changed: 2 additions & 1 deletion b/‎sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/SchemaRDDLike.scala‎
Lines changed: 1 addition & 1 deletion b/‎sql/core/src/main/scala/org/apache/spark/sql/SchemaRDDLike.scala‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/api/java/JavaSQLContext.scala‎
Lines changed: 4 additions & 4 deletions b/‎sql/core/src/main/scala/org/apache/spark/sql/api/java/JavaSQLContext.scala‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala‎
Lines changed: 4 additions & 4 deletions b/‎sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala‎
Lines changed: 4 additions & 4 deletions
@@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst.analysis
 import org.apache.spark.sql.catalyst.{errors, trees}
 import org.apache.spark.sql.catalyst.errors.TreeNodeException
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.plans.logical.BaseRelation
+import org.apache.spark.sql.catalyst.plans.logical.LeafNode
 import org.apache.spark.sql.catalyst.trees.TreeNode
 
 /**
@@ -36,7 +36,7 @@ class UnresolvedException[TreeType <: TreeNode[_]](tree: TreeType, function: Str
 case class UnresolvedRelation(
     databaseName: Option[String],
     tableName: String,
-    alias: Option[String] = None) extends BaseRelation {
+    alias: Option[String] = None) extends LeafNode {
   override def output = Nil
   override lazy val resolved = false
 }
 
@@ -26,6 +26,25 @@ import org.apache.spark.sql.catalyst.trees
 abstract class LogicalPlan extends QueryPlan[LogicalPlan] {
   self: Product =>
 
+  /**
+   * Estimates of various statistics.  The default estimation logic simply lazily multiplies the
+   * corresponding statistic produced by the children.  To override this behavior, override
+   * `statistics` and assign it an overriden version of `Statistics`.
+   *
+   * '''NOTE''': concrete and/or overriden versions of statistics fields should pay attention to the
+   * performance of the implementations.  The reason is that estimations might get triggered in
+   * performance-critical processes, such as query plan planning.
+   *
+   * @param sizeInBytes Physical size in bytes. For leaf operators this defaults to 1, otherwise it
+   *                    defaults to the product of children's `sizeInBytes`.
+   */
+  case class Statistics(
+    sizeInBytes: BigInt
+  )
+  lazy val statistics: Statistics = Statistics(
+    sizeInBytes = children.map(_.statistics).map(_.sizeInBytes).product
+  )
+
   /**
    * Returns the set of attributes that are referenced by this node
    * during evaluation.
@@ -92,6 +111,9 @@ abstract class LogicalPlan extends QueryPlan[LogicalPlan] {
 abstract class LeafNode extends LogicalPlan with trees.LeafNode[LogicalPlan] {
   self: Product =>
 
+  override lazy val statistics: Statistics =
+    throw new UnsupportedOperationException(s"LeafNode $nodeName must implement statistics.")
+
   // Leaf nodes by definition cannot reference any input attributes.
   override def references = Set.empty
 }
 
@@ -21,17 +21,34 @@ import java.util.Properties
 
 import scala.collection.JavaConverters._
 
+object SQLConf {
+  val AUTO_BROADCASTJOIN_THRESHOLD = "spark.sql.autoBroadcastJoinThreshold"
+  val DEFAULT_SIZE_IN_BYTES = "spark.sql.defaultSizeInBytes"
+  val AUTO_CONVERT_JOIN_SIZE = "spark.sql.auto.convert.join.size"
+  val SHUFFLE_PARTITIONS = "spark.sql.shuffle.partitions"
+  val JOIN_BROADCAST_TABLES = "spark.sql.join.broadcastTables"
+  val CODEGEN_ENABLED = "spark.sql.codegen"
+
+  object Deprecated {
+    val MAPRED_REDUCE_TASKS = "mapred.reduce.tasks"
+  }
+}
+
 /**
- * SQLConf holds mutable config parameters and hints.  These can be set and
- * queried either by passing SET commands into Spark SQL's DSL
- * functions (sql(), hql(), etc.), or by programmatically using setters and
- * getters of this class.
+ * A trait that enables the setting and getting of mutable config parameters/hints.
+ *
+ * In the presence of a SQLContext, these can be set and queried by passing SET commands
+ * into Spark SQL's query functions (sql(), hql(), etc.). Otherwise, users of this trait can
+ * modify the hints by programmatically calling the setters and getters of this trait.
  *
- * SQLConf is thread-safe (internally synchronized so safe to be used in multiple threads).
+ * SQLConf is thread-safe (internally synchronized, so safe to be used in multiple threads).
  */
 trait SQLConf {
   import SQLConf._
 
+  @transient protected[spark] val settings = java.util.Collections.synchronizedMap(
+    new java.util.HashMap[String, String]())
+
   /** ************************ Spark SQL Params/Hints ******************* */
   // TODO: refactor so that these hints accessors don't pollute the name space of SQLContext?
 
@@ -52,28 +69,33 @@ trait SQLConf {
 
   /**
    * Upper bound on the sizes (in bytes) of the tables qualified for the auto conversion to
-   * a broadcast value during the physical executions of join operations.  Setting this to 0
+   * a broadcast value during the physical executions of join operations.  Setting this to -1
    * effectively disables auto conversion.
-   * Hive setting: hive.auto.convert.join.noconditionaltask.size.
+   *
+   * Hive setting: hive.auto.convert.join.noconditionaltask.size, whose default value is also 10000.
    */
-  private[spark] def autoConvertJoinSize: Int = get(AUTO_CONVERT_JOIN_SIZE, "10000").toInt
+  private[spark] def autoBroadcastJoinThreshold: Int =
+    get(AUTO_BROADCASTJOIN_THRESHOLD, "10000").toInt
 
-  /** A comma-separated list of table names marked to be broadcasted during joins. */
-  private[spark] def joinBroadcastTables: String = get(JOIN_BROADCAST_TABLES, "")
+  /**
+   * The default size in bytes to assign to a logical operator's estimation statistics.  By default,
+   * it is set to a larger value than `autoConvertJoinSize`, hence any logical operator without a
+   * properly implemented estimation of this statistic will not be incorrectly broadcasted in joins.
+   */
+  private[spark] def defaultSizeInBytes: Long =
+    getOption(DEFAULT_SIZE_IN_BYTES).map(_.toLong).getOrElse(autoBroadcastJoinThreshold + 1)
 
   /** ********************** SQLConf functionality methods ************ */
 
-  @transient
-  private val settings = java.util.Collections.synchronizedMap(
-    new java.util.HashMap[String, String]())
-
   def set(props: Properties): Unit = {
-    props.asScala.foreach { case (k, v) => this.settings.put(k, v) }
+    settings.synchronized {
+      props.asScala.foreach { case (k, v) => settings.put(k, v) }
+    }
   }
 
   def set(key: String, value: String): Unit = {
     require(key != null, "key cannot be null")
-    require(value != null, s"value cannot be null for $key")
+    require(value != null, s"value cannot be null for key: $key")
     settings.put(key, value)
   }
 
@@ -100,16 +122,5 @@ trait SQLConf {
   private[spark] def clear() {
     settings.clear()
   }
-
 }
 
-object SQLConf {
-  val AUTO_CONVERT_JOIN_SIZE = "spark.sql.auto.convert.join.size"
-  val SHUFFLE_PARTITIONS = "spark.sql.shuffle.partitions"
-  val JOIN_BROADCAST_TABLES = "spark.sql.join.broadcastTables"
-  val CODEGEN_ENABLED = "spark.sql.codegen"
-
-  object Deprecated {
-    val MAPRED_REDUCE_TASKS = "mapred.reduce.tasks"
-  }
-}
 
@@ -24,14 +24,14 @@ import org.apache.hadoop.conf.Configuration
 
 import org.apache.spark.annotation.{AlphaComponent, DeveloperApi, Experimental}
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.ScalaReflection
-import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.dsl.ExpressionConversions
-import org.apache.spark.sql.catalyst.types._
+import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.optimizer.Optimizer
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
-import org.apache.spark.sql.catalyst.rules.{Rule, RuleExecutor}
+import org.apache.spark.sql.catalyst.rules.RuleExecutor
+import org.apache.spark.sql.catalyst.types._
 import org.apache.spark.sql.columnar.InMemoryRelation
 import org.apache.spark.sql.execution._
 import org.apache.spark.sql.execution.SparkStrategies
@@ -86,15 +86,15 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * @group userf
    */
   implicit def createSchemaRDD[A <: Product: TypeTag](rdd: RDD[A]) =
-    new SchemaRDD(this, SparkLogicalPlan(ExistingRdd.fromProductRdd(rdd)))
+    new SchemaRDD(this, SparkLogicalPlan(ExistingRdd.fromProductRdd(rdd))(self))
 
   /**
    * Loads a Parquet file, returning the result as a [[SchemaRDD]].
    *
    * @group userf
    */
   def parquetFile(path: String): SchemaRDD =
-    new SchemaRDD(this, parquet.ParquetRelation(path, Some(sparkContext.hadoopConfiguration)))
+    new SchemaRDD(this, parquet.ParquetRelation(path, Some(sparkContext.hadoopConfiguration), this))
 
   /**
    * Loads a JSON file (one object per line), returning the result as a [[SchemaRDD]].
@@ -127,7 +127,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
    */
   @Experimental
   def jsonRDD(json: RDD[String], samplingRatio: Double): SchemaRDD =
-    new SchemaRDD(this, JsonRDD.inferSchema(json, samplingRatio))
+    new SchemaRDD(this, JsonRDD.inferSchema(self, json, samplingRatio))
 
   /**
    * :: Experimental ::
@@ -160,7 +160,8 @@ class SQLContext(@transient val sparkContext: SparkContext)
       conf: Configuration = new Configuration()): SchemaRDD = {
     new SchemaRDD(
       this,
-      ParquetRelation.createEmpty(path, ScalaReflection.attributesFor[A], allowExisting, conf))
+      ParquetRelation.createEmpty(
+        path, ScalaReflection.attributesFor[A], allowExisting, conf, this))
   }
 
   /**
@@ -170,11 +171,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * @group userf
    */
   def registerRDDAsTable(rdd: SchemaRDD, tableName: String): Unit = {
-    val name = tableName
-    val newPlan = rdd.logicalPlan transform {
-      case s @ SparkLogicalPlan(ExistingRdd(_, _), _) => s.copy(tableName = name)
-    }
-    catalog.registerTable(None, tableName, newPlan)
+    catalog.registerTable(None, tableName, rdd.logicalPlan)
   }
 
   /**
@@ -212,7 +209,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
       case inMem @ InMemoryRelation(_, _, e: ExistingRdd) =>
         inMem.cachedColumnBuffers.unpersist()
         catalog.unregisterTable(None, tableName)
-        catalog.registerTable(None, tableName, SparkLogicalPlan(e))
+        catalog.registerTable(None, tableName, SparkLogicalPlan(e)(self))
       case inMem: InMemoryRelation =>
         inMem.cachedColumnBuffers.unpersist()
         catalog.unregisterTable(None, tableName)
@@ -413,7 +410,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
         new GenericRow(map.values.toArray.asInstanceOf[Array[Any]]): Row
       }
     }
-    new SchemaRDD(this, SparkLogicalPlan(ExistingRdd(schema, rowRdd)))
+    new SchemaRDD(this, SparkLogicalPlan(ExistingRdd(schema, rowRdd))(self))
   }
 
 }
@@ -418,7 +418,8 @@ class SchemaRDD(
    * @group schema
    */
   private def applySchema(rdd: RDD[Row]): SchemaRDD = {
-    new SchemaRDD(sqlContext, SparkLogicalPlan(ExistingRdd(queryExecution.analyzed.output, rdd)))
+    new SchemaRDD(sqlContext,
+      SparkLogicalPlan(ExistingRdd(queryExecution.analyzed.output, rdd))(sqlContext))
   }
 
   // =======================================================================
 
@@ -56,7 +56,7 @@ private[sql] trait SchemaRDDLike {
     // happen right away to let these side effects take place eagerly.
     case _: Command | _: InsertIntoTable | _: InsertIntoCreatedTable | _: WriteToFile =>
       queryExecution.toRdd
-      SparkLogicalPlan(queryExecution.executedPlan)
+      SparkLogicalPlan(queryExecution.executedPlan)(sqlContext)
     case _ =>
       baseLogicalPlan
   }
 
@@ -72,7 +72,7 @@ class JavaSQLContext(val sqlContext: SQLContext) {
       conf: Configuration = new Configuration()): JavaSchemaRDD = {
     new JavaSchemaRDD(
       sqlContext,
-      ParquetRelation.createEmpty(path, getSchema(beanClass), allowExisting, conf))
+      ParquetRelation.createEmpty(path, getSchema(beanClass), allowExisting, conf, sqlContext))
   }
 
   /**
@@ -92,7 +92,7 @@ class JavaSQLContext(val sqlContext: SQLContext) {
         new GenericRow(extractors.map(e => e.invoke(row)).toArray[Any]): ScalaRow
       }
     }
-    new JavaSchemaRDD(sqlContext, SparkLogicalPlan(ExistingRdd(schema, rowRdd)))
+    new JavaSchemaRDD(sqlContext, SparkLogicalPlan(ExistingRdd(schema, rowRdd))(sqlContext))
   }
 
   /**
@@ -101,7 +101,7 @@ class JavaSQLContext(val sqlContext: SQLContext) {
   def parquetFile(path: String): JavaSchemaRDD =
     new JavaSchemaRDD(
       sqlContext,
-      ParquetRelation(path, Some(sqlContext.sparkContext.hadoopConfiguration)))
+      ParquetRelation(path, Some(sqlContext.sparkContext.hadoopConfiguration), sqlContext))
 
   /**
    * Loads a JSON file (one object per line), returning the result as a [[JavaSchemaRDD]].
@@ -120,7 +120,7 @@ class JavaSQLContext(val sqlContext: SQLContext) {
    * @group userf
    */
   def jsonRDD(json: JavaRDD[String]): JavaSchemaRDD =
-    new JavaSchemaRDD(sqlContext, JsonRDD.inferSchema(json, 1.0))
+    new JavaSchemaRDD(sqlContext, JsonRDD.inferSchema(sqlContext, json, 1.0))
 
   /**
    * Registers the given RDD as a temporary table in the catalog.  Temporary tables exist only
 
@@ -130,26 +130,26 @@ case class GeneratedAggregate(
       // Builds a new custom class for holding the results of aggregation for a group.
       val initialValues = computeFunctions.flatMap(_.initialValues)
       val newAggregationBuffer = newProjection(initialValues, child.output)
-      logger.info(s"Initial values: ${initialValues.mkString(",")}")
+      log.info(s"Initial values: ${initialValues.mkString(",")}")
 
       // A projection that computes the group given an input tuple.
       val groupProjection = newProjection(groupingExpressions, child.output)
-      logger.info(s"Grouping Projection: ${groupingExpressions.mkString(",")}")
+      log.info(s"Grouping Projection: ${groupingExpressions.mkString(",")}")
 
       // A projection that is used to update the aggregate values for a group given a new tuple.
       // This projection should be targeted at the current values for the group and then applied
       // to a joined row of the current values with the new input row.
       val updateExpressions = computeFunctions.flatMap(_.update)
       val updateSchema = computeFunctions.flatMap(_.schema) ++ child.output
       val updateProjection = newMutableProjection(updateExpressions, updateSchema)()
-      logger.info(s"Update Expressions: ${updateExpressions.mkString(",")}")
+      log.info(s"Update Expressions: ${updateExpressions.mkString(",")}")
 
       // A projection that produces the final result, given a computation.
       val resultProjectionBuilder =
         newMutableProjection(
           resultExpressions,
           (namedGroups.map(_._2.toAttribute) ++ computationSchema).toSeq)
-      logger.info(s"Result Projection: ${resultExpressions.mkString(",")}")
+      log.info(s"Result Projection: ${resultExpressions.mkString(",")}")
 
       val joinedRow = new JoinedRow
Original file line number	Diff line number	Diff line change
`@@ -418,7 +418,8 @@ class SchemaRDD(`
`418`	`418`	`* @group schema`
`419`	`419`	`*/`
`420`	`420`	`private def applySchema(rdd: RDD[Row]): SchemaRDD = {`
`421`		`- new SchemaRDD(sqlContext, SparkLogicalPlan(ExistingRdd(queryExecution.analyzed.output, rdd)))`
	`421`	`+ new SchemaRDD(sqlContext,`
	`422`	`+ SparkLogicalPlan(ExistingRdd(queryExecution.analyzed.output, rdd))(sqlContext))`
`422`	`423`	`}`
`423`	`424`
`424`	`425`	`// =======================================================================`
Original file line number	Diff line number	Diff line change
`@@ -56,7 +56,7 @@ private[sql] trait SchemaRDDLike {`
`56`	`56`	`// happen right away to let these side effects take place eagerly.`
`57`	`57`	`case _: Command \| _: InsertIntoTable \| _: InsertIntoCreatedTable \| _: WriteToFile =>`
`58`	`58`	`queryExecution.toRdd`
`59`		`- SparkLogicalPlan(queryExecution.executedPlan)`
	`59`	`+ SparkLogicalPlan(queryExecution.executedPlan)(sqlContext)`
`60`	`60`	`case _ =>`
`61`	`61`	`baseLogicalPlan`
`62`	`62`	`}`