Cleanup AdjustTimestamps.

Marcelo Vanzin · Marcelo Vanzin · commit 5c03e07ca0c3 · 2017-10-10T09:42:51.000-07:00
Running the rule during resolution also allowed to do all the needed
ajustments with a single rule (instead of needing a Hive-specific
rule for InsertIntoHiveTable).
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/AdjustTimestamps.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/AdjustTimestamps.scala
@@ -16,183 +16,113 @@
  */
 package org.apache.spark.sql.execution.datasources
 
-import org.apache.spark.sql.{AnalysisException, SparkSession}
-import org.apache.spark.sql.catalyst.analysis.UnresolvedException
+import org.apache.spark.sql.{AnalysisException}
 import org.apache.spark.sql.catalyst.catalog.{CatalogTable, HiveTableRelation}
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project}
+import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoTable, LogicalPlan, Project}
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types.{StringType, TimestampType}
 
-abstract class BaseAdjustTimestampsRule(sparkSession: SparkSession) extends Rule[LogicalPlan] {
+/**
+ * Apply a correction to data loaded from, or saved to, tables that have a configured time zone, so
+ * that timestamps can be read like TIMESTAMP WITHOUT TIMEZONE.  This gives correct behavior if you
+ * process data with machines in different timezones, or if you access the data from multiple SQL
+ * engines.
+ */
+case class AdjustTimestamps(conf: SQLConf) extends Rule[LogicalPlan] {
 
-  /**
-   * Apply the correction to all timestamp inputs, and replace all references to the raw attributes
-   * with the new converted inputs.
-   * @return The converted plan, and the replacements to be applied further up the plan
-   */
-  protected def convertInputs(
-      plan: LogicalPlan
-      ): (LogicalPlan, Map[ExprId, NamedExpression]) = plan match {
-    case alreadyConverted@Project(exprs, _) if hasCorrection(exprs) =>
-      (alreadyConverted, Map())
+  def apply(plan: LogicalPlan): LogicalPlan = plan match {
+    case insert: InsertIntoHadoopFsRelationCommand =>
+      val adjusted = adjustTimestampsForWrite(insert.query, insert.catalogTable, insert.options)
+      insert.copy(query = adjusted)
 
-    case lr @ LogicalRelation(fsRelation: HadoopFsRelation, _, _, _) =>
-      val tzOpt = extractTableTz(lr.catalogTable, fsRelation.options)
-      tzOpt.flatMap { tableTz =>
-        // the table has a timezone set, so after reading the data, apply a conversion
-
-        // SessionTZ (instead of JVM TZ) will make the time display correctly in SQL queries, but
-        // incorrectly if you pull Timestamp objects out (eg. with a dataset.collect())
-        val toTz = sparkSession.sessionState.conf.sessionLocalTimeZone
-        if (toTz != tableTz) {
-          logDebug(s"table tz = $tableTz; converting to current session tz = $toTz")
-          // find timestamp columns, and convert their tz
-          convertTzForAllTimestamps(lr, tableTz, toTz).map { case (fields, replacements) =>
-            (new Project(fields, lr), replacements)
-          }
-        } else {
-          None
-        }
-      }.getOrElse((lr, Map()))
-
-    case relation @ HiveTableRelation(table, _, _) =>
-      val tzOpt = extractTableTz(Some(table), Map())
-      tzOpt.flatMap { tz =>
-        val toTz = sparkSession.sessionState.conf.sessionLocalTimeZone
-        if (toTz != tz) {
-          logDebug(s"table tz = $tz; converting to current session tz = $toTz")
-          // find timestamp columns, and convert their tz
-          convertTzForAllTimestamps(relation, tz, toTz).map { case (fields, replacements) =>
-            (new Project(fields, relation), replacements)
-          }
-        } else {
-          None
-        }
-      }.getOrElse((relation, Map()))
+    case insert @ InsertIntoTable(table: HiveTableRelation, _, query, _, _) =>
+      val adjusted = adjustTimestampsForWrite(insert.query, Some(table.tableMeta), Map())
+      insert.copy(query = adjusted)
 
     case other =>
-      // first, process all the children -- this ensures we have the right renames in scope.
-      var newReplacements = Map[ExprId, NamedExpression]()
-      val fixedPlan = other.mapChildren { originalPlan =>
-        val (newPlan, extraReplacements) = convertInputs(originalPlan)
-        newReplacements ++= extraReplacements
-        newPlan
-      }
-      // now we need to adjust all names to use the new version.
-      val fixedExpressions = fixedPlan.mapExpressions { outerExp =>
-        val adjustedExp = outerExp.transformUp { case exp: NamedExpression =>
-          try {
-            newReplacements.get(exp.exprId).getOrElse(exp)
-          } catch {
-            // UnresolvedAttributes etc. will cause problems later anyway, we just dont' want to
-            // expose the error here
-            case ue: UnresolvedException[_] => exp
-          }
-        }
-        adjustedExp
-      }
-      (fixedExpressions, newReplacements)
+      convertInputs(plan)
   }
 
-  protected def hasCorrection(exprs: Seq[Expression]): Boolean = {
-    exprs.exists { expr =>
-      expr.isInstanceOf[TimestampTimezoneCorrection] || hasCorrection(expr.children)
-    }
+  private def convertInputs(plan: LogicalPlan): LogicalPlan = plan match {
+    case adjusted @ Project(exprs, _) if hasCorrection(exprs) =>
+      adjusted
+
+    case lr @ LogicalRelation(fsRelation: HadoopFsRelation, _, _, _) =>
+      adjustTimestamps(lr, lr.catalogTable, fsRelation.options, true)
+
+    case hr @ HiveTableRelation(table, _, _) =>
+      adjustTimestamps(hr, Some(table), Map(), true)
+
+    case other =>
+      other.mapChildren { originalPlan =>
+        convertInputs(originalPlan)
+      }
   }
 
-  protected def writeConversion(
+  private def adjustTimestamps(
+      plan: LogicalPlan,
       table: Option[CatalogTable],
       options: Map[String, String],
-      query: LogicalPlan): LogicalPlan = {
-    val tableTz = extractTableTz(table, options)
-    val internalTz = sparkSession.sessionState.conf.sessionLocalTimeZone
-    if (tableTz.isDefined && tableTz != internalTz) {
-      convertTzForAllTimestamps(query, internalTz, tableTz.get).map { case (fields, _) =>
-        new Project(fields, query)
-      }.getOrElse(query)
-    } else {
-      query
-    }
-  }
-
-  protected def extractTableTz(options: Map[String, String]): Option[String] = {
-    options.get(DateTimeUtils.TIMEZONE_PROPERTY)
+      reading: Boolean): LogicalPlan = {
+    val tableTz = table.flatMap(_.properties.get(DateTimeUtils.TIMEZONE_PROPERTY))
+      .orElse(options.get(DateTimeUtils.TIMEZONE_PROPERTY))
+
+    tableTz.map { tz =>
+      val sessionTz = conf.sessionLocalTimeZone
+      val toTz = if (reading) sessionTz else tz
+      val fromTz = if (reading) tz else sessionTz
+      logDebug(
+        s"table tz = $tz; converting ${if (reading) "to" else "from"} session tz = $sessionTz\n")
+
+      var hasTimestamp = false
+      val adjusted = plan.expressions.map {
+        case e: NamedExpression if e.dataType == TimestampType =>
+          val adjustment = TimestampTimezoneCorrection(e.toAttribute,
+            Literal.create(fromTz, StringType), Literal.create(toTz, StringType))
+          hasTimestamp = true
+          Alias(adjustment, e.name)()
+
+        case other: NamedExpression =>
+          other
+
+        case unnamed =>
+          throw new AnalysisException(s"Unexpected expr: $unnamed")
+      }.toList
+
+      if (hasTimestamp) Project(adjusted, plan) else plan
+    }.getOrElse(plan)
   }
 
-  protected def extractTableTz(
+  private def adjustTimestampsForWrite(
+      query: LogicalPlan,
       table: Option[CatalogTable],
-      options: Map[String, String]): Option[String] = {
-    table.flatMap { tbl => extractTableTz(tbl.properties) }.orElse(extractTableTz(options))
+      options: Map[String, String]): LogicalPlan = query match {
+    case unadjusted if !hasOutputCorrection(unadjusted.expressions) =>
+      // The query might be reading from a table with a configured time zone; this makes sure we
+      // apply the correct conversions for that data.
+      val fixedInputs = convertInputs(unadjusted)
+      adjustTimestamps(fixedInputs, table, options, false)
+
+    case _ =>
+      query
   }
 
-  /**
-   * Find all timestamp fields in the given relation.  For each one, replace it with an expression
-   * that converts the timezone of the timestamp, and assigns an alias to that new expression.
-   * (Leave non-timestamp fields alone.)  Also return a map from the original id for the timestamp
-   * field, to the new alias of the timezone-corrected expression.
-   */
-  protected def convertTzForAllTimestamps(
-      relation: LogicalPlan,
-      fromTz: String,
-      toTz: String): Option[(Seq[NamedExpression], Map[ExprId, NamedExpression])] = {
-    val schema = relation.schema
-    var foundTs = false
-    var replacements = Map[ExprId, NamedExpression]()
-    val modifiedFields: Seq[NamedExpression] = schema.map { field =>
-      val exp = relation.resolve(Seq(field.name), sparkSession.sessionState.conf.resolver)
-        .getOrElse {
-          val inputColumns = schema.map(_.name).mkString(", ")
-          throw new AnalysisException(
-            s"cannot resolve '${field.name}' given input columns: [$inputColumns]")
-        }
-      if (field.dataType == TimestampType) {
-        foundTs = true
-        val adjustedTs = Alias(
-          TimestampTimezoneCorrection(
-            exp,
-            Literal.create(fromTz, StringType),
-            Literal.create(toTz, StringType)
-          ),
-          field.name
-        )()
-        // we also need to rename all occurrences of this field further up in the plan
-        // to refer to our new adjusted timestamp, so we pass this replacement up the call stack.
-        replacements += exp.exprId -> adjustedTs.toAttribute
-        adjustedTs
-      } else {
-        exp
-      }
+  private def hasCorrection(exprs: Seq[Expression]): Boolean = {
+    exprs.exists { expr =>
+      expr.isInstanceOf[TimestampTimezoneCorrection] || hasCorrection(expr.children)
     }
-    if (foundTs) Some((modifiedFields, replacements)) else None
   }
-}
 
-/**
- * Apply a correction to data loaded from, or saved to, tables that have a configured time zone, so
- * that timestamps can be read like TIMESTAMP WITHOUT TIMEZONE.  This gives correct behavior if you
- * process data with machines in different timezones, or if you access the data from multiple SQL
- * engines.
- */
-case class AdjustTimestamps(sparkSession: SparkSession)
-    extends BaseAdjustTimestampsRule(sparkSession) {
-
-  def apply(plan: LogicalPlan): LogicalPlan = {
-    // we can't use transformUp because we want to terminate recursion if there was already
-    // timestamp correction, to keep this idempotent.
-    plan match {
-      case insert: InsertIntoHadoopFsRelationCommand =>
-        // The query might be reading from a parquet table which requires a different conversion;
-        // this makes sure we apply the correct conversions there.
-        val (fixedQuery, _) = convertInputs(insert.query)
-        val fixedOutput = writeConversion(insert.catalogTable, insert.options, fixedQuery)
-        insert.copy(query = fixedOutput)
-
-      case other =>
-        // recurse into children to see if we're reading data that needs conversion
-        val (convertedPlan, _) = convertInputs(plan)
-        convertedPlan
+  private def hasOutputCorrection(exprs: Seq[Expression]): Boolean = {
+    // Output correction is any TimestampTimezoneCorrection that converts from the current
+    // session's time zone.
+    val sessionTz = conf.sessionLocalTimeZone
+    exprs.exists {
+      case TimestampTimezoneCorrection(_, from, _) => from.toString() == sessionTz
+      case other => hasOutputCorrection(other.children)
     }
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala
@@ -158,13 +158,13 @@ abstract class BaseSessionStateBuilder(
     override val extendedResolutionRules: Seq[Rule[LogicalPlan]] =
       new FindDataSourceTable(session) +:
         new ResolveSQLOnFile(session) +:
+        AdjustTimestamps(conf) +:
         customResolutionRules
 
     override val postHocResolutionRules: Seq[Rule[LogicalPlan]] =
       PreprocessTableCreation(session) +:
         PreprocessTableInsertion(conf) +:
         DataSourceAnalysis(conf) +:
-        AdjustTimestamps(session) +:
         customPostHocResolutionRules
 
     override val extendedCheckRules: Seq[LogicalPlan => Unit] =
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala
@@ -71,6 +71,7 @@ class HiveSessionStateBuilder(session: SparkSession, parentState: Option[Session
       new ResolveHiveSerdeTable(session) +:
         new FindDataSourceTable(session) +:
         new ResolveSQLOnFile(session) +:
+        AdjustTimestamps(conf) +:
         customResolutionRules
 
     override val postHocResolutionRules: Seq[Rule[LogicalPlan]] =
@@ -80,8 +81,6 @@ class HiveSessionStateBuilder(session: SparkSession, parentState: Option[Session
         PreprocessTableInsertion(conf) +:
         DataSourceAnalysis(conf) +:
         HiveAnalysis +:
-        HiveAdjustTimestamps(session) +:
-        AdjustTimestamps(session) +:
         customPostHocResolutionRules
 
     override val extendedCheckRules: Seq[LogicalPlan => Unit] =
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
@@ -31,7 +31,7 @@ import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoDir, InsertIntoTab
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.execution._
 import org.apache.spark.sql.execution.command.{CreateTableCommand, DDLUtils}
-import org.apache.spark.sql.execution.datasources.{BaseAdjustTimestampsRule, CreateTable, LogicalRelation}
+import org.apache.spark.sql.execution.datasources.{CreateTable, LogicalRelation}
 import org.apache.spark.sql.execution.datasources.parquet.{ParquetFileFormat, ParquetOptions}
 import org.apache.spark.sql.hive.execution._
 import org.apache.spark.sql.hive.orc.OrcFileFormat
@@ -217,33 +217,6 @@ case class RelationConversions(
   }
 }
 
-/**
- * Apply a correction to data loaded from, or saved to, tables that have a configured time zone, so
- * that timestamps can be read like TIMESTAMP WITHOUT TIMEZONE.  This gives correct behavior if you
- * process data with machines in different timezones, or if you access the data from multiple SQL
- * engines.
- */
-case class HiveAdjustTimestamps(sparkSession: SparkSession)
-  extends BaseAdjustTimestampsRule(sparkSession) {
-
-  def apply(plan: LogicalPlan): LogicalPlan = {
-    // we can't use transformUp because we want to terminate recursion if there was already
-    // timestamp correction, to keep this idempotent.
-    plan match {
-      case insert: InsertIntoHiveTable =>
-        // The query might be reading from a parquet table which requires a different conversion;
-        // this makes sure we apply the correct conversions there.
-        val (fixedQuery, _) = convertInputs(insert.query)
-        val fixedOutput = writeConversion(Some(insert.table), Map(), fixedQuery)
-        insert.copy(query = fixedOutput)
-
-      case other =>
-        plan
-    }
-  }
-
-}
-
 private[hive] trait HiveStrategies {
   // Possibly being too clever with types here... or not clever enough.
   self: SparkPlanner =>