fix input and out put format

AngersZhuuuu · AngersZhuuuu · commit ec754e270c29 · 2020-07-14T22:21:05.000+08:00
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/BaseScriptTransformationExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/BaseScriptTransformationExec.scala
@@ -27,12 +27,15 @@ import org.apache.hadoop.conf.Configuration
 
 import org.apache.spark.{SparkException, TaskContext}
 import org.apache.spark.internal.Logging
+import org.apache.spark.network.util.JavaUtils
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{AttributeSet, UnsafeProjection}
 import org.apache.spark.sql.catalyst.plans.physical.Partitioning
+import org.apache.spark.sql.catalyst.util.{DateTimeUtils, IntervalUtils}
 import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.types.DataType
+import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.UTF8String
 import org.apache.spark.util.{CircularBuffer, SerializableConfiguration, Utils}
 
 trait BaseScriptTransformationExec extends UnaryExecNode {
@@ -87,6 +90,41 @@ trait BaseScriptTransformationExec extends UnaryExecNode {
       }
     }
   }
+
+  def wrapper(data: String, dt: DataType): Any = {
+    dt match {
+      case StringType => data
+      case ByteType => JavaUtils.stringToBytes(data)
+      case IntegerType => data.toInt
+      case ShortType => data.toShort
+      case LongType => data.toLong
+      case FloatType => data.toFloat
+      case DoubleType => data.toDouble
+      case dt: DecimalType => BigDecimal(data)
+      case DateType if conf.datetimeJava8ApiEnabled =>
+        DateTimeUtils.stringToDate(
+          UTF8String.fromString(data),
+          DateTimeUtils.getZoneId(conf.sessionLocalTimeZone))
+          .map(DateTimeUtils.daysToLocalDate).orNull
+      case DateType =>
+        DateTimeUtils.stringToDate(
+          UTF8String.fromString(data),
+          DateTimeUtils.getZoneId(conf.sessionLocalTimeZone))
+          .map(DateTimeUtils.toJavaDate).orNull
+      case TimestampType if conf.datetimeJava8ApiEnabled =>
+        DateTimeUtils.stringToTimestamp(
+          UTF8String.fromString(data),
+          DateTimeUtils.getZoneId(conf.sessionLocalTimeZone))
+          .map(DateTimeUtils.microsToInstant).orNull
+      case TimestampType =>
+        DateTimeUtils.stringToTimestamp(
+          UTF8String.fromString(data),
+          DateTimeUtils.getZoneId(conf.sessionLocalTimeZone))
+          .map(DateTimeUtils.toJavaTimestamp).orNull
+      case CalendarIntervalType => IntervalUtils.stringToInterval(UTF8String.fromString(data))
+      case dataType: DataType => data
+    }
+  }
 }
 
 abstract class BaseScriptTransformationWriterThread extends Thread with Logging {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkScriptTransformationExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkScriptTransformationExec.scala
@@ -29,7 +29,7 @@ import org.apache.spark.TaskContext
 import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical.ScriptInputOutputSchema
-import org.apache.spark.sql.types.DataType
+import org.apache.spark.sql.types._
 import org.apache.spark.util.{CircularBuffer, RedirectThread}
 
 /**
@@ -67,7 +67,9 @@ case class SparkScriptTransformationExec(
       stderrBuffer,
       "Thread-ScriptTransformation-STDERR-Consumer").start()
 
-    val outputProjection = new InterpretedProjection(input, child.output)
+    val finalInput = input.map(Cast(_, StringType).withTimeZone(conf.sessionLocalTimeZone))
+
+    val outputProjection = new InterpretedProjection(finalInput, child.output)
 
     // This new thread will consume the ScriptTransformation's input rows and write them to the
     // external process. That process's output will be read by this current thread.
@@ -116,11 +118,17 @@ case class SparkScriptTransformationExec(
         if (!ioschema.schemaLess) {
           new GenericInternalRow(
             prevLine.split(ioschema.outputRowFormatMap("TOK_TABLEROWFORMATFIELD"))
-              .map(CatalystTypeConverters.convertToCatalyst))
+              .zip(output)
+              .map { case (data, dataType) =>
+                CatalystTypeConverters.convertToCatalyst(wrapper(data, dataType.dataType))
+              })
         } else {
           new GenericInternalRow(
             prevLine.split(ioschema.outputRowFormatMap("TOK_TABLEROWFORMATFIELD"), 2)
-              .map(CatalystTypeConverters.convertToCatalyst))
+              .zip(output)
+              .map { case (data, dataType) =>
+                CatalystTypeConverters.convertToCatalyst(wrapper(data, dataType.dataType))
+              })
         }
       }
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
@@ -713,7 +713,8 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) {
           None
         }
         (Seq.empty, Option(name), props.toSeq, recordHandler)
-
+      // SPARK-32106: When there is no definition about format, we return empty result
+      // then we finally execute with SparkScriptTransformationExec
       case null =>
         (Nil, None, Seq.empty, None)
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -17,8 +17,6 @@
 
 package org.apache.spark.sql.execution
 
-import java.time.ZoneId
-
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{execution, AnalysisException, Strategy}
 import org.apache.spark.sql.catalyst.InternalRow
@@ -40,7 +38,7 @@ import org.apache.spark.sql.execution.streaming._
 import org.apache.spark.sql.execution.streaming.sources.MemoryPlan
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.streaming.{OutputMode, StreamingQuery}
-import org.apache.spark.sql.types.{StringType, StructType}
+import org.apache.spark.sql.types.StructType
 
 /**
  * Converts a logical plan into zero or more SparkPlans.  This API is exposed for experimenting
@@ -539,7 +537,7 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
       case logical.ScriptTransformation(input, script, output, child, ioschema)
         if ioschema.inputSerdeClass.isEmpty && ioschema.outputSerdeClass.isEmpty =>
         SparkScriptTransformationExec(
-          input.map(Cast(_, StringType).withTimeZone(conf.sessionLocalTimeZone)),
+          input,
           script,
           output,
           planLater(child),
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveScriptTransformationExec.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveScriptTransformationExec.scala
@@ -39,7 +39,7 @@ import org.apache.spark.sql.catalyst.plans.logical.ScriptInputOutputSchema
 import org.apache.spark.sql.execution._
 import org.apache.spark.sql.hive.HiveInspectors
 import org.apache.spark.sql.hive.HiveShim._
-import org.apache.spark.sql.types.DataType
+import org.apache.spark.sql.types.{DataType, StringType}
 import org.apache.spark.util.{CircularBuffer, RedirectThread, Utils}
 
 /**
@@ -78,17 +78,25 @@ case class HiveScriptTransformationExec(
       stderrBuffer,
       "Thread-ScriptTransformation-STDERR-Consumer").start()
 
-    val outputProjection = new InterpretedProjection(input, child.output)
-
     // This nullability is a performance optimization in order to avoid an Option.foreach() call
     // inside of a loop
     @Nullable val (inputSerde, inputSoi) = ioschema.initInputSerDe(input).getOrElse((null, null))
 
+    // For HiveScriptTransformationExec, if inputSerde == null, but outputSerde != null
+    // We will use StringBuffer to pass data, in this case, we should cast data as string too.
+    val finalInput = if (inputSerde == null) {
+      input.map(Cast(_, StringType).withTimeZone(conf.sessionLocalTimeZone))
+    } else {
+      input
+    }
+
+    val outputProjection = new InterpretedProjection(finalInput, child.output)
+
     // This new thread will consume the ScriptTransformation's input rows and write them to the
     // external process. That process's output will be read by this current thread.
     val writerThread = HiveScriptTransformationWriterThread(
       inputIterator.map(outputProjection),
-      input.map(_.dataType),
+      finalInput.map(_.dataType),
       inputSerde,
       inputSoi,
       ioschema,
@@ -178,11 +186,17 @@ case class HiveScriptTransformationExec(
           if (!ioschema.schemaLess) {
             new GenericInternalRow(
               prevLine.split(ioschema.outputRowFormatMap("TOK_TABLEROWFORMATFIELD"))
-                .map(CatalystTypeConverters.convertToCatalyst))
+                .zip(output)
+                .map { case (data, dataType) =>
+                  CatalystTypeConverters.convertToCatalyst(wrapper(data, dataType.dataType))
+                })
           } else {
             new GenericInternalRow(
               prevLine.split(ioschema.outputRowFormatMap("TOK_TABLEROWFORMATFIELD"), 2)
-                .map(CatalystTypeConverters.convertToCatalyst))
+                .zip(output)
+                .map { case (data, dataType) =>
+                  CatalystTypeConverters.convertToCatalyst(wrapper(data, dataType.dataType))
+                })
           }
         } else {
           val raw = outputSerde.deserialize(scriptOutputWritable)

Original file line number	Diff line number	Diff line change
`@@ -713,7 +713,8 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) {`
`713`	`713`	`None`
`714`	`714`	`}`
`715`	`715`	`(Seq.empty, Option(name), props.toSeq, recordHandler)`
`716`		`-`
	`716`	`+ // SPARK-32106: When there is no definition about format, we return empty result`
	`717`	`+ // then we finally execute with SparkScriptTransformationExec`
`717`	`718`	`case null =>`
`718`	`719`	`(Nil, None, Seq.empty, None)`
`719`	`720`	`}`