Fixes MiMA issues, addresses comments

liancheng · liancheng · commit b6946e67c0bb · 2015-02-05T13:28:58.000-08:00
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/dataTypes.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/dataTypes.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.types
 
 import java.sql.Timestamp
 
+import scala.collection.mutable.ArrayBuffer
 import scala.math.Numeric.{FloatAsIfIntegral, DoubleAsIfIntegral}
 import scala.reflect.ClassTag
 import scala.reflect.runtime.universe.{TypeTag, runtimeMirror, typeTag}
@@ -29,6 +30,7 @@ import org.json4s.JsonAST.JValue
 import org.json4s.JsonDSL._
 import org.json4s.jackson.JsonMethods._
 
+import org.apache.spark.SparkException
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.sql.catalyst.ScalaReflectionLock
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, Expression}
@@ -159,7 +161,6 @@ object DataType {
       case failure: NoSuccess =>
         throw new IllegalArgumentException(s"Unsupported dataType: $asString, $failure")
     }
-
   }
 
   protected[types] def buildFormattedString(
@@ -754,6 +755,57 @@ object StructType {
   def apply(fields: java.util.List[StructField]): StructType = {
     StructType(fields.toArray.asInstanceOf[Array[StructField]])
   }
+
+  private[sql] def merge(left: DataType, right: DataType): DataType =
+    (left, right) match {
+      case (ArrayType(leftElementType, leftContainsNull),
+            ArrayType(rightElementType, rightContainsNull)) =>
+        ArrayType(
+          merge(leftElementType, rightElementType),
+          leftContainsNull || rightContainsNull)
+
+      case (MapType(leftKeyType, leftValueType, leftContainsNull),
+            MapType(rightKeyType, rightValueType, rightContainsNull)) =>
+        MapType(
+          merge(leftKeyType, rightKeyType),
+          merge(leftValueType, rightValueType),
+          leftContainsNull || rightContainsNull)
+
+      case (StructType(leftFields), StructType(rightFields)) =>
+        val newFields = ArrayBuffer.empty[StructField]
+
+        leftFields.foreach {
+          case leftField @ StructField(leftName, leftType, leftNullable, _) =>
+            rightFields
+              .find(_.name == leftName)
+              .map { case rightField @ StructField(_, rightType, rightNullable, _) =>
+                leftField.copy(
+                  dataType = merge(leftType, rightType),
+                  nullable = leftNullable || rightNullable)
+              }
+              .orElse(Some(leftField))
+              .foreach(newFields += _)
+        }
+
+        rightFields
+          .filterNot(f => leftFields.map(_.name).contains(f.name))
+          .foreach(newFields += _)
+
+        StructType(newFields)
+
+      case (DecimalType.Fixed(leftPrecision, leftScale),
+            DecimalType.Fixed(rightPrecision, rightScale)) =>
+        DecimalType(leftPrecision.max(rightPrecision), leftScale.max(rightScale))
+
+      case (leftUdt: UserDefinedType[_], rightUdt: UserDefinedType[_])
+        if leftUdt.userClass == rightUdt.userClass => leftUdt
+
+      case (leftType, rightType) if leftType == rightType =>
+        leftType
+
+      case _ =>
+        throw new SparkException(s"Failed to merge incompatible data types $left and $right")
+    }
 }
 
 
@@ -890,6 +942,20 @@ case class StructType(fields: Array[StructField]) extends DataType with Seq[Stru
     val fieldTypes = fields.map(field => s"${field.name}:${field.dataType.simpleString}")
     s"struct<${fieldTypes.mkString(",")}>"
   }
+
+  /**
+   * Merges with another schema (`StructType`).  For a struct field A from `this` and a struct field
+   * B from `that`,
+   *
+   * 1. If A and B have the same name and data type, they are merged to a field C with the same name
+   *    and data type.  C is nullable if and only if either A or B is nullable.
+   * 2. If A doesn't exist in `that`, it's included in the result schema.
+   * 3. If B doesn't exist in `this`, it's also included in the result schema.
+   * 4. Otherwise, `this` and `that` are considered as conflicting schemas and an exception would be
+   *    thrown.
+   */
+  private[sql] def merge(that: StructType): StructType =
+    StructType.merge(this, that).asInstanceOf[StructType]
 }
 
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTypes.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTypes.scala
@@ -284,7 +284,7 @@ private[parquet] object ParquetTypesConverter extends Logging {
       ctype: DataType,
       name: String,
       nullable: Boolean = true,
-      inArray: Boolean = false, 
+      inArray: Boolean = false,
       toThriftSchemaNames: Boolean = false): ParquetType = {
     val repetition =
       if (inArray) {
@@ -339,7 +339,7 @@ private[parquet] object ParquetTypesConverter extends Logging {
         }
         case StructType(structFields) => {
           val fields = structFields.map {
-            field => fromDataType(field.dataType, field.name, field.nullable, 
+            field => fromDataType(field.dataType, field.name, field.nullable,
                                   inArray = false, toThriftSchemaNames)
           }
           new ParquetGroupType(repetition, name, fields.toSeq)
@@ -522,58 +522,4 @@ private[parquet] object ParquetTypesConverter extends Logging {
       attributes
     }
   }
-
-  def mergeCatalystSchemas(left: StructType, right: StructType): StructType =
-    mergeCatalystDataTypes(left, right).asInstanceOf[StructType]
-
-  def mergeCatalystDataTypes(left: DataType, right: DataType): DataType =
-    (left, right) match {
-      case (ArrayType(leftElementType, leftContainsNull),
-            ArrayType(rightElementType, rightContainsNull)) =>
-        ArrayType(
-          mergeCatalystDataTypes(leftElementType, rightElementType),
-          leftContainsNull || rightContainsNull)
-
-      case (MapType(leftKeyType, leftValueType, leftContainsNull),
-            MapType(rightKeyType, rightValueType, rightContainsNull)) =>
-        MapType(
-          mergeCatalystDataTypes(leftKeyType, rightKeyType),
-          mergeCatalystDataTypes(leftValueType, rightValueType),
-          leftContainsNull || rightContainsNull)
-
-      case (StructType(leftFields), StructType(rightFields)) =>
-        val newFields = ArrayBuffer.empty[StructField]
-
-        leftFields.foreach {
-          case leftField @ StructField(leftName, leftType, leftNullable, _) =>
-            rightFields
-              .find(_.name == leftName)
-              .map { case rightField @ StructField(_, rightType, rightNullable, _) =>
-                leftField.copy(
-                  dataType = mergeCatalystDataTypes(leftType, rightType),
-                  nullable = leftNullable || rightNullable)
-              }
-              .orElse(Some(leftField))
-              .foreach(newFields += _)
-        }
-
-        rightFields
-          .filterNot(f => leftFields.map(_.name).contains(f.name))
-          .foreach(newFields += _)
-
-        StructType(newFields)
-
-      case (DecimalType.Fixed(leftPrecision, leftScale),
-            DecimalType.Fixed(rightPrecision, rightScale)) =>
-        DecimalType(leftPrecision.max(rightPrecision), leftScale.max(rightScale))
-
-      case (leftUdt: UserDefinedType[_], rightUdt: UserDefinedType[_])
-        if leftUdt.userClass == rightUdt.userClass => leftUdt
-
-      case (leftType, rightType) if leftType == rightType =>
-        leftType
-
-      case _ =>
-        throw new SparkException(s"Failed to merge incompatible data types $left and $right")
-    }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
@@ -45,6 +45,7 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.parquet.ParquetTypesConverter._
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types.{IntegerType, StructField, StructType, _}
+import org.apache.spark.sql.types.StructType._
 import org.apache.spark.sql.{DataFrame, Row, SQLConf, SQLContext}
 import org.apache.spark.{Partition => SparkPartition, TaskContext, SerializableWritable, Logging, SparkException}
 
@@ -173,7 +174,7 @@ case class ParquetRelation2
         val statuses = paths.distinct.map(p => fs.getFileStatus(fs.makeQualified(new Path(p))))
         // Support either reading a collection of raw Parquet part-files, or a collection of folders
         // containing Parquet files (e.g. partitioned Parquet table).
-        assert(statuses.forall(_.isFile) || statuses.forall(_.isDir))
+        assert(statuses.forall(!_.isDir) || statuses.forall(_.isDir))
         statuses.toArray
       }
 
@@ -252,11 +253,18 @@ case class ParquetRelation2
       // we can't trust the summary files if users require a merged schema, and must touch all part-
       // files to do the merge.
         if (shouldMergeSchemas) {
-          dataStatuses.toSeq
+          // Also includes summary files, 'cause there might be empty partition directories.
+          (metadataStatuses ++ commonMetadataStatuses ++ dataStatuses).toSeq
         } else {
+          // Tries any "_common_metadata" first. Parquet files written by old versions or Parquet
+          // don't have this.
           commonMetadataStatuses.headOption
+            // Falls back to "_metadata"
             .orElse(metadataStatuses.headOption)
-            // Summary file(s) not found, falls back to the first part-file.
+            // Summary file(s) not found, the Parquet file is either corrupted, or different part-
+            // files contain conflicting user defined metadata (two or more values are associated
+            // with a same key in different files).  In either case, we fall back to any of the
+            // first part-file, and just assume all schemas are consistent.
             .orElse(dataStatuses.headOption)
             .toSeq
         }
@@ -507,14 +515,17 @@ case class ParquetRelation2
 
 object ParquetRelation2 {
   // Whether we should merge schemas collected from all Parquet part-files.
-  val MERGE_SCHEMA = "parquet.mergeSchema"
+  val MERGE_SCHEMA = "mergeSchema"
 
   // Hive Metastore schema, passed in when the Parquet relation is converted from Metastore
-  val METASTORE_SCHEMA = "parquet.metastoreSchema"
+  val METASTORE_SCHEMA = "metastoreSchema"
 
   // Default partition name to use when the partition column value is null or empty string
   val DEFAULT_PARTITION_NAME = "partition.defaultName"
 
+  // When true, the Parquet data source caches Parquet metadata for performance
+  val CACHE_METADATA = "cacheMetadata"
+
   private[parquet] def readSchema(footers: Seq[Footer], sqlContext: SQLContext): StructType = {
     footers.map { footer =>
       val metadata = footer.getParquetMetadata.getFileMetaData
@@ -535,7 +546,7 @@ object ParquetRelation2 {
             sqlContext.conf.isParquetINT96AsTimestamp))
       }
     }.reduce { (left, right) =>
-      try mergeCatalystSchemas(left, right) catch { case e: Throwable =>
+      try left.merge(right) catch { case e: Throwable =>
         throw new SparkException(s"Failed to merge incompatible schemas $left and $right", e)
       }
     }
@@ -637,14 +648,15 @@ object ParquetRelation2 {
       path: Path,
       defaultPartitionName: String): PartitionValues = {
     val columns = ArrayBuffer.empty[(String, Literal)]
-    var finished = path.isRoot
+    // Old Hadoop versions don't have `Path.isRoot`
+    var finished = path.getParent == null
     var chopped = path
 
     while (!finished) {
       val maybeColumn = parsePartitionColumn(chopped.getName, defaultPartitionName)
       maybeColumn.foreach(columns += _)
       chopped = chopped.getParent
-      finished = maybeColumn.isEmpty || chopped.isRoot
+      finished = maybeColumn.isEmpty || chopped.getParent == null
     }
 
     val (columnNames, values) = columns.reverse.unzip