From cae4413d9f87d9c8c332280a547ae4f8ba63267b Mon Sep 17 00:00:00 2001 From: Sameer Agarwal Date: Mon, 18 Jan 2016 20:46:12 -0800 Subject: [PATCH 1/2] Add support for (optionally) not storing tables in hive metadata format --- .../spark/sql/hive/HiveMetastoreCatalog.scala | 8 ++++++ .../sql/hive/MetastoreDataSourcesSuite.scala | 28 +++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala index 43d84d507b20..416197e6a816 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala @@ -323,7 +323,15 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive // TODO: Support persisting partitioned data source relations in Hive compatible format val qualifiedTableName = tableIdent.quotedString + val skipHiveMetadata = options.getOrElse("skip_hive_metadata", "false").toBoolean val (hiveCompatibleTable, logMessage) = (maybeSerDe, dataSource.relation) match { + case (Some(serde), relation: HadoopFsRelation) if skipHiveMetadata => + val message = + s"Persisting partitioned data source relation $qualifiedTableName into " + + "Hive metastore in Spark SQL specific format, which is NOT compatible with Hive. " + + "Input path(s): " + relation.paths.mkString("\n", "\n", "") + (None, message) + case (Some(serde), relation: HadoopFsRelation) if relation.paths.length == 1 && relation.partitionColumns.isEmpty => val hiveTable = newHiveCompatibleMetastoreTable(relation, serde) diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala index 202851ae1366..76d83efaffd3 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala @@ -847,4 +847,32 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv sqlContext.sql("""use default""") sqlContext.sql("""drop database if exists testdb8156 CASCADE""") } + + test("skip hive metadata on table creation") { + val schema = StructType((1 to 5).map(i => StructField(s"c_$i", StringType))) + + catalog.createDataSourceTable( + tableIdent = TableIdentifier("not_skip_hive_metadata"), + userSpecifiedSchema = Some(schema), + partitionColumns = Array.empty[String], + bucketSpec = None, + provider = "parquet", + options = Map("path" -> "just a dummy path", "skip_hive_metadata" -> "false"), + isExternal = false) + + assert(catalog.client.getTable("default", "not_skip_hive_metadata").schema + .forall(column => HiveMetastoreTypes.toDataType(column.hiveType) == StringType)) + + catalog.createDataSourceTable( + tableIdent = TableIdentifier("skip_hive_metadata"), + userSpecifiedSchema = Some(schema), + partitionColumns = Array.empty[String], + bucketSpec = None, + provider = "parquet", + options = Map("path" -> "just a dummy path", "skip_hive_metadata" -> "true"), + isExternal = false) + + assert(catalog.client.getTable("default", "skip_hive_metadata").schema + .forall(column => HiveMetastoreTypes.toDataType(column.hiveType) == ArrayType(StringType))) + } } From 2ce35b19abe69f1791e396f5fd8bc0ebcd38b661 Mon Sep 17 00:00:00 2001 From: Sameer Agarwal Date: Mon, 25 Jan 2016 12:35:44 -0800 Subject: [PATCH 2/2] Yin's comments --- .../org/apache/spark/sql/hive/HiveMetastoreCatalog.scala | 7 +++---- .../apache/spark/sql/hive/MetastoreDataSourcesSuite.scala | 8 ++++++-- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala index 416197e6a816..a5af03e4a7a1 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala @@ -323,13 +323,12 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive // TODO: Support persisting partitioned data source relations in Hive compatible format val qualifiedTableName = tableIdent.quotedString - val skipHiveMetadata = options.getOrElse("skip_hive_metadata", "false").toBoolean + val skipHiveMetadata = options.getOrElse("skipHiveMetadata", "false").toBoolean val (hiveCompatibleTable, logMessage) = (maybeSerDe, dataSource.relation) match { - case (Some(serde), relation: HadoopFsRelation) if skipHiveMetadata => + case _ if skipHiveMetadata => val message = s"Persisting partitioned data source relation $qualifiedTableName into " + - "Hive metastore in Spark SQL specific format, which is NOT compatible with Hive. " + - "Input path(s): " + relation.paths.mkString("\n", "\n", "") + "Hive metastore in Spark SQL specific format, which is NOT compatible with Hive." (None, message) case (Some(serde), relation: HadoopFsRelation) diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala index 76d83efaffd3..c0f36d8e3c98 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala @@ -857,9 +857,11 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv partitionColumns = Array.empty[String], bucketSpec = None, provider = "parquet", - options = Map("path" -> "just a dummy path", "skip_hive_metadata" -> "false"), + options = Map("path" -> "just a dummy path", "skipHiveMetadata" -> "false"), isExternal = false) + // As a proxy for verifying that the table was stored in Hive compatible format, we verify that + // each column of the table is of native type StringType. assert(catalog.client.getTable("default", "not_skip_hive_metadata").schema .forall(column => HiveMetastoreTypes.toDataType(column.hiveType) == StringType)) @@ -869,9 +871,11 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv partitionColumns = Array.empty[String], bucketSpec = None, provider = "parquet", - options = Map("path" -> "just a dummy path", "skip_hive_metadata" -> "true"), + options = Map("path" -> "just a dummy path", "skipHiveMetadata" -> "true"), isExternal = false) + // As a proxy for verifying that the table was stored in SparkSQL format, we verify that + // the table has a column type as array of StringType. assert(catalog.client.getTable("default", "skip_hive_metadata").schema .forall(column => HiveMetastoreTypes.toDataType(column.hiveType) == ArrayType(StringType))) }