Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions R/pkg/inst/tests/testthat/test_sparkSQL.R
Original file line number Diff line number Diff line change
Expand Up @@ -2659,15 +2659,15 @@ test_that("Call DataFrameWriter.save() API in Java without path and check argume
# It makes sure that we can omit path argument in write.df API and then it calls
# DataFrameWriter.save() without path.
expect_error(write.df(df, source = "csv"),
"Error in save : illegal argument - 'path' is not specified")
"Error in save : illegal argument - Expected exactly one path to be specified")
expect_error(write.json(df, jsonPath),
"Error in json : analysis error - path file:.*already exists")
expect_error(write.text(df, jsonPath),
"Error in text : analysis error - path file:.*already exists")
expect_error(write.orc(df, jsonPath),
"Error in orc : analysis error - path file:.*already exists")
expect_error(write.parquet(df, jsonPath),
"Error in parquet : analysis error - path file:.*already exists")
"Error in parquet : analysis error - path file:.*already exists")

# Arguments checking in R side.
expect_error(write.df(df, "data.tmp", source = c(1, 2)),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -196,18 +196,32 @@ class InMemoryCatalog(
throw new TableAlreadyExistsException(db = db, table = table)
}
} else {
if (tableDefinition.tableType == CatalogTableType.MANAGED) {
val dir = new Path(catalog(db).db.locationUri, table)
// Set the default table location if this is a managed table and its location is not
// specified.
// Ideally we should not create a managed table with location, but Hive serde table can
// specify location for managed table. And in [[CreateDataSourceTableAsSelectCommand]] we have
// to create the table directory and write out data before we create this table, to avoid
// exposing a partial written table.
val needDefaultTableLocation =
tableDefinition.tableType == CatalogTableType.MANAGED &&
tableDefinition.storage.locationUri.isEmpty
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For a managed table, when will its location uri be empty? When it is a data source table?

Copy link
Contributor Author

@cloud-fan cloud-fan Oct 26, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ideally all managed tables should not set location uri, but we have 2 special cases:

  1. Users can create managed hive serde table with location.
  2. For CTAS of data source table, we have to write data before creating table in metastore, we will set the location uri in the table metadata.


val tableWithLocation = if (needDefaultTableLocation) {
val defaultTableLocation = new Path(catalog(db).db.locationUri, table)
try {
val fs = dir.getFileSystem(hadoopConfig)
fs.mkdirs(dir)
val fs = defaultTableLocation.getFileSystem(hadoopConfig)
fs.mkdirs(defaultTableLocation)
} catch {
case e: IOException =>
throw new SparkException(s"Unable to create table $table as failed " +
s"to create its directory $dir", e)
s"to create its directory $defaultTableLocation", e)
}
tableDefinition.withNewStorage(locationUri = Some(defaultTableLocation.toUri.toString))
} else {
tableDefinition
}
catalog(db).tables.put(table, new TableDesc(tableDefinition))

catalog(db).tables.put(table, new TableDesc(tableWithLocation))
}
}

Expand All @@ -218,8 +232,12 @@ class InMemoryCatalog(
purge: Boolean): Unit = synchronized {
requireDbExists(db)
if (tableExists(db, table)) {
if (getTable(db, table).tableType == CatalogTableType.MANAGED) {
val dir = new Path(catalog(db).db.locationUri, table)
val tableMeta = getTable(db, table)
if (tableMeta.tableType == CatalogTableType.MANAGED) {
assert(tableMeta.storage.locationUri.isDefined,
"Managed table should always have table location, as we will assign a default location " +
"to it if it doesn't have one.")
val dir = new Path(tableMeta.storage.locationUri.get)
try {
val fs = dir.getFileSystem(hadoopConfig)
fs.delete(dir, true)
Expand All @@ -244,7 +262,10 @@ class InMemoryCatalog(
oldDesc.table = oldDesc.table.copy(identifier = TableIdentifier(newName, Some(db)))

if (oldDesc.table.tableType == CatalogTableType.MANAGED) {
val oldDir = new Path(catalog(db).db.locationUri, oldName)
assert(oldDesc.table.storage.locationUri.isDefined,
"Managed table should always have table location, as we will assign a default location " +
"to it if it doesn't have one.")
val oldDir = new Path(oldDesc.table.storage.locationUri.get)
val newDir = new Path(catalog(db).db.locationUri, newName)
try {
val fs = oldDir.getFileSystem(hadoopConfig)
Expand All @@ -254,6 +275,7 @@ class InMemoryCatalog(
throw new SparkException(s"Unable to rename table $oldName to $newName as failed " +
s"to rename its directory $oldDir", e)
}
oldDesc.table = oldDesc.table.withNewStorage(locationUri = Some(newDir.toUri.toString))
Copy link
Contributor

@clockfly clockfly Sep 12, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe for managed table, we should never set field locationUri in CatalogStorageFormat.

Instead, we can create a method in CatalogTable, with signature like this

// in class CatalogTable
def location: String = {
 if (managed) {
   databasePath + tableName
 } else {
    storage.locationUri.getOrElse(database, throw Exception("LocationUri for unmanaged table is not set"))
  }
}

The advantage is that we don't need to handle the inconsistence between tablename and locationuri for managed table.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In Hive, users are allowed to create a Hive managed table with the user-specified location. In Spark SQL, we do not allow users to do it. If users specify the location, we always convert the type to EXTERNAL.

When we creating managed table without a location, Hive will set it for us. Before this PR, sometimes we check path and sometimes we check locationUri. It could hide bugs we do not realize, especially for CreateDataSourceTableAsSelectCommand. We assume our generated path is always identical to the locationUri that is populated by Hive. Thus, we should explicitly set locationUri for Hive managed table using our generated path.

Based on my understanding, here is to ensure InMemoryCatalog and HiveExternalCatalog behave the same.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can hide the all hive specific details in HiveExternalCatalog? I see there is many code place where we try to add a locationUri for managed table. It doesn't seems necessary if the locationUri can be deduced from table name.

We can always do some conversion in HiveExternalCatalog to change CatalogTable to hive understandable catalog.

private def saveTableIntoHive(tableDefinition: CatalogTable, ignoreIfExists: Boolean): Unit = {

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For determining the default location, we can do it based on your way. However, we should not disallow users to change it. ALTER TABLE SET LOCATION still can change the location of the managed table

}

catalog(db).tables.put(newName, oldDesc)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -373,7 +373,8 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
throw new AnalysisException(s"Table $tableIdent already exists.")

case _ =>
val tableType = if (new CaseInsensitiveMap(extraOptions.toMap).contains("path")) {
val storage = DataSource.buildStorageFormatFromOptions(extraOptions.toMap)
val tableType = if (storage.locationUri.isDefined) {
CatalogTableType.EXTERNAL
} else {
CatalogTableType.MANAGED
Expand All @@ -382,7 +383,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
val tableDesc = CatalogTable(
identifier = tableIdent,
tableType = tableType,
storage = CatalogStorageFormat.empty.copy(properties = extraOptions.toMap),
storage = storage,
schema = new StructType,
provider = Some(source),
partitionColumnNames = partitioningColumns.getOrElse(Nil),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -343,7 +343,8 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder {

// TODO: this may be wrong for non file-based data source like JDBC, which should be external
// even there is no `path` in options. We should consider allow the EXTERNAL keyword.
val tableType = if (new CaseInsensitiveMap(options).contains("path")) {
val storage = DataSource.buildStorageFormatFromOptions(options)
val tableType = if (storage.locationUri.isDefined) {
CatalogTableType.EXTERNAL
} else {
CatalogTableType.MANAGED
Expand All @@ -352,7 +353,7 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder {
val tableDesc = CatalogTable(
identifier = table,
tableType = tableType,
storage = CatalogStorageFormat.empty.copy(properties = options),
storage = storage,
schema = schema.getOrElse(new StructType),
provider = Some(provider),
partitionColumnNames = partitionColumnNames,
Expand Down Expand Up @@ -1062,17 +1063,9 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder {
if (conf.convertCTAS && !hasStorageProperties) {
// At here, both rowStorage.serdeProperties and fileStorage.serdeProperties
// are empty Maps.
val optionsWithPath = if (location.isDefined) {
Map("path" -> location.get)
} else {
Map.empty[String, String]
}

val newTableDesc = tableDesc.copy(
storage = CatalogStorageFormat.empty.copy(properties = optionsWithPath),
provider = Some(conf.defaultDataSourceName)
)

storage = CatalogStorageFormat.empty.copy(locationUri = location),
provider = Some(conf.defaultDataSourceName))
CreateTable(newTableDesc, mode, Some(q))
} else {
CreateTable(tableDesc, mode, Some(q))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,13 +57,14 @@ case class CreateDataSourceTableCommand(table: CatalogTable, ignoreIfExists: Boo

// Create the relation to validate the arguments before writing the metadata to the metastore,
// and infer the table schema and partition if users didn't specify schema in CREATE TABLE.
val pathOption = table.storage.locationUri.map("path" -> _)
val dataSource: BaseRelation =
DataSource(
sparkSession = sparkSession,
userSpecifiedSchema = if (table.schema.isEmpty) None else Some(table.schema),
className = table.provider.get,
bucketSpec = table.bucketSpec,
options = table.storage.properties).resolveRelation()
options = table.storage.properties ++ pathOption).resolveRelation()

dataSource match {
case fs: HadoopFsRelation =>
Expand All @@ -85,14 +86,7 @@ case class CreateDataSourceTableCommand(table: CatalogTable, ignoreIfExists: Boo
}
}

val optionsWithPath = if (table.tableType == CatalogTableType.MANAGED) {
table.storage.properties + ("path" -> sessionState.catalog.defaultTablePath(table.identifier))
} else {
table.storage.properties
}

val newTable = table.copy(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

where do we assign the default location?

storage = table.storage.copy(properties = optionsWithPath),
schema = dataSource.schema,
partitionColumnNames = partitionColumnNames,
// If metastore partition management for file source tables is enabled, we start off with
Expand Down Expand Up @@ -140,12 +134,6 @@ case class CreateDataSourceTableAsSelectCommand(
val tableIdentWithDB = table.identifier.copy(database = Some(db))
val tableName = tableIdentWithDB.unquotedString

val optionsWithPath = if (table.tableType == CatalogTableType.MANAGED) {
table.storage.properties + ("path" -> sessionState.catalog.defaultTablePath(table.identifier))
} else {
table.storage.properties
}

var createMetastoreTable = false
var existingSchema = Option.empty[StructType]
if (sparkSession.sessionState.catalog.tableExists(tableIdentWithDB)) {
Expand All @@ -162,13 +150,7 @@ case class CreateDataSourceTableAsSelectCommand(
return Seq.empty[Row]
case SaveMode.Append =>
// Check if the specified data source match the data source of the existing table.
val dataSource = DataSource(
sparkSession = sparkSession,
userSpecifiedSchema = Some(query.schema.asNullable),
partitionColumns = table.partitionColumnNames,
bucketSpec = table.bucketSpec,
className = provider,
options = optionsWithPath)
val existingProvider = DataSource.lookupDataSource(provider)
// TODO: Check that options from the resolved relation match the relation that we are
// inserting into (i.e. using the same compression).

Expand All @@ -178,7 +160,7 @@ case class CreateDataSourceTableAsSelectCommand(
case l @ LogicalRelation(_: InsertableRelation | _: HadoopFsRelation, _, _) =>
// check if the file formats match
l.relation match {
case r: HadoopFsRelation if r.fileFormat.getClass != dataSource.providingClass =>
case r: HadoopFsRelation if r.fileFormat.getClass != existingProvider =>
throw new AnalysisException(
s"The file format of the existing table $tableName is " +
s"`${r.fileFormat.getClass.getName}`. It doesn't match the specified " +
Expand Down Expand Up @@ -213,13 +195,20 @@ case class CreateDataSourceTableAsSelectCommand(
case None => data
}

val tableLocation = if (table.tableType == CatalogTableType.MANAGED) {
Some(sessionState.catalog.defaultTablePath(table.identifier))
} else {
table.storage.locationUri
}

// Create the relation based on the data of df.
val pathOption = tableLocation.map("path" -> _)
val dataSource = DataSource(
sparkSession,
className = provider,
partitionColumns = table.partitionColumnNames,
bucketSpec = table.bucketSpec,
options = optionsWithPath)
options = table.storage.properties ++ pathOption)

val result = try {
dataSource.write(mode, df)
Expand All @@ -230,7 +219,7 @@ case class CreateDataSourceTableAsSelectCommand(
}
if (createMetastoreTable) {
val newTable = table.copy(
storage = table.storage.copy(properties = optionsWithPath),
storage = table.storage.copy(locationUri = tableLocation),
// We will use the schema of resolved.relation as the schema of the table (instead of
// the schema of df). It is important since the nullability may be changed by the relation
// provider (for example, see org.apache.spark.sql.parquet.DefaultSource).
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -485,14 +485,6 @@ case class AlterTableRecoverPartitionsCommand(
}
}

private def getBasePath(table: CatalogTable): Option[String] = {
if (table.provider == Some("hive")) {
table.storage.locationUri
} else {
new CaseInsensitiveMap(table.storage.properties).get("path")
}
}

override def run(spark: SparkSession): Seq[Row] = {
val catalog = spark.sessionState.catalog
val table = catalog.getTableMetadata(tableName)
Expand All @@ -503,13 +495,12 @@ case class AlterTableRecoverPartitionsCommand(
s"Operation not allowed: $cmd only works on partitioned tables: $tableIdentWithDB")
}

val tablePath = getBasePath(table)
if (tablePath.isEmpty) {
if (table.storage.locationUri.isEmpty) {
throw new AnalysisException(s"Operation not allowed: $cmd only works on table with " +
s"location provided: $tableIdentWithDB")
}

val root = new Path(tablePath.get)
val root = new Path(table.storage.locationUri.get)
logInfo(s"Recover all the partitions in $root")
val fs = root.getFileSystem(spark.sparkContext.hadoopConfiguration)

Expand Down Expand Up @@ -688,15 +679,7 @@ case class AlterTableSetLocationCommand(
catalog.alterPartitions(table.identifier, Seq(newPart))
case None =>
// No partition spec is specified, so we set the location for the table itself
val newTable =
if (DDLUtils.isDatasourceTable(table)) {
table.withNewStorage(
locationUri = Some(location),
properties = table.storage.properties ++ Map("path" -> location))
} else {
table.withNewStorage(locationUri = Some(location))
}
catalog.alterTable(newTable)
catalog.alterTable(table.withNewStorage(locationUri = Some(location)))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we throws an exception if it is a managed table?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is legal to change the location for Hive managed tables. Why issuing an exception here?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is strange to allow user to change location of a managed table. Maybe we should ban this usage?

Probably not in the scope of this PR.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For example, renaming a Hive managed table, we also change the location of this table. Thus, I think we might need to keep this.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We supported it before, I'd like to not break it in this PR.

Copy link
Contributor Author

@cloud-fan cloud-fan Sep 16, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FYI we have a bug here, currently we allow users to SET LOCATION for managed data source table, however, in SHOW CREATE TABLE, we can't generate corrected SQL to create managed data source table whose location has been set, because data source table with path is always treated as external table. We should either forbid SET LOCATION for managed data source table, or improve the CREATE TABLE syntax to support managed table with path.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we still have this issue?

}
Seq.empty[Row]
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
import org.apache.spark.sql.catalyst.util.quoteIdentifier
import org.apache.spark.sql.execution.datasources.PartitioningUtils
import org.apache.spark.sql.internal.StaticSQLConf.CATALOG_IMPLEMENTATION
import org.apache.spark.sql.types._
import org.apache.spark.util.Utils

Expand All @@ -62,25 +63,6 @@ case class CreateTableLikeCommand(
val catalog = sparkSession.sessionState.catalog
val sourceTableDesc = catalog.getTempViewOrPermanentTableMetadata(sourceTable)

// Storage format
val newStorage =
if (sourceTableDesc.tableType == CatalogTableType.VIEW) {
val newPath = catalog.defaultTablePath(targetTable)
CatalogStorageFormat.empty.copy(properties = Map("path" -> newPath))
} else if (DDLUtils.isDatasourceTable(sourceTableDesc)) {
val newPath = catalog.defaultTablePath(targetTable)
val newSerdeProp =
sourceTableDesc.storage.properties.filterKeys(_.toLowerCase != "path") ++
Map("path" -> newPath)
sourceTableDesc.storage.copy(
locationUri = None,
properties = newSerdeProp)
} else {
sourceTableDesc.storage.copy(
locationUri = None,
properties = sourceTableDesc.storage.properties)
}

val newProvider = if (sourceTableDesc.tableType == CatalogTableType.VIEW) {
Some(sparkSession.sessionState.conf.defaultDataSourceName)
} else {
Expand All @@ -91,7 +73,8 @@ case class CreateTableLikeCommand(
CatalogTable(
identifier = targetTable,
tableType = CatalogTableType.MANAGED,
storage = newStorage,
// We are creating a new managed table, which should not have custom table location.
storage = sourceTableDesc.storage.copy(locationUri = None),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When will we set the location? Is it set by hive metastore?

schema = sourceTableDesc.schema,
provider = newProvider,
partitionColumnNames = sourceTableDesc.partitionColumnNames,
Expand Down Expand Up @@ -170,13 +153,6 @@ case class AlterTableRenameCommand(
case NonFatal(e) => log.warn(e.toString, e)
}
}
// For datasource tables, we also need to update the "path" serde property
if (DDLUtils.isDatasourceTable(table) && table.tableType == CatalogTableType.MANAGED) {
val newPath = catalog.defaultTablePath(newName)
val newTable = table.withNewStorage(
properties = table.storage.properties ++ Map("path" -> newPath))
catalog.alterTable(newTable)
}
// Invalidate the table last, otherwise uncaching the table would load the logical plan
// back into the hive metastore cache
catalog.refreshTable(oldName)
Expand Down Expand Up @@ -367,8 +343,9 @@ case class TruncateTableCommand(
DDLUtils.verifyPartitionProviderIsHive(spark, table, "TRUNCATE TABLE ... PARTITION")
}
val locations =
if (DDLUtils.isDatasourceTable(table)) {
Seq(table.storage.properties.get("path"))
// TODO: The `InMemoryCatalog` doesn't support listPartition with partial partition spec.
if (spark.conf.get(CATALOG_IMPLEMENTATION) == "in-memory") {
Seq(table.storage.locationUri)
} else if (table.partitionColumnNames.isEmpty) {
Seq(table.storage.locationUri)
} else {
Expand Down Expand Up @@ -916,17 +893,18 @@ case class ShowCreateTableCommand(table: TableIdentifier) extends RunnableComman
}

private def showDataSourceTableOptions(metadata: CatalogTable, builder: StringBuilder): Unit = {
val props = metadata.properties

builder ++= s"USING ${metadata.provider.get}\n"

val dataSourceOptions = metadata.storage.properties.filterNot {
case (key, value) =>
val dataSourceOptions = metadata.storage.properties.map {
case (key, value) => s"${quoteIdentifier(key)} '${escapeSingleQuotedString(value)}'"
} ++ metadata.storage.locationUri.flatMap { location =>
if (metadata.tableType == MANAGED) {
// If it's a managed table, omit PATH option. Spark SQL always creates external table
// when the table creation DDL contains the PATH option.
key.toLowerCase == "path" && metadata.tableType == MANAGED
}.map {
case (key, value) => s"${quoteIdentifier(key)} '${escapeSingleQuotedString(value)}'"
None
} else {
Some(s"path '${escapeSingleQuotedString(location)}'")
}
}

if (dataSourceOptions.nonEmpty) {
Expand Down
Loading