From dc977d26acb29731f62addaec67ea6dbe077e670 Mon Sep 17 00:00:00 2001 From: Ryan Blue Date: Fri, 4 May 2018 18:13:01 -0700 Subject: [PATCH 01/14] SPARK-24252: Add v2 data source mix-in for catalog support. --- .../spark/sql/sources/v2/CatalogSupport.java | 19 +++ .../sources/v2/catalog/DataSourceCatalog.java | 106 ++++++++++++ .../spark/sql/sources/v2/catalog/Table.java | 42 +++++ .../sql/sources/v2/catalog/TableChange.java | 156 ++++++++++++++++++ 4 files changed, 323 insertions(+) create mode 100644 sql/core/src/main/java/org/apache/spark/sql/sources/v2/CatalogSupport.java create mode 100644 sql/core/src/main/java/org/apache/spark/sql/sources/v2/catalog/DataSourceCatalog.java create mode 100644 sql/core/src/main/java/org/apache/spark/sql/sources/v2/catalog/Table.java create mode 100644 sql/core/src/main/java/org/apache/spark/sql/sources/v2/catalog/TableChange.java diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/CatalogSupport.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/CatalogSupport.java new file mode 100644 index 000000000000..cef6d59cd6cd --- /dev/null +++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/CatalogSupport.java @@ -0,0 +1,19 @@ +package org.apache.spark.sql.sources.v2; + +import org.apache.spark.sql.sources.v2.catalog.DataSourceCatalog; + +/** + * A mix-in interface for {@link DataSourceV2} catalog support. Data sources can implement this + * interface to provide the ability to load, create, alter, and drop tables. + *

+ * Data sources must implement this interface to support logical operations that combine writing + * data with catalog tasks, like create-table-as-select. + */ +public interface CatalogSupport { + /** + * Return a {@link DataSourceCatalog catalog} for tables. + * + * @return a catalog + */ + DataSourceCatalog catalog(); +} diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/catalog/DataSourceCatalog.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/catalog/DataSourceCatalog.java new file mode 100644 index 000000000000..91bf5953fde2 --- /dev/null +++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/catalog/DataSourceCatalog.java @@ -0,0 +1,106 @@ +package org.apache.spark.sql.sources.v2.catalog; + +import org.apache.spark.sql.catalyst.TableIdentifier; +import org.apache.spark.sql.catalyst.analysis.NoSuchTableException; +import org.apache.spark.sql.catalyst.analysis.TableAlreadyExistsException; +import org.apache.spark.sql.catalyst.expressions.Expression; +import org.apache.spark.sql.types.StructType; + +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.Map; + +public interface DataSourceCatalog { + /** + * Load table metadata by {@link TableIdentifier identifier} from the catalog. + * + * @param ident a table identifier + * @return the table's metadata + * @throws NoSuchTableException If the table doesn't exist. + */ + Table loadTable(TableIdentifier ident) throws NoSuchTableException; + + /** + * Create a table in the catalog. + * + * @param ident a table identifier + * @param schema the schema of the new table, as a struct type + * @return metadata for the new table + * @throws TableAlreadyExistsException If a table already exists for the identifier + */ + default Table createTable(TableIdentifier ident, + StructType schema) throws TableAlreadyExistsException { + return createTable(ident, schema, Collections.emptyList(), Collections.emptyMap()); + } + + /** + * Create a table in the catalog. + * + * @param ident a table identifier + * @param schema the schema of the new table, as a struct type + * @param properties a string map of table properties + * @return metadata for the new table + * @throws TableAlreadyExistsException If a table already exists for the identifier + */ + default Table createTable(TableIdentifier ident, + StructType schema, + Map properties) throws TableAlreadyExistsException { + return createTable(ident, schema, Collections.emptyList(), properties); + } + + /** + * Create a table in the catalog. + * + * @param ident a table identifier + * @param schema the schema of the new table, as a struct type + * @param partitions a list of expressions to use for partitioning data in the table + * @param properties a string map of table properties + * @return metadata for the new table + * @throws TableAlreadyExistsException If a table already exists for the identifier + */ + Table createTable(TableIdentifier ident, + StructType schema, + List partitions, + Map properties) throws TableAlreadyExistsException; + + /** + * Apply a list of {@link TableChange changes} to a table in the catalog. + *

+ * Implementations may reject the requested changes. If any change is rejected, none of the + * changes should be applied to the table. + * + * @param ident a table identifier + * @param changes a list of changes to apply to the table + * @return updated metadata for the table + * @throws NoSuchTableException If the table doesn't exist. + * @throws IllegalArgumentException If any change is rejected by the implementation. + */ + Table alterTable(TableIdentifier ident, + List changes) throws NoSuchTableException; + + /** + * Apply {@link TableChange changes} to a table in the catalog. + *

+ * Implementations may reject the requested changes. If any change is rejected, none of the + * changes should be applied to the table. + * + * @param ident a table identifier + * @param changes a list of changes to apply to the table + * @return updated metadata for the table + * @throws NoSuchTableException If the table doesn't exist. + * @throws IllegalArgumentException If any change is rejected by the implementation. + */ + default Table alterTable(TableIdentifier ident, + TableChange... changes) throws NoSuchTableException { + return alterTable(ident, Arrays.asList(changes)); + } + + /** + * Drop a table in the catalog. + * + * @param ident a table identifier + * @return true if a table was deleted, false if no table exists for the identifier + */ + boolean dropTable(TableIdentifier ident); +} diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/catalog/Table.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/catalog/Table.java new file mode 100644 index 000000000000..36b95fb54402 --- /dev/null +++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/catalog/Table.java @@ -0,0 +1,42 @@ +package org.apache.spark.sql.sources.v2.catalog; + +import org.apache.spark.sql.catalyst.expressions.Expression; +import org.apache.spark.sql.types.StructType; + +import java.util.List; +import java.util.Map; + +/** + * Represents table metadata from a {@link DataSourceCatalog}. + */ +public interface Table { + /** + * Return the table name. + * @return this table's name + */ + String name(); + + /** + * Return the database that contains the table. + * @return this table's database + */ + String database(); + + /** + * Return the table properties. + * @return this table's map of string properties + */ + Map properties(); + + /** + * Return the table schema. + * @return this table's schema as a struct type + */ + StructType schema(); + + /** + * Return the table partitioning expressions. + * @return this table's partitioning expressions + */ + List partitions(); +} diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/catalog/TableChange.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/catalog/TableChange.java new file mode 100644 index 000000000000..e191fc79c820 --- /dev/null +++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/catalog/TableChange.java @@ -0,0 +1,156 @@ +package org.apache.spark.sql.sources.v2.catalog; + +import org.apache.spark.sql.types.DataType; + +/** + * TableChange subclasses represent requested changes to a table. These are passed to + * {@link DataSourceCatalog#alterTable}. + */ +public interface TableChange { + + /** + * Create a TableChange for adding a top-level column to a table. + *

+ * Because "." may be interpreted as a field path separator or may be used in field names, it is + * not allowed in names passed to this method. To add to nested types or to add fields with + * names that contain ".", use {@link #addColumn(String, String, DataType)}. + * + * @param name the new top-level column name + * @param dataType the new column's data type + * @return a TableChange for the addition + */ + static TableChange addColumn(String name, DataType dataType) { + return new AddColumn(null, name, dataType); + } + + /** + * Create a TableChange for adding a nested column to a table. + *

+ * The parent name is used to find the parent struct type where the nested field will be added. + * If the parent name is null, the new column will be added to the root as a top-level column. + * If parent identifies a struct, a new column is added to that struct. If it identifies a list, + * the column is added to the list element struct, and if it identifies a map, the new column is + * added to the map's value struct. + *

+ * The given name is used to name the new column and names containing "." are not handled + * differently. + * + * @param parent the new field's parent + * @param name the new field name + * @param dataType the new field's data type + * @return a TableChange for the addition + */ + static TableChange addColumn(String parent, String name, DataType dataType) { + return new AddColumn(parent, name, dataType); + } + + /** + * Create a TableChange for renaming a field. + *

+ * The name is used to find the field to rename. The new name will replace the name of the type. + * For example, renameColumn("a.b.c", "x") should produce column a.b.x. + * + * @param name the current field name + * @param newName the new name + * @return a TableChange for the rename + */ + static TableChange renameColumn(String name, String newName) { + return new RenameColumn(name, newName); + } + + /** + * Create a TableChange for updating the type of a field. + *

+ * The name is used to find the field to update. + * + * @param name the field name + * @param newDataType the new data type + * @return a TableChange for the update + */ + static TableChange updateColumn(String name, DataType newDataType) { + return new UpdateColumn(name, newDataType); + } + + /** + * Create a TableChange for deleting a field from a table. + * + * @param name the name of the field to delete + * @return a TableChange for the delete + */ + static TableChange deleteColumn(String name) { + return new DeleteColumn(name); + } + + final class AddColumn implements TableChange { + private final String parent; + private final String name; + private final DataType dataType; + + private AddColumn(String parent, String name, DataType dataType) { + this.parent = parent; + this.name = name; + this.dataType = dataType; + } + + public String parent() { + return parent; + } + + public String name() { + return name; + } + + public DataType type() { + return dataType; + } + } + + final class RenameColumn implements TableChange { + private final String name; + private final String newName; + + private RenameColumn(String name, String newName) { + this.name = name; + this.newName = newName; + } + + public String name() { + return name; + } + + public String newName() { + return newName; + } + } + + final class UpdateColumn implements TableChange { + private final String name; + private final DataType newDataType; + + private UpdateColumn(String name, DataType newDataType) { + this.name = name; + this.newDataType = newDataType; + } + + public String name() { + return name; + } + + public DataType newDataType() { + return newDataType; + } + } + + final class DeleteColumn implements TableChange { + private final String name; + + private DeleteColumn(String name) { + this.name = name; + } + + public String name() { + return name; + } + } + +} From 7d75da5e6abc2bf90b8cf7f530412d6b01ee6938 Mon Sep 17 00:00:00 2001 From: Ryan Blue Date: Fri, 11 May 2018 14:27:47 -0700 Subject: [PATCH 02/14] SPARK-24252: Add copyright headers. --- .../spark/sql/sources/v2/CatalogSupport.java | 17 +++++++++++++++++ .../sources/v2/catalog/DataSourceCatalog.java | 17 +++++++++++++++++ .../spark/sql/sources/v2/catalog/Table.java | 17 +++++++++++++++++ .../sql/sources/v2/catalog/TableChange.java | 17 +++++++++++++++++ 4 files changed, 68 insertions(+) diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/CatalogSupport.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/CatalogSupport.java index cef6d59cd6cd..df0215c65932 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/CatalogSupport.java +++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/CatalogSupport.java @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.spark.sql.sources.v2; import org.apache.spark.sql.sources.v2.catalog.DataSourceCatalog; diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/catalog/DataSourceCatalog.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/catalog/DataSourceCatalog.java index 91bf5953fde2..b896ecb0a078 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/catalog/DataSourceCatalog.java +++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/catalog/DataSourceCatalog.java @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.spark.sql.sources.v2.catalog; import org.apache.spark.sql.catalyst.TableIdentifier; diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/catalog/Table.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/catalog/Table.java index 36b95fb54402..2285c78fd51b 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/catalog/Table.java +++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/catalog/Table.java @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.spark.sql.sources.v2.catalog; import org.apache.spark.sql.catalyst.expressions.Expression; diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/catalog/TableChange.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/catalog/TableChange.java index e191fc79c820..dbb03a19a0b3 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/catalog/TableChange.java +++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/catalog/TableChange.java @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.spark.sql.sources.v2.catalog; import org.apache.spark.sql.types.DataType; From 3be3b7c320c7f9b68270112c9d3f7fa0a1ff377c Mon Sep 17 00:00:00 2001 From: Ryan Blue Date: Wed, 4 Jul 2018 10:02:52 -0700 Subject: [PATCH 03/14] SPARK-24252: Update for review comments. * Rename CatalogSupport to TableSupport * Rename DataSourceCatalog to TableCatalog * Remove name and database from Table --- .../{CatalogSupport.java => TableSupport.java} | 8 ++++---- .../spark/sql/sources/v2/catalog/Table.java | 16 ++-------------- ...{DataSourceCatalog.java => TableCatalog.java} | 2 +- .../sql/sources/v2/catalog/TableChange.java | 2 +- 4 files changed, 8 insertions(+), 20 deletions(-) rename sql/core/src/main/java/org/apache/spark/sql/sources/v2/{CatalogSupport.java => TableSupport.java} (86%) rename sql/core/src/main/java/org/apache/spark/sql/sources/v2/catalog/{DataSourceCatalog.java => TableCatalog.java} (99%) diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/CatalogSupport.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/TableSupport.java similarity index 86% rename from sql/core/src/main/java/org/apache/spark/sql/sources/v2/CatalogSupport.java rename to sql/core/src/main/java/org/apache/spark/sql/sources/v2/TableSupport.java index df0215c65932..ad42b4b26838 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/CatalogSupport.java +++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/TableSupport.java @@ -17,7 +17,7 @@ package org.apache.spark.sql.sources.v2; -import org.apache.spark.sql.sources.v2.catalog.DataSourceCatalog; +import org.apache.spark.sql.sources.v2.catalog.TableCatalog; /** * A mix-in interface for {@link DataSourceV2} catalog support. Data sources can implement this @@ -26,11 +26,11 @@ * Data sources must implement this interface to support logical operations that combine writing * data with catalog tasks, like create-table-as-select. */ -public interface CatalogSupport { +public interface TableSupport { /** - * Return a {@link DataSourceCatalog catalog} for tables. + * Return a {@link TableCatalog catalog} for tables. * * @return a catalog */ - DataSourceCatalog catalog(); + TableCatalog catalog(); } diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/catalog/Table.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/catalog/Table.java index 2285c78fd51b..45b4cc1841fd 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/catalog/Table.java +++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/catalog/Table.java @@ -24,21 +24,9 @@ import java.util.Map; /** - * Represents table metadata from a {@link DataSourceCatalog}. + * Represents table metadata from a {@link TableCatalog} or other table sources. */ public interface Table { - /** - * Return the table name. - * @return this table's name - */ - String name(); - - /** - * Return the database that contains the table. - * @return this table's database - */ - String database(); - /** * Return the table properties. * @return this table's map of string properties @@ -55,5 +43,5 @@ public interface Table { * Return the table partitioning expressions. * @return this table's partitioning expressions */ - List partitions(); + List partitionExpressions(); } diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/catalog/DataSourceCatalog.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/catalog/TableCatalog.java similarity index 99% rename from sql/core/src/main/java/org/apache/spark/sql/sources/v2/catalog/DataSourceCatalog.java rename to sql/core/src/main/java/org/apache/spark/sql/sources/v2/catalog/TableCatalog.java index b896ecb0a078..ab15f355968d 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/catalog/DataSourceCatalog.java +++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/catalog/TableCatalog.java @@ -28,7 +28,7 @@ import java.util.List; import java.util.Map; -public interface DataSourceCatalog { +public interface TableCatalog { /** * Load table metadata by {@link TableIdentifier identifier} from the catalog. * diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/catalog/TableChange.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/catalog/TableChange.java index dbb03a19a0b3..ea2daf7eea54 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/catalog/TableChange.java +++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/catalog/TableChange.java @@ -21,7 +21,7 @@ /** * TableChange subclasses represent requested changes to a table. These are passed to - * {@link DataSourceCatalog#alterTable}. + * {@link TableCatalog#alterTable}. */ public interface TableChange { From 7ad0c8e8fe45f1b85011fc91dd162dba3951593e Mon Sep 17 00:00:00 2001 From: Ryan Blue Date: Wed, 4 Jul 2018 10:19:45 -0700 Subject: [PATCH 04/14] SPARK-24252: Add TableChange example to Javadocs. --- .../spark/sql/sources/v2/catalog/TableChange.java | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/catalog/TableChange.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/catalog/TableChange.java index ea2daf7eea54..6e3c6ca0097d 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/catalog/TableChange.java +++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/catalog/TableChange.java @@ -21,7 +21,16 @@ /** * TableChange subclasses represent requested changes to a table. These are passed to - * {@link TableCatalog#alterTable}. + * {@link TableCatalog#alterTable}. For example, + *

+ *   import TableChange._
+ *   val catalog = source.asInstanceOf[TableSupport].catalog()
+ *   catalog.alterTable(ident,
+ *       addColumn("x", IntegerType),
+ *       renameColumn("a", "b"),
+ *       deleteColumn("c")
+ *     )
+ * 
*/ public interface TableChange { From 3de2cfa3d069aa905c2c5076acc8d59b5ee0c63e Mon Sep 17 00:00:00 2001 From: Ryan Blue Date: Wed, 25 Jul 2018 11:11:22 -0700 Subject: [PATCH 05/14] SPARK-24252: Update catalog API in org.apache.spark.sql.catalog.v2. --- .../catalog/v2/CaseInsensitiveStringMap.java | 104 +++++++++++++++++ .../spark/sql/catalog/v2/CatalogProvider.java | 50 ++++++++ .../apache/spark/sql/catalog/v2/Catalogs.java | 108 ++++++++++++++++++ .../v2/catalog => catalog/v2}/Table.java | 4 +- .../catalog => catalog/v2}/TableCatalog.java | 20 +++- .../catalog => catalog/v2}/TableChange.java | 4 +- .../spark/sql/sources/v2/TableSupport.java | 36 ------ .../org/apache/spark/sql/SparkSession.scala | 8 ++ 8 files changed, 291 insertions(+), 43 deletions(-) create mode 100644 sql/core/src/main/java/org/apache/spark/sql/catalog/v2/CaseInsensitiveStringMap.java create mode 100644 sql/core/src/main/java/org/apache/spark/sql/catalog/v2/CatalogProvider.java create mode 100644 sql/core/src/main/java/org/apache/spark/sql/catalog/v2/Catalogs.java rename sql/core/src/main/java/org/apache/spark/sql/{sources/v2/catalog => catalog/v2}/Table.java (93%) rename sql/core/src/main/java/org/apache/spark/sql/{sources/v2/catalog => catalog/v2}/TableCatalog.java (90%) rename sql/core/src/main/java/org/apache/spark/sql/{sources/v2/catalog => catalog/v2}/TableChange.java (98%) delete mode 100644 sql/core/src/main/java/org/apache/spark/sql/sources/v2/TableSupport.java diff --git a/sql/core/src/main/java/org/apache/spark/sql/catalog/v2/CaseInsensitiveStringMap.java b/sql/core/src/main/java/org/apache/spark/sql/catalog/v2/CaseInsensitiveStringMap.java new file mode 100644 index 000000000000..f5204db2e7ca --- /dev/null +++ b/sql/core/src/main/java/org/apache/spark/sql/catalog/v2/CaseInsensitiveStringMap.java @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalog.v2; + +import java.util.Collection; +import java.util.HashMap; +import java.util.Locale; +import java.util.Map; +import java.util.Set; + +/** + * Case-insensitive map of string keys to string values. + *

+ * This is used to pass options to v2 implementations to ensure consistent case insensitivity. + */ +public class CaseInsensitiveStringMap implements Map { + + public static CaseInsensitiveStringMap empty() { + return new CaseInsensitiveStringMap(); + } + + private final Map delegate; + + private CaseInsensitiveStringMap() { + this.delegate = new HashMap<>(); + } + + @Override + public int size() { + return delegate.size(); + } + + @Override + public boolean isEmpty() { + return delegate.isEmpty(); + } + + @Override + public boolean containsKey(Object key) { + return delegate.containsKey(key.toString().toLowerCase(Locale.ROOT)); + } + + @Override + public boolean containsValue(Object value) { + return delegate.containsValue(value); + } + + @Override + public String get(Object key) { + return delegate.get(key.toString().toLowerCase(Locale.ROOT)); + } + + @Override + public String put(String key, String value) { + return delegate.put(key.toLowerCase(Locale.ROOT), value); + } + + @Override + public String remove(Object key) { + return delegate.remove(key.toString().toLowerCase(Locale.ROOT)); + } + + @Override + public void putAll(Map m) { + for (Map.Entry entry : m.entrySet()) { + delegate.put(entry.getKey().toLowerCase(Locale.ROOT), entry.getValue()); + } + } + + @Override + public void clear() { + delegate.clear(); + } + + @Override + public Set keySet() { + return delegate.keySet(); + } + + @Override + public Collection values() { + return delegate.values(); + } + + @Override + public Set> entrySet() { + return delegate.entrySet(); + } +} diff --git a/sql/core/src/main/java/org/apache/spark/sql/catalog/v2/CatalogProvider.java b/sql/core/src/main/java/org/apache/spark/sql/catalog/v2/CatalogProvider.java new file mode 100644 index 000000000000..b15e0728c6aa --- /dev/null +++ b/sql/core/src/main/java/org/apache/spark/sql/catalog/v2/CatalogProvider.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalog.v2; + +import org.apache.spark.sql.internal.SQLConf; + +/** + * A marker interface to provide a catalog implementation for Spark. + *

+ * Implementations can provide catalog functions by implementing additional interfaces, like + * {@link TableCatalog} to expose table operations. + *

+ * Catalog implementations must implement this marker interface to be loaded by + * {@link Catalogs#load(String, SQLConf)}. The loader will instantiate catalog classes using the + * required no-arg constructor. After creating an instance, it will be configured by calling + * {@link #initialize(CaseInsensitiveStringMap)}. + *

+ * Catalog implementations are registered to a name by adding a configuration option to Spark: + * {@code spark.sql.catalog.catalog-name=com.example.YourCatalogClass}. All configuration properties + * in the Spark configuration that share the catalog name prefix, + * {@code spark.sql.catalog.catalog-name.(key)=(value)} will be passed in the case insensitive + * string map of options in initialization with the prefix removed. An additional property, + * {@code name}, is also added to the options and will contain the catalog's name; in this case, + * "catalog-name". + */ +public interface CatalogProvider { + /** + * Called to initialize configuration. + *

+ * This method is called once, just after the provider is instantiated. + * + * @param options a case-insensitive string map of configuration + */ + void initialize(CaseInsensitiveStringMap options); +} diff --git a/sql/core/src/main/java/org/apache/spark/sql/catalog/v2/Catalogs.java b/sql/core/src/main/java/org/apache/spark/sql/catalog/v2/Catalogs.java new file mode 100644 index 000000000000..359e6cd5303d --- /dev/null +++ b/sql/core/src/main/java/org/apache/spark/sql/catalog/v2/Catalogs.java @@ -0,0 +1,108 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalog.v2; + +import org.apache.spark.SparkException; +import org.apache.spark.sql.internal.SQLConf; +import org.apache.spark.util.Utils; + +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import static scala.collection.JavaConverters.mapAsJavaMapConverter; + +public class Catalogs { + private Catalogs() { + } + + /** + * Load and configure a catalog by name. + *

+ * This loads, instantiates, and initializes the catalog provider for each call; it does not + * cache or reuse instances. + * + * @param name a String catalog name + * @param conf a SQLConf + * @return an initialized CatalogProvider + * @throws SparkException If the provider class cannot be found or instantiated + */ + public static CatalogProvider load(String name, SQLConf conf) throws SparkException { + String providerClassName = conf.getConfString("spark.sql.catalog." + name, null); + if (providerClassName == null) { + throw new SparkException(String.format( + "Catalog '%s' provider not found: spark.sql.catalog.%s is not defined", name, name)); + } + + ClassLoader loader = Utils.getContextOrSparkClassLoader(); + + try { + Class providerClass = loader.loadClass(providerClassName); + + if (!CatalogProvider.class.isAssignableFrom(providerClass)) { + throw new SparkException(String.format( + "Provider class for catalog '%s' does not implement CatalogProvider: %s", + name, providerClassName)); + } + + CatalogProvider provider = CatalogProvider.class.cast(providerClass.newInstance()); + + provider.initialize(catalogOptions(name, conf)); + + return provider; + + } catch (ClassNotFoundException e) { + throw new SparkException(String.format( + "Cannot find catalog provider class for catalog '%s': %s", name, providerClassName)); + + } catch (IllegalAccessException e) { + throw new SparkException(String.format( + "Failed to call provider constructor for catalog '%s': %s", name, providerClassName), e); + + } catch (InstantiationException e) { + throw new SparkException(String.format( + "Failed while instantiating provider for catalog '%s': %s", name, providerClassName), + e.getCause()); + } + } + + /** + * Extracts a named catalog's configuration from a SQLConf. + * + * @param name a catalog name + * @param conf a SQLConf + * @return a case insensitive string map of options starting with spark.sql.catalog.(name). + */ + private static CaseInsensitiveStringMap catalogOptions(String name, SQLConf conf) { + Map allConfs = mapAsJavaMapConverter(conf.getAllConfs()).asJava(); + Pattern prefix = Pattern.compile("^spark\\.sql\\.catalog\\." + name + "\\.(.+)"); + + CaseInsensitiveStringMap options = CaseInsensitiveStringMap.empty(); + for (Map.Entry entry : allConfs.entrySet()) { + Matcher matcher = prefix.matcher(entry.getKey()); + if (matcher.matches() && matcher.groupCount() > 0) { + options.put(matcher.group(1), entry.getValue()); + } + } + + // add name last to ensure it overwrites any conflicting options + options.put("name", name); + + return options; + } +} diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/catalog/Table.java b/sql/core/src/main/java/org/apache/spark/sql/catalog/v2/Table.java similarity index 93% rename from sql/core/src/main/java/org/apache/spark/sql/sources/v2/catalog/Table.java rename to sql/core/src/main/java/org/apache/spark/sql/catalog/v2/Table.java index 45b4cc1841fd..30a20f27b8c6 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/catalog/Table.java +++ b/sql/core/src/main/java/org/apache/spark/sql/catalog/v2/Table.java @@ -6,7 +6,7 @@ * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.spark.sql.sources.v2.catalog; +package org.apache.spark.sql.catalog.v2; import org.apache.spark.sql.catalyst.expressions.Expression; import org.apache.spark.sql.types.StructType; diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/catalog/TableCatalog.java b/sql/core/src/main/java/org/apache/spark/sql/catalog/v2/TableCatalog.java similarity index 90% rename from sql/core/src/main/java/org/apache/spark/sql/sources/v2/catalog/TableCatalog.java rename to sql/core/src/main/java/org/apache/spark/sql/catalog/v2/TableCatalog.java index ab15f355968d..539beb0c39c5 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/catalog/TableCatalog.java +++ b/sql/core/src/main/java/org/apache/spark/sql/catalog/v2/TableCatalog.java @@ -6,7 +6,7 @@ * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.spark.sql.sources.v2.catalog; +package org.apache.spark.sql.catalog.v2; import org.apache.spark.sql.catalyst.TableIdentifier; import org.apache.spark.sql.catalyst.analysis.NoSuchTableException; @@ -28,7 +28,7 @@ import java.util.List; import java.util.Map; -public interface TableCatalog { +public interface TableCatalog extends CatalogProvider { /** * Load table metadata by {@link TableIdentifier identifier} from the catalog. * @@ -38,6 +38,20 @@ public interface TableCatalog { */ Table loadTable(TableIdentifier ident) throws NoSuchTableException; + /** + * Test whether a table exists using an {@link TableIdentifier identifier} from the catalog. + * + * @param ident a table identifier + * @return true if the table exists, false otherwise + */ + default boolean tableExists(TableIdentifier ident) { + try { + return loadTable(ident) != null; + } catch (NoSuchTableException e) { + return false; + } + } + /** * Create a table in the catalog. * diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/catalog/TableChange.java b/sql/core/src/main/java/org/apache/spark/sql/catalog/v2/TableChange.java similarity index 98% rename from sql/core/src/main/java/org/apache/spark/sql/sources/v2/catalog/TableChange.java rename to sql/core/src/main/java/org/apache/spark/sql/catalog/v2/TableChange.java index 6e3c6ca0097d..3a8ba5e00b39 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/catalog/TableChange.java +++ b/sql/core/src/main/java/org/apache/spark/sql/catalog/v2/TableChange.java @@ -6,7 +6,7 @@ * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.spark.sql.sources.v2.catalog; +package org.apache.spark.sql.catalog.v2; import org.apache.spark.sql.types.DataType; diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/TableSupport.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/TableSupport.java deleted file mode 100644 index ad42b4b26838..000000000000 --- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/TableSupport.java +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.sources.v2; - -import org.apache.spark.sql.sources.v2.catalog.TableCatalog; - -/** - * A mix-in interface for {@link DataSourceV2} catalog support. Data sources can implement this - * interface to provide the ability to load, create, alter, and drop tables. - *

- * Data sources must implement this interface to support logical operations that combine writing - * data with catalog tasks, like create-table-as-select. - */ -public interface TableSupport { - /** - * Return a {@link TableCatalog catalog} for tables. - * - * @return a catalog - */ - TableCatalog catalog(); -} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala index 565042fcf762..4937a794c812 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala @@ -21,6 +21,7 @@ import java.io.Closeable import java.util.concurrent.atomic.AtomicReference import scala.collection.JavaConverters._ +import scala.collection.mutable import scala.reflect.runtime.universe.TypeTag import scala.util.control.NonFatal @@ -31,6 +32,7 @@ import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD import org.apache.spark.scheduler.{SparkListener, SparkListenerApplicationEnd} import org.apache.spark.sql.catalog.Catalog +import org.apache.spark.sql.catalog.v2.{CatalogProvider, Catalogs} import org.apache.spark.sql.catalyst._ import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation import org.apache.spark.sql.catalyst.encoders._ @@ -609,6 +611,12 @@ class SparkSession private( */ @transient lazy val catalog: Catalog = new CatalogImpl(self) + @transient private lazy val catalogs = new mutable.HashMap[String, CatalogProvider]() + + private[sql] def catalog(name: String): CatalogProvider = synchronized { + catalogs.getOrElseUpdate(name, Catalogs.load(name, sessionState.conf)) + } + /** * Returns the specified table/view as a `DataFrame`. * From ae61a797f584fba8ca2601b76f80774d22c3845c Mon Sep 17 00:00:00 2001 From: Ryan Blue Date: Wed, 25 Jul 2018 12:25:32 -0700 Subject: [PATCH 06/14] SPARK-24252: Add tests and clarify javadoc. --- .../catalog/v2/CaseInsensitiveStringMap.java | 3 + .../spark/sql/catalog/v2/CatalogProvider.java | 2 +- .../apache/spark/sql/catalog/v2/Catalogs.java | 3 +- .../v2/CaseInsensitiveStringMapSuite.java | 49 ++++++ .../sql/catalog/v2/CatalogLoadingSuite.java | 162 ++++++++++++++++++ .../v2/ConstructorFailureCatalogProvider.java | 35 ++++ .../sql/catalog/v2/TestCatalogProvider.java | 37 ++++ 7 files changed, 289 insertions(+), 2 deletions(-) create mode 100644 sql/core/src/test/java/test/org/apache/spark/sql/catalog/v2/CaseInsensitiveStringMapSuite.java create mode 100644 sql/core/src/test/java/test/org/apache/spark/sql/catalog/v2/CatalogLoadingSuite.java create mode 100644 sql/core/src/test/java/test/org/apache/spark/sql/catalog/v2/ConstructorFailureCatalogProvider.java create mode 100644 sql/core/src/test/java/test/org/apache/spark/sql/catalog/v2/TestCatalogProvider.java diff --git a/sql/core/src/main/java/org/apache/spark/sql/catalog/v2/CaseInsensitiveStringMap.java b/sql/core/src/main/java/org/apache/spark/sql/catalog/v2/CaseInsensitiveStringMap.java index f5204db2e7ca..90bfa8c6c8db 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/catalog/v2/CaseInsensitiveStringMap.java +++ b/sql/core/src/main/java/org/apache/spark/sql/catalog/v2/CaseInsensitiveStringMap.java @@ -27,6 +27,9 @@ * Case-insensitive map of string keys to string values. *

* This is used to pass options to v2 implementations to ensure consistent case insensitivity. + *

+ * Methods that return keys in this map, like {@link #entrySet()} and {@link #keySet()}, return + * keys converted to lower case. */ public class CaseInsensitiveStringMap implements Map { diff --git a/sql/core/src/main/java/org/apache/spark/sql/catalog/v2/CatalogProvider.java b/sql/core/src/main/java/org/apache/spark/sql/catalog/v2/CatalogProvider.java index b15e0728c6aa..03831b7aa915 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/catalog/v2/CatalogProvider.java +++ b/sql/core/src/main/java/org/apache/spark/sql/catalog/v2/CatalogProvider.java @@ -27,7 +27,7 @@ *

* Catalog implementations must implement this marker interface to be loaded by * {@link Catalogs#load(String, SQLConf)}. The loader will instantiate catalog classes using the - * required no-arg constructor. After creating an instance, it will be configured by calling + * required public no-arg constructor. After creating an instance, it will be configured by calling * {@link #initialize(CaseInsensitiveStringMap)}. *

* Catalog implementations are registered to a name by adding a configuration option to Spark: diff --git a/sql/core/src/main/java/org/apache/spark/sql/catalog/v2/Catalogs.java b/sql/core/src/main/java/org/apache/spark/sql/catalog/v2/Catalogs.java index 359e6cd5303d..71ab9f528dbe 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/catalog/v2/Catalogs.java +++ b/sql/core/src/main/java/org/apache/spark/sql/catalog/v2/Catalogs.java @@ -72,7 +72,8 @@ public static CatalogProvider load(String name, SQLConf conf) throws SparkExcept } catch (IllegalAccessException e) { throw new SparkException(String.format( - "Failed to call provider constructor for catalog '%s': %s", name, providerClassName), e); + "Failed to call public no-arg constructor for catalog '%s': %s", name, providerClassName), + e); } catch (InstantiationException e) { throw new SparkException(String.format( diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/catalog/v2/CaseInsensitiveStringMapSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/catalog/v2/CaseInsensitiveStringMapSuite.java new file mode 100644 index 000000000000..e17185fc2782 --- /dev/null +++ b/sql/core/src/test/java/test/org/apache/spark/sql/catalog/v2/CaseInsensitiveStringMapSuite.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package test.org.apache.spark.sql.catalog.v2; + +import org.apache.spark.sql.catalog.v2.CaseInsensitiveStringMap; +import org.junit.Assert; +import org.junit.Test; + +import java.util.HashSet; +import java.util.Set; + +public class CaseInsensitiveStringMapSuite { + @Test + public void testPutAndGet() { + CaseInsensitiveStringMap options = CaseInsensitiveStringMap.empty(); + options.put("kEy", "valUE"); + + Assert.assertEquals("Should return correct value for lower-case key", + "valUE", options.get("key")); + Assert.assertEquals("Should return correct value for upper-case key", + "valUE", options.get("KEY")); + } + + @Test + public void testKeySet() { + CaseInsensitiveStringMap options = CaseInsensitiveStringMap.empty(); + options.put("kEy", "valUE"); + + Set expectedKeySet = new HashSet<>(); + expectedKeySet.add("key"); + + Assert.assertEquals("Should return lower-case key set", expectedKeySet, options.keySet()); + } +} diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/catalog/v2/CatalogLoadingSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/catalog/v2/CatalogLoadingSuite.java new file mode 100644 index 000000000000..283c134b3d17 --- /dev/null +++ b/sql/core/src/test/java/test/org/apache/spark/sql/catalog/v2/CatalogLoadingSuite.java @@ -0,0 +1,162 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package test.org.apache.spark.sql.catalog.v2; + +import org.apache.spark.SparkException; +import org.apache.spark.sql.catalog.v2.CaseInsensitiveStringMap; +import org.apache.spark.sql.catalog.v2.CatalogProvider; +import org.apache.spark.sql.catalog.v2.Catalogs; +import org.apache.spark.sql.internal.SQLConf; +import org.junit.Assert; +import org.junit.Test; + +import java.util.concurrent.Callable; + +public class CatalogLoadingSuite { + @Test + public void testLoad() throws SparkException { + SQLConf conf = new SQLConf(); + conf.setConfString("spark.sql.catalog.test-name", TestCatalogProvider.class.getCanonicalName()); + + CatalogProvider provider = Catalogs.load("test-name", conf); + Assert.assertNotNull("Should instantiate a non-null provider", provider); + Assert.assertEquals("Provider should have correct implementation", + TestCatalogProvider.class, provider.getClass()); + + TestCatalogProvider testProvider = (TestCatalogProvider) provider; + Assert.assertEquals("Options should contain only one key", 1, testProvider.options.size()); + Assert.assertEquals("Options should contain correct catalog name", + "test-name", testProvider.options.get("name")); + } + + @Test + public void testInitializationOptions() throws SparkException { + SQLConf conf = new SQLConf(); + conf.setConfString("spark.sql.catalog.test-name", TestCatalogProvider.class.getCanonicalName()); + conf.setConfString("spark.sql.catalog.test-name.name", "overwritten"); + conf.setConfString("spark.sql.catalog.test-name.kEy", "valUE"); + + CatalogProvider provider = Catalogs.load("test-name", conf); + Assert.assertNotNull("Should instantiate a non-null provider", provider); + Assert.assertEquals("Provider should have correct implementation", + TestCatalogProvider.class, provider.getClass()); + + TestCatalogProvider testProvider = (TestCatalogProvider) provider; + + Assert.assertEquals("Options should contain only two keys", 2, testProvider.options.size()); + Assert.assertEquals("Options should contain correct catalog name", + "test-name", testProvider.options.get("name")); + Assert.assertEquals("Options should contain correct value for key", + "valUE", testProvider.options.get("key")); + } + + @Test + public void testLoadWithoutConfig() { + SQLConf conf = new SQLConf(); + + SparkException exc = intercept(SparkException.class, () -> Catalogs.load("missing", conf)); + + Assert.assertTrue("Should complain that implementation is not configured", + exc.getMessage().contains("provider not found: spark.sql.catalog.missing is not defined")); + Assert.assertTrue("Should identify the catalog by name", exc.getMessage().contains("missing")); + } + + @Test + public void testLoadMissingClass() { + SQLConf conf = new SQLConf(); + conf.setConfString("spark.sql.catalog.missing", "com.example.NoSuchCatalogProvider"); + + SparkException exc = intercept(SparkException.class, () -> Catalogs.load("missing", conf)); + + Assert.assertTrue("Should complain that the class is not found", + exc.getMessage().contains("Cannot find catalog provider class")); + Assert.assertTrue("Should identify the catalog by name", exc.getMessage().contains("missing")); + Assert.assertTrue("Should identify the missing class", + exc.getMessage().contains("com.example.NoSuchCatalogProvider")); + } + + @Test + public void testLoadNonCatalogProvider() { + SQLConf conf = new SQLConf(); + String invalidClassName = InvalidCatalogProvider.class.getCanonicalName(); + conf.setConfString("spark.sql.catalog.invalid", invalidClassName); + + SparkException exc = intercept(SparkException.class, () -> Catalogs.load("invalid", conf)); + + Assert.assertTrue("Should complain that class does not implement CatalogProvider", + exc.getMessage().contains("does not implement CatalogProvider")); + Assert.assertTrue("Should identify the catalog by name", exc.getMessage().contains("invalid")); + Assert.assertTrue("Should identify the class", exc.getMessage().contains(invalidClassName)); + } + + @Test + public void testLoadConstructorFailureCatalogProvider() { + SQLConf conf = new SQLConf(); + String invalidClassName = ConstructorFailureCatalogProvider.class.getCanonicalName(); + conf.setConfString("spark.sql.catalog.invalid", invalidClassName); + + RuntimeException exc = intercept(RuntimeException.class, () -> Catalogs.load("invalid", conf)); + + Assert.assertTrue("Should have expected error message", + exc.getMessage().contains("Expected failure")); + } + + @Test + public void testLoadAccessErrorCatalogProvider() { + SQLConf conf = new SQLConf(); + String invalidClassName = AccessErrorCatalogProvider.class.getCanonicalName(); + conf.setConfString("spark.sql.catalog.invalid", invalidClassName); + + SparkException exc = intercept(SparkException.class, () -> Catalogs.load("invalid", conf)); + + Assert.assertTrue("Should complain that no public constructor is provided", + exc.getMessage().contains("Failed to call public no-arg constructor for catalog")); + Assert.assertTrue("Should identify the catalog by name", exc.getMessage().contains("invalid")); + Assert.assertTrue("Should identify the class", exc.getMessage().contains(invalidClassName)); + } + + @SuppressWarnings("unchecked") + public static E intercept(Class expected, Callable callable) { + try { + callable.call(); + Assert.fail("No exception was thrown, expected: " + + expected.getName()); + } catch (Exception actual) { + try { + Assert.assertEquals(expected, actual.getClass()); + return (E) actual; + } catch (AssertionError e) { + e.addSuppressed(actual); + throw e; + } + } + // Compiler doesn't catch that Assert.fail will always throw an exception. + throw new UnsupportedOperationException("[BUG] Should not reach this statement"); + } +} + +class AccessErrorCatalogProvider implements CatalogProvider { // no public constructor + @Override + public void initialize(CaseInsensitiveStringMap options) { + } +} + +class InvalidCatalogProvider { // doesn't implement CatalogProvider + public void initialize(CaseInsensitiveStringMap options) { + } +} diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/catalog/v2/ConstructorFailureCatalogProvider.java b/sql/core/src/test/java/test/org/apache/spark/sql/catalog/v2/ConstructorFailureCatalogProvider.java new file mode 100644 index 000000000000..2b550bac2865 --- /dev/null +++ b/sql/core/src/test/java/test/org/apache/spark/sql/catalog/v2/ConstructorFailureCatalogProvider.java @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package test.org.apache.spark.sql.catalog.v2; + +import org.apache.spark.sql.catalog.v2.CaseInsensitiveStringMap; +import org.apache.spark.sql.catalog.v2.CatalogProvider; + +/** + * A catalog provider that fails in its constructor. This is in its own file because it needs to be + * public for Catalogs.load to be able to call the constructor. + */ +public class ConstructorFailureCatalogProvider implements CatalogProvider { + public ConstructorFailureCatalogProvider() { + throw new RuntimeException("Expected failure."); + } + + @Override + public void initialize(CaseInsensitiveStringMap options) { + } +} diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/catalog/v2/TestCatalogProvider.java b/sql/core/src/test/java/test/org/apache/spark/sql/catalog/v2/TestCatalogProvider.java new file mode 100644 index 000000000000..02fe0ead5236 --- /dev/null +++ b/sql/core/src/test/java/test/org/apache/spark/sql/catalog/v2/TestCatalogProvider.java @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package test.org.apache.spark.sql.catalog.v2; + +import org.apache.spark.sql.catalog.v2.CaseInsensitiveStringMap; +import org.apache.spark.sql.catalog.v2.CatalogProvider; + +/** + * A TestCatalogProvider. This is in its own file because it needs to be public for Catalogs.load + * to be able to call the constructor. + */ +public class TestCatalogProvider implements CatalogProvider { + CaseInsensitiveStringMap options = null; + + public TestCatalogProvider() { + } + + @Override + public void initialize(CaseInsensitiveStringMap options) { + this.options = options; + } +} From 8b135607512fda53ff5adc34ac8b965b04f3a3d9 Mon Sep 17 00:00:00 2001 From: Ryan Blue Date: Wed, 25 Jul 2018 13:00:23 -0700 Subject: [PATCH 07/14] SPARK-24252: Fix Map.Entry reference causing doc failures. --- .../apache/spark/sql/catalog/v2/CaseInsensitiveStringMap.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/core/src/main/java/org/apache/spark/sql/catalog/v2/CaseInsensitiveStringMap.java b/sql/core/src/main/java/org/apache/spark/sql/catalog/v2/CaseInsensitiveStringMap.java index 90bfa8c6c8db..a4ad1f6994f9 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/catalog/v2/CaseInsensitiveStringMap.java +++ b/sql/core/src/main/java/org/apache/spark/sql/catalog/v2/CaseInsensitiveStringMap.java @@ -101,7 +101,7 @@ public Collection values() { } @Override - public Set> entrySet() { + public Set> entrySet() { return delegate.entrySet(); } } From b4c505758a5137a311ac098965ace19af33bd198 Mon Sep 17 00:00:00 2001 From: Ryan Blue Date: Wed, 25 Jul 2018 22:07:52 -0700 Subject: [PATCH 08/14] SPARK-24252: Move catalog.v2 classes into catalyst. This is needed for catalyst plans to use catalogs. --- .../catalog/v2/CaseInsensitiveStringMap.java | 0 .../spark/sql/catalog/v2/CatalogProvider.java | 0 .../apache/spark/sql/catalog/v2/Catalogs.java | 0 .../apache/spark/sql/catalog/v2/Table.java | 0 .../spark/sql/catalog/v2/TableCatalog.java | 0 .../spark/sql/catalog/v2/TableChange.java | 0 .../v2/CaseInsensitiveStringMapSuite.java | 3 +- .../sql/catalog/v2/CatalogLoadingSuite.java | 30 +++++++++++++-- .../v2/ConstructorFailureCatalogProvider.java | 35 ------------------ .../sql/catalog/v2/TestCatalogProvider.java | 37 ------------------- 10 files changed, 27 insertions(+), 78 deletions(-) rename sql/{core => catalyst}/src/main/java/org/apache/spark/sql/catalog/v2/CaseInsensitiveStringMap.java (100%) rename sql/{core => catalyst}/src/main/java/org/apache/spark/sql/catalog/v2/CatalogProvider.java (100%) rename sql/{core => catalyst}/src/main/java/org/apache/spark/sql/catalog/v2/Catalogs.java (100%) rename sql/{core => catalyst}/src/main/java/org/apache/spark/sql/catalog/v2/Table.java (100%) rename sql/{core => catalyst}/src/main/java/org/apache/spark/sql/catalog/v2/TableCatalog.java (100%) rename sql/{core => catalyst}/src/main/java/org/apache/spark/sql/catalog/v2/TableChange.java (100%) rename sql/{core/src/test/java/test => catalyst/src/test/java}/org/apache/spark/sql/catalog/v2/CaseInsensitiveStringMapSuite.java (93%) rename sql/{core/src/test/java/test => catalyst/src/test/java}/org/apache/spark/sql/catalog/v2/CatalogLoadingSuite.java (91%) delete mode 100644 sql/core/src/test/java/test/org/apache/spark/sql/catalog/v2/ConstructorFailureCatalogProvider.java delete mode 100644 sql/core/src/test/java/test/org/apache/spark/sql/catalog/v2/TestCatalogProvider.java diff --git a/sql/core/src/main/java/org/apache/spark/sql/catalog/v2/CaseInsensitiveStringMap.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalog/v2/CaseInsensitiveStringMap.java similarity index 100% rename from sql/core/src/main/java/org/apache/spark/sql/catalog/v2/CaseInsensitiveStringMap.java rename to sql/catalyst/src/main/java/org/apache/spark/sql/catalog/v2/CaseInsensitiveStringMap.java diff --git a/sql/core/src/main/java/org/apache/spark/sql/catalog/v2/CatalogProvider.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalog/v2/CatalogProvider.java similarity index 100% rename from sql/core/src/main/java/org/apache/spark/sql/catalog/v2/CatalogProvider.java rename to sql/catalyst/src/main/java/org/apache/spark/sql/catalog/v2/CatalogProvider.java diff --git a/sql/core/src/main/java/org/apache/spark/sql/catalog/v2/Catalogs.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalog/v2/Catalogs.java similarity index 100% rename from sql/core/src/main/java/org/apache/spark/sql/catalog/v2/Catalogs.java rename to sql/catalyst/src/main/java/org/apache/spark/sql/catalog/v2/Catalogs.java diff --git a/sql/core/src/main/java/org/apache/spark/sql/catalog/v2/Table.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalog/v2/Table.java similarity index 100% rename from sql/core/src/main/java/org/apache/spark/sql/catalog/v2/Table.java rename to sql/catalyst/src/main/java/org/apache/spark/sql/catalog/v2/Table.java diff --git a/sql/core/src/main/java/org/apache/spark/sql/catalog/v2/TableCatalog.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalog/v2/TableCatalog.java similarity index 100% rename from sql/core/src/main/java/org/apache/spark/sql/catalog/v2/TableCatalog.java rename to sql/catalyst/src/main/java/org/apache/spark/sql/catalog/v2/TableCatalog.java diff --git a/sql/core/src/main/java/org/apache/spark/sql/catalog/v2/TableChange.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalog/v2/TableChange.java similarity index 100% rename from sql/core/src/main/java/org/apache/spark/sql/catalog/v2/TableChange.java rename to sql/catalyst/src/main/java/org/apache/spark/sql/catalog/v2/TableChange.java diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/catalog/v2/CaseInsensitiveStringMapSuite.java b/sql/catalyst/src/test/java/org/apache/spark/sql/catalog/v2/CaseInsensitiveStringMapSuite.java similarity index 93% rename from sql/core/src/test/java/test/org/apache/spark/sql/catalog/v2/CaseInsensitiveStringMapSuite.java rename to sql/catalyst/src/test/java/org/apache/spark/sql/catalog/v2/CaseInsensitiveStringMapSuite.java index e17185fc2782..0d869108fa7d 100644 --- a/sql/core/src/test/java/test/org/apache/spark/sql/catalog/v2/CaseInsensitiveStringMapSuite.java +++ b/sql/catalyst/src/test/java/org/apache/spark/sql/catalog/v2/CaseInsensitiveStringMapSuite.java @@ -15,9 +15,8 @@ * limitations under the License. */ -package test.org.apache.spark.sql.catalog.v2; +package org.apache.spark.sql.catalog.v2; -import org.apache.spark.sql.catalog.v2.CaseInsensitiveStringMap; import org.junit.Assert; import org.junit.Test; diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/catalog/v2/CatalogLoadingSuite.java b/sql/catalyst/src/test/java/org/apache/spark/sql/catalog/v2/CatalogLoadingSuite.java similarity index 91% rename from sql/core/src/test/java/test/org/apache/spark/sql/catalog/v2/CatalogLoadingSuite.java rename to sql/catalyst/src/test/java/org/apache/spark/sql/catalog/v2/CatalogLoadingSuite.java index 283c134b3d17..62e26af7f0c6 100644 --- a/sql/core/src/test/java/test/org/apache/spark/sql/catalog/v2/CatalogLoadingSuite.java +++ b/sql/catalyst/src/test/java/org/apache/spark/sql/catalog/v2/CatalogLoadingSuite.java @@ -15,12 +15,9 @@ * limitations under the License. */ -package test.org.apache.spark.sql.catalog.v2; +package org.apache.spark.sql.catalog.v2; import org.apache.spark.SparkException; -import org.apache.spark.sql.catalog.v2.CaseInsensitiveStringMap; -import org.apache.spark.sql.catalog.v2.CatalogProvider; -import org.apache.spark.sql.catalog.v2.Catalogs; import org.apache.spark.sql.internal.SQLConf; import org.junit.Assert; import org.junit.Test; @@ -150,7 +147,32 @@ public static E intercept(Class expected, Callable c } } +class TestCatalogProvider implements CatalogProvider { + CaseInsensitiveStringMap options = null; + + TestCatalogProvider() { + } + + @Override + public void initialize(CaseInsensitiveStringMap options) { + this.options = options; + } +} + +class ConstructorFailureCatalogProvider implements CatalogProvider { // fails in its constructor + ConstructorFailureCatalogProvider() { + throw new RuntimeException("Expected failure."); + } + + @Override + public void initialize(CaseInsensitiveStringMap options) { + } +} + class AccessErrorCatalogProvider implements CatalogProvider { // no public constructor + private AccessErrorCatalogProvider() { + } + @Override public void initialize(CaseInsensitiveStringMap options) { } diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/catalog/v2/ConstructorFailureCatalogProvider.java b/sql/core/src/test/java/test/org/apache/spark/sql/catalog/v2/ConstructorFailureCatalogProvider.java deleted file mode 100644 index 2b550bac2865..000000000000 --- a/sql/core/src/test/java/test/org/apache/spark/sql/catalog/v2/ConstructorFailureCatalogProvider.java +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package test.org.apache.spark.sql.catalog.v2; - -import org.apache.spark.sql.catalog.v2.CaseInsensitiveStringMap; -import org.apache.spark.sql.catalog.v2.CatalogProvider; - -/** - * A catalog provider that fails in its constructor. This is in its own file because it needs to be - * public for Catalogs.load to be able to call the constructor. - */ -public class ConstructorFailureCatalogProvider implements CatalogProvider { - public ConstructorFailureCatalogProvider() { - throw new RuntimeException("Expected failure."); - } - - @Override - public void initialize(CaseInsensitiveStringMap options) { - } -} diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/catalog/v2/TestCatalogProvider.java b/sql/core/src/test/java/test/org/apache/spark/sql/catalog/v2/TestCatalogProvider.java deleted file mode 100644 index 02fe0ead5236..000000000000 --- a/sql/core/src/test/java/test/org/apache/spark/sql/catalog/v2/TestCatalogProvider.java +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package test.org.apache.spark.sql.catalog.v2; - -import org.apache.spark.sql.catalog.v2.CaseInsensitiveStringMap; -import org.apache.spark.sql.catalog.v2.CatalogProvider; - -/** - * A TestCatalogProvider. This is in its own file because it needs to be public for Catalogs.load - * to be able to call the constructor. - */ -public class TestCatalogProvider implements CatalogProvider { - CaseInsensitiveStringMap options = null; - - public TestCatalogProvider() { - } - - @Override - public void initialize(CaseInsensitiveStringMap options) { - this.options = options; - } -} From 85a49d729e0ae75cf6e1725216829c7d5e79a8a1 Mon Sep 17 00:00:00 2001 From: Ryan Blue Date: Mon, 7 May 2018 08:54:37 -0700 Subject: [PATCH 09/14] SPARK-24251: Add AppendData logical plan. This adds a new logical plan, AppendData, that was proposed in SPARK-23521. This also adds an analyzer rule to validate data written with AppendData against the target table. DataFrameWriter is also updated so that v2 writes use the new AppendData logical plan. --- .../sql/catalyst/analysis/Analyzer.scala | 95 +++++++++++++++++++ .../sql/catalyst/analysis/NamedRelation.scala | 24 +++++ .../plans/logical/basicLogicalOperators.scala | 35 ++++++- .../org/apache/spark/sql/types/DataType.scala | 62 ++++++++++++ .../spark/sql/sources/v2/WriteSupport.java | 9 +- .../apache/spark/sql/DataFrameWriter.scala | 40 ++++---- .../datasources/v2/DataSourceV2Relation.scala | 49 ++++++++-- .../datasources/v2/DataSourceV2Strategy.scala | 5 +- ...V2.scala => WriteToDataSourceV2Exec.scala} | 4 +- .../sql/sources/v2/DataSourceV2Suite.scala | 16 ++-- 10 files changed, 297 insertions(+), 42 deletions(-) create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/NamedRelation.scala rename sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/{WriteToDataSourceV2.scala => WriteToDataSourceV2Exec.scala} (96%) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index 4f474f4987dc..63f8c10b490f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -175,6 +175,7 @@ class Analyzer( ResolveWindowOrder :: ResolveWindowFrame :: ResolveNaturalAndUsingJoin :: + ResolveOutputRelation :: ExtractWindowExpressions :: GlobalAggregates :: ResolveAggregateFunctions :: @@ -2226,6 +2227,100 @@ class Analyzer( } } + /** + * Resolves columns of an output table from the data in a logical plan. This rule will: + * + * - Reorder columns when the write is by name + * - Insert safe casts when data types do not match + * - Insert aliases when column names do not match + * - Detect plans that are not compatible with the output table and throw AnalysisException + */ + object ResolveOutputRelation extends Rule[LogicalPlan] { + override def apply(plan: LogicalPlan): LogicalPlan = plan transform { + case append @ AppendData(table, query, isByName) + if table.resolved && query.resolved && !append.resolved => + val projection = resolveOutputColumns(table.name, table.output, query, isByName) + + if (projection != query) { + append.copy(query = projection) + } else { + append + } + } + + def resolveOutputColumns( + tableName: String, + expected: Seq[Attribute], + query: LogicalPlan, + byName: Boolean): LogicalPlan = { + + if (expected.size < query.output.size) { + throw new AnalysisException( + s"""Cannot write to '$tableName', too many data columns: + |Table columns: ${expected.map(_.name).mkString(", ")} + |Data columns: ${query.output.map(_.name).mkString(", ")}""".stripMargin) + } + + val errors = new mutable.ArrayBuffer[String]() + val resolved: Seq[NamedExpression] = if (byName) { + expected.flatMap { outAttr => + query.resolveQuoted(outAttr.name, resolver) match { + case Some(inAttr) if inAttr.nullable && !outAttr.nullable => + errors += s"Cannot write nullable values to non-null column '${outAttr.name}'" + None + + case Some(inAttr) if !DataType.canWrite(outAttr.dataType, inAttr.dataType, resolver) => + Some(upcast(inAttr, outAttr)) + + case Some(inAttr) => + Some(inAttr) // matches nullability, datatype, and name + + case _ => + errors += s"Cannot find data for output column '${outAttr.name}'" + None + } + } + + } else { + if (expected.size > query.output.size) { + throw new AnalysisException( + s"""Cannot write to '$tableName', not enough data columns: + |Table columns: ${expected.map(_.name).mkString(", ")} + |Data columns: ${query.output.map(_.name).mkString(", ")}""".stripMargin) + } + + query.output.zip(expected).flatMap { + case (inAttr, outAttr) if inAttr.nullable && !outAttr.nullable => + errors += s"Cannot write nullable values to non-null column '${outAttr.name}'" + None + + case (inAttr, outAttr) + if !DataType.canWrite(inAttr.dataType, outAttr.dataType, resolver) || + inAttr.name != outAttr.name => + Some(upcast(inAttr, outAttr)) + + case (inAttr, _) => + Some(inAttr) // matches nullability, datatype, and name + } + } + + if (errors.nonEmpty) { + throw new AnalysisException( + s"Cannot write incompatible data to table '$tableName':\n- ${errors.mkString("\n- ")}") + } + + Project(resolved, query) + } + + private def upcast(inAttr: NamedExpression, outAttr: Attribute): NamedExpression = { + Alias( + UpCast(inAttr, outAttr.dataType, Seq()), outAttr.name + )( + explicitMetadata = Option(outAttr.metadata) + ) + } + } + private def commonNaturalJoinProcessing( left: LogicalPlan, right: LogicalPlan, diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/NamedRelation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/NamedRelation.scala new file mode 100644 index 000000000000..ad201f947b67 --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/NamedRelation.scala @@ -0,0 +1,24 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.analysis + +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan + +trait NamedRelation extends LogicalPlan { + def name: String +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala index ea5a9b8ed554..679886c89d12 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala @@ -17,13 +17,12 @@ package org.apache.spark.sql.catalyst.plans.logical -import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation +import org.apache.spark.sql.catalyst.analysis.{MultiInstanceRelation, NamedRelation} import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression import org.apache.spark.sql.catalyst.plans._ -import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, Partitioning, - RangePartitioning, RoundRobinPartitioning} +import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, Partitioning, RangePartitioning, RoundRobinPartitioning} import org.apache.spark.sql.types._ import org.apache.spark.util.Utils import org.apache.spark.util.random.RandomSampler @@ -344,6 +343,36 @@ case class Join( } } +/** + * Append data to an existing table. + */ +case class AppendData( + table: NamedRelation, + query: LogicalPlan, + isByName: Boolean) extends LogicalPlan { + override def children: Seq[LogicalPlan] = Seq(query) + override def output: Seq[Attribute] = Seq.empty + + override lazy val resolved: Boolean = { + query.output.size == table.output.size && query.output.zip(table.output).forall { + case (inAttr, outAttr) => + inAttr.name == outAttr.name && // names must match + outAttr.dataType.sameType(inAttr.dataType) && // types must match + (outAttr.nullable || !inAttr.nullable) // must accept null or never produce nulls + } + } +} + +object AppendData { + def byName(table: NamedRelation, df: LogicalPlan): AppendData = { + new AppendData(table, df, true) + } + + def byPosition(table: NamedRelation, query: LogicalPlan): AppendData = { + new AppendData(table, query, false) + } +} + /** * Insert some data into a table. Note that this plan is unresolved and has to be replaced by the * concrete implementations during analysis. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala index fd40741cfb5f..a63c1717d169 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala @@ -27,6 +27,7 @@ import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods._ import org.apache.spark.annotation.InterfaceStability +import org.apache.spark.sql.catalyst.analysis.Resolver import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.parser.CatalystSqlParser import org.apache.spark.sql.internal.SQLConf @@ -336,4 +337,65 @@ object DataType { case (fromDataType, toDataType) => fromDataType == toDataType } } + + /** + * Returns true if the write data type can be read using the read data type. + * + * The write type is compatible with the read type if: + * - Both types are arrays, the array element types are compatible, and element nullability is + * compatible (read allows nulls or write does not contain nulls). + * - Both types are maps and the map key and value types are compatible, and value nullability + * is compatible (read allows nulls or write does not contain nulls). + * - Both types are structs and each field in the read struct is present in the write struct and + * compatible (including nullability), or is nullable if the write struct does not contain the + * field. Write-side structs are not compatible if they contain fields that are not present in + * the read-side struct. + * - Both types are atomic and are the same type + * + * This method does not allow type promotion such as, write type int with read type long. + * + * Extra fields in write-side structs are not allowed to avoid accidentally writing data that + * the read schema will not read, and to ensure map key equality is not changed when data is read. + * + * @param write a write-side data type to validate against the read type + * @param read a read-side data type + * @return true if data written with the write type can be read using the read type + */ + def canWrite( + write: DataType, + read: DataType, + resolver: Resolver): Boolean = { + (write, read) match { + case (wArr: ArrayType, rArr: ArrayType) => + canWrite(wArr.elementType, rArr.elementType, resolver) && + (rArr.containsNull || !wArr.containsNull) + + case (wMap: MapType, rMap: MapType) => + // map keys cannot include data fields not in the read schema without changing equality when + // read. map keys can be missing fields as long as they are nullable in the read schema. + canWrite(wMap.keyType, rMap.keyType, resolver) && + canWrite(wMap.valueType, rMap.valueType, resolver) && + (rMap.valueContainsNull || !wMap.valueContainsNull) + + case (StructType(writeFields), StructType(readFields)) => + lazy val hasNoExtraFields = + (writeFields.map(_.name).toSet -- readFields.map(_.name)).isEmpty + + readFields.forall { readField => + writeFields.find(writeField => resolver(writeField.name, readField.name)) match { + case Some(writeField) => + canWrite(writeField.dataType, readField.dataType, resolver) && + (readField.nullable || !writeField.nullable) + + case None => + // the write schema doesn't have the field, so it must be nullable in the read schema + readField.nullable + } + } && hasNoExtraFields + + case (w, r) => + // the write and read types are atomic and must be the same + w.sameType(r) + } + } } diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/WriteSupport.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/WriteSupport.java index 83aeec0c4785..048787a7a0a0 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/WriteSupport.java +++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/WriteSupport.java @@ -38,15 +38,16 @@ public interface WriteSupport extends DataSourceV2 { * If this method fails (by throwing an exception), the action will fail and no Spark job will be * submitted. * - * @param jobId A unique string for the writing job. It's possible that there are many writing - * jobs running at the same time, and the returned {@link DataSourceWriter} can - * use this job id to distinguish itself from other jobs. + * @param writeUUID A unique string for the writing job. It's possible that there are many writing + * jobs running at the same time, and the returned {@link DataSourceWriter} can + * use this job id to distinguish itself from other jobs. * @param schema the schema of the data to be written. * @param mode the save mode which determines what to do when the data are already in this data * source, please refer to {@link SaveMode} for more details. * @param options the options for the returned data source writer, which is an immutable * case-insensitive string-to-string map. + * @return a writer to append data to this data source */ Optional createWriter( - String jobId, StructType schema, SaveMode mode, DataSourceOptions options); + String writeUUID, StructType schema, SaveMode mode, DataSourceOptions options); } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala index 39c0e102b69b..c1663765f6a5 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala @@ -17,8 +17,7 @@ package org.apache.spark.sql -import java.text.SimpleDateFormat -import java.util.{Date, Locale, Properties, UUID} +import java.util.{Locale, Properties, UUID} import scala.collection.JavaConverters._ @@ -26,12 +25,11 @@ import org.apache.spark.annotation.InterfaceStability import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.analysis.{EliminateSubqueryAliases, UnresolvedRelation} import org.apache.spark.sql.catalyst.catalog._ -import org.apache.spark.sql.catalyst.plans.logical.{AnalysisBarrier, InsertIntoTable, LogicalPlan} +import org.apache.spark.sql.catalyst.plans.logical.{AnalysisBarrier, AppendData, InsertIntoTable, LogicalPlan} import org.apache.spark.sql.execution.SQLExecution import org.apache.spark.sql.execution.command.DDLUtils import org.apache.spark.sql.execution.datasources.{CreateTable, DataSource, LogicalRelation} -import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Utils -import org.apache.spark.sql.execution.datasources.v2.WriteToDataSourceV2 +import org.apache.spark.sql.execution.datasources.v2.{DataSourceV2Relation, DataSourceV2Utils, WriteToDataSourceV2} import org.apache.spark.sql.sources.BaseRelation import org.apache.spark.sql.sources.v2._ import org.apache.spark.sql.types.StructType @@ -240,21 +238,27 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) { val cls = DataSource.lookupDataSource(source, df.sparkSession.sessionState.conf) if (classOf[DataSourceV2].isAssignableFrom(cls)) { - val ds = cls.newInstance() - ds match { + val source = cls.newInstance().asInstanceOf[DataSourceV2] + source match { case ws: WriteSupport => - val options = new DataSourceOptions((extraOptions ++ - DataSourceV2Utils.extractSessionConfigs( - ds = ds.asInstanceOf[DataSourceV2], - conf = df.sparkSession.sessionState.conf)).asJava) - // Using a timestamp and a random UUID to distinguish different writing jobs. This is good - // enough as there won't be tons of writing jobs created at the same second. - val jobId = new SimpleDateFormat("yyyyMMddHHmmss", Locale.US) - .format(new Date()) + "-" + UUID.randomUUID() - val writer = ws.createWriter(jobId, df.logicalPlan.schema, mode, options) - if (writer.isPresent) { + val options = extraOptions ++ + DataSourceV2Utils.extractSessionConfigs(source, df.sparkSession.sessionState.conf) + + val relation = DataSourceV2Relation.create(source, options.toMap) + if (mode == SaveMode.Append) { runCommand(df.sparkSession, "save") { - WriteToDataSourceV2(writer.get(), df.planWithBarrier) + AppendData.byName(relation, df.planWithBarrier) + } + + } else { + val writer = ws.createWriter( + UUID.randomUUID.toString, df.logicalPlan.output.toStructType, mode, + new DataSourceOptions(options.asJava)) + + if (writer.isPresent) { + runCommand(df.sparkSession, "save") { + WriteToDataSourceV2(writer.get, df.planWithBarrier) + } } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala index 7613eb210c65..b3b4038b9a6d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala @@ -17,15 +17,19 @@ package org.apache.spark.sql.execution.datasources.v2 +import java.util.UUID + import scala.collection.JavaConverters._ -import org.apache.spark.sql.AnalysisException -import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation +import org.apache.spark.sql.{AnalysisException, SaveMode} +import org.apache.spark.sql.catalyst.TableIdentifier +import org.apache.spark.sql.catalyst.analysis.{MultiInstanceRelation, NamedRelation} import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Expression} import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, Statistics} import org.apache.spark.sql.sources.DataSourceRegister -import org.apache.spark.sql.sources.v2.{DataSourceOptions, DataSourceV2, ReadSupport, ReadSupportWithSchema} +import org.apache.spark.sql.sources.v2.{DataSourceOptions, DataSourceV2, ReadSupport, ReadSupportWithSchema, WriteSupport} import org.apache.spark.sql.sources.v2.reader.{DataSourceReader, SupportsReportStatistics} +import org.apache.spark.sql.sources.v2.writer.DataSourceWriter import org.apache.spark.sql.types.StructType /** @@ -40,17 +44,24 @@ case class DataSourceV2Relation( source: DataSourceV2, output: Seq[AttributeReference], options: Map[String, String], - userSpecifiedSchema: Option[StructType]) - extends LeafNode with MultiInstanceRelation with DataSourceV2StringFormat { + tableIdent: Option[TableIdentifier] = None, + userSpecifiedSchema: Option[StructType] = None) + extends LeafNode with MultiInstanceRelation with NamedRelation with DataSourceV2StringFormat { import DataSourceV2Relation._ + override def name: String = { + tableIdent.map(_.unquotedString).getOrElse(s"${source.name}:unknown") + } + override def pushedFilters: Seq[Expression] = Seq.empty override def simpleString: String = "RelationV2 " + metadataString def newReader(): DataSourceReader = source.createReader(options, userSpecifiedSchema) + def newWriter(): DataSourceWriter = source.createWriter(options, schema) + override def computeStats(): Statistics = newReader match { case r: SupportsReportStatistics => Statistics(sizeInBytes = r.getStatistics.sizeInBytes().orElse(conf.defaultSizeInBytes)) @@ -131,6 +142,15 @@ object DataSourceV2Relation { } } + def asWriteSupport: WriteSupport = { + source match { + case support: WriteSupport => + support + case _ => + throw new AnalysisException(s"Data source is not writable: $name") + } + } + def name: String = { source match { case registered: DataSourceRegister => @@ -151,14 +171,29 @@ object DataSourceV2Relation { asReadSupport.createReader(v2Options) } } + + def createWriter( + options: Map[String, String], + schema: StructType): DataSourceWriter = { + val v2Options = new DataSourceOptions(options.asJava) + asWriteSupport.createWriter(UUID.randomUUID.toString, schema, SaveMode.Append, v2Options).get + } } def create( source: DataSourceV2, options: Map[String, String], - userSpecifiedSchema: Option[StructType]): DataSourceV2Relation = { + tableIdent: Option[TableIdentifier] = None, + userSpecifiedSchema: Option[StructType] = None): DataSourceV2Relation = { val reader = source.createReader(options, userSpecifiedSchema) + val ident = tableIdent.orElse(tableFromOptions(options)) DataSourceV2Relation( - source, reader.readSchema().toAttributes, options, userSpecifiedSchema) + source, reader.readSchema().toAttributes, options, ident, userSpecifiedSchema) + } + + private def tableFromOptions(options: Map[String, String]): Option[TableIdentifier] = { + options + .get(DataSourceOptions.TABLE_KEY) + .map(TableIdentifier(_, options.get(DataSourceOptions.DATABASE_KEY))) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala index 9414e68155b9..6daaa4c65c33 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala @@ -22,7 +22,7 @@ import scala.collection.mutable import org.apache.spark.sql.{sources, Strategy} import org.apache.spark.sql.catalyst.expressions.{And, AttributeReference, AttributeSet, Expression} import org.apache.spark.sql.catalyst.planning.PhysicalOperation -import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Repartition} +import org.apache.spark.sql.catalyst.plans.logical.{AppendData, LogicalPlan, Repartition} import org.apache.spark.sql.execution.{FilterExec, ProjectExec, SparkPlan} import org.apache.spark.sql.execution.datasources.DataSourceStrategy import org.apache.spark.sql.execution.streaming.continuous.{ContinuousCoalesceExec, WriteToContinuousDataSource, WriteToContinuousDataSourceExec} @@ -136,6 +136,9 @@ object DataSourceV2Strategy extends Strategy { case WriteToDataSourceV2(writer, query) => WriteToDataSourceV2Exec(writer, planLater(query)) :: Nil + case AppendData(r: DataSourceV2Relation, query, _) => + WriteToDataSourceV2Exec(r.newWriter(), planLater(query)) :: Nil + case WriteToContinuousDataSource(writer, query) => WriteToContinuousDataSourceExec(writer, planLater(query)) :: Nil diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2Exec.scala similarity index 96% rename from sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala rename to sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2Exec.scala index b1148c0f62f7..9ac745425a6a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2Exec.scala @@ -35,8 +35,10 @@ import org.apache.spark.sql.types.StructType import org.apache.spark.util.Utils /** - * The logical plan for writing data into data source v2. + * Deprecated logical plan for writing data into data source v2. This is being replaced by more + * specific logical plans, like [[org.apache.spark.sql.catalyst.plans.logical.AppendData]]. */ +@deprecated("Use specific logical plans like AppendData instead", "2.4.0") case class WriteToDataSourceV2(writer: DataSourceWriter, query: LogicalPlan) extends LogicalPlan { override def children: Seq[LogicalPlan] = Seq(query) override def output: Seq[Attribute] = Nil diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala index d73eebbc84b7..09757b0e8a6e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala @@ -204,33 +204,33 @@ class DataSourceV2Suite extends QueryTest with SharedSQLContext { val path = file.getCanonicalPath assert(spark.read.format(cls.getName).option("path", path).load().collect().isEmpty) - spark.range(10).select('id, -'id).write.format(cls.getName) + spark.range(10).select('id as 'i, -'id as 'j).write.format(cls.getName) .option("path", path).save() checkAnswer( spark.read.format(cls.getName).option("path", path).load(), spark.range(10).select('id, -'id)) // test with different save modes - spark.range(10).select('id, -'id).write.format(cls.getName) + spark.range(10).select('id as 'i, -'id as 'j).write.format(cls.getName) .option("path", path).mode("append").save() checkAnswer( spark.read.format(cls.getName).option("path", path).load(), spark.range(10).union(spark.range(10)).select('id, -'id)) - spark.range(5).select('id, -'id).write.format(cls.getName) + spark.range(5).select('id as 'i, -'id as 'j).write.format(cls.getName) .option("path", path).mode("overwrite").save() checkAnswer( spark.read.format(cls.getName).option("path", path).load(), spark.range(5).select('id, -'id)) - spark.range(5).select('id, -'id).write.format(cls.getName) + spark.range(5).select('id as 'i, -'id as 'j).write.format(cls.getName) .option("path", path).mode("ignore").save() checkAnswer( spark.read.format(cls.getName).option("path", path).load(), spark.range(5).select('id, -'id)) val e = intercept[Exception] { - spark.range(5).select('id, -'id).write.format(cls.getName) + spark.range(5).select('id as 'i, -'id as 'j).write.format(cls.getName) .option("path", path).mode("error").save() } assert(e.getMessage.contains("data already exists")) @@ -247,7 +247,7 @@ class DataSourceV2Suite extends QueryTest with SharedSQLContext { } } // this input data will fail to read middle way. - val input = spark.range(10).select(failingUdf('id).as('i)).select('i, -'i) + val input = spark.range(10).select(failingUdf('id).as('i)).select('i, -'i as 'j) val e2 = intercept[SparkException] { input.write.format(cls.getName).option("path", path).mode("overwrite").save() } @@ -256,7 +256,7 @@ class DataSourceV2Suite extends QueryTest with SharedSQLContext { assert(spark.read.format(cls.getName).option("path", path).load().collect().isEmpty) // test internal row writer - spark.range(5).select('id, -'id).write.format(cls.getName) + spark.range(5).select('id as 'i, -'id as 'j).write.format(cls.getName) .option("path", path).option("internal", "true").mode("overwrite").save() checkAnswer( spark.read.format(cls.getName).option("path", path).load(), @@ -272,7 +272,7 @@ class DataSourceV2Suite extends QueryTest with SharedSQLContext { assert(spark.read.format(cls.getName).option("path", path).load().collect().isEmpty) val numPartition = 6 - spark.range(0, 10, 1, numPartition).select('id, -'id).write.format(cls.getName) + spark.range(0, 10, 1, numPartition).select('id as 'i, -'id as 'j).write.format(cls.getName) .option("path", path).save() checkAnswer( spark.read.format(cls.getName).option("path", path).load(), From 88c12a47cbc3eea04ed9d90e91c62285de594b7e Mon Sep 17 00:00:00 2001 From: Ryan Blue Date: Wed, 25 Jul 2018 11:11:45 -0700 Subject: [PATCH 10/14] Add CTAS and RTAS support. This uses the catalog API introduced in SPARK-24252 to implement CTAS and RTAS plans. --- .../sql/catalyst/analysis/NamedRelation.scala | 3 + .../plans/logical/basicLogicalOperators.scala | 33 ++++ .../spark/sql/sources/v2/ReadSupport.java | 2 +- .../sql/sources/v2/ReadSupportWithSchema.java | 2 +- .../spark/sql/sources/v2/WriteSupport.java | 2 +- .../apache/spark/sql/DataFrameReader.scala | 17 +- .../apache/spark/sql/DataFrameWriter.scala | 47 ++++- .../datasources/v2/DataSourceV2Relation.scala | 134 ++++++------- .../datasources/v2/DataSourceV2ScanExec.scala | 5 +- .../datasources/v2/DataSourceV2Strategy.scala | 37 +++- .../v2/DataSourceV2StringFormat.scala | 11 +- .../v2/WriteToDataSourceV2Exec.scala | 91 ++++++++- .../sources/v2/DataSourceV2Implicits.scala | 183 ++++++++++++++++++ 13 files changed, 454 insertions(+), 113 deletions(-) create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/sources/v2/DataSourceV2Implicits.scala diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/NamedRelation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/NamedRelation.scala index ad201f947b67..2e72d13cb923 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/NamedRelation.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/NamedRelation.scala @@ -17,8 +17,11 @@ package org.apache.spark.sql.catalyst.analysis +import org.apache.spark.sql.catalyst.expressions.AttributeReference import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan trait NamedRelation extends LogicalPlan { def name: String + + def output: Seq[AttributeReference] } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala index 679886c89d12..8f1c36dcef37 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala @@ -17,6 +17,8 @@ package org.apache.spark.sql.catalyst.plans.logical +import org.apache.spark.sql.catalog.v2.TableCatalog +import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.analysis.{MultiInstanceRelation, NamedRelation} import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable} import org.apache.spark.sql.catalyst.expressions._ @@ -373,6 +375,37 @@ object AppendData { } } +/** + * Create a new table from a select query. + */ +case class CreateTableAsSelect( + catalog: TableCatalog, + table: TableIdentifier, + partitioning: Seq[Expression], + query: LogicalPlan, + writeOptions: Map[String, String], + ignoreIfExists: Boolean) extends LogicalPlan { + + override def children: Seq[LogicalPlan] = Seq(query) + override def output: Seq[Attribute] = Seq.empty + override lazy val resolved = true +} + +/** + * Replace a table with the results of a select query. + */ +case class ReplaceTableAsSelect( + catalog: TableCatalog, + table: TableIdentifier, + partitioning: Seq[Expression], + query: LogicalPlan, + writeOptions: Map[String, String]) extends LogicalPlan { + + override def children: Seq[LogicalPlan] = Seq(query) + override def output: Seq[Attribute] = Seq.empty + override lazy val resolved = true +} + /** * Insert some data into a table. Note that this plan is unresolved and has to be replaced by the * concrete implementations during analysis. diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/ReadSupport.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/ReadSupport.java index b2526ded53d9..d6ab6ceddd8b 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/ReadSupport.java +++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/ReadSupport.java @@ -25,7 +25,7 @@ * provide data reading ability and scan the data from the data source. */ @InterfaceStability.Evolving -public interface ReadSupport extends DataSourceV2 { +public interface ReadSupport { /** * Creates a {@link DataSourceReader} to scan the data from this data source. diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/ReadSupportWithSchema.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/ReadSupportWithSchema.java index f31659904cc5..d2901064a38f 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/ReadSupportWithSchema.java +++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/ReadSupportWithSchema.java @@ -30,7 +30,7 @@ * supports both schema inference and user-specified schema. */ @InterfaceStability.Evolving -public interface ReadSupportWithSchema extends DataSourceV2 { +public interface ReadSupportWithSchema { /** * Create a {@link DataSourceReader} to scan the data from this data source. diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/WriteSupport.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/WriteSupport.java index 048787a7a0a0..16484d4b8439 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/WriteSupport.java +++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/WriteSupport.java @@ -29,7 +29,7 @@ * provide data writing ability and save the data to the data source. */ @InterfaceStability.Evolving -public interface WriteSupport extends DataSourceV2 { +public interface WriteSupport { /** * Creates an optional {@link DataSourceWriter} to save the data to this data source. Data diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala index ec9352a7fa05..4fe2ebbfc240 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala @@ -37,7 +37,7 @@ import org.apache.spark.sql.execution.datasources.jdbc._ import org.apache.spark.sql.execution.datasources.json.TextInputJsonDataSource import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Utils -import org.apache.spark.sql.sources.v2.{DataSourceOptions, DataSourceV2, ReadSupport, ReadSupportWithSchema} +import org.apache.spark.sql.sources.v2.{DataSourceOptions, DataSourceV2, DataSourceV2Implicits, ReadSupport, ReadSupportWithSchema} import org.apache.spark.sql.types.{StringType, StructType} import org.apache.spark.unsafe.types.UTF8String @@ -191,6 +191,21 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { "read files of Hive data source directly.") } + import DataSourceV2Implicits._ + + extraOptions.get("catalog") match { + case Some(catalogName) if extraOptions.get(DataSourceOptions.TABLE_KEY).isDefined => + val catalog = sparkSession.catalog(catalogName).asTableCatalog + val options = extraOptions.toMap + val identifier = options.table.get + + return Dataset.ofRows(sparkSession, + DataSourceV2Relation.create( + catalogName, identifier, catalog.loadTable(identifier), options)) + + case _ => + } + val cls = DataSource.lookupDataSource(source, sparkSession.sessionState.conf) if (classOf[DataSourceV2].isAssignableFrom(cls)) { val ds = cls.newInstance().asInstanceOf[DataSourceV2] diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala index c1663765f6a5..b1e102fd90f7 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala @@ -25,7 +25,7 @@ import org.apache.spark.annotation.InterfaceStability import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.analysis.{EliminateSubqueryAliases, UnresolvedRelation} import org.apache.spark.sql.catalyst.catalog._ -import org.apache.spark.sql.catalyst.plans.logical.{AnalysisBarrier, AppendData, InsertIntoTable, LogicalPlan} +import org.apache.spark.sql.catalyst.plans.logical.{AnalysisBarrier, AppendData, CreateTableAsSelect, InsertIntoTable, LogicalPlan, ReplaceTableAsSelect} import org.apache.spark.sql.execution.SQLExecution import org.apache.spark.sql.execution.command.DDLUtils import org.apache.spark.sql.execution.datasources.{CreateTable, DataSource, LogicalRelation} @@ -236,6 +236,51 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) { assertNotBucketed("save") + import DataSourceV2Implicits._ + + extraOptions.get("catalog") match { + case Some(catalogName) if extraOptions.get(DataSourceOptions.TABLE_KEY).isDefined => + val catalog = df.sparkSession.catalog(catalogName).asTableCatalog + val options = extraOptions.toMap + val identifier = options.table.get + val exists = catalog.tableExists(identifier) + + (exists, mode) match { + case (true, SaveMode.ErrorIfExists) => + throw new AnalysisException(s"Table already exists: ${identifier.quotedString}") + + case (true, SaveMode.Overwrite) => + runCommand(df.sparkSession, "save") { + ReplaceTableAsSelect(catalog, identifier, Seq.empty, df.logicalPlan, options) + } + + case (true, SaveMode.Append) => + val relation = DataSourceV2Relation.create( + catalogName, identifier, catalog.loadTable(identifier), options) + + runCommand(df.sparkSession, "save") { + AppendData.byName(relation, df.logicalPlan) + } + + case (false, SaveMode.Append) => + throw new AnalysisException(s"Table does not exist: ${identifier.quotedString}") + + case (false, SaveMode.ErrorIfExists) | + (false, SaveMode.Ignore) | + (false, SaveMode.Overwrite) => + + runCommand(df.sparkSession, "save") { + CreateTableAsSelect(catalog, identifier, Seq.empty, df.logicalPlan, options, + ignoreIfExists = mode == SaveMode.Ignore) + } + + case _ => + return // table exists and mode is ignore + } + + case _ => + } + val cls = DataSource.lookupDataSource(source, df.sparkSession.sessionState.conf) if (classOf[DataSourceV2].isAssignableFrom(cls)) { val source = cls.newInstance().asInstanceOf[DataSourceV2] diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala index b3b4038b9a6d..126c9bb1d670 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala @@ -17,20 +17,17 @@ package org.apache.spark.sql.execution.datasources.v2 -import java.util.UUID - -import scala.collection.JavaConverters._ - -import org.apache.spark.sql.{AnalysisException, SaveMode} +import org.apache.spark.sql.catalog.v2.Table import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.analysis.{MultiInstanceRelation, NamedRelation} import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Expression} import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, Statistics} -import org.apache.spark.sql.sources.DataSourceRegister -import org.apache.spark.sql.sources.v2.{DataSourceOptions, DataSourceV2, ReadSupport, ReadSupportWithSchema, WriteSupport} +import org.apache.spark.sql.sources.v2.DataSourceV2 +import org.apache.spark.sql.sources.v2.DataSourceV2Implicits._ import org.apache.spark.sql.sources.v2.reader.{DataSourceReader, SupportsReportStatistics} import org.apache.spark.sql.sources.v2.writer.DataSourceWriter import org.apache.spark.sql.types.StructType +import org.apache.spark.util.Utils /** * A logical plan representing a data source v2 scan. @@ -48,10 +45,10 @@ case class DataSourceV2Relation( userSpecifiedSchema: Option[StructType] = None) extends LeafNode with MultiInstanceRelation with NamedRelation with DataSourceV2StringFormat { - import DataSourceV2Relation._ + override def sourceName: String = source.name override def name: String = { - tableIdent.map(_.unquotedString).getOrElse(s"${source.name}:unknown") + tableIdent.map(_.unquotedString).getOrElse(s"$sourceName:unknown") } override def pushedFilters: Seq[Expression] = Seq.empty @@ -62,7 +59,7 @@ case class DataSourceV2Relation( def newWriter(): DataSourceWriter = source.createWriter(options, schema) - override def computeStats(): Statistics = newReader match { + override def computeStats(): Statistics = newReader() match { case r: SupportsReportStatistics => Statistics(sizeInBytes = r.getStatistics.sizeInBytes().orElse(conf.defaultSizeInBytes)) case _ => @@ -74,6 +71,43 @@ case class DataSourceV2Relation( } } +/** + * A logical plan representing a data source v2 table. + * + * @param ident The table's TableIdentifier. + * @param table The table. + * @param output The output attributes of the table. + * @param options The options for this scan or write. + */ +case class TableV2Relation( + catalogName: String, + ident: TableIdentifier, + table: Table, + output: Seq[AttributeReference], + options: Map[String, String]) + extends LeafNode with MultiInstanceRelation with NamedRelation { + + import org.apache.spark.sql.sources.v2.DataSourceV2Implicits._ + + override def name: String = ident.unquotedString + + override def simpleString: String = + s"RelationV2 $name ${Utils.truncatedString(output, "[", ", ", "]")}" + + def newReader(): DataSourceReader = table.createReader(options) + + override def computeStats(): Statistics = newReader() match { + case r: SupportsReportStatistics => + Statistics(sizeInBytes = r.getStatistics.sizeInBytes().orElse(conf.defaultSizeInBytes)) + case _ => + Statistics(sizeInBytes = conf.defaultSizeInBytes) + } + + override def newInstance(): TableV2Relation = { + copy(output = output.map(_.newInstance())) + } +} + /** * A specialization of [[DataSourceV2Relation]] with the streaming bit set to true. * @@ -88,6 +122,8 @@ case class StreamingDataSourceV2Relation( reader: DataSourceReader) extends LeafNode with MultiInstanceRelation with DataSourceV2StringFormat { + override def sourceName: String = source.name + override def isStreaming: Boolean = true override def simpleString: String = "Streaming RelationV2 " + metadataString @@ -116,84 +152,22 @@ case class StreamingDataSourceV2Relation( } object DataSourceV2Relation { - private implicit class SourceHelpers(source: DataSourceV2) { - def asReadSupport: ReadSupport = { - source match { - case support: ReadSupport => - support - case _: ReadSupportWithSchema => - // this method is only called if there is no user-supplied schema. if there is no - // user-supplied schema and ReadSupport was not implemented, throw a helpful exception. - throw new AnalysisException(s"Data source requires a user-supplied schema: $name") - case _ => - throw new AnalysisException(s"Data source is not readable: $name") - } - } - - def asReadSupportWithSchema: ReadSupportWithSchema = { - source match { - case support: ReadSupportWithSchema => - support - case _: ReadSupport => - throw new AnalysisException( - s"Data source does not support user-supplied schema: $name") - case _ => - throw new AnalysisException(s"Data source is not readable: $name") - } - } - - def asWriteSupport: WriteSupport = { - source match { - case support: WriteSupport => - support - case _ => - throw new AnalysisException(s"Data source is not writable: $name") - } - } - - def name: String = { - source match { - case registered: DataSourceRegister => - registered.shortName() - case _ => - source.getClass.getSimpleName - } - } - - def createReader( - options: Map[String, String], - userSpecifiedSchema: Option[StructType]): DataSourceReader = { - val v2Options = new DataSourceOptions(options.asJava) - userSpecifiedSchema match { - case Some(s) => - asReadSupportWithSchema.createReader(s, v2Options) - case _ => - asReadSupport.createReader(v2Options) - } - } - - def createWriter( - options: Map[String, String], - schema: StructType): DataSourceWriter = { - val v2Options = new DataSourceOptions(options.asJava) - asWriteSupport.createWriter(UUID.randomUUID.toString, schema, SaveMode.Append, v2Options).get - } - } - def create( source: DataSourceV2, options: Map[String, String], tableIdent: Option[TableIdentifier] = None, - userSpecifiedSchema: Option[StructType] = None): DataSourceV2Relation = { + userSpecifiedSchema: Option[StructType] = None): NamedRelation = { val reader = source.createReader(options, userSpecifiedSchema) - val ident = tableIdent.orElse(tableFromOptions(options)) + val ident = tableIdent.orElse(options.table) DataSourceV2Relation( source, reader.readSchema().toAttributes, options, ident, userSpecifiedSchema) } - private def tableFromOptions(options: Map[String, String]): Option[TableIdentifier] = { - options - .get(DataSourceOptions.TABLE_KEY) - .map(TableIdentifier(_, options.get(DataSourceOptions.DATABASE_KEY))) + def create( + catalogName: String, + ident: TableIdentifier, + table: Table, + options: Map[String, String]): NamedRelation = { + TableV2Relation(catalogName, ident, table, table.schema.toAttributes, options) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala index b030b9a929b0..643feeb57e50 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala @@ -28,7 +28,6 @@ import org.apache.spark.sql.catalyst.plans.physical import org.apache.spark.sql.catalyst.plans.physical.SinglePartition import org.apache.spark.sql.execution.{ColumnarBatchScan, LeafExecNode, WholeStageCodegenExec} import org.apache.spark.sql.execution.streaming.continuous._ -import org.apache.spark.sql.sources.v2.DataSourceV2 import org.apache.spark.sql.sources.v2.reader._ import org.apache.spark.sql.sources.v2.reader.streaming.ContinuousReader import org.apache.spark.sql.types.StructType @@ -39,7 +38,7 @@ import org.apache.spark.sql.vectorized.ColumnarBatch */ case class DataSourceV2ScanExec( output: Seq[AttributeReference], - @transient source: DataSourceV2, + @transient sourceName: String, @transient options: Map[String, String], @transient pushedFilters: Seq[Expression], @transient reader: DataSourceReader) @@ -55,7 +54,7 @@ case class DataSourceV2ScanExec( } override def hashCode(): Int = { - Seq(output, source, options).hashCode() + Seq(output, sourceName, options).hashCode() } override def outputPartitioning: physical.Partitioning = reader match { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala index 6daaa4c65c33..a9a433c5ea7f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala @@ -20,9 +20,11 @@ package org.apache.spark.sql.execution.datasources.v2 import scala.collection.mutable import org.apache.spark.sql.{sources, Strategy} +import org.apache.spark.sql.catalog.v2.TableCatalog +import org.apache.spark.sql.catalyst.analysis.NamedRelation import org.apache.spark.sql.catalyst.expressions.{And, AttributeReference, AttributeSet, Expression} import org.apache.spark.sql.catalyst.planning.PhysicalOperation -import org.apache.spark.sql.catalyst.plans.logical.{AppendData, LogicalPlan, Repartition} +import org.apache.spark.sql.catalyst.plans.logical.{AppendData, CreateTableAsSelect, LogicalPlan, Repartition, ReplaceTableAsSelect} import org.apache.spark.sql.execution.{FilterExec, ProjectExec, SparkPlan} import org.apache.spark.sql.execution.datasources.DataSourceStrategy import org.apache.spark.sql.execution.streaming.continuous.{ContinuousCoalesceExec, WriteToContinuousDataSource, WriteToContinuousDataSourceExec} @@ -31,6 +33,8 @@ import org.apache.spark.sql.sources.v2.reader.streaming.ContinuousReader object DataSourceV2Strategy extends Strategy { + import org.apache.spark.sql.sources.v2.DataSourceV2Implicits._ + /** * Pushes down filters to the data source reader * @@ -81,7 +85,7 @@ object DataSourceV2Strategy extends Strategy { // TODO: nested column pruning. private def pruneColumns( reader: DataSourceReader, - relation: DataSourceV2Relation, + relation: NamedRelation, exprs: Seq[Expression]): Seq[AttributeReference] = { reader match { case r: SupportsPushDownRequiredColumns => @@ -102,10 +106,15 @@ object DataSourceV2Strategy extends Strategy { } } - override def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { - case PhysicalOperation(project, filters, relation: DataSourceV2Relation) => - val reader = relation.newReader() + case PhysicalOperation(project, filters, relation: NamedRelation) + if relation.isInstanceOf[DataSourceV2Relation] || relation.isInstanceOf[TableV2Relation] => + + val (reader, options, sourceName) = relation match { + case r: DataSourceV2Relation => (r.newReader(), r.options, r.sourceName) + case r: TableV2Relation => (r.newReader(), r.options, r.catalogName) + } + // `pushedFilters` will be pushed down and evaluated in the underlying data sources. // `postScanFilters` need to be evaluated after the scan. // `postScanFilters` and `pushedFilters` can overlap, e.g. the parquet row group filter. @@ -113,14 +122,14 @@ object DataSourceV2Strategy extends Strategy { val output = pruneColumns(reader, relation, project ++ postScanFilters) logInfo( s""" - |Pushing operators to ${relation.source.getClass} + |Pushing operators to $sourceName |Pushed Filters: ${pushedFilters.mkString(", ")} |Post-Scan Filters: ${postScanFilters.mkString(",")} |Output: ${output.mkString(", ")} """.stripMargin) val scan = DataSourceV2ScanExec( - output, relation.source, relation.options, pushedFilters, reader) + output, sourceName, options, pushedFilters, reader) val filterCondition = postScanFilters.reduceLeftOption(And) val withFilter = filterCondition.map(FilterExec(_, scan)).getOrElse(scan) @@ -131,13 +140,21 @@ object DataSourceV2Strategy extends Strategy { case r: StreamingDataSourceV2Relation => // ensure there is a projection, which will produce unsafe rows required by some operators ProjectExec(r.output, - DataSourceV2ScanExec(r.output, r.source, r.options, r.pushedFilters, r.reader)) :: Nil + DataSourceV2ScanExec(r.output, r.source.name, r.options, r.pushedFilters, r.reader)) :: Nil case WriteToDataSourceV2(writer, query) => WriteToDataSourceV2Exec(writer, planLater(query)) :: Nil - case AppendData(r: DataSourceV2Relation, query, _) => - WriteToDataSourceV2Exec(r.newWriter(), planLater(query)) :: Nil + case AppendData(r: TableV2Relation, query, _) => + AppendDataExec(r.table, r.options, planLater(query)) :: Nil + + case CreateTableAsSelect(catalog, ident, partitioning, query, writeOptions, ignoreIfExists) => + CreateTableAsSelectExec(catalog, ident, partitioning, Map.empty, writeOptions, + planLater(query), ignoreIfExists) :: Nil + + case ReplaceTableAsSelect(catalog, ident, partitioning, query, writeOptions) => + ReplaceTableAsSelectExec(catalog, ident, partitioning, Map.empty, writeOptions, + planLater(query)) :: Nil case WriteToContinuousDataSource(writer, query) => WriteToContinuousDataSourceExec(writer, planLater(query)) :: Nil diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2StringFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2StringFormat.scala index 97e6c6d702ac..5c5a51c99c77 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2StringFormat.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2StringFormat.scala @@ -20,8 +20,6 @@ package org.apache.spark.sql.execution.datasources.v2 import org.apache.commons.lang3.StringUtils import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} -import org.apache.spark.sql.sources.DataSourceRegister -import org.apache.spark.sql.sources.v2.DataSourceV2 import org.apache.spark.util.Utils /** @@ -34,7 +32,7 @@ trait DataSourceV2StringFormat { * The instance of this data source implementation. Note that we only consider its class in * equals/hashCode, not the instance itself. */ - def source: DataSourceV2 + def sourceName: String /** * The output of the data source reader, w.r.t. column pruning. @@ -51,13 +49,6 @@ trait DataSourceV2StringFormat { */ def pushedFilters: Seq[Expression] - private def sourceName: String = source match { - case registered: DataSourceRegister => registered.shortName() - // source.getClass.getSimpleName can cause Malformed class name error, - // call safer `Utils.getSimpleName` instead - case _ => Utils.getSimpleName(source.getClass) - } - def metadataString: String = { val entries = scala.collection.mutable.ArrayBuffer.empty[(String, String)] diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2Exec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2Exec.scala index 9ac745425a6a..95f8e78899a5 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2Exec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2Exec.scala @@ -17,6 +17,7 @@ package org.apache.spark.sql.execution.datasources.v2 +import scala.collection.JavaConverters._ import scala.util.control.NonFatal import org.apache.spark.{SparkEnv, SparkException, TaskContext} @@ -24,9 +25,11 @@ import org.apache.spark.executor.CommitDeniedException import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD import org.apache.spark.sql.Row -import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalog.v2.{Table, TableCatalog} +import org.apache.spark.sql.catalyst.{InternalRow, TableIdentifier} +import org.apache.spark.sql.catalyst.analysis.{NoSuchTableException, TableAlreadyExistsException} import org.apache.spark.sql.catalyst.encoders.{ExpressionEncoder, RowEncoder} -import org.apache.spark.sql.catalyst.expressions.Attribute +import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.streaming.MicroBatchExecution @@ -44,14 +47,92 @@ case class WriteToDataSourceV2(writer: DataSourceWriter, query: LogicalPlan) ext override def output: Seq[Attribute] = Nil } +case class AppendDataExec( + table: Table, + writeOptions: Map[String, String], + plan: SparkPlan) extends V2TableWriteExec(writeOptions, plan) { + + override protected def doExecute(): RDD[InternalRow] = { + appendToTable(table) + } +} + +case class CreateTableAsSelectExec( + catalog: TableCatalog, + ident: TableIdentifier, + partitioning: Seq[Expression], + properties: Map[String, String], + writeOptions: Map[String, String], + plan: SparkPlan, + ifNotExists: Boolean) extends V2TableWriteExec(writeOptions, plan) { + + override protected def doExecute(): RDD[InternalRow] = { + if (catalog.tableExists(ident)) { + if (ifNotExists) { + return sparkContext.parallelize(Seq.empty, 1) + } + + throw new TableAlreadyExistsException(ident.database.getOrElse("null"), ident.table) + } + + Utils.tryWithSafeFinally({ + val table = catalog.createTable(ident, plan.schema, partitioning.asJava, properties.asJava) + appendToTable(table) + })(finallyBlock = { + catalog.dropTable(ident) + }) + } +} + +case class ReplaceTableAsSelectExec( + catalog: TableCatalog, + ident: TableIdentifier, + partitioning: Seq[Expression], + properties: Map[String, String], + writeOptions: Map[String, String], + plan: SparkPlan) extends V2TableWriteExec(writeOptions, plan) { + + override protected def doExecute(): RDD[InternalRow] = { + if (!catalog.tableExists(ident)) { + throw new NoSuchTableException(ident.database.getOrElse("null"), ident.table) + } + + catalog.dropTable(ident) + + Utils.tryWithSafeFinally({ + val table = catalog.createTable(ident, plan.schema, partitioning.asJava, properties.asJava) + appendToTable(table) + })(finallyBlock = { + catalog.dropTable(ident) + }) + } +} + +case class WriteToDataSourceV2Exec( + writer: DataSourceWriter, + plan: SparkPlan) extends V2TableWriteExec(Map.empty, plan) { + + override protected def doExecute(): RDD[InternalRow] = { + doAppend(writer) + } +} + /** - * The physical plan for writing data into data source v2. + * The base physical plan for writing data into data source v2. */ -case class WriteToDataSourceV2Exec(writer: DataSourceWriter, query: SparkPlan) extends SparkPlan { +abstract class V2TableWriteExec( + options: Map[String, String], + query: SparkPlan) extends SparkPlan { + import org.apache.spark.sql.sources.v2.DataSourceV2Implicits._ + override def children: Seq[SparkPlan] = Seq(query) override def output: Seq[Attribute] = Nil - override protected def doExecute(): RDD[InternalRow] = { + protected def appendToTable(table: Table): RDD[InternalRow] = { + doAppend(table.createWriter(options, query.schema)) + } + + protected def doAppend(writer: DataSourceWriter): RDD[InternalRow] = { val writeTask = writer match { case w: SupportsWriteInternalRow => w.createInternalRowWriterFactory() case _ => new InternalRowDataWriterFactory(writer.createWriterFactory(), query.schema) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/v2/DataSourceV2Implicits.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/v2/DataSourceV2Implicits.scala new file mode 100644 index 000000000000..68feb9876b6a --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/v2/DataSourceV2Implicits.scala @@ -0,0 +1,183 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.sources.v2 + +import java.util.UUID + +import scala.collection.JavaConverters._ + +import org.apache.spark.sql.{AnalysisException, SaveMode} +import org.apache.spark.sql.catalog.v2.{CaseInsensitiveStringMap, CatalogProvider, Table, TableCatalog} +import org.apache.spark.sql.catalyst.TableIdentifier +import org.apache.spark.sql.sources.DataSourceRegister +import org.apache.spark.sql.sources.v2.reader.DataSourceReader +import org.apache.spark.sql.sources.v2.writer.DataSourceWriter +import org.apache.spark.sql.types.StructType + +/** + * Implicit helper classes to make working with the v2 API in Scala easier. + */ +private[sql] object DataSourceV2Implicits { + implicit class CatalogHelper(catalog: CatalogProvider) { + def asTableCatalog: TableCatalog = catalog match { + case tableCatalog: TableCatalog => + tableCatalog + case _ => + throw new UnsupportedOperationException(s"Catalog $catalog does not support tables") + } + } + + implicit class TableHelper(table: Table) { + def asReadSupport: ReadSupport = { + table match { + case support: ReadSupport => + support + case _: ReadSupportWithSchema => + // this method is only called if there is no user-supplied schema. if there is no + // user-supplied schema and ReadSupport was not implemented, throw a helpful exception. + throw new AnalysisException(s"Table requires a user-supplied schema: $table") + case _ => + throw new AnalysisException(s"Table is not readable: $table") + } + } + + def asReadSupportWithSchema: ReadSupportWithSchema = { + table match { + case support: ReadSupportWithSchema => + support + case _: ReadSupport => + throw new AnalysisException( + s"Table does not support user-supplied schema: $table") + case _ => + throw new AnalysisException(s"Table is not readable: $table") + } + } + + def asWriteSupport: WriteSupport = { + table match { + case support: WriteSupport => + support + case _ => + throw new AnalysisException(s"Table is not writable: $table") + } + } + + def createReader( + options: Map[String, String]): DataSourceReader = { + table match { + case supportWithSchema: ReadSupportWithSchema => + supportWithSchema.createReader(table.schema(), options.asDataSourceOptions) + case support: ReadSupport => + support.createReader(options.asDataSourceOptions) + } + } + + def createWriter( + options: Map[String, String], + schema: StructType): DataSourceWriter = { + asWriteSupport.createWriter( + UUID.randomUUID.toString, schema, SaveMode.Append, options.asDataSourceOptions).get + } + } + + implicit class SourceHelper(source: DataSourceV2) { + def asReadSupport: ReadSupport = { + source match { + case support: ReadSupport => + support + case _: ReadSupportWithSchema => + // this method is only called if there is no user-supplied schema. if there is no + // user-supplied schema and ReadSupport was not implemented, throw a helpful exception. + throw new AnalysisException(s"Data source requires a user-supplied schema: $name") + case _ => + throw new AnalysisException(s"Data source is not readable: $name") + } + } + + def asReadSupportWithSchema: ReadSupportWithSchema = { + source match { + case support: ReadSupportWithSchema => + support + case _: ReadSupport => + throw new AnalysisException( + s"Data source does not support user-supplied schema: $name") + case _ => + throw new AnalysisException(s"Data source is not readable: $name") + } + } + + def asWriteSupport: WriteSupport = { + source match { + case support: WriteSupport => + support + case _ => + throw new AnalysisException(s"Data source is not writable: $name") + } + } + + def name: String = { + source match { + case registered: DataSourceRegister => + registered.shortName() + case _ => + source.getClass.getSimpleName + } + } + + def createReader( + options: Map[String, String], + userSpecifiedSchema: Option[StructType]): DataSourceReader = { + val v2Options = new DataSourceOptions(options.asJava) + userSpecifiedSchema match { + case Some(s) => + asReadSupportWithSchema.createReader(s, v2Options) + case _ => + asReadSupport.createReader(v2Options) + } + } + + def createWriter( + options: Map[String, String], + schema: StructType): DataSourceWriter = { + val v2Options = new DataSourceOptions(options.asJava) + asWriteSupport.createWriter(UUID.randomUUID.toString, schema, SaveMode.Append, v2Options).get + } + } + + implicit class OptionsHelper(options: Map[String, String]) { + def asDataSourceOptions: DataSourceOptions = { + new DataSourceOptions(options.asJava) + } + + def asCaseInsensitiveMap: CaseInsensitiveStringMap = { + val map = CaseInsensitiveStringMap.empty() + map.putAll(options.asJava) + map + } + + def table: Option[TableIdentifier] = { + val map = asCaseInsensitiveMap + Option(map.get(DataSourceOptions.TABLE_KEY)) + .map(TableIdentifier(_, Option(map.get(DataSourceOptions.DATABASE_KEY)))) + } + + def paths: Array[String] = { + asDataSourceOptions.paths() + } + } +} From 770a31cf43903736b10455a82b4743987dd27db6 Mon Sep 17 00:00:00 2001 From: Ryan Blue Date: Fri, 11 May 2018 15:04:15 -0700 Subject: [PATCH 11/14] SPARK-24253: Add DeleteSupport mix-in for DataSourceV2. --- .../spark/sql/sources/v2/DeleteSupport.java | 44 +++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 sql/core/src/main/java/org/apache/spark/sql/sources/v2/DeleteSupport.java diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/DeleteSupport.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/DeleteSupport.java new file mode 100644 index 000000000000..33f0f120cb6d --- /dev/null +++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/DeleteSupport.java @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.sources.v2; + +import org.apache.spark.sql.catalyst.expressions.Expression; + +public interface DeleteSupport extends DataSourceV2 { + /** + * Delete data from a data source table that matches filter expressions. + *

+ * Rows are deleted from the data source iff all of the filter expressions match. That is, the + * expressions must be interpreted as a set of filters that are ANDed together. + *

+ * Implementations may reject a delete operation if the delete isn't possible without significant + * effort. For example, partitioned data sources may reject deletes that do not filter by + * partition columns because the filter may require rewriting files without deleted records. + * To reject a delete implementations should throw {@link IllegalArgumentException} with a clear + * error message that identifies which expression was rejected. + *

+ * Implementations may throw {@link UnsupportedOperationException} if the delete operation is not + * supported because one of the filter expressions is not supported. Implementations should throw + * this exception with a clear error message that identifies the unsupported expression. + * + * @param filters filter expressions, used to select rows to delete when all expressions match + * @throws UnsupportedOperationException If one or more filter expressions is not supported + * @throws IllegalArgumentException If the delete is rejected due to required effort + */ + void deleteWhere(Expression[] filters); +} From f769341547cdc63f77456c1c2677070a4d3d363c Mon Sep 17 00:00:00 2001 From: Ryan Blue Date: Fri, 11 May 2018 15:09:22 -0700 Subject: [PATCH 12/14] SPARK-24253: Add interface description to DeleteSupport. --- .../org/apache/spark/sql/sources/v2/DeleteSupport.java | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/DeleteSupport.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/DeleteSupport.java index 33f0f120cb6d..090255c910ca 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/DeleteSupport.java +++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/DeleteSupport.java @@ -19,6 +19,13 @@ import org.apache.spark.sql.catalyst.expressions.Expression; +/** + * A mix-in interface for {@link DataSourceV2} delete support. Data sources can implement this + * interface to provide the ability to delete data from tables that matches filter expressions. + *

+ * Data sources must implement this interface to support logical operations that combine writing + * data with deleting data, like overwriting partitions. + */ public interface DeleteSupport extends DataSourceV2 { /** * Delete data from a data source table that matches filter expressions. From af16c4238fd486a82e86dda02ad21279104afe21 Mon Sep 17 00:00:00 2001 From: Ryan Blue Date: Wed, 13 Jun 2018 16:08:21 -0700 Subject: [PATCH 13/14] Add DeleteFrom logical plan, add to parser. --- .../org/apache/spark/sql/catalyst/parser/SqlBase.g4 | 2 +- .../org/apache/spark/sql/catalyst/parser/AstBuilder.scala | 8 ++++++++ .../catalyst/plans/logical/basicLogicalOperators.scala | 6 ++++++ 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 index 2aca10f1bfbc..53320cb09244 100644 --- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 +++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 @@ -167,6 +167,7 @@ statement | SET ROLE .*? #failNativeCommand | SET .*? #setConfiguration | RESET #resetConfiguration + | DELETE FROM tableIdentifier WHERE expression #deleteFrom | unsupportedHiveNativeCommands .*? #failNativeCommand ; @@ -215,7 +216,6 @@ unsupportedHiveNativeCommands | kw1=COMMIT | kw1=ROLLBACK | kw1=DFS - | kw1=DELETE kw2=FROM ; createTableHeader diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala index 49f578a24aae..b51f047b8030 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala @@ -547,6 +547,14 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging } } + /** + * Delete data from a table that matches a boolean expression. + */ + override def visitDeleteFrom(ctx: DeleteFromContext): LogicalPlan = withOrigin(ctx) { + val tableIdent = visitTableIdentifier(ctx.tableIdentifier()) + DeleteFrom(UnresolvedRelation(tableIdent), expression(ctx.expression())) + } + /** * Add a [[WithWindowDefinition]] operator to a logical plan. */ diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala index 8f1c36dcef37..7d918cd50397 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala @@ -345,6 +345,12 @@ case class Join( } } +case class DeleteFrom(table: LogicalPlan, expr: Expression) extends LogicalPlan { + override def output: Seq[Attribute] = Seq.empty + + override def children: Seq[LogicalPlan] = Seq.empty +} + /** * Append data to an existing table. */ From f8b178d34b870e779ec061175f01ba63a5adc076 Mon Sep 17 00:00:00 2001 From: Ryan Blue Date: Thu, 26 Jul 2018 15:02:25 -0700 Subject: [PATCH 14/14] SPARK-24253: Implement DELETE FROM for v2 tables. --- .../spark/sql/catalyst/parser/SqlBase.g4 | 6 +- .../sql/catalyst/analysis/Analyzer.scala | 5 +- .../sql/catalyst/analysis/unresolved.scala | 16 ++-- .../spark/sql/catalyst/identifiers.scala | 85 ++++++++++++++++--- .../sql/catalyst/parser/AstBuilder.scala | 15 +++- .../sql/catalyst/parser/ParserInterface.scala | 6 ++ .../spark/sql/sources/v2/DeleteSupport.java | 3 +- .../sql/catalog/v2/CatalogV2Implicits.scala | 29 +++++++ .../v2/ResolveCatalogV2Relations.scala | 36 ++++++++ .../datasources/v2/DataSourceV2Relation.scala | 2 +- .../datasources/v2/DataSourceV2Strategy.scala | 6 +- .../datasources/v2/DeleteFromV2Exec.scala | 43 ++++++++++ .../internal/BaseSessionStateBuilder.scala | 4 +- 13 files changed, 226 insertions(+), 30 deletions(-) create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/catalog/v2/CatalogV2Implicits.scala create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/catalog/v2/ResolveCatalogV2Relations.scala create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DeleteFromV2Exec.scala diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 index 53320cb09244..2bb930280018 100644 --- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 +++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 @@ -167,7 +167,7 @@ statement | SET ROLE .*? #failNativeCommand | SET .*? #setConfiguration | RESET #resetConfiguration - | DELETE FROM tableIdentifier WHERE expression #deleteFrom + | DELETE FROM catalogTableIdentifier WHERE expression #deleteFrom | unsupportedHiveNativeCommands .*? #failNativeCommand ; @@ -526,6 +526,10 @@ rowFormat (NULL DEFINED AS nullDefinedAs=STRING)? #rowFormatDelimited ; +catalogTableIdentifier + : ((catalog=identifier '.')? db=identifier '.')? table=identifier + ; + tableIdentifier : (db=identifier '.')? table=identifier ; diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index 63f8c10b490f..da4571792f80 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -696,13 +696,14 @@ class Analyzer( } def apply(plan: LogicalPlan): LogicalPlan = plan.transformUp { - case i @ InsertIntoTable(u: UnresolvedRelation, parts, child, _, _) if child.resolved => + case i @ InsertIntoTable(u: UnresolvedRelation, parts, child, _, _) + if child.resolved && u.table.catalog.isEmpty => EliminateSubqueryAliases(lookupTableFromCatalog(u)) match { case v: View => u.failAnalysis(s"Inserting into a view is not allowed. View: ${v.desc.identifier}.") case other => i.copy(table = other) } - case u: UnresolvedRelation => resolveRelation(u) + case u @ UnresolvedRelation(table) if table.catalog.isEmpty => resolveRelation(u) } // Look up the table with the given name from catalog. The database we used is decided by the diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala index 71e23175168e..ff8874cf7435 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala @@ -18,7 +18,7 @@ package org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.AnalysisException -import org.apache.spark.sql.catalyst.{FunctionIdentifier, InternalRow, TableIdentifier} +import org.apache.spark.sql.catalyst.{CatalogTableIdentifier, FunctionIdentifier, InternalRow, TableIdentifier} import org.apache.spark.sql.catalyst.errors.TreeNodeException import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} @@ -38,15 +38,19 @@ class UnresolvedException[TreeType <: TreeNode[_]](tree: TreeType, function: Str /** * Holds the name of a relation that has yet to be looked up in a catalog. * - * @param tableIdentifier table name + * @param table a [[CatalogTableIdentifier]] */ -case class UnresolvedRelation(tableIdentifier: TableIdentifier) - extends LeafNode { +case class UnresolvedRelation(table: CatalogTableIdentifier) extends LeafNode with NamedRelation { /** Returns a `.` separated name for this relation. */ - def tableName: String = tableIdentifier.unquotedString + def tableName: String = table.unquotedString - override def output: Seq[Attribute] = Nil + def name: String = tableName + + /** Returns the table identifier without the catalog element */ + def tableIdentifier: TableIdentifier = table.asTableIdentifier + + override def output: Seq[AttributeReference] = Nil override lazy val resolved = false } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/identifiers.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/identifiers.scala index a3cc4529b545..6b0bd57d1756 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/identifiers.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/identifiers.scala @@ -18,30 +18,32 @@ package org.apache.spark.sql.catalyst /** - * An identifier that optionally specifies a database. + * An identifier that optionally specifies a database and catalog. * * Format (unquoted): "name" or "db.name" * Format (quoted): "`name`" or "`db`.`name`" */ -sealed trait IdentifierWithDatabase { +sealed trait IdentifierWithOptionalDatabaseAndCatalog { val identifier: String def database: Option[String] + def catalog: Option[String] + /* * Escapes back-ticks within the identifier name with double-back-ticks. */ private def quoteIdentifier(name: String): String = name.replace("`", "``") def quotedString: String = { - val replacedId = quoteIdentifier(identifier) - val replacedDb = database.map(quoteIdentifier(_)) - - if (replacedDb.isDefined) s"`${replacedDb.get}`.`$replacedId`" else s"`$replacedId`" + // database is required if catalog is present + assert(database.isDefined || catalog.isEmpty) + def q(s: String): String = s"`${quoteIdentifier(s)}`" + Seq(catalog.map(q), database.map(q), Some(q(identifier))).flatten.mkString(".") } def unquotedString: String = { - if (database.isDefined) s"${database.get}.$identifier" else identifier + Seq(catalog, database, Some(identifier)).flatten.mkString(".") } override def toString: String = quotedString @@ -49,17 +51,62 @@ sealed trait IdentifierWithDatabase { /** - * Identifies a table in a database. - * If `database` is not defined, the current database is used. - * When we register a permanent function in the FunctionRegistry, we use - * unquotedString as the function name. + * Identifies a table in a database and catalog. + * If `database` is not defined, the current catalog's default database is used. + * If `catalog` is not defined, the current catalog is used. */ -case class TableIdentifier(table: String, database: Option[String]) - extends IdentifierWithDatabase { +case class CatalogTableIdentifier(table: String, database: Option[String], catalog: Option[String]) + extends IdentifierWithOptionalDatabaseAndCatalog { + + // ensure database is present if catalog is defined + assert(database.isDefined || catalog.isEmpty) override val identifier: String = table + /** + * Returns this as a TableIdentifier if its catalog is not set, fail otherwise. + * + * This is used to provide TableIdentifier for paths that do not support the catalog element. To + * ensure that the identifier is compatible, this asserts that the catalog element is not defined. + */ + lazy val asTableIdentifier: TableIdentifier = { + assert(catalog.isEmpty, s"Cannot convert to TableIdentifier: catalog is ${catalog.get} != None") + new TableIdentifier(table, database) + } + + /** + * Returns this CatalogTableIdentifier without the catalog. + * + * This is used for code paths where the catalog has already been used. + */ + lazy val dropCatalog: CatalogTableIdentifier = catalog match { + case Some(_) => CatalogTableIdentifier(table, database, None) + case _ => this + } +} + + +/** + * Identifies a table in a database. + * If `database` is not defined, the current database is used. + * + * This class is used instead of CatalogTableIdentifier in paths that do not yet support table + * identifiers with catalogs. + */ +class TableIdentifier(name: String, db: Option[String]) + extends CatalogTableIdentifier(name, db, None) { + def this(table: String) = this(table, None) + + override lazy val asTableIdentifier: TableIdentifier = this + + override def copy( + name: String = this.name, + database: Option[String] = this.db, + catalog: Option[String] = None): TableIdentifier = { + assert(catalog.isEmpty, "Cannot add catalog to a TableIdentifier using copy") + new TableIdentifier(name, database) + } } /** A fully qualified identifier for a table (i.e., database.tableName) */ @@ -69,18 +116,28 @@ case class QualifiedTableName(database: String, name: String) { object TableIdentifier { def apply(tableName: String): TableIdentifier = new TableIdentifier(tableName) + + def apply( + tableName: String, + database: Option[String] = None, + catalog: Option[String] = None): CatalogTableIdentifier = + CatalogTableIdentifier(tableName, database, catalog) } /** * Identifies a function in a database. * If `database` is not defined, the current database is used. + * When we register a permanent function in the FunctionRegistry, we use + * unquotedString as the function name. */ case class FunctionIdentifier(funcName: String, database: Option[String]) - extends IdentifierWithDatabase { + extends IdentifierWithOptionalDatabaseAndCatalog { override val identifier: String = funcName + override val catalog: Option[String] = None + def this(funcName: String) = this(funcName, None) override def toString: String = unquotedString diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala index b51f047b8030..a929f3b9368d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala @@ -29,7 +29,7 @@ import org.antlr.v4.runtime.tree.{ParseTree, RuleNode, TerminalNode} import org.apache.spark.internal.Logging import org.apache.spark.sql.AnalysisException -import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier} +import org.apache.spark.sql.catalyst.{CatalogTableIdentifier, FunctionIdentifier, TableIdentifier} import org.apache.spark.sql.catalyst.analysis._ import org.apache.spark.sql.catalyst.catalog.CatalogStorageFormat import org.apache.spark.sql.catalyst.expressions._ @@ -551,7 +551,7 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging * Delete data from a table that matches a boolean expression. */ override def visitDeleteFrom(ctx: DeleteFromContext): LogicalPlan = withOrigin(ctx) { - val tableIdent = visitTableIdentifier(ctx.tableIdentifier()) + val tableIdent = visitCatalogTableIdentifier(ctx.catalogTableIdentifier()) DeleteFrom(UnresolvedRelation(tableIdent), expression(ctx.expression())) } @@ -937,6 +937,17 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging TableIdentifier(ctx.table.getText, Option(ctx.db).map(_.getText)) } + /** + * Create a [[CatalogTableIdentifier]] from a 'tableName' or 'databaseName'.'tableName' pattern. + */ + override def visitCatalogTableIdentifier( + ctx: CatalogTableIdentifierContext): CatalogTableIdentifier = withOrigin(ctx) { + TableIdentifier( + ctx.table.getText, + Option(ctx.db).map(_.getText), + Option(ctx.catalog).map(_.getText)) + } + /** * Create a [[FunctionIdentifier]] from a 'functionName' or 'databaseName'.'functionName' pattern. */ diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserInterface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserInterface.scala index 75240d219622..b37d3e25386c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserInterface.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserInterface.scala @@ -46,6 +46,12 @@ trait ParserInterface { @throws[ParseException]("Text cannot be parsed to a TableIdentifier") def parseTableIdentifier(sqlText: String): TableIdentifier +// /** +// * Parse a string to a [[TableIdentifier]]. +// */ +// @throws[ParseException]("Text cannot be parsed to a TableIdentifier") +// def parseCatalogTableIdentifier(sqlText: String): TableIdentifier + /** * Parse a string to a [[FunctionIdentifier]]. */ diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/DeleteSupport.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/DeleteSupport.java index 090255c910ca..731f32d3603f 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/DeleteSupport.java +++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/DeleteSupport.java @@ -17,6 +17,7 @@ package org.apache.spark.sql.sources.v2; +import org.apache.spark.sql.catalog.v2.Table; import org.apache.spark.sql.catalyst.expressions.Expression; /** @@ -26,7 +27,7 @@ * Data sources must implement this interface to support logical operations that combine writing * data with deleting data, like overwriting partitions. */ -public interface DeleteSupport extends DataSourceV2 { +public interface DeleteSupport extends Table { /** * Delete data from a data source table that matches filter expressions. *

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/catalog/v2/CatalogV2Implicits.scala b/sql/core/src/main/scala/org/apache/spark/sql/catalog/v2/CatalogV2Implicits.scala new file mode 100644 index 000000000000..57564cb3e2ed --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/catalog/v2/CatalogV2Implicits.scala @@ -0,0 +1,29 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalog.v2 + +private[sql] object CatalogV2Implicits { + implicit class CatalogHelper(catalog: CatalogProvider) { + def asTableCatalog: TableCatalog = catalog match { + case tableCatalog: TableCatalog => + tableCatalog + case _ => + throw new UnsupportedOperationException(s"Catalog $catalog does not support tables") + } + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/catalog/v2/ResolveCatalogV2Relations.scala b/sql/core/src/main/scala/org/apache/spark/sql/catalog/v2/ResolveCatalogV2Relations.scala new file mode 100644 index 000000000000..77aa94600bf4 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/catalog/v2/ResolveCatalogV2Relations.scala @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalog.v2 + +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation +import org.apache.spark.sql.catalyst.plans.logical.{DeleteFrom, LogicalPlan} +import org.apache.spark.sql.catalyst.rules.Rule +import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation + +case class ResolveCatalogV2Relations(sparkSession: SparkSession) extends Rule[LogicalPlan] { + import org.apache.spark.sql.catalog.v2.CatalogV2Implicits._ + + override def apply(plan: LogicalPlan): LogicalPlan = plan.transformUp { + case d @ DeleteFrom(u: UnresolvedRelation, _) if u.table.catalog.isDefined => + val catalogName = u.table.catalog.get + val catalog = sparkSession.catalog(catalogName).asTableCatalog + val table = catalog.loadTable(u.table.asTableIdentifier) + d.copy(table = DataSourceV2Relation.create(catalogName, u.table.dropCatalog, table)) + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala index 126c9bb1d670..0192f53dad2d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala @@ -167,7 +167,7 @@ object DataSourceV2Relation { catalogName: String, ident: TableIdentifier, table: Table, - options: Map[String, String]): NamedRelation = { + options: Map[String, String] = Map.empty): NamedRelation = { TableV2Relation(catalogName, ident, table, table.schema.toAttributes, options) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala index a9a433c5ea7f..b67567e55074 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala @@ -20,11 +20,10 @@ package org.apache.spark.sql.execution.datasources.v2 import scala.collection.mutable import org.apache.spark.sql.{sources, Strategy} -import org.apache.spark.sql.catalog.v2.TableCatalog import org.apache.spark.sql.catalyst.analysis.NamedRelation import org.apache.spark.sql.catalyst.expressions.{And, AttributeReference, AttributeSet, Expression} import org.apache.spark.sql.catalyst.planning.PhysicalOperation -import org.apache.spark.sql.catalyst.plans.logical.{AppendData, CreateTableAsSelect, LogicalPlan, Repartition, ReplaceTableAsSelect} +import org.apache.spark.sql.catalyst.plans.logical.{AppendData, CreateTableAsSelect, DeleteFrom, LogicalPlan, Repartition, ReplaceTableAsSelect} import org.apache.spark.sql.execution.{FilterExec, ProjectExec, SparkPlan} import org.apache.spark.sql.execution.datasources.DataSourceStrategy import org.apache.spark.sql.execution.streaming.continuous.{ContinuousCoalesceExec, WriteToContinuousDataSource, WriteToContinuousDataSourceExec} @@ -170,6 +169,9 @@ object DataSourceV2Strategy extends Strategy { Nil } + case DeleteFrom(rel: TableV2Relation, expr) => + DeleteFromV2Exec(rel, expr) :: Nil + case _ => Nil } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DeleteFromV2Exec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DeleteFromV2Exec.scala new file mode 100644 index 000000000000..9065ea425530 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DeleteFromV2Exec.scala @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources.v2 + +import org.apache.spark.SparkException +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, PredicateHelper} +import org.apache.spark.sql.execution.LeafExecNode +import org.apache.spark.sql.sources.v2.DeleteSupport + +case class DeleteFromV2Exec(rel: TableV2Relation, expr: Expression) + extends LeafExecNode with PredicateHelper { + + override protected def doExecute(): RDD[InternalRow] = { + rel.table match { + case d: DeleteSupport => + d.deleteWhere(splitConjunctivePredicates(expr).toArray) + case _ => + throw new SparkException( + s"Cannot delete, table that does not support deletes: ${rel.table}") + } + + sqlContext.sparkContext.parallelize(Seq.empty, 1) + } + + override def output: Seq[Attribute] = Seq.empty +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala index 3a0db7e16c23..4c2e1fd8a8d1 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala @@ -19,6 +19,7 @@ package org.apache.spark.sql.internal import org.apache.spark.SparkConf import org.apache.spark.annotation.{Experimental, InterfaceStability} import org.apache.spark.sql.{ExperimentalMethods, SparkSession, UDFRegistration, _} +import org.apache.spark.sql.catalog.v2.ResolveCatalogV2Relations import org.apache.spark.sql.catalyst.analysis.{Analyzer, FunctionRegistry} import org.apache.spark.sql.catalyst.catalog.SessionCatalog import org.apache.spark.sql.catalyst.optimizer.Optimizer @@ -156,7 +157,8 @@ abstract class BaseSessionStateBuilder( */ protected def analyzer: Analyzer = new Analyzer(catalog, conf) { override val extendedResolutionRules: Seq[Rule[LogicalPlan]] = - new FindDataSourceTable(session) +: + ResolveCatalogV2Relations(session) +: + new FindDataSourceTable(session) +: new ResolveSQLOnFile(session) +: customResolutionRules