-
Notifications
You must be signed in to change notification settings - Fork 28.9k
[SPARK-19575][SQL]Reading from or writing to a hive serde table with a non pre-existing location should succeed #16910
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
cb98375
401e86d
4493a8f
6fb2b57
b4caca7
119fa64
3870e19
92d1067
4f660d2
7aa43b1
f83d81d
2456a94
f4b4d29
3dcd6c6
065af19
a4f771a
15c0a77
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,6 +18,7 @@ | |
| package org.apache.spark.sql.hive.execution | ||
|
|
||
| import java.io.File | ||
| import java.net.URI | ||
|
|
||
| import org.apache.hadoop.fs.Path | ||
| import org.scalatest.BeforeAndAfterEach | ||
|
|
@@ -35,6 +36,7 @@ import org.apache.spark.sql.internal.SQLConf | |
| import org.apache.spark.sql.internal.StaticSQLConf.CATALOG_IMPLEMENTATION | ||
| import org.apache.spark.sql.test.SQLTestUtils | ||
| import org.apache.spark.sql.types.StructType | ||
| import org.apache.spark.util.Utils | ||
|
|
||
| class HiveDDLSuite | ||
| extends QueryTest with SQLTestUtils with TestHiveSingleton with BeforeAndAfterEach { | ||
|
|
@@ -63,6 +65,12 @@ class HiveDDLSuite | |
| fs.exists(filesystemPath) | ||
| } | ||
|
|
||
| private def makeQualifiedPath(path: String): Path = { | ||
| val hadoopPath = new Path(path) | ||
| val fs = hadoopPath.getFileSystem(sparkContext.hadoopConfiguration) | ||
| fs.makeQualified(hadoopPath) | ||
| } | ||
|
|
||
| test("drop tables") { | ||
| withTable("tab1") { | ||
| val tabName = "tab1" | ||
|
|
@@ -1588,6 +1596,147 @@ class HiveDDLSuite | |
| } | ||
| } | ||
|
|
||
| test("insert data to a hive serde table which has a non-existing location should succeed") { | ||
| withTable("t") { | ||
| withTempDir { dir => | ||
| spark.sql( | ||
| s""" | ||
| |CREATE TABLE t(a string, b int) | ||
| |USING hive | ||
| |LOCATION '$dir' | ||
| """.stripMargin) | ||
| val table = spark.sessionState.catalog.getTableMetadata(TableIdentifier("t")) | ||
| assert(new Path(table.location) == makeQualifiedPath(dir.getAbsolutePath)) | ||
|
|
||
| val tableLocFile = new File(new URI(table.location)) | ||
| tableLocFile.delete() | ||
| assert(!tableLocFile.exists()) | ||
| spark.sql("INSERT INTO TABLE t SELECT 'c', 1") | ||
| assert(tableLocFile.exists()) | ||
| checkAnswer(spark.table("t"), Row("c", 1) :: Nil) | ||
|
|
||
| Utils.deleteRecursively(dir) | ||
| assert(!tableLocFile.exists()) | ||
| spark.sql("INSERT OVERWRITE TABLE t SELECT 'c', 1") | ||
| assert(tableLocFile.exists()) | ||
| checkAnswer(spark.table("t"), Row("c", 1) :: Nil) | ||
|
|
||
| val newDirFile = new File(dir, "x") | ||
| val newDirPath = newDirFile.getAbsolutePath | ||
| spark.sql(s"ALTER TABLE t SET LOCATION '$newDirPath'") | ||
| spark.sessionState.catalog.refreshTable(TableIdentifier("t")) | ||
|
|
||
| val table1 = spark.sessionState.catalog.getTableMetadata(TableIdentifier("t")) | ||
| assert(table1.location == newDirPath) | ||
| assert(!newDirFile.exists()) | ||
|
|
||
| spark.sql("INSERT INTO TABLE t SELECT 'c', 1") | ||
| checkAnswer(spark.table("t"), Row("c", 1) :: Nil) | ||
| assert(newDirFile.exists()) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| test("insert into a hive serde table with non-existing partition location should succeed") { | ||
| withTable("t") { | ||
| withTempDir { dir => | ||
| spark.sql( | ||
| s""" | ||
| |CREATE TABLE t(a int, b int, c int, d int) | ||
| |USING hive | ||
| |PARTITIONED BY(a, b) | ||
| |LOCATION "$dir" | ||
| """.stripMargin) | ||
| val table = spark.sessionState.catalog.getTableMetadata(TableIdentifier("t")) | ||
| assert(new Path(table.location) == makeQualifiedPath(dir.getAbsolutePath)) | ||
|
|
||
| spark.sql("INSERT INTO TABLE t PARTITION(a=1, b=2) SELECT 3, 4") | ||
| checkAnswer(spark.table("t"), Row(3, 4, 1, 2) :: Nil) | ||
|
|
||
| val partLoc = new File(dir, "a=1") | ||
| Utils.deleteRecursively(partLoc) | ||
| assert(!partLoc.exists()) | ||
| // insert overwrite into a partition which location has been deleted. | ||
| spark.sql("INSERT OVERWRITE TABLE t PARTITION(a=1, b=2) SELECT 7, 8") | ||
| assert(partLoc.exists()) | ||
| checkAnswer(spark.table("t"), Row(7, 8, 1, 2) :: Nil) | ||
|
|
||
| val newDirFile = new File(dir, "x") | ||
| val newDirPath = newDirFile.getAbsolutePath | ||
| spark.sql(s"ALTER TABLE t PARTITION(a=1, b=2) SET LOCATION '$newDirPath'") | ||
| assert(!newDirFile.exists()) | ||
|
|
||
| // insert into a partition which location does not exists. | ||
| spark.sql("INSERT INTO TABLE t PARTITION(a=1, b=2) SELECT 9, 10") | ||
| assert(newDirFile.exists()) | ||
| checkAnswer(spark.table("t"), Row(9, 10, 1, 2) :: Nil) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| test("read data from a hive serde table which has a non-existing location should succeed") { | ||
| withTable("t") { | ||
| withTempDir { dir => | ||
| spark.sql( | ||
| s""" | ||
| |CREATE TABLE t(a string, b int) | ||
| |USING hive | ||
| |LOCATION "$dir" | ||
| """.stripMargin) | ||
| val table = spark.sessionState.catalog.getTableMetadata(TableIdentifier("t")) | ||
| assert(new Path(table.location) == makeQualifiedPath(dir.getAbsolutePath)) | ||
|
|
||
| dir.delete() | ||
| checkAnswer(spark.table("t"), Nil) | ||
|
|
||
| val newDirFile = new File(dir, "x") | ||
| val newDirPath = newDirFile.getAbsolutePath | ||
| spark.sql(s"ALTER TABLE t SET LOCATION '$newDirPath'") | ||
|
|
||
| val table1 = spark.sessionState.catalog.getTableMetadata(TableIdentifier("t")) | ||
| assert(table1.location == newDirPath) | ||
| assert(!newDirFile.exists()) | ||
| checkAnswer(spark.table("t"), Nil) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| test("read data from a hive serde table with non-existing partition location should succeed") { | ||
| withTable("t") { | ||
| withTempDir { dir => | ||
| spark.sql( | ||
| s""" | ||
| |CREATE TABLE t(a int, b int, c int, d int) | ||
| |USING hive | ||
| |PARTITIONED BY(a, b) | ||
| |LOCATION "$dir" | ||
| """.stripMargin) | ||
| spark.sql("INSERT INTO TABLE t PARTITION(a=1, b=2) SELECT 3, 4") | ||
| checkAnswer(spark.table("t"), Row(3, 4, 1, 2) :: Nil) | ||
|
|
||
| val newDirFile = new File(dir, "x") | ||
| val newDirPath = newDirFile.getAbsolutePath | ||
| spark.sql(s"ALTER TABLE t PARTITION(a=1, b=2) SET LOCATION '$newDirPath'") | ||
| assert(!newDirFile.exists()) | ||
| // select from a partition which location has changed to a not existed location | ||
| withSQLConf(SQLConf.HIVE_VERIFY_PARTITION_PATH.key -> "true") { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why setting this conf?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if we don't set it,it will throw an exception,if we set it,it will check if the partition path exists,and will not throw exception just return emptyrdd even if path not existed
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is this expected? I think hive will always return empty result right?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. BTW this conf will be removed soon, as it has bugs.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok~thanks~ then here we also need to modify something?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, hive return empty , if there is a bug here(could you describe what the bug is?), we can remove the conf ,and always return result? |
||
| checkAnswer(spark.sql("select * from t where a=1 and b=2"), Nil) | ||
| } | ||
|
|
||
| spark.sql("INSERT INTO TABLE t PARTITION(a=1, b=2) SELECT 5, 6") | ||
| checkAnswer(spark.table("t"), Row(5, 6, 1, 2) :: Nil) | ||
| assert(newDirFile.exists()) | ||
|
|
||
| // select from a partition which location has been deleted. | ||
| Utils.deleteRecursively(newDirFile) | ||
| assert(!newDirFile.exists()) | ||
| withSQLConf(SQLConf.HIVE_VERIFY_PARTITION_PATH.key -> "true") { | ||
| checkAnswer(spark.sql("select * from t where a=1 and b=2"), Nil) | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| Seq(true, false).foreach { shouldDelete => | ||
| val tcName = if (shouldDelete) "non-existent" else "existed" | ||
| test(s"CTAS for external data source table with a $tcName location") { | ||
|
|
@@ -1651,10 +1800,8 @@ class HiveDDLSuite | |
| |LOCATION '$dir' | ||
| |AS SELECT 3 as a, 4 as b, 1 as c, 2 as d | ||
| """.stripMargin) | ||
| val dirPath = new Path(dir.getAbsolutePath) | ||
| val fs = dirPath.getFileSystem(spark.sessionState.newHadoopConf()) | ||
| val table = spark.sessionState.catalog.getTableMetadata(TableIdentifier("t")) | ||
| assert(new Path(table.location) == fs.makeQualified(dirPath)) | ||
| assert(new Path(table.location) == makeQualifiedPath(dir.getAbsolutePath)) | ||
|
|
||
| checkAnswer(spark.table("t"), Row(3, 4, 1, 2)) | ||
| } | ||
|
|
@@ -1672,10 +1819,8 @@ class HiveDDLSuite | |
| |LOCATION '$dir' | ||
| |AS SELECT 3 as a, 4 as b, 1 as c, 2 as d | ||
| """.stripMargin) | ||
| val dirPath = new Path(dir.getAbsolutePath) | ||
| val fs = dirPath.getFileSystem(spark.sessionState.newHadoopConf()) | ||
| val table = spark.sessionState.catalog.getTableMetadata(TableIdentifier("t1")) | ||
| assert(new Path(table.location) == fs.makeQualified(dirPath)) | ||
| assert(new Path(table.location) == makeQualifiedPath(dir.getAbsolutePath)) | ||
|
|
||
| val partDir = new File(dir, "a=3") | ||
| assert(partDir.exists()) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can we just call
dir.deletebefore creating this table?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ok~
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@cloud-fan I found the dir will be created in
create table, so we should keep current logic.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
does hive have the same behavior?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yes, I test it in Hive
then
hdfs:/xxwill be createdThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
seems the
InMemoryCatalogdoesn't do this, you can send a new PR to fix it.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ok thanks~