Skip to content

Commit e3d2022

Browse files
zuotingbinggatorsmile
authored andcommitted
[SPARK-20594][SQL] The staging directory should be a child directory starts with "." to avoid being deleted if we set hive.exec.stagingdir under the table directory.
JIRA Issue: https://issues.apache.org/jira/browse/SPARK-20594 ## What changes were proposed in this pull request? The staging directory should be a child directory starts with "." to avoid being deleted before moving staging directory to table directory if we set hive.exec.stagingdir under the table directory. ## How was this patch tested? Added unit tests Author: zuotingbing <[email protected]> Closes #17858 from zuotingbing/spark-stagingdir.
1 parent 0d3a631 commit e3d2022

File tree

2 files changed

+25
-2
lines changed

2 files changed

+25
-2
lines changed

sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
package org.apache.spark.sql.hive.execution
1919

20-
import java.io.IOException
20+
import java.io.{File, IOException}
2121
import java.net.URI
2222
import java.text.SimpleDateFormat
2323
import java.util.{Date, Locale, Random}
@@ -97,12 +97,24 @@ case class InsertIntoHiveTable(
9797
val inputPathUri: URI = inputPath.toUri
9898
val inputPathName: String = inputPathUri.getPath
9999
val fs: FileSystem = inputPath.getFileSystem(hadoopConf)
100-
val stagingPathName: String =
100+
var stagingPathName: String =
101101
if (inputPathName.indexOf(stagingDir) == -1) {
102102
new Path(inputPathName, stagingDir).toString
103103
} else {
104104
inputPathName.substring(0, inputPathName.indexOf(stagingDir) + stagingDir.length)
105105
}
106+
107+
// SPARK-20594: This is a walk-around fix to resolve a Hive bug. Hive requires that the
108+
// staging directory needs to avoid being deleted when users set hive.exec.stagingdir
109+
// under the table directory.
110+
if (FileUtils.isSubDir(new Path(stagingPathName), inputPath, fs) &&
111+
!stagingPathName.stripPrefix(inputPathName).stripPrefix(File.separator).startsWith(".")) {
112+
logDebug(s"The staging dir '$stagingPathName' should be a child directory starts " +
113+
"with '.' to avoid being deleted if we set hive.exec.stagingdir under the table " +
114+
"directory.")
115+
stagingPathName = new Path(inputPathName, ".hive-staging").toString
116+
}
117+
106118
val dir: Path =
107119
fs.makeQualified(
108120
new Path(stagingPathName + "_" + executionId + "-" + TaskRunner.getTaskRunnerID))

sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -494,4 +494,15 @@ class InsertIntoHiveTableSuite extends QueryTest with TestHiveSingleton with Bef
494494
spark.table("t").write.insertInto(tableName)
495495
}
496496
}
497+
498+
test("SPARK-20594: hive.exec.stagingdir was deleted by Hive") {
499+
// Set hive.exec.stagingdir under the table directory without start with ".".
500+
withSQLConf("hive.exec.stagingdir" -> "./test") {
501+
withTable("test_table") {
502+
sql("CREATE TABLE test_table (key int)")
503+
sql("INSERT OVERWRITE TABLE test_table SELECT 1")
504+
checkAnswer(sql("SELECT * FROM test_table"), Row(1))
505+
}
506+
}
507+
}
497508
}

0 commit comments

Comments
 (0)