-
Notifications
You must be signed in to change notification settings - Fork 28.9k
[SPARK-14459] [SQL] Detect relation partitioning and adjust the logical plan #12239
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
067d12b
eb0b3b5
0ba41c7
2130db8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -347,10 +347,23 @@ case class InsertIntoTable( | |
| override def children: Seq[LogicalPlan] = child :: Nil | ||
| override def output: Seq[Attribute] = Seq.empty | ||
|
|
||
| private[spark] lazy val expectedColumns = { | ||
| if (table.output.isEmpty) { | ||
| None | ||
| } else { | ||
| val numDynamicPartitions = partition.values.count(_.isEmpty) | ||
| val (partitionColumns, dataColumns) = table.output | ||
| .partition(a => partition.keySet.contains(a.name)) | ||
| Some(dataColumns ++ partitionColumns.takeRight(numDynamicPartitions)) | ||
| } | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Seems that we can omit the
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @liancheng, I added a note about this below. The problem is that some relations return |
||
| } | ||
|
|
||
| assert(overwrite || !ifNotExists) | ||
| override lazy val resolved: Boolean = childrenResolved && child.output.zip(table.output).forall { | ||
| case (childAttr, tableAttr) => | ||
| DataType.equalsIgnoreCompatibleNullability(childAttr.dataType, tableAttr.dataType) | ||
| override lazy val resolved: Boolean = childrenResolved && expectedColumns.forall { expected => | ||
| child.output.size == expected.size && child.output.zip(expected).forall { | ||
| case (childAttr, tableAttr) => | ||
| DataType.equalsIgnoreCompatibleNullability(childAttr.dataType, tableAttr.dataType) | ||
| } | ||
| } | ||
| } | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -22,21 +22,23 @@ import java.io.File | |
| import org.apache.hadoop.hive.conf.HiveConf | ||
| import org.scalatest.BeforeAndAfter | ||
|
|
||
| import org.apache.spark.SparkException | ||
| import org.apache.spark.sql.{QueryTest, _} | ||
| import org.apache.spark.sql.execution.QueryExecutionException | ||
| import org.apache.spark.sql.catalyst.plans.logical.InsertIntoTable | ||
| import org.apache.spark.sql.hive.test.TestHiveSingleton | ||
| import org.apache.spark.sql.test.SQLTestUtils | ||
| import org.apache.spark.sql.types._ | ||
| import org.apache.spark.util.Utils | ||
|
|
||
| case class TestData(key: Int, value: String) | ||
|
|
||
| case class ThreeCloumntable(key: Int, value: String, key1: String) | ||
|
|
||
| class InsertIntoHiveTableSuite extends QueryTest with TestHiveSingleton with BeforeAndAfter { | ||
| class InsertIntoHiveTableSuite extends QueryTest with TestHiveSingleton with BeforeAndAfter | ||
| with SQLTestUtils { | ||
| import hiveContext.implicits._ | ||
| import hiveContext.sql | ||
|
|
||
| val testData = hiveContext.sparkContext.parallelize( | ||
| override lazy val testData = hiveContext.sparkContext.parallelize( | ||
| (1 to 100).map(i => TestData(i, i.toString))).toDF() | ||
|
|
||
| before { | ||
|
|
@@ -212,4 +214,77 @@ class InsertIntoHiveTableSuite extends QueryTest with TestHiveSingleton with Bef | |
|
|
||
| sql("DROP TABLE hiveTableWithStructValue") | ||
| } | ||
|
|
||
| test("Reject partitioning that does not match table") { | ||
| withSQLConf(("hive.exec.dynamic.partition.mode", "nonstrict")) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I doubt whether this configuration is effective/respected in Spark. Not quite familiar with the newly refactored Spark configuration API. I guess it's probably not respected, and you don't need it to create tables partitioned by dynamic partitions only in Spark. cc @yhuai
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is enforced by the dynamic partition writer container for Hive to match Hive's behavior. Since these tests don't have a static partition, it's needed or else the tests fail. I can change this, but I'm not sure what you would suggest.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah sorry, made a mistake here. I only checked that |
||
| sql("CREATE TABLE partitioned (id bigint, data string) PARTITIONED BY (part string)") | ||
| val data = (1 to 10).map(i => (i, s"data-$i", if ((i % 2) == 0) "even" else "odd")) | ||
| .toDF("id", "data", "part") | ||
|
|
||
| intercept[AnalysisException] { | ||
| // cannot partition by 2 fields when there is only one in the table definition | ||
| data.write.partitionBy("part", "data").insertInto("partitioned") | ||
| } | ||
| } | ||
| } | ||
|
|
||
| test("Test partition mode = strict") { | ||
| withSQLConf(("hive.exec.dynamic.partition.mode", "strict")) { | ||
| sql("CREATE TABLE partitioned (id bigint, data string) PARTITIONED BY (part string)") | ||
| val data = (1 to 10).map(i => (i, s"data-$i", if ((i % 2) == 0) "even" else "odd")) | ||
| .toDF("id", "data", "part") | ||
|
|
||
| intercept[SparkException] { | ||
| data.write.insertInto("partitioned") | ||
| } | ||
| } | ||
| } | ||
|
|
||
| test("Detect table partitioning") { | ||
| withSQLConf(("hive.exec.dynamic.partition.mode", "nonstrict")) { | ||
| sql("CREATE TABLE source (id bigint, data string, part string)") | ||
| val data = (1 to 10).map(i => (i, s"data-$i", if ((i % 2) == 0) "even" else "odd")).toDF() | ||
|
|
||
| data.write.insertInto("source") | ||
| checkAnswer(sql("SELECT * FROM source"), data.collect().toSeq) | ||
|
|
||
| sql("CREATE TABLE partitioned (id bigint, data string) PARTITIONED BY (part string)") | ||
| // this will pick up the output partitioning from the table definition | ||
| sqlContext.table("source").write.insertInto("partitioned") | ||
|
|
||
| checkAnswer(sql("SELECT * FROM partitioned"), data.collect().toSeq) | ||
| } | ||
| } | ||
|
|
||
| test("Detect table partitioning with correct partition order") { | ||
| withSQLConf(("hive.exec.dynamic.partition.mode", "nonstrict")) { | ||
| sql("CREATE TABLE source (id bigint, part2 string, part1 string, data string)") | ||
| val data = (1 to 10).map(i => (i, if ((i % 2) == 0) "even" else "odd", "p", s"data-$i")) | ||
| .toDF("id", "part2", "part1", "data") | ||
|
|
||
| data.write.insertInto("source") | ||
| checkAnswer(sql("SELECT * FROM source"), data.collect().toSeq) | ||
|
|
||
| // the original data with part1 and part2 at the end | ||
| val expected = data.select("id", "data", "part1", "part2") | ||
|
|
||
| sql( | ||
| """CREATE TABLE partitioned (id bigint, data string) | ||
| |PARTITIONED BY (part1 string, part2 string)""".stripMargin) | ||
| sqlContext.table("source").write.insertInto("partitioned") | ||
|
|
||
| checkAnswer(sql("SELECT * FROM partitioned"), expected.collect().toSeq) | ||
| } | ||
| } | ||
|
|
||
| test("InsertIntoTable#resolved should include dynamic partitions") { | ||
| withSQLConf(("hive.exec.dynamic.partition.mode", "nonstrict")) { | ||
| sql("CREATE TABLE partitioned (id bigint, data string) PARTITIONED BY (part string)") | ||
| val data = (1 to 10).map(i => (i.toLong, s"data-$i")).toDF("id", "data") | ||
|
|
||
| val logical = InsertIntoTable(sqlContext.table("partitioned").logicalPlan, | ||
| Map("part" -> None), data.logicalPlan, overwrite = false, ifNotExists = false) | ||
| assert(!logical.resolved, "Should not resolve: missing partition data") | ||
| } | ||
| } | ||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
when will the
partColumnsdifferent frominputPartCols? Seems never?Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This does two things:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
makes sense.