Skip to content

Commit d207813

Browse files
committed
address comments.
1 parent 8c718b3 commit d207813

File tree

2 files changed

+16
-46
lines changed

2 files changed

+16
-46
lines changed

sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala

Lines changed: 7 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -242,29 +242,15 @@ final class DataFrameWriter private[sql](df: DataFrame) extends Logging {
242242
} yield {
243243
require(n > 0 && n < 100000, "Bucket number must be greater than 0 and less than 100000.")
244244

245-
if (normalizedParCols.isEmpty) {
246-
BucketSpec(n, normalizedBucketColNames.get, normalizedSortColNames.getOrElse(Nil))
247-
} else {
248-
// When partitionBy and blockBy are used at the same time, the overlapping columns are
249-
// useless. Thus, we removed these overlapping columns from blockBy.
250-
val bucketColumns: Seq[String] =
251-
normalizedBucketColNames.get.filterNot(normalizedParCols.get.contains)
252-
253-
if (bucketColumns.nonEmpty) {
254-
if (bucketColumns.length != normalizedBucketColNames.get.length) {
255-
val removedColumns: Seq[String] =
256-
normalizedBucketColNames.get.filter(normalizedParCols.get.contains)
257-
logInfo(s"bucketBy columns is changed to '${bucketColumnNames.get.mkString(", ")}' " +
258-
s"after removing the columns '${removedColumns.mkString(", ")}' that are part of " +
259-
s"partitionBy columns '${partitioningColumns.get.mkString(", ")}'")
260-
}
261-
BucketSpec(n, bucketColumns, normalizedSortColNames.getOrElse(Nil))
262-
} else {
245+
// partitionBy columns cannot be used in blockedBy
246+
if (normalizedParCols.nonEmpty &&
247+
normalizedBucketColNames.get.toSet.intersect(normalizedParCols.get.toSet).nonEmpty) {
263248
throw new AnalysisException(
264-
s"bucketBy columns '${bucketColumnNames.get.mkString(", ")}' should not be the " +
265-
s"subset of partitionBy columns '${partitioningColumns.get.mkString(", ")}'")
266-
}
249+
s"bucketBy columns '${bucketColumnNames.get.mkString(", ")}' should not be part of " +
250+
s"partitionBy columns '${partitioningColumns.get.mkString(", ")}'")
267251
}
252+
253+
BucketSpec(n, normalizedBucketColNames.get, normalizedSortColNames.getOrElse(Nil))
268254
}
269255
}
270256

sql/hive/src/test/scala/org/apache/spark/sql/sources/BucketedWriteSuite.scala

Lines changed: 9 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -162,34 +162,18 @@ class BucketedWriteSuite extends QueryTest with SQLTestUtils with TestHiveSingle
162162
}
163163

164164
test("write bucketed data with the overlapping blockBy and partitionBy columns") {
165-
for (source <- Seq("parquet", "json", "orc")) {
166-
withTable("bucketed_table") {
167-
df.write
168-
.format(source)
169-
.partitionBy("i")
170-
.bucketBy(8, "i", "k")
171-
.sortBy("k")
172-
.saveAsTable("bucketed_table")
173-
174-
for (i <- 0 until 5) {
175-
// After column pruning, the actual bucketBy columns only contain `k`, which
176-
// is identical to the sortBy column.
177-
testBucketing(new File(tableDir, s"i=$i"), source, 8, Seq("k"), Seq("k"))
178-
}
179-
}
180-
}
165+
intercept[AnalysisException](df.write
166+
.partitionBy("i")
167+
.bucketBy(8, "i", "k")
168+
.sortBy("k")
169+
.saveAsTable("bucketed_table"))
181170
}
182171

183172
test("write bucketed data with the identical blockBy and partitionBy columns") {
184-
for (source <- Seq("parquet", "json", "orc")) {
185-
withTable("bucketed_table") {
186-
intercept[AnalysisException](df.write
187-
.format(source)
188-
.partitionBy("i")
189-
.bucketBy(8, "i")
190-
.saveAsTable("bucketed_table"))
191-
}
192-
}
173+
intercept[AnalysisException](df.write
174+
.partitionBy("i")
175+
.bucketBy(8, "i")
176+
.saveAsTable("bucketed_table"))
193177
}
194178

195179
test("write bucketed data without partitionBy") {

0 commit comments

Comments
 (0)