add more comments

cloud-fan · cloud-fan · commit d2dc9b3ce51b · 2015-12-30T23:09:19.000+08:00
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala
@@ -23,8 +23,6 @@ import java.util.zip.CRC32
 import org.apache.commons.codec.digest.DigestUtils
 
 import org.apache.spark.sql.catalyst.expressions.codegen._
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.util.{MapData, ArrayData}
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -119,8 +119,6 @@ final class DataFrameWriter private[sql](df: DataFrame) {
    * Partitions the output by the given columns on the file system. If specified, the output is
    * laid out on the file system similar to Hive's partitioning scheme.
    *
-   * This is only applicable for Parquet at the moment.
-   *
    * @since 1.4.0
    */
   @scala.annotation.varargs
@@ -129,13 +127,24 @@ final class DataFrameWriter private[sql](df: DataFrame) {
     this
   }
 
+  /**
+   * Buckets the output by the given columns on the file system. If specified, the output is
+   * laid out on the file system similar to Hive's bucketing scheme.
+   *
+   * @since 2.0
+   */
   @scala.annotation.varargs
   def bucketBy(numBuckets: Int, colName: String, colNames: String*): DataFrameWriter = {
     this.numBuckets = Option(numBuckets)
     this.bucketingColumns = Option(colName +: colNames)
     this
   }
 
+  /**
+   * Sorts the bucketed output by the given columns.
+   *
+   * @since 2.0
+   */
   @scala.annotation.varargs
   def sortBy(colName: String, colNames: String*): DataFrameWriter = {
     this.sortingColumns = Option(colName +: colNames)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala
@@ -327,9 +327,10 @@ private[sql] class DynamicPartitionWriterContainer(
     val getKey: InternalRow => UnsafeRow = if (bucketSpec.isEmpty) {
       val projection = UnsafeProjection.create(partitionColumns, inputSchema)
       row => projection(row)
-    } else {
+    } else { // If it's bucketed, we should also consider bucket id as part of the key.
       val bucketColumns = bucketSpec.get.resolvedBucketingColumns(inputSchema)
       val getBucketKey = UnsafeProjection.create(bucketColumns, inputSchema)
+      // Leave an empty int slot at the last of the result row, so that we can set bucket id later.
       val getResultRow = UnsafeProjection.create(partitionColumns :+ Literal(-1), inputSchema)
       row => {
         val bucketId = math.abs(getBucketKey(row).hashCode()) % bucketSpec.get.numBuckets
@@ -341,7 +342,7 @@ private[sql] class DynamicPartitionWriterContainer(
 
     val keySchema = if (bucketSpec.isEmpty) {
       StructType.fromAttributes(partitionColumns)
-    } else {
+    } else { // If it's bucketed, we should also consider bucket id as part of the key.
       StructType.fromAttributes(partitionColumns).add("bucketId", IntegerType, nullable = false)
     }
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/BucketedWriteSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/BucketedWriteSuite.scala
@@ -120,8 +120,8 @@ class BucketedWriteSuite extends QueryTest with SQLTestUtils with TestHiveSingle
 
               for (row <- rows) {
                 assert(row.isInstanceOf[UnsafeRow])
-                val actuaBucketId = math.abs(row.hashCode()) % 8
-                assert(actuaBucketId == bucketId)
+                val actualBucketId = math.abs(row.hashCode()) % 8
+                assert(actualBucketId == bucketId)
               }
             }
           }
@@ -151,8 +151,8 @@ class BucketedWriteSuite extends QueryTest with SQLTestUtils with TestHiveSingle
 
             for (row <- rows) {
               assert(row.isInstanceOf[UnsafeRow])
-              val actuaBucketId = math.abs(row.hashCode()) % 8
-              assert(actuaBucketId == bucketId)
+              val actualBucketId = math.abs(row.hashCode()) % 8
+              assert(actualBucketId == bucketId)
             }
           }
         }
@@ -183,8 +183,8 @@ class BucketedWriteSuite extends QueryTest with SQLTestUtils with TestHiveSingle
 
             for (row <- rows) {
               assert(row.isInstanceOf[UnsafeRow])
-              val actuaBucketId = math.abs(row.hashCode()) % 8
-              assert(actuaBucketId == bucketId)
+              val actualBucketId = math.abs(row.hashCode()) % 8
+              assert(actualBucketId == bucketId)
             }
           }
         }

Original file line number	Diff line number	Diff line change
`@@ -120,8 +120,8 @@ class BucketedWriteSuite extends QueryTest with SQLTestUtils with TestHiveSingle`
`120`	`120`
`121`	`121`	`for (row <- rows) {`
`122`	`122`	`assert(row.isInstanceOf[UnsafeRow])`
`123`		`- val actuaBucketId = math.abs(row.hashCode()) % 8`
`124`		`- assert(actuaBucketId == bucketId)`
	`123`	`+ val actualBucketId = math.abs(row.hashCode()) % 8`
	`124`	`+ assert(actualBucketId == bucketId)`
`125`	`125`	`}`
`126`	`126`	`}`
`127`	`127`	`}`
`@@ -151,8 +151,8 @@ class BucketedWriteSuite extends QueryTest with SQLTestUtils with TestHiveSingle`
`151`	`151`
`152`	`152`	`for (row <- rows) {`
`153`	`153`	`assert(row.isInstanceOf[UnsafeRow])`
`154`		`- val actuaBucketId = math.abs(row.hashCode()) % 8`
`155`		`- assert(actuaBucketId == bucketId)`
	`154`	`+ val actualBucketId = math.abs(row.hashCode()) % 8`
	`155`	`+ assert(actualBucketId == bucketId)`
`156`	`156`	`}`
`157`	`157`	`}`
`158`	`158`	`}`
`@@ -183,8 +183,8 @@ class BucketedWriteSuite extends QueryTest with SQLTestUtils with TestHiveSingle`
`183`	`183`
`184`	`184`	`for (row <- rows) {`
`185`	`185`	`assert(row.isInstanceOf[UnsafeRow])`
`186`		`- val actuaBucketId = math.abs(row.hashCode()) % 8`
`187`		`- assert(actuaBucketId == bucketId)`
	`186`	`+ val actualBucketId = math.abs(row.hashCode()) % 8`
	`187`	`+ assert(actualBucketId == bucketId)`
`188`	`188`	`}`
`189`	`189`	`}`
`190`	`190`	`}`