address comments

cloud-fan · cloud-fan · commit f79f63cbead6 · 2016-01-25T14:09:30.000-08:00
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/BufferHolder.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/BufferHolder.java
@@ -29,6 +29,10 @@
  * this class per writing program, so that the memory segment/data buffer can be reused.  Note that
  * for each incoming record, we should call `reset` of BufferHolder instance before write the record
  * and reuse the data buffer.
+ *
+ * Generally we should call `UnsafeRow.setTotalSize` and pass in `BufferHolder.totalSize` to update
+ * the size of the result row, after writing a record to the buffer. However, we can skip this step
+ * if the fields of row are all fixed-length, as the size of result row is also fixed.
  */
 public class BufferHolder {
   public byte[] buffer;
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeRowWriter.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeRowWriter.java
@@ -29,11 +29,14 @@
  * A helper class to write data into global row buffer using `UnsafeRow` format.
  *
  * It will remember the offset of row buffer which it starts to write, and move the cursor of row
- * buffer while writing.  If a new record comes, the cursor of row buffer will be reset, so we need
- * to also call `reset` of this class before writing, to update the `startingOffset` and clear out
- * null bits.  Note that if we use it to write data into the result unsafe row, which means we will
- * always write from the very beginning of the global row buffer, we don't need to update
- * `startingOffset` and can just call `zeroOutNullBytes` before writing new record.
+ * buffer while writing.  If new data(can be the input record if this is the outermost writer, or
+ * nested struct if this is an inner writer) comes, the starting cursor of row buffer may be
+ * changed, so we need to call `UnsafeRowWriter.reset` before writing, to update the
+ * `startingOffset` and clear out null bits.
+ *
+ * Note that if this is the outermost writer, which means we will always write from the very
+ * beginning of the global row buffer, we don't need to update `startingOffset` and can just call
+ * `zeroOutNullBytes` before writing new data.
  */
 public class UnsafeRowWriter {
 
@@ -43,6 +46,17 @@ public class UnsafeRowWriter {
   private final int nullBitsSize;
   private final int fixedSize;
 
+  public UnsafeRowWriter(BufferHolder holder, int numFields) {
+    this.holder = holder;
+    this.nullBitsSize = UnsafeRow.calculateBitSetWidthInBytes(numFields);
+    this.fixedSize = nullBitsSize + 8 * numFields;
+    this.startingOffset = holder.cursor;
+  }
+
+  /**
+   * Resets the `startingOffset` according to the current cursor of row buffer, and clear out null
+   * bits.  This should be called before we write a new nested struct to the row buffer.
+   */
   public void reset() {
     this.startingOffset = holder.cursor;
 
@@ -53,19 +67,15 @@ public void reset() {
     zeroOutNullBytes();
   }
 
+  /**
+   * Clears out null bits.  This should be called before we write a new row to row buffer.
+   */
   public void zeroOutNullBytes() {
     for (int i = 0; i < nullBitsSize; i += 8) {
       Platform.putLong(holder.buffer, startingOffset + i, 0L);
     }
   }
 
-  public UnsafeRowWriter(BufferHolder holder, int numFields) {
-    this.holder = holder;
-    this.nullBitsSize = UnsafeRow.calculateBitSetWidthInBytes(numFields);
-    this.fixedSize = nullBitsSize + 8 * numFields;
-    this.startingOffset = holder.cursor;
-  }
-
   private void zeroOutPaddingBytes(int numBytes) {
     if ((numBytes & 0x07) > 0) {
       Platform.putLong(holder.buffer, holder.cursor + ((numBytes >> 3) << 3), 0L);
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/UnsafeProjectionBenchmark.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/UnsafeProjectionBenchmark.scala
@@ -24,7 +24,7 @@ import org.apache.spark.sql.types._
 import org.apache.spark.util.Benchmark
 
 /**
- * Benchmark [[UnsafeProjection]] for flat schema(primitive-type fields).
+ * Benchmark [[UnsafeProjection]] for fixed-length/primitive-type fields.
  */
 object UnsafeProjectionBenchmark {
 
@@ -86,7 +86,7 @@ object UnsafeProjectionBenchmark {
     val rows3 = generateRows(schema3, numRows)
     val projection3 = UnsafeProjection.create(attrs3, attrs3)
 
-    benchmark.addCase("primitive types") { _ =>
+    benchmark.addCase("7 primitive types") { _ =>
       for (_ <- 1 to iters) {
         var sum = 0L
         var i = 0
@@ -110,7 +110,7 @@ object UnsafeProjectionBenchmark {
     val rows4 = generateRows(schema4, numRows)
     val projection4 = UnsafeProjection.create(attrs4, attrs4)
 
-    benchmark.addCase("nullable primitive types") { _ =>
+    benchmark.addCase("7 nullable primitive types") { _ =>
       for (_ <- 1 to iters) {
         var sum = 0L
         var i = 0
@@ -122,6 +122,15 @@ object UnsafeProjectionBenchmark {
     }
 
 
+    /*
+    Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz
+    unsafe projection:                 Avg Time(ms)    Avg Rate(M/s)  Relative Rate
+    -------------------------------------------------------------------------------
+    single long                             1533.34           175.07         1.00 X
+    single nullable long                    2306.73           116.37         0.66 X
+    primitive types                         8403.93            31.94         0.18 X
+    nullable primitive types               12448.39            21.56         0.12 X
+     */
     benchmark.run()
   }
 }