reduce the perf regression of vectorized parquet reader caused by datetime rebase

cloud-fan · cloud-fan · commit 056e7f0f4b6f · 2020-04-29T22:25:38.000+08:00
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/RebaseDateTime.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/RebaseDateTime.scala
@@ -71,6 +71,8 @@ object RebaseDateTime {
     -719164, -682945, -646420, -609895, -536845, -500320, -463795,
     -390745, -354220, -317695, -244645, -208120, -171595, -141427)
 
+  final val lastSwitchJulianDay: Int = julianGregDiffSwitchDay.last
+
   // The first days of Common Era (CE) which is mapped to the '0001-01-01' date in Julian calendar.
   private final val julianCommonEraStartDay = julianGregDiffSwitchDay(0)
 
@@ -416,6 +418,8 @@ object RebaseDateTime {
   // in the interval: [julianGregDiffSwitchMicros(i), julianGregDiffSwitchMicros(i+1))
   private val julianGregRebaseMap = loadRebaseRecords("julian-gregorian-rebase-micros.json")
 
+  final val lastSwitchJulianTs: Long = julianGregRebaseMap.values.map(_.switches.last).max
+
   /**
    * An optimized version of [[rebaseJulianToGregorianMicros(ZoneId, Long)]]. This method leverages
    * the pre-calculated rebasing maps to save calculation. If the rebasing map doesn't contain
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java
@@ -423,15 +423,8 @@ private void readIntBatch(int rowId, int num, WritableColumnVector column) throw
           num, column, rowId, maxDefLevel, (VectorizedValuesReader) dataColumn);
     } else if (column.dataType() == DataTypes.DateType ) {
       if (rebaseDateTime) {
-        for (int i = 0; i < num; i++) {
-          if (defColumn.readInteger() == maxDefLevel) {
-            column.putInt(
-              rowId + i,
-              RebaseDateTime.rebaseJulianToGregorianDays(dataColumn.readInteger()));
-          } else {
-            column.putNull(rowId + i);
-          }
-        }
+        defColumn.readIntegersWithRebase(
+          num, column, rowId, maxDefLevel, (VectorizedValuesReader) dataColumn);
       } else {
         defColumn.readIntegers(
            num, column, rowId, maxDefLevel, (VectorizedValuesReader) dataColumn);
@@ -449,15 +442,8 @@ private void readLongBatch(int rowId, int num, WritableColumnVector column) thro
         num, column, rowId, maxDefLevel, (VectorizedValuesReader) dataColumn);
     } else if (originalType == OriginalType.TIMESTAMP_MICROS) {
       if (rebaseDateTime) {
-        for (int i = 0; i < num; i++) {
-          if (defColumn.readInteger() == maxDefLevel) {
-            column.putLong(
-              rowId + i,
-              RebaseDateTime.rebaseJulianToGregorianMicros(dataColumn.readLong()));
-          } else {
-            column.putNull(rowId + i);
-          }
-        }
+        defColumn.readLongsWithRebase(
+          num, column, rowId, maxDefLevel, (VectorizedValuesReader) dataColumn);
       } else {
         defColumn.readLongs(
           num, column, rowId, maxDefLevel, (VectorizedValuesReader) dataColumn);
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedPlainValuesReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedPlainValuesReader.java
@@ -22,6 +22,7 @@
 
 import org.apache.parquet.bytes.ByteBufferInputStream;
 import org.apache.parquet.io.ParquetDecodingException;
+import org.apache.spark.sql.catalyst.util.RebaseDateTime;
 import org.apache.spark.sql.execution.vectorized.WritableColumnVector;
 
 import org.apache.parquet.column.values.ValuesReader;
@@ -81,6 +82,33 @@ public final void readIntegers(int total, WritableColumnVector c, int rowId) {
     }
   }
 
+  // A fork of `readIntegers` to rebase the date values. For performance reasons, this method
+  // iterates the values twice: check if we need to rebase first, then go to the optimized branch
+  // if rebase is not needed.
+  @Override
+  public final void readIntegersWithRebase(int total, WritableColumnVector c, int rowId) {
+    int requiredBytes = total * 4;
+    ByteBuffer buffer = getBuffer(requiredBytes);
+    boolean rebase = false;
+    for (int i = 0; i < total; i += 1) {
+      rebase = buffer.getInt(buffer.position() + i * 4) < RebaseDateTime.lastSwitchJulianDay();
+    }
+    if (rebase) {
+      for (int i = 0; i < total; i += 1) {
+        c.putInt(rowId + i, RebaseDateTime.rebaseJulianToGregorianDays(buffer.getInt()));
+      }
+    } else {
+      if (buffer.hasArray()) {
+        int offset = buffer.arrayOffset() + buffer.position();
+        c.putIntsLittleEndian(rowId, total, buffer.array(), offset);
+      } else {
+        for (int i = 0; i < total; i += 1) {
+          c.putInt(rowId + i, buffer.getInt());
+        }
+      }
+    }
+  }
+
   @Override
   public final void readLongs(int total, WritableColumnVector c, int rowId) {
     int requiredBytes = total * 8;
@@ -96,6 +124,33 @@ public final void readLongs(int total, WritableColumnVector c, int rowId) {
     }
   }
 
+  // A fork of `readLongs` to rebase the timestamp values. For performance reasons, this method
+  // iterates the values twice: check if we need to rebase first, then go to the optimized branch
+  // if rebase is not needed.
+  @Override
+  public final void readLongsWithRebase(int total, WritableColumnVector c, int rowId) {
+    int requiredBytes = total * 8;
+    ByteBuffer buffer = getBuffer(requiredBytes);
+    boolean rebase = false;
+    for (int i = 0; i < total; i += 1) {
+      rebase = buffer.getLong(buffer.position() + i * 8) < RebaseDateTime.lastSwitchJulianTs();
+    }
+    if (rebase) {
+      for (int i = 0; i < total; i += 1) {
+        c.putLong(rowId + i, RebaseDateTime.rebaseJulianToGregorianMicros(buffer.getLong()));
+      }
+    } else {
+      if (buffer.hasArray()) {
+        int offset = buffer.arrayOffset() + buffer.position();
+        c.putLongsLittleEndian(rowId, total, buffer.array(), offset);
+      } else {
+        for (int i = 0; i < total; i += 1) {
+          c.putLong(rowId + i, buffer.getLong());
+        }
+      }
+    }
+  }
+
   @Override
   public final void readFloats(int total, WritableColumnVector c, int rowId) {
     int requiredBytes = total * 4;
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedRleValuesReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedRleValuesReader.java
@@ -26,6 +26,7 @@
 import org.apache.parquet.io.ParquetDecodingException;
 import org.apache.parquet.io.api.Binary;
 
+import org.apache.spark.sql.catalyst.util.RebaseDateTime;
 import org.apache.spark.sql.execution.vectorized.WritableColumnVector;
 
 import java.io.IOException;
@@ -203,6 +204,43 @@ public void readIntegers(
     }
   }
 
+  // A fork of `readIntegers`, which rebases the date int value (days) before filling
+  // the Spark column vector.
+  public void readIntegersWithRebase(
+      int total,
+      WritableColumnVector c,
+      int rowId,
+      int level,
+      VectorizedValuesReader data) throws IOException {
+    int left = total;
+    while (left > 0) {
+      if (this.currentCount == 0) this.readNextGroup();
+      int n = Math.min(left, this.currentCount);
+      switch (mode) {
+        case RLE:
+          if (currentValue == level) {
+            data.readIntegersWithRebase(n, c, rowId);
+          } else {
+            c.putNulls(rowId, n);
+          }
+          break;
+        case PACKED:
+          for (int i = 0; i < n; ++i) {
+            if (currentBuffer[currentBufferIdx++] == level) {
+              c.putInt(rowId + i,
+                RebaseDateTime.rebaseJulianToGregorianDays(data.readInteger()));
+            } else {
+              c.putNull(rowId + i);
+            }
+          }
+          break;
+      }
+      rowId += n;
+      left -= n;
+      currentCount -= n;
+    }
+  }
+
   // TODO: can this code duplication be removed without a perf penalty?
   public void readBooleans(
       int total,
@@ -342,6 +380,43 @@ public void readLongs(
     }
   }
 
+  // A fork of `readLongs`, which rebases the timestamp long value (microseconds) before filling
+  // the Spark column vector.
+  public void readLongsWithRebase(
+      int total,
+      WritableColumnVector c,
+      int rowId,
+      int level,
+      VectorizedValuesReader data) throws IOException {
+    int left = total;
+    while (left > 0) {
+      if (this.currentCount == 0) this.readNextGroup();
+      int n = Math.min(left, this.currentCount);
+      switch (mode) {
+        case RLE:
+          if (currentValue == level) {
+            data.readLongsWithRebase(n, c, rowId);
+          } else {
+            c.putNulls(rowId, n);
+          }
+          break;
+        case PACKED:
+          for (int i = 0; i < n; ++i) {
+            if (currentBuffer[currentBufferIdx++] == level) {
+              c.putLong(rowId + i,
+                RebaseDateTime.rebaseJulianToGregorianMicros(data.readLong()));
+            } else {
+              c.putNull(rowId + i);
+            }
+          }
+          break;
+      }
+      rowId += n;
+      left -= n;
+      currentCount -= n;
+    }
+  }
+
   public void readFloats(
       int total,
       WritableColumnVector c,
@@ -508,6 +583,11 @@ public void readIntegers(int total, WritableColumnVector c, int rowId) {
     }
   }
 
+  @Override
+  public void readIntegersWithRebase(int total, WritableColumnVector c, int rowId) {
+    throw new UnsupportedOperationException("only readInts is valid.");
+  }
+
   @Override
   public byte readByte() {
     throw new UnsupportedOperationException("only readInts is valid.");
@@ -523,6 +603,11 @@ public void readLongs(int total, WritableColumnVector c, int rowId) {
     throw new UnsupportedOperationException("only readInts is valid.");
   }
 
+  @Override
+  public void readLongsWithRebase(int total, WritableColumnVector c, int rowId) {
+    throw new UnsupportedOperationException("only readInts is valid.");
+  }
+
   @Override
   public void readBinary(int total, WritableColumnVector c, int rowId) {
     throw new UnsupportedOperationException("only readInts is valid.");
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedValuesReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedValuesReader.java
@@ -40,7 +40,9 @@ public interface VectorizedValuesReader {
   void readBooleans(int total, WritableColumnVector c, int rowId);
   void readBytes(int total, WritableColumnVector c, int rowId);
   void readIntegers(int total, WritableColumnVector c, int rowId);
+  void readIntegersWithRebase(int total, WritableColumnVector c, int rowId);
   void readLongs(int total, WritableColumnVector c, int rowId);
+  void readLongsWithRebase(int total, WritableColumnVector c, int rowId);
   void readFloats(int total, WritableColumnVector c, int rowId);
   void readDoubles(int total, WritableColumnVector c, int rowId);
   void readBinary(int total, WritableColumnVector c, int rowId);
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeRebaseBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeRebaseBenchmark.scala