[SPARK-30414][SQL] ParquetRowConverter optimizations: arrays, maps, plus misc. constant factors

JoshRosen · joshrosen-stripe · HyukjinKwon · commit 7a1a5db35f2c · 2020-01-07T14:30:10.000+09:00
### What changes were proposed in this pull request? This PR implements multiple performance optimizations for `ParquetRowConverter`, achieving some modest constant-factor wins for all fields and larger wins for map and array fields: - Add `private[this]` to several `val`s (90cebf0) - Keep a `fieldUpdaters` array, saving two`.updater()` calls per field (7318785): I suspect that these are often megamorphic calls, so cutting these out seems like it could be a relatively large performance win. - Only call `currentRow.numFields` once per `start()` call (e05de15): previously we'd call it once per field and this had a significant enough cost that it was visible during profiling. - Reuse buffers in array and map converters (c7d1534, 6d16f59): previously we would create a brand-new Scala `ArrayBuffer` for each field read, but this isn't actually necessary because the data is already copied into a fresh array when `end()` constructs a `GenericArrayData`. ### Why are the changes needed? To improve Parquet read performance; this is complementary to #26993's (orthogonal) improvements for nested struct read performance. ### Does this PR introduce any user-facing change? No. ### How was this patch tested? Existing tests, plus manual benchmarking with both synthetic and realistic schemas (similar to the ones in #26993). I've seen ~10%+ improvements in scan performance on certain real-world datasets. Closes #27089 from JoshRosen/joshrosen/more-ParquetRowConverter-optimizations. Lead-authored-by: Josh Rosen <rosenville@gmail.com> Co-authored-by: Josh Rosen <joshrosen@stripe.com> Signed-off-by: HyukjinKwon <gurwls223@apache.org>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala
@@ -154,7 +154,7 @@ private[parquet] class ParquetRowConverter(
        |${catalystType.prettyJson}
      """.stripMargin)
 
-  private val UTC = DateTimeUtils.TimeZoneUTC
+  private[this] val UTC = DateTimeUtils.TimeZoneUTC
 
   /**
    * Updater used together with field converters within a [[ParquetRowConverter]].  It propagates
@@ -171,15 +171,15 @@ private[parquet] class ParquetRowConverter(
     override def setFloat(value: Float): Unit = row.setFloat(ordinal, value)
   }
 
-  private val currentRow = new SpecificInternalRow(catalystType.map(_.dataType))
+  private[this] val currentRow = new SpecificInternalRow(catalystType.map(_.dataType))
 
   /**
    * The [[InternalRow]] converted from an entire Parquet record.
    */
   def currentRecord: InternalRow = currentRow
 
   // Converters for each field.
-  private val fieldConverters: Array[Converter with HasParentContainerUpdater] = {
+  private[this] val fieldConverters: Array[Converter with HasParentContainerUpdater] = {
     parquetType.getFields.asScala.map { parquetField =>
       val fieldIndex = catalystType.fieldIndex(parquetField.getName)
       val catalystField = catalystType(fieldIndex)
@@ -188,26 +188,30 @@ private[parquet] class ParquetRowConverter(
     }.toArray
   }
 
+  // Updaters for each field.
+  private[this] val fieldUpdaters: Array[ParentContainerUpdater] = fieldConverters.map(_.updater)
+
   override def getConverter(fieldIndex: Int): Converter = fieldConverters(fieldIndex)
 
   override def end(): Unit = {
     var i = 0
-    while (i < fieldConverters.length) {
-      fieldConverters(i).updater.end()
+    while (i < fieldUpdaters.length) {
+      fieldUpdaters(i).end()
       i += 1
     }
     updater.set(currentRow)
   }
 
   override def start(): Unit = {
     var i = 0
-    while (i < currentRow.numFields) {
+    val numFields = currentRow.numFields
+    while (i < numFields) {
       currentRow.setNullAt(i)
       i += 1
     }
     i = 0
-    while (i < fieldConverters.length) {
-      fieldConverters(i).updater.start()
+    while (i < fieldUpdaters.length) {
+      fieldUpdaters(i).start()
       i += 1
     }
   }
@@ -488,9 +492,9 @@ private[parquet] class ParquetRowConverter(
       updater: ParentContainerUpdater)
     extends ParquetGroupConverter(updater) {
 
-    private var currentArray: ArrayBuffer[Any] = _
+    private[this] val currentArray = ArrayBuffer.empty[Any]
 
-    private val elementConverter: Converter = {
+    private[this] val elementConverter: Converter = {
       val repeatedType = parquetSchema.getType(0)
       val elementType = catalystSchema.elementType
 
@@ -541,20 +545,18 @@ private[parquet] class ParquetRowConverter(
 
     override def end(): Unit = updater.set(new GenericArrayData(currentArray.toArray))
 
-    // NOTE: We can't reuse the mutable `ArrayBuffer` here and must instantiate a new buffer for the
-    // next value.  `Row.copy()` only copies row cells, it doesn't do deep copy to objects stored
-    // in row cells.
-    override def start(): Unit = currentArray = ArrayBuffer.empty[Any]
+    override def start(): Unit = currentArray.clear()
 
     /** Array element converter */
     private final class ElementConverter(parquetType: Type, catalystType: DataType)
       extends GroupConverter {
 
       private var currentElement: Any = _
 
-      private val converter = newConverter(parquetType, catalystType, new ParentContainerUpdater {
-        override def set(value: Any): Unit = currentElement = value
-      })
+      private[this] val converter =
+        newConverter(parquetType, catalystType, new ParentContainerUpdater {
+          override def set(value: Any): Unit = currentElement = value
+        })
 
       override def getConverter(fieldIndex: Int): Converter = converter
 
@@ -571,10 +573,10 @@ private[parquet] class ParquetRowConverter(
       updater: ParentContainerUpdater)
     extends ParquetGroupConverter(updater) {
 
-    private var currentKeys: ArrayBuffer[Any] = _
-    private var currentValues: ArrayBuffer[Any] = _
+    private[this] val currentKeys = ArrayBuffer.empty[Any]
+    private[this] val currentValues = ArrayBuffer.empty[Any]
 
-    private val keyValueConverter = {
+    private[this] val keyValueConverter = {
       val repeatedType = parquetType.getType(0).asGroupType()
       new KeyValueConverter(
         repeatedType.getType(0),
@@ -592,12 +594,9 @@ private[parquet] class ParquetRowConverter(
       updater.set(ArrayBasedMapData(currentKeys.toArray, currentValues.toArray))
     }
 
-    // NOTE: We can't reuse the mutable Map here and must instantiate a new `Map` for the next
-    // value.  `Row.copy()` only copies row cells, it doesn't do deep copy to objects stored in row
-    // cells.
     override def start(): Unit = {
-      currentKeys = ArrayBuffer.empty[Any]
-      currentValues = ArrayBuffer.empty[Any]
+      currentKeys.clear()
+      currentValues.clear()
     }
 
     /** Parquet converter for key-value pairs within the map. */
@@ -612,7 +611,7 @@ private[parquet] class ParquetRowConverter(
 
       private var currentValue: Any = _
 
-      private val converters = Array(
+      private[this] val converters = Array(
         // Converter for keys
         newConverter(parquetKeyType, catalystKeyType, new ParentContainerUpdater {
           override def set(value: Any): Unit = currentKey = value
@@ -638,10 +637,10 @@ private[parquet] class ParquetRowConverter(
   }
 
   private trait RepeatedConverter {
-    private var currentArray: ArrayBuffer[Any] = _
+    private[this] val currentArray = ArrayBuffer.empty[Any]
 
     protected def newArrayUpdater(updater: ParentContainerUpdater) = new ParentContainerUpdater {
-      override def start(): Unit = currentArray = ArrayBuffer.empty[Any]
+      override def start(): Unit = currentArray.clear()
       override def end(): Unit = updater.set(new GenericArrayData(currentArray.toArray))
       override def set(value: Any): Unit = currentArray += value
     }
@@ -659,7 +658,7 @@ private[parquet] class ParquetRowConverter(
 
     val updater: ParentContainerUpdater = newArrayUpdater(parentUpdater)
 
-    private val elementConverter: PrimitiveConverter =
+    private[this] val elementConverter: PrimitiveConverter =
       newConverter(parquetType, catalystType, updater).asPrimitiveConverter()
 
     override def addBoolean(value: Boolean): Unit = elementConverter.addBoolean(value)
@@ -686,7 +685,7 @@ private[parquet] class ParquetRowConverter(
 
     val updater: ParentContainerUpdater = newArrayUpdater(parentUpdater)
 
-    private val elementConverter: GroupConverter =
+    private[this] val elementConverter: GroupConverter =
       newConverter(parquetType, catalystType, updater).asGroupConverter()
 
     override def getConverter(field: Int): Converter = elementConverter.getConverter(field)