SPARK-4176 spark should save decimal values with precision >18 as parquets fixed_byte_array

rtreffer · rtreffer · commit 5fe321ee0275 · 2015-06-26T11:48:02.000+02:00
Parquet defines multiple ways to store decimals. This patch enables
the reading of all variations as well as writing decimals in the
smallest fixed-length container possible (INT32, INT64, FIXED_LEN_BYTE_ARRAY).
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala
@@ -169,11 +169,12 @@ private[parquet] class CatalystSchemaConverter(
         }
 
       case INT96 =>
-        CatalystSchemaConverter.analysisRequire(
-          assumeInt96IsTimestamp,
-          "INT96 is not supported unless it's interpreted as timestamp. " +
-            s"Please try to set ${SQLConf.PARQUET_INT96_AS_TIMESTAMP.key} to true.")
-        TimestampType
+        field.getOriginalType match {
+          case DECIMAL => makeDecimalType(maxPrecisionForBytes(12))
+          case _ if assumeInt96IsTimestamp => TimestampType
+          case null => makeDecimalType(maxPrecisionForBytes(12))
+          case _ => illegalType()
+        }
 
       case BINARY =>
         field.getOriginalType match {
@@ -373,8 +374,10 @@ private[parquet] class CatalystSchemaConverter(
 
       // Spark 1.4.x and prior versions only support decimals with a maximum precision of 18 and
       // always store decimals in fixed-length byte arrays.
+      // Always storing FIXED_LEN_BYTE_ARRAY is thus compatible with spark <= 1.4.x, except for
+      // precisions > 18.
       case DecimalType.Fixed(precision, scale)
-        if precision <= maxPrecisionForBytes(8) && !followParquetFormatSpec =>
+        if !followParquetFormatSpec =>
         Types
           .primitive(FIXED_LEN_BYTE_ARRAY, repetition)
           .as(DECIMAL)
@@ -383,30 +386,25 @@ private[parquet] class CatalystSchemaConverter(
           .length(minBytesForPrecision(precision))
           .named(field.name)
 
-      case dec @ DecimalType() if !followParquetFormatSpec =>
-        throw new AnalysisException(
-          s"Data type $dec is not supported. " +
-            s"When ${SQLConf.PARQUET_FOLLOW_PARQUET_FORMAT_SPEC.key} is set to false," +
-            "decimal precision and scale must be specified, " +
-            "and precision must be less than or equal to 18.")
-
       // =====================================
       // Decimals (follow Parquet format spec)
       // =====================================
 
-      // Uses INT32 for 1 <= precision <= 9
+      // Uses INT32 for 4 byte encodings / precision <= 9
       case DecimalType.Fixed(precision, scale)
-        if precision <= maxPrecisionForBytes(4) && followParquetFormatSpec =>
+        if followParquetFormatSpec && maxPrecisionForBytes(3) < precision && 
+          precision <= maxPrecisionForBytes(4) =>
         Types
           .primitive(INT32, repetition)
           .as(DECIMAL)
           .precision(precision)
           .scale(scale)
           .named(field.name)
 
-      // Uses INT64 for 1 <= precision <= 18
+      // Uses INT64 for 8 byte encodings / precision <= 18
       case DecimalType.Fixed(precision, scale)
-        if precision <= maxPrecisionForBytes(8) && followParquetFormatSpec =>
+        if followParquetFormatSpec && maxPrecisionForBytes(7) < precision &&
+          precision <= maxPrecisionForBytes(8) =>
         Types
           .primitive(INT64, repetition)
           .as(DECIMAL)
@@ -562,4 +560,5 @@ private[parquet] object CatalystSchemaConverter {
       throw new AnalysisException(message)
     }
   }
+
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.parquet
 
+import java.math.BigInteger
 import java.nio.ByteOrder
 
 import scala.collection.mutable.{ArrayBuffer, Buffer, HashMap}
@@ -241,26 +242,29 @@ private[parquet] abstract class CatalystConverter extends GroupConverter {
   def getCurrentRecord: InternalRow = throw new UnsupportedOperationException
 
   /**
-   * Read a decimal value from a Parquet Binary into "dest". Only supports decimals that fit in
-   * a long (i.e. precision <= 18)
+   * Read a decimal value from a Parquet Binary into "dest".
    *
    * Returned value is needed by CatalystConverter, which doesn't reuse the Decimal object.
    */
   protected[parquet] def readDecimal(dest: Decimal, value: Binary, ctype: DecimalType): Decimal = {
     val precision = ctype.precisionInfo.get.precision
     val scale = ctype.precisionInfo.get.scale
     val bytes = value.getBytes
-    require(bytes.length <= 16, "Decimal field too large to read")
-    var unscaled = 0L
-    var i = 0
-    while (i < bytes.length) {
-      unscaled = (unscaled << 8) | (bytes(i) & 0xFF)
-      i += 1
+    if (precision <= 18) {
+      var unscaled = 0L
+      var i = 0
+      while (i < bytes.length) {
+        unscaled = (unscaled << 8) | (bytes(i) & 0xFF)
+        i += 1
+      }
+      // Make sure unscaled has the right sign, by sign-extending the first bit
+      val numBits = 8 * bytes.length
+      unscaled = (unscaled << (64 - numBits)) >> (64 - numBits)
+      dest.set(unscaled, precision, scale)
+    } else {
+      val decimal = new java.math.BigDecimal(new BigInteger(bytes), scale)
+      dest.set(new BigDecimal(decimal))
     }
-    // Make sure unscaled has the right sign, by sign-extending the first bit
-    val numBits = 8 * bytes.length
-    unscaled = (unscaled << (64 - numBits)) >> (64 - numBits)
-    dest.set(unscaled, precision, scale)
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
@@ -212,10 +212,7 @@ private[parquet] class RowWriteSupport extends WriteSupport[InternalRow] with Lo
         case BooleanType => writer.addBoolean(value.asInstanceOf[Boolean])
         case DateType => writer.addInteger(value.asInstanceOf[Int])
         case d: DecimalType =>
-          if (d.precisionInfo == None || d.precisionInfo.get.precision > 18) {
-            sys.error(s"Unsupported datatype $d, cannot write to consumer")
-          }
-          writeDecimal(value.asInstanceOf[Decimal], d.precisionInfo.get.precision)
+          writeDecimal(value.asInstanceOf[Decimal], d.precisionInfo.map(_.precision).getOrElse(10))
         case _ => sys.error(s"Do not know how to writer $schema to consumer")
       }
     }
@@ -297,19 +294,35 @@ private[parquet] class RowWriteSupport extends WriteSupport[InternalRow] with Lo
   }
 
   // Scratch array used to write decimals as fixed-length binary
-  private[this] val scratchBytes = new Array[Byte](8)
+  private[this] val scratchBytes = new Array[Byte](4096)
 
   private[parquet] def writeDecimal(decimal: Decimal, precision: Int): Unit = {
     val numBytes = ParquetTypesConverter.BYTES_FOR_PRECISION(precision)
-    val unscaledLong = decimal.toUnscaledLong
-    var i = 0
-    var shift = 8 * (numBytes - 1)
-    while (i < numBytes) {
-      scratchBytes(i) = (unscaledLong >> shift).toByte
-      i += 1
-      shift -= 8
+    if (precision <= 18) {
+      val unscaledLong = decimal.toUnscaledLong
+      var i = 0
+      var shift = 8 * (numBytes - 1)
+      while (i < numBytes) {
+        scratchBytes(i) = (unscaledLong >> shift).toByte
+        i += 1
+        shift -= 8
+      }
+      writer.addBinary(Binary.fromByteArray(scratchBytes, 0, numBytes))
+    } else {
+      val bytes = decimal.toBigDecimal.underlying.unscaledValue.toByteArray()
+      val outBuffer =
+        if (bytes.length == numBytes) {
+          bytes
+        } else {
+          val b = if (numBytes <= scratchBytes.length) scratchBytes else new Array[Byte](numBytes)
+          if (b == scratchBytes && numBytes < scratchBytes.length) {
+            java.util.Arrays.fill(b, 0, numBytes - bytes.length, 0.toByte)
+          }
+          System.arraycopy(bytes, 0, b, numBytes - bytes.length, bytes.length)
+          b
+        }
+      writer.addBinary(Binary.fromByteArray(outBuffer, 0, numBytes))
     }
-    writer.addBinary(Binary.fromByteArray(scratchBytes, 0, numBytes))
   }
 
   // array used to write Timestamp as Int96 (fixed-length binary)
@@ -367,10 +380,8 @@ private[parquet] class MutableRowWriteSupport extends RowWriteSupport {
       case DateType => writer.addInteger(record.getInt(index))
       case TimestampType => writeTimestamp(record.getLong(index))
       case d: DecimalType =>
-        if (d.precisionInfo == None || d.precisionInfo.get.precision > 18) {
-          sys.error(s"Unsupported datatype $d, cannot write to consumer")
-        }
-        writeDecimal(record(index).asInstanceOf[Decimal], d.precisionInfo.get.precision)
+        writeDecimal(record(index).asInstanceOf[Decimal],
+          d.precisionInfo.map(_.precision).getOrElse(10))
       case _ => sys.error(s"Unsupported datatype $ctype, cannot write to consumer")
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTypes.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTypes.scala
@@ -43,16 +43,27 @@ private[parquet] object ParquetTypesConverter extends Logging {
   }
 
   /**
-   * Compute the FIXED_LEN_BYTE_ARRAY length needed to represent a given DECIMAL precision.
+   * BYTES_FOR_PRECISION computes the required bytes to store a value of a certain decimal
+   * precision.
    */
-  private[parquet] val BYTES_FOR_PRECISION = Array.tabulate[Int](38) { precision =>
-    var length = 1
+  private[parquet] def BYTES_FOR_PRECISION_COMPUTE(precision : Int) : Int = {
+    var length = (precision / math.log10(2) - 1).toInt / 8
     while (math.pow(2.0, 8 * length - 1) < math.pow(10.0, precision)) {
       length += 1
     }
     length
   }
 
+  private[parquet] def BYTES_FOR_PRECISION_STATIC =
+    (0 to 30).map(BYTES_FOR_PRECISION_COMPUTE).toArray
+
+  private[parquet] def BYTES_FOR_PRECISION(precision : Int) : Int =
+    if (precision < BYTES_FOR_PRECISION_STATIC.length) {
+      BYTES_FOR_PRECISION_STATIC(precision)
+    } else {
+      BYTES_FOR_PRECISION_COMPUTE(precision)
+    }
+
   def convertToAttributes(
       parquetSchema: MessageType,
       isBinaryAsString: Boolean,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala
@@ -107,22 +107,14 @@ class ParquetIOSuiteBase extends QueryTest with ParquetTest {
         // Parquet doesn't allow column names with spaces, have to add an alias here
         .select($"_1" cast decimal as "dec")
 
-    for ((precision, scale) <- Seq((5, 2), (1, 0), (1, 1), (18, 10), (18, 17))) {
+    for ((precision, scale) <- Seq((5, 2), (1, 0), (1, 1), (18, 10), (18, 17), (19,0), (60, 5))) {
       withTempPath { dir =>
         val data = makeDecimalRDD(DecimalType(precision, scale))
         data.write.parquet(dir.getCanonicalPath)
         checkAnswer(sqlContext.read.parquet(dir.getCanonicalPath), data.collect().toSeq)
       }
     }
 
-    // Decimals with precision above 18 are not yet supported
-    intercept[Throwable] {
-      withTempPath { dir =>
-        makeDecimalRDD(DecimalType(19, 10)).write.parquet(dir.getCanonicalPath)
-        sqlContext.read.parquet(dir.getCanonicalPath).collect()
-      }
-    }
-
     // Unlimited-length decimals are not yet supported
     intercept[Throwable] {
       withTempPath { dir =>