Add some comments for consistent compression names and add none/uncompressed for test-based datasources

HyukjinKwon · HyukjinKwon · commit baf7a63337c4 · 2016-03-03T10:56:49.000+09:00
diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
@@ -465,7 +465,8 @@ def json(self, path, mode=None, compression=None):
             * ``ignore``: Silently ignore this operation if data already exists.
             * ``error`` (default case): Throw an exception if data already exists.
         :param compression: compression codec to use when saving to file. This can be one of the
-                            known case-insensitive shorten names (bzip2, gzip, lz4, and snappy).
+                            known case-insensitive shorten names (none, bzip2, gzip, lz4,
+                            snappy and deflate).
 
         >>> df.write.json(os.path.join(tempfile.mkdtemp(), 'data'))
         """
@@ -487,8 +488,8 @@ def parquet(self, path, mode=None, partitionBy=None, compression=None):
             * ``error`` (default case): Throw an exception if data already exists.
         :param partitionBy: names of partitioning columns
         :param compression: compression codec to use when saving to file. This can be one of the
-                            known case-insensitive shorten names (uncompressed, snappy, gzip, and
-                            lzo). This will overwrite ``spark.sql.parquet.compression.codec``.
+                            known case-insensitive shorten names (none, snappy, gzip, and lzo).
+                            This will overwrite ``spark.sql.parquet.compression.codec``.
 
         >>> df.write.parquet(os.path.join(tempfile.mkdtemp(), 'data'))
         """
@@ -505,7 +506,8 @@ def text(self, path, compression=None):
 
         :param path: the path in any Hadoop supported file system
         :param compression: compression codec to use when saving to file. This can be one of the
-                            known case-insensitive shorten names (bzip2, gzip, lz4, and snappy).
+                            known case-insensitive shorten names (none, bzip2, gzip, lz4,
+                            snappy and deflate).
 
         The DataFrame must have only one column that is of string type.
         Each row becomes a new line in the output file.
@@ -527,7 +529,8 @@ def csv(self, path, mode=None, compression=None):
             * ``error`` (default case): Throw an exception if data already exists.
 
         :param compression: compression codec to use when saving to file. This can be one of the
-                            known case-insensitive shorten names (bzip2, gzip, lz4, and snappy).
+                            known case-insensitive shorten names (none, bzip2, gzip, lz4,
+                            snappy and deflate).
 
         >>> df.write.csv(os.path.join(tempfile.mkdtemp(), 'data'))
         """
@@ -552,8 +555,8 @@ def orc(self, path, mode=None, partitionBy=None, compression=None):
             * ``error`` (default case): Throw an exception if data already exists.
         :param partitionBy: names of partitioning columns
         :param compression: compression codec to use when saving to file. This can be one of the
-                            known case-insensitive shorten names (uncompressed, snappy, zlib, and
-                            lzo). This will overwrite ``orc.compress``.
+                            known case-insensitive shorten names (none, snappy, zlib, and lzo).
+                            This will overwrite ``orc.compress``.
 
         >>> orc_df = hiveContext.read.orc('python/test_support/sql/orc_partitioned')
         >>> orc_df.write.orc(os.path.join(tempfile.mkdtemp(), 'data'))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -455,7 +455,8 @@ final class DataFrameWriter private[sql](df: DataFrame) {
    *
    * You can set the following JSON-specific option(s) for writing JSON files:
    * <li>`compression` (default `null`): compression codec to use when saving to file. This can be
-   * one of the known case-insensitive shorten names (`bzip2`, `gzip`, `lz4`, and `snappy`). </li>
+   * one of the known case-insensitive shorten names (`none`, `bzip2`, `gzip`, `lz4`,
+   * `snappy` and `deflate`). </li>
    *
    * @since 1.4.0
    */
@@ -470,8 +471,8 @@ final class DataFrameWriter private[sql](df: DataFrame) {
    *
    * You can set the following Parquet-specific option(s) for writing Parquet files:
    * <li>`compression` (default `null`): compression codec to use when saving to file. This can be
-   * one of the known case-insensitive shorten names(`uncompressed`, `snappy`,`gzip`, and
-   * `lzo`). This will overwrite `spark.sql.parquet.compression.codec`. </li>
+   * one of the known case-insensitive shorten names(`none`, `snappy`, `gzip`, and `lzo`).
+   * This will overwrite `spark.sql.parquet.compression.codec`. </li>
    *
    * @since 1.4.0
    */
@@ -486,8 +487,8 @@ final class DataFrameWriter private[sql](df: DataFrame) {
    *
    * You can set the following ORC-specific option(s) for writing ORC files:
    * <li>`compression` (default `null`): compression codec to use when saving to file. This can be
-   * one of the known case-insensitive shorten names(`uncompressed`, `snappy`, `zlib`, and
-   * `lzo`). This will overwrite `orc.compress`. </li>
+   * one of the known case-insensitive shorten names(`none`, `snappy`, `zlib`, and `lzo`).
+   * This will overwrite `orc.compress`. </li>
    *
    * @since 1.5.0
    * @note Currently, this method can only be used together with `HiveContext`.
@@ -508,7 +509,8 @@ final class DataFrameWriter private[sql](df: DataFrame) {
    *
    * You can set the following option(s) for writing text files:
    * <li>`compression` (default `null`): compression codec to use when saving to file. This can be
-   * one of the known case-insensitive shorten names (`bzip2`, `gzip`, `lz4`, and `snappy`). </li>
+   * one of the known case-insensitive shorten names (`none`, `bzip2`, `gzip`, `lz4`,
+   * `snappy` and `deflate`). </li>
    *
    * @since 1.6.0
    */
@@ -523,7 +525,8 @@ final class DataFrameWriter private[sql](df: DataFrame) {
    *
    * You can set the following CSV-specific option(s) for writing CSV files:
    * <li>`compression` (default `null`): compression codec to use when saving to file. This can be
-   * one of the known case-insensitive shorten names (`bzip2`, `gzip`, `lz4`, and `snappy`). </li>
+   * one of the known case-insensitive shorten names (`none`, `bzip2`, `gzip`, `lz4`,
+   * `snappy` and `deflate`). </li>
    *
    * @since 2.0.0
    */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/CompressionCodecs.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/CompressionCodecs.scala
@@ -25,6 +25,8 @@ import org.apache.spark.util.Utils
 
 private[datasources] object CompressionCodecs {
   private val shortCompressionCodecNames = Map(
+    "none" -> null,
+    "uncompressed" -> null,
     "bzip2" -> classOf[BZip2Codec].getName,
     "deflate" -> classOf[DeflateCodec].getName,
     "gzip" -> classOf[GzipCodec].getName,
@@ -39,7 +41,9 @@ private[datasources] object CompressionCodecs {
     val codecName = shortCompressionCodecNames.getOrElse(name.toLowerCase, name)
     try {
       // Validate the codec name
-      Utils.classForName(codecName)
+      if (codecName != null) {
+        Utils.classForName(codecName)
+      }
       codecName
     } catch {
       case e: ClassNotFoundException =>

Original file line number	Diff line number	Diff line change
`@@ -455,7 +455,8 @@ final class DataFrameWriter private[sql](df: DataFrame) {`
`455`	`455`	`*`
`456`	`456`	`* You can set the following JSON-specific option(s) for writing JSON files:`
`457`	`457`	* <li>`compression` (default `null`): compression codec to use when saving to file. This can be
`458`		- * one of the known case-insensitive shorten names (`bzip2`, `gzip`, `lz4`, and `snappy`). </li>
	`458`	+ * one of the known case-insensitive shorten names (`none`, `bzip2`, `gzip`, `lz4`,
	`459`	+ * `snappy` and `deflate`). </li>
`459`	`460`	`*`
`460`	`461`	`* @since 1.4.0`
`461`	`462`	`*/`
`@@ -470,8 +471,8 @@ final class DataFrameWriter private[sql](df: DataFrame) {`
`470`	`471`	`*`
`471`	`472`	`* You can set the following Parquet-specific option(s) for writing Parquet files:`
`472`	`473`	* <li>`compression` (default `null`): compression codec to use when saving to file. This can be
`473`		- * one of the known case-insensitive shorten names(`uncompressed`, `snappy`,`gzip`, and
`474`		- * `lzo`). This will overwrite `spark.sql.parquet.compression.codec`. </li>
	`474`	+ * one of the known case-insensitive shorten names(`none`, `snappy`, `gzip`, and `lzo`).
	`475`	+ * This will overwrite `spark.sql.parquet.compression.codec`. </li>
`475`	`476`	`*`
`476`	`477`	`* @since 1.4.0`
`477`	`478`	`*/`
`@@ -486,8 +487,8 @@ final class DataFrameWriter private[sql](df: DataFrame) {`
`486`	`487`	`*`
`487`	`488`	`* You can set the following ORC-specific option(s) for writing ORC files:`
`488`	`489`	* <li>`compression` (default `null`): compression codec to use when saving to file. This can be
`489`		- * one of the known case-insensitive shorten names(`uncompressed`, `snappy`, `zlib`, and
`490`		- * `lzo`). This will overwrite `orc.compress`. </li>
	`490`	+ * one of the known case-insensitive shorten names(`none`, `snappy`, `zlib`, and `lzo`).
	`491`	+ * This will overwrite `orc.compress`. </li>
`491`	`492`	`*`
`492`	`493`	`* @since 1.5.0`
`493`	`494`	* @note Currently, this method can only be used together with `HiveContext`.
`@@ -508,7 +509,8 @@ final class DataFrameWriter private[sql](df: DataFrame) {`
`508`	`509`	`*`
`509`	`510`	`* You can set the following option(s) for writing text files:`
`510`	`511`	* <li>`compression` (default `null`): compression codec to use when saving to file. This can be
`511`		- * one of the known case-insensitive shorten names (`bzip2`, `gzip`, `lz4`, and `snappy`). </li>
	`512`	+ * one of the known case-insensitive shorten names (`none`, `bzip2`, `gzip`, `lz4`,
	`513`	+ * `snappy` and `deflate`). </li>
`512`	`514`	`*`
`513`	`515`	`* @since 1.6.0`
`514`	`516`	`*/`
`@@ -523,7 +525,8 @@ final class DataFrameWriter private[sql](df: DataFrame) {`
`523`	`525`	`*`
`524`	`526`	`* You can set the following CSV-specific option(s) for writing CSV files:`
`525`	`527`	* <li>`compression` (default `null`): compression codec to use when saving to file. This can be
`526`		- * one of the known case-insensitive shorten names (`bzip2`, `gzip`, `lz4`, and `snappy`). </li>
	`528`	+ * one of the known case-insensitive shorten names (`none`, `bzip2`, `gzip`, `lz4`,
	`529`	+ * `snappy` and `deflate`). </li>
`527`	`530`	`*`
`528`	`531`	`* @since 2.0.0`
`529`	`532`	`*/`