Fixed docs

tdas · tdas · commit 24174f08587d · 2016-06-20T03:49:43.000-07:00
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -238,34 +238,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
 
   /**
    * Loads a JSON file (one object per line) and returns the result as a [[DataFrame]].
-   *
-   * This function goes through the input once to determine the input schema. If you know the
-   * schema in advance, use the version that specifies the schema to avoid the extra scan.
-   *
-   * You can set the following JSON-specific options to deal with non-standard JSON files:
-   * <li>`primitivesAsString` (default `false`): infers all primitive values as a string type</li>
-   * <li>`prefersDecimal` (default `false`): infers all floating-point values as a decimal
-   * type. If the values do not fit in decimal, then it infers them as doubles.</li>
-   * <li>`allowComments` (default `false`): ignores Java/C++ style comment in JSON records</li>
-   * <li>`allowUnquotedFieldNames` (default `false`): allows unquoted JSON field names</li>
-   * <li>`allowSingleQuotes` (default `true`): allows single quotes in addition to double quotes
-   * </li>
-   * <li>`allowNumericLeadingZeros` (default `false`): allows leading zeros in numbers
-   * (e.g. 00012)</li>
-   * <li>`allowBackslashEscapingAnyCharacter` (default `false`): allows accepting quoting of all
-   * character using backslash quoting mechanism</li>
-   * <li>`mode` (default `PERMISSIVE`): allows a mode for dealing with corrupt records
-   * during parsing.</li>
-   * <ul>
-   *  <li>`PERMISSIVE` : sets other fields to `null` when it meets a corrupted record, and puts the
-   *  malformed string into a new field configured by `columnNameOfCorruptRecord`. When
-   *  a schema is set by user, it sets `null` for extra fields.</li>
-   *  <li>`DROPMALFORMED` : ignores the whole corrupted records.</li>
-   *  <li>`FAILFAST` : throws an exception when it meets corrupted records.</li>
-   * </ul>
-   * <li>`columnNameOfCorruptRecord` (default is the value specified in
-   * `spark.sql.columnNameOfCorruptRecord`): allows renaming the new field having malformed string
-   * created by `PERMISSIVE` mode. This overrides `spark.sql.columnNameOfCorruptRecord`.</li>
+   * See the documentation on the overloaded `json()` method with varargs for more details.
    *
    * @since 1.4.0
    */
@@ -281,6 +254,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
    * schema in advance, use the version that specifies the schema to avoid the extra scan.
    *
    * You can set the following JSON-specific options to deal with non-standard JSON files:
+   * <ul>
    * <li>`primitivesAsString` (default `false`): infers all primitive values as a string type</li>
    * <li>`prefersDecimal` (default `false`): infers all floating-point values as a decimal
    * type. If the values do not fit in decimal, then it infers them as doubles.</li>
@@ -304,7 +278,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
    * <li>`columnNameOfCorruptRecord` (default is the value specified in
    * `spark.sql.columnNameOfCorruptRecord`): allows renaming the new field having malformed string
    * created by `PERMISSIVE` mode. This overrides `spark.sql.columnNameOfCorruptRecord`.</li>
-   *
+   * </ul>
    * @since 2.0.0
    */
   @scala.annotation.varargs
@@ -356,54 +330,8 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
   }
 
   /**
-   * Loads a CSV file and returns the result as a [[DataFrame]].
-   *
-   * This function will go through the input once to determine the input schema if `inferSchema`
-   * is enabled. To avoid going through the entire data once, disable `inferSchema` option or
-   * specify the schema explicitly using [[schema]].
-   *
-   * You can set the following CSV-specific options to deal with CSV files:
-   * <li>`sep` (default `,`): sets the single character as a separator for each
-   * field and value.</li>
-   * <li>`encoding` (default `UTF-8`): decodes the CSV files by the given encoding
-   * type.</li>
-   * <li>`quote` (default `"`): sets the single character used for escaping quoted values where
-   * the separator can be part of the value. If you would like to turn off quotations, you need to
-   * set not `null` but an empty string. This behaviour is different form
-   * `com.databricks.spark.csv`.</li>
-   * <li>`escape` (default `\`): sets the single character used for escaping quotes inside
-   * an already quoted value.</li>
-   * <li>`comment` (default empty string): sets the single character used for skipping lines
-   * beginning with this character. By default, it is disabled.</li>
-   * <li>`header` (default `false`): uses the first line as names of columns.</li>
-   * <li>`inferSchema` (default `false`): infers the input schema automatically from data. It
-   * requires one extra pass over the data.</li>
-   * <li>`ignoreLeadingWhiteSpace` (default `false`): defines whether or not leading whitespaces
-   * from values being read should be skipped.</li>
-   * <li>`ignoreTrailingWhiteSpace` (default `false`): defines whether or not trailing
-   * whitespaces from values being read should be skipped.</li>
-   * <li>`nullValue` (default empty string): sets the string representation of a null value.</li>
-   * <li>`nanValue` (default `NaN`): sets the string representation of a non-number" value.</li>
-   * <li>`positiveInf` (default `Inf`): sets the string representation of a positive infinity
-   * value.</li>
-   * <li>`negativeInf` (default `-Inf`): sets the string representation of a negative infinity
-   * value.</li>
-   * <li>`dateFormat` (default `null`): sets the string that indicates a date format. Custom date
-   * formats follow the formats at `java.text.SimpleDateFormat`. This applies to both date type
-   * and timestamp type. By default, it is `null` which means trying to parse times and date by
-   * `java.sql.Timestamp.valueOf()` and `java.sql.Date.valueOf()`.</li>
-   * <li>`maxColumns` (default `20480`): defines a hard limit of how many columns
-   * a record can have.</li>
-   * <li>`maxCharsPerColumn` (default `1000000`): defines the maximum number of characters allowed
-   * for any given value being read.</li>
-   * <li>`mode` (default `PERMISSIVE`): allows a mode for dealing with corrupt records
-   * during parsing.</li>
-   * <ul>
-   * <li>`PERMISSIVE` : sets other fields to `null` when it meets a corrupted record. When
-   * a schema is set by user, it sets `null` for extra fields.</li>
-   * <li>`DROPMALFORMED` : ignores the whole corrupted records.</li>
-   * <li>`FAILFAST` : throws an exception when it meets corrupted records.</li>
-   * </ul>
+   * Loads a CSV file and returns the result as a [[DataFrame]]. See the documentation on the
+   * other overloaded `csv()` method for more details.
    *
    * @since 2.0.0
    */
@@ -420,6 +348,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
    * specify the schema explicitly using [[schema]].
    *
    * You can set the following CSV-specific options to deal with CSV files:
+   * <ul>
    * <li>`sep` (default `,`): sets the single character as a separator for each
    * field and value.</li>
    * <li>`encoding` (default `UTF-8`): decodes the CSV files by the given encoding
@@ -461,20 +390,15 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
    *   <li>`DROPMALFORMED` : ignores the whole corrupted records.</li>
    *   <li>`FAILFAST` : throws an exception when it meets corrupted records.</li>
    * </ul>
-   *
+   * </ul>
    * @since 2.0.0
    */
   @scala.annotation.varargs
   def csv(paths: String*): DataFrame = format("csv").load(paths : _*)
 
   /**
-   * Loads a Parquet file, returning the result as a [[DataFrame]]. This function returns an empty
-   * [[DataFrame]] if no paths are passed in.
-   *
-   * You can set the following Parquet-specific option(s) for reading Parquet files:
-   * <li>`mergeSchema` (default is the value specified in `spark.sql.parquet.mergeSchema`): sets
-   * whether we should merge schemas collected from all Parquet part-files. This will override
-   * `spark.sql.parquet.mergeSchema`.</li>
+   * Loads a Parquet file, returning the result as a [[DataFrame]]. See the documentation
+   * on the other overloaded `parquet()` method for more details.
    *
    * @since 2.0.0
    */
@@ -484,14 +408,14 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
   }
 
   /**
-   * Loads a Parquet file, returning the result as a [[DataFrame]]. This function returns an empty
-   * [[DataFrame]] if no paths are passed in.
+   * Loads a Parquet file, returning the result as a [[DataFrame]].
    *
    * You can set the following Parquet-specific option(s) for reading Parquet files:
+   * <ul>
    * <li>`mergeSchema` (default is the value specified in `spark.sql.parquet.mergeSchema`): sets
    * whether we should merge schemas collected from all Parquet part-files. This will override
    * `spark.sql.parquet.mergeSchema`.</li>
-   *
+   * </ul>
    * @since 1.4.0
    */
   @scala.annotation.varargs
@@ -534,18 +458,9 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
 
   /**
    * Loads text files and returns a [[DataFrame]] whose schema starts with a string column named
-   * "value", and followed by partitioned columns if there are any.
-   *
-   * Each line in the text files is a new row in the resulting DataFrame. For example:
-   * {{{
-   *   // Scala:
-   *   spark.read.text("/path/to/spark/README.md")
+   * "value", and followed by partitioned columns if there are any. See the documentation on
+   * the other overloaded `text()` method for more details.
    *
-   *   // Java:
-   *   spark.read().text("/path/to/spark/README.md")
-   * }}}
-   *
-   * @param path input path
    * @since 2.0.0
    */
   def text(path: String): DataFrame = {
@@ -573,22 +488,8 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
   def text(paths: String*): DataFrame = format("text").load(paths : _*)
 
   /**
-   * Loads text files and returns a [[Dataset]] of String. The underlying schema of the Dataset
-   * contains a single string column named "value".
-   *
-   * If the directory structure of the text files contains partitioning information, those are
-   * ignored in the resulting Dataset. To include partitioning information as columns, use `text`.
-   *
-   * Each line in the text files is a new element in the resulting Dataset. For example:
-   * {{{
-   *   // Scala:
-   *   spark.read.textFile("/path/to/spark/README.md")
-   *
-   *   // Java:
-   *   spark.read().textFile("/path/to/spark/README.md")
-   * }}}
-   *
-   * @param path input path
+   * Loads text files and returns a [[Dataset]] of String. See the documentation on the
+   * other overloaded `textFile()` method for more details.
    * @since 2.0.0
    */
   def textFile(path: String): Dataset[String] = {
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameReaderWriterSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameReaderWriterSuite.java
@@ -146,7 +146,7 @@ public void testParquetAPI() {
 
   /**
    * This only tests whether API compiles, but does not run it as orc()
-   * cannot be run with Hive classes.
+   * cannot be run without Hive classes.
    */
   public void testOrcAPI() {
     spark.read().schema(schema).orc();
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala
@@ -348,7 +348,7 @@ class DataFrameReaderWriterSuite extends QueryTest with SharedSQLContext with Be
 
   /**
    * This only tests whether API compiles, but does not run it as orc()
-   * cannot be run with Hive classes.
+   * cannot be run without Hive classes.
    */
   ignore("orc - API") {
     // Reader, with user specified schema