Skip to content

Commit 24174f0

Browse files
committed
Fixed docs
1 parent 29524b1 commit 24174f0

File tree

3 files changed

+18
-117
lines changed

3 files changed

+18
-117
lines changed

sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala

Lines changed: 16 additions & 115 deletions
Original file line numberDiff line numberDiff line change
@@ -238,34 +238,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
238238

239239
/**
240240
* Loads a JSON file (one object per line) and returns the result as a [[DataFrame]].
241-
*
242-
* This function goes through the input once to determine the input schema. If you know the
243-
* schema in advance, use the version that specifies the schema to avoid the extra scan.
244-
*
245-
* You can set the following JSON-specific options to deal with non-standard JSON files:
246-
* <li>`primitivesAsString` (default `false`): infers all primitive values as a string type</li>
247-
* <li>`prefersDecimal` (default `false`): infers all floating-point values as a decimal
248-
* type. If the values do not fit in decimal, then it infers them as doubles.</li>
249-
* <li>`allowComments` (default `false`): ignores Java/C++ style comment in JSON records</li>
250-
* <li>`allowUnquotedFieldNames` (default `false`): allows unquoted JSON field names</li>
251-
* <li>`allowSingleQuotes` (default `true`): allows single quotes in addition to double quotes
252-
* </li>
253-
* <li>`allowNumericLeadingZeros` (default `false`): allows leading zeros in numbers
254-
* (e.g. 00012)</li>
255-
* <li>`allowBackslashEscapingAnyCharacter` (default `false`): allows accepting quoting of all
256-
* character using backslash quoting mechanism</li>
257-
* <li>`mode` (default `PERMISSIVE`): allows a mode for dealing with corrupt records
258-
* during parsing.</li>
259-
* <ul>
260-
* <li>`PERMISSIVE` : sets other fields to `null` when it meets a corrupted record, and puts the
261-
* malformed string into a new field configured by `columnNameOfCorruptRecord`. When
262-
* a schema is set by user, it sets `null` for extra fields.</li>
263-
* <li>`DROPMALFORMED` : ignores the whole corrupted records.</li>
264-
* <li>`FAILFAST` : throws an exception when it meets corrupted records.</li>
265-
* </ul>
266-
* <li>`columnNameOfCorruptRecord` (default is the value specified in
267-
* `spark.sql.columnNameOfCorruptRecord`): allows renaming the new field having malformed string
268-
* created by `PERMISSIVE` mode. This overrides `spark.sql.columnNameOfCorruptRecord`.</li>
241+
* See the documentation on the overloaded `json()` method with varargs for more details.
269242
*
270243
* @since 1.4.0
271244
*/
@@ -281,6 +254,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
281254
* schema in advance, use the version that specifies the schema to avoid the extra scan.
282255
*
283256
* You can set the following JSON-specific options to deal with non-standard JSON files:
257+
* <ul>
284258
* <li>`primitivesAsString` (default `false`): infers all primitive values as a string type</li>
285259
* <li>`prefersDecimal` (default `false`): infers all floating-point values as a decimal
286260
* type. If the values do not fit in decimal, then it infers them as doubles.</li>
@@ -304,7 +278,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
304278
* <li>`columnNameOfCorruptRecord` (default is the value specified in
305279
* `spark.sql.columnNameOfCorruptRecord`): allows renaming the new field having malformed string
306280
* created by `PERMISSIVE` mode. This overrides `spark.sql.columnNameOfCorruptRecord`.</li>
307-
*
281+
* </ul>
308282
* @since 2.0.0
309283
*/
310284
@scala.annotation.varargs
@@ -356,54 +330,8 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
356330
}
357331

358332
/**
359-
* Loads a CSV file and returns the result as a [[DataFrame]].
360-
*
361-
* This function will go through the input once to determine the input schema if `inferSchema`
362-
* is enabled. To avoid going through the entire data once, disable `inferSchema` option or
363-
* specify the schema explicitly using [[schema]].
364-
*
365-
* You can set the following CSV-specific options to deal with CSV files:
366-
* <li>`sep` (default `,`): sets the single character as a separator for each
367-
* field and value.</li>
368-
* <li>`encoding` (default `UTF-8`): decodes the CSV files by the given encoding
369-
* type.</li>
370-
* <li>`quote` (default `"`): sets the single character used for escaping quoted values where
371-
* the separator can be part of the value. If you would like to turn off quotations, you need to
372-
* set not `null` but an empty string. This behaviour is different form
373-
* `com.databricks.spark.csv`.</li>
374-
* <li>`escape` (default `\`): sets the single character used for escaping quotes inside
375-
* an already quoted value.</li>
376-
* <li>`comment` (default empty string): sets the single character used for skipping lines
377-
* beginning with this character. By default, it is disabled.</li>
378-
* <li>`header` (default `false`): uses the first line as names of columns.</li>
379-
* <li>`inferSchema` (default `false`): infers the input schema automatically from data. It
380-
* requires one extra pass over the data.</li>
381-
* <li>`ignoreLeadingWhiteSpace` (default `false`): defines whether or not leading whitespaces
382-
* from values being read should be skipped.</li>
383-
* <li>`ignoreTrailingWhiteSpace` (default `false`): defines whether or not trailing
384-
* whitespaces from values being read should be skipped.</li>
385-
* <li>`nullValue` (default empty string): sets the string representation of a null value.</li>
386-
* <li>`nanValue` (default `NaN`): sets the string representation of a non-number" value.</li>
387-
* <li>`positiveInf` (default `Inf`): sets the string representation of a positive infinity
388-
* value.</li>
389-
* <li>`negativeInf` (default `-Inf`): sets the string representation of a negative infinity
390-
* value.</li>
391-
* <li>`dateFormat` (default `null`): sets the string that indicates a date format. Custom date
392-
* formats follow the formats at `java.text.SimpleDateFormat`. This applies to both date type
393-
* and timestamp type. By default, it is `null` which means trying to parse times and date by
394-
* `java.sql.Timestamp.valueOf()` and `java.sql.Date.valueOf()`.</li>
395-
* <li>`maxColumns` (default `20480`): defines a hard limit of how many columns
396-
* a record can have.</li>
397-
* <li>`maxCharsPerColumn` (default `1000000`): defines the maximum number of characters allowed
398-
* for any given value being read.</li>
399-
* <li>`mode` (default `PERMISSIVE`): allows a mode for dealing with corrupt records
400-
* during parsing.</li>
401-
* <ul>
402-
* <li>`PERMISSIVE` : sets other fields to `null` when it meets a corrupted record. When
403-
* a schema is set by user, it sets `null` for extra fields.</li>
404-
* <li>`DROPMALFORMED` : ignores the whole corrupted records.</li>
405-
* <li>`FAILFAST` : throws an exception when it meets corrupted records.</li>
406-
* </ul>
333+
* Loads a CSV file and returns the result as a [[DataFrame]]. See the documentation on the
334+
* other overloaded `csv()` method for more details.
407335
*
408336
* @since 2.0.0
409337
*/
@@ -420,6 +348,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
420348
* specify the schema explicitly using [[schema]].
421349
*
422350
* You can set the following CSV-specific options to deal with CSV files:
351+
* <ul>
423352
* <li>`sep` (default `,`): sets the single character as a separator for each
424353
* field and value.</li>
425354
* <li>`encoding` (default `UTF-8`): decodes the CSV files by the given encoding
@@ -461,20 +390,15 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
461390
* <li>`DROPMALFORMED` : ignores the whole corrupted records.</li>
462391
* <li>`FAILFAST` : throws an exception when it meets corrupted records.</li>
463392
* </ul>
464-
*
393+
* </ul>
465394
* @since 2.0.0
466395
*/
467396
@scala.annotation.varargs
468397
def csv(paths: String*): DataFrame = format("csv").load(paths : _*)
469398

470399
/**
471-
* Loads a Parquet file, returning the result as a [[DataFrame]]. This function returns an empty
472-
* [[DataFrame]] if no paths are passed in.
473-
*
474-
* You can set the following Parquet-specific option(s) for reading Parquet files:
475-
* <li>`mergeSchema` (default is the value specified in `spark.sql.parquet.mergeSchema`): sets
476-
* whether we should merge schemas collected from all Parquet part-files. This will override
477-
* `spark.sql.parquet.mergeSchema`.</li>
400+
* Loads a Parquet file, returning the result as a [[DataFrame]]. See the documentation
401+
* on the other overloaded `parquet()` method for more details.
478402
*
479403
* @since 2.0.0
480404
*/
@@ -484,14 +408,14 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
484408
}
485409

486410
/**
487-
* Loads a Parquet file, returning the result as a [[DataFrame]]. This function returns an empty
488-
* [[DataFrame]] if no paths are passed in.
411+
* Loads a Parquet file, returning the result as a [[DataFrame]].
489412
*
490413
* You can set the following Parquet-specific option(s) for reading Parquet files:
414+
* <ul>
491415
* <li>`mergeSchema` (default is the value specified in `spark.sql.parquet.mergeSchema`): sets
492416
* whether we should merge schemas collected from all Parquet part-files. This will override
493417
* `spark.sql.parquet.mergeSchema`.</li>
494-
*
418+
* </ul>
495419
* @since 1.4.0
496420
*/
497421
@scala.annotation.varargs
@@ -534,18 +458,9 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
534458

535459
/**
536460
* Loads text files and returns a [[DataFrame]] whose schema starts with a string column named
537-
* "value", and followed by partitioned columns if there are any.
538-
*
539-
* Each line in the text files is a new row in the resulting DataFrame. For example:
540-
* {{{
541-
* // Scala:
542-
* spark.read.text("/path/to/spark/README.md")
461+
* "value", and followed by partitioned columns if there are any. See the documentation on
462+
* the other overloaded `text()` method for more details.
543463
*
544-
* // Java:
545-
* spark.read().text("/path/to/spark/README.md")
546-
* }}}
547-
*
548-
* @param path input path
549464
* @since 2.0.0
550465
*/
551466
def text(path: String): DataFrame = {
@@ -573,22 +488,8 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
573488
def text(paths: String*): DataFrame = format("text").load(paths : _*)
574489

575490
/**
576-
* Loads text files and returns a [[Dataset]] of String. The underlying schema of the Dataset
577-
* contains a single string column named "value".
578-
*
579-
* If the directory structure of the text files contains partitioning information, those are
580-
* ignored in the resulting Dataset. To include partitioning information as columns, use `text`.
581-
*
582-
* Each line in the text files is a new element in the resulting Dataset. For example:
583-
* {{{
584-
* // Scala:
585-
* spark.read.textFile("/path/to/spark/README.md")
586-
*
587-
* // Java:
588-
* spark.read().textFile("/path/to/spark/README.md")
589-
* }}}
590-
*
591-
* @param path input path
491+
* Loads text files and returns a [[Dataset]] of String. See the documentation on the
492+
* other overloaded `textFile()` method for more details.
592493
* @since 2.0.0
593494
*/
594495
def textFile(path: String): Dataset[String] = {

sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameReaderWriterSuite.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,7 @@ public void testParquetAPI() {
146146

147147
/**
148148
* This only tests whether API compiles, but does not run it as orc()
149-
* cannot be run with Hive classes.
149+
* cannot be run without Hive classes.
150150
*/
151151
public void testOrcAPI() {
152152
spark.read().schema(schema).orc();

sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -348,7 +348,7 @@ class DataFrameReaderWriterSuite extends QueryTest with SharedSQLContext with Be
348348

349349
/**
350350
* This only tests whether API compiles, but does not run it as orc()
351-
* cannot be run with Hive classes.
351+
* cannot be run without Hive classes.
352352
*/
353353
ignore("orc - API") {
354354
// Reader, with user specified schema

0 commit comments

Comments
 (0)