From f6bd4eceb28a5f75ae423fabc39ffea952b16612 Mon Sep 17 00:00:00 2001 From: Kent Yao Date: Wed, 25 Mar 2020 16:35:45 +0800 Subject: [PATCH 1/8] [MINOR][DOCS] Fix some links for python api doc --- python/pyspark/sql/readwriter.py | 4 ++-- python/pyspark/sql/streaming.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py index e7ecb3ba9fc7..4bd1644174af 100644 --- a/python/pyspark/sql/readwriter.py +++ b/python/pyspark/sql/readwriter.py @@ -253,8 +253,8 @@ def json(self, path, schema=None, primitivesAsString=None, prefersDecimal=None, :param recursiveFileLookup: recursively scan a directory for files. Using this option disables `partition discovery`_. - .. _partition discovery: /sql-data-sources-parquet.html#partition-discovery - .. _datetime pattern: https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html + .. _partition discovery: /docs/latest/sql-data-sources-parquet.html#partition-discovery + .. _datetime pattern: /docs/latest/sql-ref-datetime-pattern.html >>> df1 = spark.read.json('python/test_support/sql/people.json') >>> df1.dtypes diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py index a83167882a8d..c220f1be4eda 100644 --- a/python/pyspark/sql/streaming.py +++ b/python/pyspark/sql/streaming.py @@ -489,8 +489,8 @@ def json(self, path, schema=None, primitivesAsString=None, prefersDecimal=None, :param recursiveFileLookup: recursively scan a directory for files. Using this option disables `partition discovery`_. - .. _partition discovery: /sql-data-sources-parquet.html#partition-discovery - .. _datetime pattern: https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html + .. _partition discovery: /docs/latest/sql-data-sources-parquet.html#partition-discovery + .. _datetime pattern: /docs/latest/sql-ref-datetime-pattern.html >>> json_sdf = spark.readStream.json(tempfile.mkdtemp(), schema = sdf_schema) >>> json_sdf.isStreaming From 2fd8b3b14da246b86b1b13e1016fc8b00416a5cb Mon Sep 17 00:00:00 2001 From: Kent Yao Date: Wed, 25 Mar 2020 17:14:03 +0800 Subject: [PATCH 2/8] style fix --- python/pyspark/sql/functions.py | 6 +----- python/pyspark/sql/readwriter.py | 6 ------ python/pyspark/sql/streaming.py | 2 -- 3 files changed, 1 insertion(+), 13 deletions(-) diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 9f8714fd8579..cfc653f04edd 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -975,7 +975,7 @@ def date_format(date, format): A pattern could be for instance `dd.MM.yyyy` and could return a string like '18.03.1993'. All pattern letters of `datetime pattern`_. can be used. - .. _datetime pattern: https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html + .. _datetime pattern: /docs/latest/sql-ref-datetime-pattern.html .. note:: Use when ever possible specialized functions like `year`. These benefit from a specialized implementation. @@ -1196,8 +1196,6 @@ def to_date(col, format=None): By default, it follows casting rules to :class:`pyspark.sql.types.DateType` if the format is omitted. Equivalent to ``col.cast("date")``. - .. _datetime pattern: https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html - >>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t']) >>> df.select(to_date(df.t).alias('date')).collect() [Row(date=datetime.date(1997, 2, 28))] @@ -1221,8 +1219,6 @@ def to_timestamp(col, format=None): By default, it follows casting rules to :class:`pyspark.sql.types.TimestampType` if the format is omitted. Equivalent to ``col.cast("timestamp")``. - .. _datetime pattern: https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html - >>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t']) >>> df.select(to_timestamp(df.t).alias('dt')).collect() [Row(dt=datetime.datetime(1997, 2, 28, 10, 30))] diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py index 4bd1644174af..d5350a4c987b 100644 --- a/python/pyspark/sql/readwriter.py +++ b/python/pyspark/sql/readwriter.py @@ -490,8 +490,6 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non :param recursiveFileLookup: recursively scan a directory for files. Using this option disables `partition discovery`_. - .. _datetime pattern: https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html - >>> df = spark.read.csv('python/test_support/sql/ages.csv') >>> df.dtypes [('_c0', 'string'), ('_c1', 'string')] @@ -865,8 +863,6 @@ def json(self, path, mode=None, compression=None, dateFormat=None, timestampForm :param ignoreNullFields: Whether to ignore null fields when generating JSON objects. If None is set, it uses the default value, ``true``. - .. _datetime pattern: https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html - >>> df.write.json(os.path.join(tempfile.mkdtemp(), 'data')) """ self.mode(mode) @@ -981,8 +977,6 @@ def csv(self, path, mode=None, compression=None, sep=None, quote=None, escape=No :param lineSep: defines the line separator that should be used for writing. If None is set, it uses the default value, ``\\n``. Maximum length is 1 character. - .. _datetime pattern: https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html - >>> df.write.csv(os.path.join(tempfile.mkdtemp(), 'data')) """ self.mode(mode) diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py index c220f1be4eda..26aa55dd5393 100644 --- a/python/pyspark/sql/streaming.py +++ b/python/pyspark/sql/streaming.py @@ -725,8 +725,6 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non :param recursiveFileLookup: recursively scan a directory for files. Using this option disables `partition discovery`_. - .. _datetime pattern: https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html - >>> csv_sdf = spark.readStream.csv(tempfile.mkdtemp(), schema = sdf_schema) >>> csv_sdf.isStreaming True From b4d0fc7a9d3783b72f52fea5a2230fb9ede51d96 Mon Sep 17 00:00:00 2001 From: Kent Yao Date: Wed, 25 Mar 2020 18:20:36 +0800 Subject: [PATCH 3/8] address cmts --- python/pyspark/sql/functions.py | 2 +- python/pyspark/sql/readwriter.py | 5 +++-- python/pyspark/sql/streaming.py | 5 +++-- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index cfc653f04edd..e089963a7fd5 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -975,7 +975,7 @@ def date_format(date, format): A pattern could be for instance `dd.MM.yyyy` and could return a string like '18.03.1993'. All pattern letters of `datetime pattern`_. can be used. - .. _datetime pattern: /docs/latest/sql-ref-datetime-pattern.html + .. _datetime pattern: https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html .. note:: Use when ever possible specialized functions like `year`. These benefit from a specialized implementation. diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py index d5350a4c987b..4acdd1786e54 100644 --- a/python/pyspark/sql/readwriter.py +++ b/python/pyspark/sql/readwriter.py @@ -253,8 +253,9 @@ def json(self, path, schema=None, primitivesAsString=None, prefersDecimal=None, :param recursiveFileLookup: recursively scan a directory for files. Using this option disables `partition discovery`_. - .. _partition discovery: /docs/latest/sql-data-sources-parquet.html#partition-discovery - .. _datetime pattern: /docs/latest/sql-ref-datetime-pattern.html + .. _partition discovery: + https://spark.apache.org/docs/latest/sql-data-sources-parquet.html#partition-discovery + .. _datetime pattern: https://spark.apache.org//docs/latest/sql-ref-datetime-pattern.html >>> df1 = spark.read.json('python/test_support/sql/people.json') >>> df1.dtypes diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py index 26aa55dd5393..a5e86466579c 100644 --- a/python/pyspark/sql/streaming.py +++ b/python/pyspark/sql/streaming.py @@ -489,8 +489,9 @@ def json(self, path, schema=None, primitivesAsString=None, prefersDecimal=None, :param recursiveFileLookup: recursively scan a directory for files. Using this option disables `partition discovery`_. - .. _partition discovery: /docs/latest/sql-data-sources-parquet.html#partition-discovery - .. _datetime pattern: /docs/latest/sql-ref-datetime-pattern.html + .. _partition discovery: + https://spark.apache.org/docs/latest/sql-data-sources-parquet.html#partition-discovery + .. _datetime pattern: https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html >>> json_sdf = spark.readStream.json(tempfile.mkdtemp(), schema = sdf_schema) >>> json_sdf.isStreaming From a959d6a2622988bd9b3d2f2d5a892d0ff3229be3 Mon Sep 17 00:00:00 2001 From: Kent Yao Date: Wed, 25 Mar 2020 19:31:58 +0800 Subject: [PATCH 4/8] fix style --- python/pyspark/sql/functions.py | 4 ++++ python/pyspark/sql/readwriter.py | 6 ++++++ python/pyspark/sql/streaming.py | 2 ++ 3 files changed, 12 insertions(+) diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index e089963a7fd5..9f8714fd8579 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -1196,6 +1196,8 @@ def to_date(col, format=None): By default, it follows casting rules to :class:`pyspark.sql.types.DateType` if the format is omitted. Equivalent to ``col.cast("date")``. + .. _datetime pattern: https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html + >>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t']) >>> df.select(to_date(df.t).alias('date')).collect() [Row(date=datetime.date(1997, 2, 28))] @@ -1219,6 +1221,8 @@ def to_timestamp(col, format=None): By default, it follows casting rules to :class:`pyspark.sql.types.TimestampType` if the format is omitted. Equivalent to ``col.cast("timestamp")``. + .. _datetime pattern: https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html + >>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t']) >>> df.select(to_timestamp(df.t).alias('dt')).collect() [Row(dt=datetime.datetime(1997, 2, 28, 10, 30))] diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py index 4acdd1786e54..c86585bc5ce8 100644 --- a/python/pyspark/sql/readwriter.py +++ b/python/pyspark/sql/readwriter.py @@ -491,6 +491,8 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non :param recursiveFileLookup: recursively scan a directory for files. Using this option disables `partition discovery`_. + .. _datetime pattern: https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html + >>> df = spark.read.csv('python/test_support/sql/ages.csv') >>> df.dtypes [('_c0', 'string'), ('_c1', 'string')] @@ -864,6 +866,8 @@ def json(self, path, mode=None, compression=None, dateFormat=None, timestampForm :param ignoreNullFields: Whether to ignore null fields when generating JSON objects. If None is set, it uses the default value, ``true``. + .. _datetime pattern: https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html + >>> df.write.json(os.path.join(tempfile.mkdtemp(), 'data')) """ self.mode(mode) @@ -978,6 +982,8 @@ def csv(self, path, mode=None, compression=None, sep=None, quote=None, escape=No :param lineSep: defines the line separator that should be used for writing. If None is set, it uses the default value, ``\\n``. Maximum length is 1 character. + .. _datetime pattern: https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html + >>> df.write.csv(os.path.join(tempfile.mkdtemp(), 'data')) """ self.mode(mode) diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py index a5e86466579c..f28d91f28674 100644 --- a/python/pyspark/sql/streaming.py +++ b/python/pyspark/sql/streaming.py @@ -726,6 +726,8 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non :param recursiveFileLookup: recursively scan a directory for files. Using this option disables `partition discovery`_. + .. _datetime pattern: https://spark.apache.org//docs/latest/sql-ref-datetime-pattern.html + >>> csv_sdf = spark.readStream.csv(tempfile.mkdtemp(), schema = sdf_schema) >>> csv_sdf.isStreaming True From 6d45125c611525a52b71d140598cc04e128b13c0 Mon Sep 17 00:00:00 2001 From: Kent Yao Date: Wed, 25 Mar 2020 19:33:16 +0800 Subject: [PATCH 5/8] Revert "fix style" This reverts commit a959d6a2622988bd9b3d2f2d5a892d0ff3229be3. --- python/pyspark/sql/functions.py | 4 ---- python/pyspark/sql/readwriter.py | 6 ------ python/pyspark/sql/streaming.py | 2 -- 3 files changed, 12 deletions(-) diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 9f8714fd8579..e089963a7fd5 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -1196,8 +1196,6 @@ def to_date(col, format=None): By default, it follows casting rules to :class:`pyspark.sql.types.DateType` if the format is omitted. Equivalent to ``col.cast("date")``. - .. _datetime pattern: https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html - >>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t']) >>> df.select(to_date(df.t).alias('date')).collect() [Row(date=datetime.date(1997, 2, 28))] @@ -1221,8 +1219,6 @@ def to_timestamp(col, format=None): By default, it follows casting rules to :class:`pyspark.sql.types.TimestampType` if the format is omitted. Equivalent to ``col.cast("timestamp")``. - .. _datetime pattern: https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html - >>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t']) >>> df.select(to_timestamp(df.t).alias('dt')).collect() [Row(dt=datetime.datetime(1997, 2, 28, 10, 30))] diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py index c86585bc5ce8..4acdd1786e54 100644 --- a/python/pyspark/sql/readwriter.py +++ b/python/pyspark/sql/readwriter.py @@ -491,8 +491,6 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non :param recursiveFileLookup: recursively scan a directory for files. Using this option disables `partition discovery`_. - .. _datetime pattern: https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html - >>> df = spark.read.csv('python/test_support/sql/ages.csv') >>> df.dtypes [('_c0', 'string'), ('_c1', 'string')] @@ -866,8 +864,6 @@ def json(self, path, mode=None, compression=None, dateFormat=None, timestampForm :param ignoreNullFields: Whether to ignore null fields when generating JSON objects. If None is set, it uses the default value, ``true``. - .. _datetime pattern: https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html - >>> df.write.json(os.path.join(tempfile.mkdtemp(), 'data')) """ self.mode(mode) @@ -982,8 +978,6 @@ def csv(self, path, mode=None, compression=None, sep=None, quote=None, escape=No :param lineSep: defines the line separator that should be used for writing. If None is set, it uses the default value, ``\\n``. Maximum length is 1 character. - .. _datetime pattern: https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html - >>> df.write.csv(os.path.join(tempfile.mkdtemp(), 'data')) """ self.mode(mode) diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py index f28d91f28674..a5e86466579c 100644 --- a/python/pyspark/sql/streaming.py +++ b/python/pyspark/sql/streaming.py @@ -726,8 +726,6 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non :param recursiveFileLookup: recursively scan a directory for files. Using this option disables `partition discovery`_. - .. _datetime pattern: https://spark.apache.org//docs/latest/sql-ref-datetime-pattern.html - >>> csv_sdf = spark.readStream.csv(tempfile.mkdtemp(), schema = sdf_schema) >>> csv_sdf.isStreaming True From 5346870c2444ea24ef34e485d247e7e2e7cae53b Mon Sep 17 00:00:00 2001 From: Kent Yao Date: Wed, 25 Mar 2020 19:36:18 +0800 Subject: [PATCH 6/8] fix link --- python/pyspark/sql/readwriter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py index 4acdd1786e54..817978498fcf 100644 --- a/python/pyspark/sql/readwriter.py +++ b/python/pyspark/sql/readwriter.py @@ -255,7 +255,7 @@ def json(self, path, schema=None, primitivesAsString=None, prefersDecimal=None, .. _partition discovery: https://spark.apache.org/docs/latest/sql-data-sources-parquet.html#partition-discovery - .. _datetime pattern: https://spark.apache.org//docs/latest/sql-ref-datetime-pattern.html + .. _datetime pattern: https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html >>> df1 = spark.read.json('python/test_support/sql/people.json') >>> df1.dtypes From aaa1594c49baee965947c6645fa4f4912c69c9fd Mon Sep 17 00:00:00 2001 From: Kent Yao Date: Wed, 25 Mar 2020 21:35:08 +0800 Subject: [PATCH 7/8] dep --- project/SparkBuild.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index 4e1badb5e90c..44ef35b65a91 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -625,7 +625,8 @@ object DependencyOverrides { dependencyOverrides += "com.google.guava" % "guava" % "14.0.1", dependencyOverrides += "commons-io" % "commons-io" % "2.4", dependencyOverrides += "xerces" % "xercesImpl" % "2.12.0", - dependencyOverrides += "jline" % "jline" % "2.14.6") + dependencyOverrides += "jline" % "jline" % "2.14.6", + dependencyOverrides += "org.apache.avro" % "avro" % "1.8.2") } /** From a1fbefe03cc6a3e8eacb92ee0103845d26c491f6 Mon Sep 17 00:00:00 2001 From: Kent Yao Date: Thu, 26 Mar 2020 09:45:47 +0800 Subject: [PATCH 8/8] Revert "dep" This reverts commit aaa1594c49baee965947c6645fa4f4912c69c9fd. --- project/SparkBuild.scala | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index 44ef35b65a91..4e1badb5e90c 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -625,8 +625,7 @@ object DependencyOverrides { dependencyOverrides += "com.google.guava" % "guava" % "14.0.1", dependencyOverrides += "commons-io" % "commons-io" % "2.4", dependencyOverrides += "xerces" % "xercesImpl" % "2.12.0", - dependencyOverrides += "jline" % "jline" % "2.14.6", - dependencyOverrides += "org.apache.avro" % "avro" % "1.8.2") + dependencyOverrides += "jline" % "jline" % "2.14.6") } /**