diff --git a/python/docs/source/getting_started/install.rst b/python/docs/source/getting_started/install.rst index 13c6f8f3a28e..601b45d00a7c 100644 --- a/python/docs/source/getting_started/install.rst +++ b/python/docs/source/getting_started/install.rst @@ -154,11 +154,11 @@ Dependencies ============= ========================= ====================================== Package Minimum supported version Note ============= ========================= ====================================== -`pandas` 0.23.2 Optional for Spark SQL +`pandas` 1.0.5 Optional for Spark SQL `NumPy` 1.7 Required for MLlib DataFrame-based API `pyarrow` 1.0.0 Optional for Spark SQL `Py4J` 0.10.9.2 Required -`pandas` 0.23.2 Required for pandas API on Spark +`pandas` 1.0.5 Required for pandas API on Spark `pyarrow` 1.0.0 Required for pandas API on Spark `Numpy` 1.14 Required for pandas API on Spark ============= ========================= ====================================== diff --git a/python/docs/source/migration_guide/pyspark_3.2_to_3.3.rst b/python/docs/source/migration_guide/pyspark_3.2_to_3.3.rst index 060f24c8f41f..932fc739bb80 100644 --- a/python/docs/source/migration_guide/pyspark_3.2_to_3.3.rst +++ b/python/docs/source/migration_guide/pyspark_3.2_to_3.3.rst @@ -21,3 +21,4 @@ Upgrading from PySpark 3.2 to 3.3 ================================= * In Spark 3.3, the ``drop`` method of pandas API on Spark DataFrame supports dropping rows by ``index``, and sets dropping by index instead of column by default. +* In Spark 3.3, PySpark upgrades Pandas version, the new minimum required version changes from 0.23.2 to 1.0.5. diff --git a/python/docs/source/user_guide/sql/arrow_pandas.rst b/python/docs/source/user_guide/sql/arrow_pandas.rst index 78d3e7ad84e3..20a9f935d586 100644 --- a/python/docs/source/user_guide/sql/arrow_pandas.rst +++ b/python/docs/source/user_guide/sql/arrow_pandas.rst @@ -387,7 +387,7 @@ working with timestamps in ``pandas_udf``\s to get the best performance, see Recommended Pandas and PyArrow Versions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -For usage with pyspark.sql, the minimum supported versions of Pandas is 0.23.2 and PyArrow is 1.0.0. +For usage with pyspark.sql, the minimum supported versions of Pandas is 1.0.5 and PyArrow is 1.0.0. Higher versions may be used, however, compatibility and data correctness can not be guaranteed and should be verified by the user. diff --git a/python/pyspark/pandas/tests/test_series.py b/python/pyspark/pandas/tests/test_series.py index 72677d18e4b8..2a861fa69702 100644 --- a/python/pyspark/pandas/tests/test_series.py +++ b/python/pyspark/pandas/tests/test_series.py @@ -2232,7 +2232,9 @@ def test_mad(self): pser.index = pmidx psser = ps.from_pandas(pser) - self.assert_eq(pser.mad(), psser.mad()) + # Mark almost as True to avoid precision issue like: + # "21.555555555555554 != 21.555555555555557" + self.assert_eq(pser.mad(), psser.mad(), almost=True) def test_to_frame(self): pser = pd.Series(["a", "b", "c"]) diff --git a/python/pyspark/sql/pandas/utils.py b/python/pyspark/sql/pandas/utils.py index cc0db017c301..bc6202f85463 100644 --- a/python/pyspark/sql/pandas/utils.py +++ b/python/pyspark/sql/pandas/utils.py @@ -19,7 +19,7 @@ def require_minimum_pandas_version() -> None: """Raise ImportError if minimum version of Pandas is not installed""" # TODO(HyukjinKwon): Relocate and deduplicate the version specification. - minimum_pandas_version = "0.23.2" + minimum_pandas_version = "1.0.5" from distutils.version import LooseVersion diff --git a/python/setup.py b/python/setup.py index 4507a2686e2c..174995d4aec4 100755 --- a/python/setup.py +++ b/python/setup.py @@ -111,7 +111,7 @@ def _supports_symlinks(): # For Arrow, you should also check ./pom.xml and ensure there are no breaking changes in the # binary format protocol with the Java version, see ARROW_HOME/format/* for specifications. # Also don't forget to update python/docs/source/getting_started/install.rst. -_minimum_pandas_version = "0.23.2" +_minimum_pandas_version = "1.0.5" _minimum_pyarrow_version = "1.0.0"