add back HiveContext and createExternalTable

gatorsmile · gatorsmile · commit 294337dc1e58 · 2020-03-05T14:14:32.000-08:00
diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md
@@ -25,10 +25,6 @@ license: |
 ## Upgrading from Spark SQL 2.4 to 3.0
   - Since Spark 3.0, when inserting a value into a table column with a different data type, the type coercion is performed as per ANSI SQL standard. Certain unreasonable type conversions such as converting `string` to `int` and `double` to `boolean` are disallowed. A runtime exception will be thrown if the value is out-of-range for the data type of the column. In Spark version 2.4 and earlier, type conversions during table insertion are allowed as long as they are valid `Cast`. When inserting an out-of-range value to a integral field, the low-order bits of the value is inserted(the same as Java/Scala numeric type casting). For example, if 257 is inserted to a field of byte type, the result is 1. The behavior is controlled by the option `spark.sql.storeAssignmentPolicy`, with a default value as "ANSI". Setting the option as "Legacy" restores the previous behavior.
 
-  - In Spark 3.0, the deprecated methods `SQLContext.createExternalTable` and `SparkSession.createExternalTable` have been removed in favor of its replacement, `createTable`.
-
-  - In Spark 3.0, the deprecated `HiveContext` class has been removed. Use `SparkSession.builder.enableHiveSupport()` instead.
-
   - Since Spark 3.0, configuration `spark.sql.crossJoin.enabled` become internal configuration, and is true by default, so by default spark won't raise exception on sql with implicit cross join.
 
   - Since Spark 3.0, we reversed argument order of the trim function from `TRIM(trimStr, str)` to `TRIM(str, trimStr)` to be compatible with other databases.
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
@@ -52,8 +52,6 @@ object MimaExcludes {
     ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.ExecutorPlugin"),
 
     // [SPARK-28980][SQL][CORE][MLLIB] Remove more old deprecated items in Spark 3
-    ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.sql.SQLContext.createExternalTable"),
-    ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.sql.catalog.Catalog.createExternalTable"),
     ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.mllib.clustering.KMeans.train"),
     ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.mllib.clustering.KMeans.train"),
     ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.mllib.classification.LogisticRegressionWithSGD$"),
diff --git a/python/pyspark/__init__.py b/python/pyspark/__init__.py
@@ -113,7 +113,7 @@ def wrapper(self, *args, **kwargs):
 
 
 # for back compatibility
-from pyspark.sql import SQLContext, Row
+from pyspark.sql import SQLContext, HiveContext, Row
 
 __all__ = [
     "SparkConf", "SparkContext", "SparkFiles", "RDD", "StorageLevel", "Broadcast",
diff --git a/python/pyspark/sql/__init__.py b/python/pyspark/sql/__init__.py
@@ -43,7 +43,7 @@
 
 
 from pyspark.sql.types import Row
-from pyspark.sql.context import SQLContext, UDFRegistration
+from pyspark.sql.context import SQLContext, HiveContext, UDFRegistration
 from pyspark.sql.session import SparkSession
 from pyspark.sql.column import Column
 from pyspark.sql.catalog import Catalog
@@ -55,7 +55,7 @@
 
 
 __all__ = [
-    'SparkSession', 'SQLContext', 'UDFRegistration',
+    'SparkSession', 'SQLContext', 'HiveContext', 'UDFRegistration',
     'DataFrame', 'GroupedData', 'Column', 'Catalog', 'Row',
     'DataFrameNaFunctions', 'DataFrameStatFunctions', 'Window', 'WindowSpec',
     'DataFrameReader', 'DataFrameWriter', 'PandasCogroupedOps'
diff --git a/python/pyspark/sql/catalog.py b/python/pyspark/sql/catalog.py
@@ -138,6 +138,26 @@ def listColumns(self, tableName, dbName=None):
                 isBucket=jcolumn.isBucket()))
         return columns
 
+    @since(2.0)
+    def createExternalTable(self, tableName, path=None, source=None, schema=None, **options):
+        """Creates a table based on the dataset in a data source.
+
+        It returns the DataFrame associated with the external table.
+
+        The data source is specified by the ``source`` and a set of ``options``.
+        If ``source`` is not specified, the default data source configured by
+        ``spark.sql.sources.default`` will be used.
+
+        Optionally, a schema can be provided as the schema of the returned :class:`DataFrame` and
+        created external table.
+
+        :return: :class:`DataFrame`
+        """
+        warnings.warn(
+            "createExternalTable is deprecated since Spark 2.2, please use createTable instead.",
+            DeprecationWarning)
+        return self.createTable(tableName, path, source, schema, **options)
+
     @since(2.2)
     def createTable(self, tableName, path=None, source=None, schema=None, **options):
         """Creates a table based on the dataset in a data source.
diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py
@@ -32,7 +32,7 @@
 from pyspark.sql.udf import UDFRegistration
 from pyspark.sql.utils import install_exception_handler
 
-__all__ = ["SQLContext"]
+__all__ = ["SQLContext", "HiveContext"]
 
 
 class SQLContext(object):
@@ -340,6 +340,24 @@ def dropTempTable(self, tableName):
         """
         self.sparkSession.catalog.dropTempView(tableName)
 
+    @since(1.3)
+    def createExternalTable(self, tableName, path=None, source=None, schema=None, **options):
+        """Creates an external table based on the dataset in a data source.
+
+        It returns the DataFrame associated with the external table.
+
+        The data source is specified by the ``source`` and a set of ``options``.
+        If ``source`` is not specified, the default data source configured by
+        ``spark.sql.sources.default`` will be used.
+
+        Optionally, a schema can be provided as the schema of the returned :class:`DataFrame` and
+        created external table.
+
+        :return: :class:`DataFrame`
+        """
+        return self.sparkSession.catalog.createExternalTable(
+            tableName, path, source, schema, **options)
+
     @ignore_unicode_prefix
     @since(1.0)
     def sql(self, sqlQuery):
@@ -463,6 +481,53 @@ def streams(self):
         return StreamingQueryManager(self._ssql_ctx.streams())
 
 
+class HiveContext(SQLContext):
+    """A variant of Spark SQL that integrates with data stored in Hive.
+
+    Configuration for Hive is read from ``hive-site.xml`` on the classpath.
+    It supports running both SQL and HiveQL commands.
+
+    :param sparkContext: The SparkContext to wrap.
+    :param jhiveContext: An optional JVM Scala HiveContext. If set, we do not instantiate a new
+        :class:`HiveContext` in the JVM, instead we make all calls to this object.
+
+    .. note:: Deprecated in 2.0.0. Use SparkSession.builder.enableHiveSupport().getOrCreate().
+    """
+
+    def __init__(self, sparkContext, jhiveContext=None):
+        warnings.warn(
+            "HiveContext is deprecated in Spark 2.0.0. Please use " +
+            "SparkSession.builder.enableHiveSupport().getOrCreate() instead.",
+            DeprecationWarning)
+        if jhiveContext is None:
+            sparkContext._conf.set("spark.sql.catalogImplementation", "hive")
+            sparkSession = SparkSession.builder._sparkContext(sparkContext).getOrCreate()
+        else:
+            sparkSession = SparkSession(sparkContext, jhiveContext.sparkSession())
+        SQLContext.__init__(self, sparkContext, sparkSession, jhiveContext)
+
+    @classmethod
+    def _createForTesting(cls, sparkContext):
+        """(Internal use only) Create a new HiveContext for testing.
+
+        All test code that touches HiveContext *must* go through this method. Otherwise,
+        you may end up launching multiple derby instances and encounter with incredibly
+        confusing error messages.
+        """
+        jsc = sparkContext._jsc.sc()
+        jtestHive = sparkContext._jvm.org.apache.spark.sql.hive.test.TestHiveContext(jsc, False)
+        return cls(sparkContext, jtestHive)
+
+    def refreshTable(self, tableName):
+        """Invalidate and refresh all the cached the metadata of the given
+        table. For performance reasons, Spark SQL or the external data source
+        library it uses might cache certain metadata about a table, such as the
+        location of blocks. When those change outside of Spark SQL, users should
+        call this function to invalidate the cache.
+        """
+        self._ssql_ctx.refreshTable(tableName)
+
+
 def _test():
     import os
     import doctest
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -479,6 +479,97 @@ class SQLContext private[sql](val sparkSession: SparkSession)
   def readStream: DataStreamReader = sparkSession.readStream
 
 
+  /**
+   * Creates an external table from the given path and returns the corresponding DataFrame.
+   * It will use the default data source configured by spark.sql.sources.default.
+   *
+   * @group ddl_ops
+   * @since 1.3.0
+   */
+  @deprecated("use sparkSession.catalog.createTable instead.", "2.2.0")
+  def createExternalTable(tableName: String, path: String): DataFrame = {
+    sparkSession.catalog.createTable(tableName, path)
+  }
+
+  /**
+   * Creates an external table from the given path based on a data source
+   * and returns the corresponding DataFrame.
+   *
+   * @group ddl_ops
+   * @since 1.3.0
+   */
+  @deprecated("use sparkSession.catalog.createTable instead.", "2.2.0")
+  def createExternalTable(
+      tableName: String,
+      path: String,
+      source: String): DataFrame = {
+    sparkSession.catalog.createTable(tableName, path, source)
+  }
+
+  /**
+   * Creates an external table from the given path based on a data source and a set of options.
+   * Then, returns the corresponding DataFrame.
+   *
+   * @group ddl_ops
+   * @since 1.3.0
+   */
+  @deprecated("use sparkSession.catalog.createTable instead.", "2.2.0")
+  def createExternalTable(
+      tableName: String,
+      source: String,
+      options: java.util.Map[String, String]): DataFrame = {
+    sparkSession.catalog.createTable(tableName, source, options)
+  }
+
+  /**
+   * (Scala-specific)
+   * Creates an external table from the given path based on a data source and a set of options.
+   * Then, returns the corresponding DataFrame.
+   *
+   * @group ddl_ops
+   * @since 1.3.0
+   */
+  @deprecated("use sparkSession.catalog.createTable instead.", "2.2.0")
+  def createExternalTable(
+      tableName: String,
+      source: String,
+      options: Map[String, String]): DataFrame = {
+    sparkSession.catalog.createTable(tableName, source, options)
+  }
+
+  /**
+   * Create an external table from the given path based on a data source, a schema and
+   * a set of options. Then, returns the corresponding DataFrame.
+   *
+   * @group ddl_ops
+   * @since 1.3.0
+   */
+  @deprecated("use sparkSession.catalog.createTable instead.", "2.2.0")
+  def createExternalTable(
+      tableName: String,
+      source: String,
+      schema: StructType,
+      options: java.util.Map[String, String]): DataFrame = {
+    sparkSession.catalog.createTable(tableName, source, schema, options)
+  }
+
+  /**
+   * (Scala-specific)
+   * Create an external table from the given path based on a data source, a schema and
+   * a set of options. Then, returns the corresponding DataFrame.
+   *
+   * @group ddl_ops
+   * @since 1.3.0
+   */
+  @deprecated("use sparkSession.catalog.createTable instead.", "2.2.0")
+  def createExternalTable(
+      tableName: String,
+      source: String,
+      schema: StructType,
+      options: Map[String, String]): DataFrame = {
+    sparkSession.catalog.createTable(tableName, source, schema, options)
+  }
+
   /**
    * Registers the given `DataFrame` as a temporary table in the catalog. Temporary tables exist
    * only during the lifetime of this instance of SQLContext.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/catalog/Catalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/catalog/Catalog.scala
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveContextCompatibilitySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveContextCompatibilitySuite.scala