apache · panbingkun · Nov 10, 2023 · Nov 10, 2023 · Feb 19, 2024 · Feb 19, 2024
diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/CatalogSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/CatalogSuite.scala
@@ -39,10 +39,10 @@ class CatalogSuite extends RemoteSparkSession with SQLHelper {
         assert(dbs.length == 2)
         assert(dbs.map(_.name) sameElements Array(db, currentDb))
         assert(dbs.map(_.catalog).distinct sameElements Array("spark_catalog"))
-        var databasesWithPattern = spark.catalog.listDatabases("def*").collect().sortBy(_.name)
+        var databasesWithPattern = spark.catalog.listDatabases("def%").collect().sortBy(_.name)
         assert(databasesWithPattern.length == 1)
         assert(databasesWithPattern.map(_.name) sameElements Array(currentDb))
-        databasesWithPattern = spark.catalog.listDatabases("def2*").collect().sortBy(_.name)
+        databasesWithPattern = spark.catalog.listDatabases("def2%").collect().sortBy(_.name)
         assert(databasesWithPattern.length == 0)
         val database = spark.catalog.getDatabase(db)
         assert(database.name == db)
@@ -75,10 +75,10 @@ class CatalogSuite extends RemoteSparkSession with SQLHelper {
       val catalogsAfterChange = spark.catalog.listCatalogs().collect()
       assert(catalogsAfterChange.length == 2)
       assert(catalogsAfterChange.map(_.name).toSet == Set("testcat", "spark_catalog"))
-      var catalogsWithPattern = spark.catalog.listCatalogs("spark*").collect()
+      var catalogsWithPattern = spark.catalog.listCatalogs("spark%").collect()
       assert(catalogsWithPattern.length == 1)
       assert(catalogsWithPattern.map(_.name) sameElements Array("spark_catalog"))
-      catalogsWithPattern = spark.catalog.listCatalogs("hive*").collect()
+      catalogsWithPattern = spark.catalog.listCatalogs("hive%").collect()
       assert(catalogsWithPattern.length == 0)
     } finally {
       spark.catalog.setCurrentCatalog(currentCatalog)
@@ -128,12 +128,12 @@ class CatalogSuite extends RemoteSparkSession with SQLHelper {
               jsonTableName))
           assert(
             spark.catalog
-              .listTables(spark.catalog.currentDatabase, "par*")
+              .listTables(spark.catalog.currentDatabase, "par%")
               .collect()
               .map(_.name)
               .toSet == Set(parquetTableName))
           assert(
-            spark.catalog.listTables(spark.catalog.currentDatabase, "txt*").collect().isEmpty)
+            spark.catalog.listTables(spark.catalog.currentDatabase, "txt%").collect().isEmpty)
         }
         assert(spark.catalog.tableExists(parquetTableName))
         assert(!spark.catalog.tableExists(orcTableName))
@@ -212,11 +212,11 @@ class CatalogSuite extends RemoteSparkSession with SQLHelper {
     }.getMessage
     assert(message.contains("UNRESOLVED_ROUTINE"))
 
-    val functionsWithPattern1 = spark.catalog.listFunctions(dbName, "to*").collect()
+    val functionsWithPattern1 = spark.catalog.listFunctions(dbName, "to%").collect()
     assert(functionsWithPattern1.nonEmpty)
     assert(functionsWithPattern1.exists(f => f.name == "to_date"))
     val functionsWithPattern2 =
-      spark.catalog.listFunctions(dbName, "*not_existing_func*").collect()
+      spark.catalog.listFunctions(dbName, "%not_existing_func%").collect()
     assert(functionsWithPattern2.isEmpty)
   }
 

diff --git a/docs/sql-ref-syntax-aux-show-tables.md b/docs/sql-ref-syntax-aux-show-tables.md
@@ -40,12 +40,21 @@ SHOW TABLES [ { FROM | IN } database_name ] [ LIKE regex_pattern ]
 
 * **regex_pattern**
 
-     Specifies the regular expression pattern that is used to filter out unwanted tables. 
+     Specifies the regular expression pattern that is used to filter out unwanted tables.
 
+     1. After Version 4.0
+     * Same as SQL type `like` expressions, `%` for any character(s), and `_` for a single character.
+     * Examples are `'employees'`, `'emp%'`, `'emplo_ees'`, all of which will match the database named `'employees'`.
+     * **Note**
+       * The `OR` syntax represented by `|` is no longer supported by default.
+       * You can restore the semantics supported before version 4 by setting `spark.sql.legacy.useVerticalBarAndStarAsWildcardsInLikePattern` to true.
+
+     1. Before Version 4.0
      * Except for `*` and `|` character, the pattern works like a regular expression.
      * `*` alone matches 0 or more characters and `|` is used to separate multiple different regular expressions,
        any of which can match.
      * The leading and trailing blanks are trimmed in the input pattern before processing. The pattern match is case-insensitive.
+     * Examples are `'employees'`, `'emp*'`, `'emp*|*ees'`, all of which will match the database named `'employees'`.
 
 ### Examples
 
@@ -79,7 +88,7 @@ SHOW TABLES IN userdb;
 +--------+---------+-----------+
 
 -- List all tables from default database matching the pattern `sam*`
-SHOW TABLES FROM default LIKE 'sam*';
+SHOW TABLES FROM default LIKE 'sam%';
 +--------+---------+-----------+
 |database|tableName|isTemporary|
 +--------+---------+-----------+
@@ -88,6 +97,7 @@ SHOW TABLES FROM default LIKE 'sam*';
 +--------+---------+-----------+
 
 -- List all tables matching the pattern `sam*|suj`
+Note: After Version 4.0, the `OR` syntax represented by `|` is no longer supported by default.
 SHOW TABLES LIKE 'sam*|suj';
 +--------+---------+-----------+
 |database|tableName|isTemporary|

diff --git a/python/pyspark/sql/catalog.py b/python/pyspark/sql/catalog.py
@@ -142,10 +142,10 @@ def listCatalogs(self, pattern: Optional[str] = None) -> List[CatalogMetadata]:
         >>> spark.catalog.listCatalogs()
         [CatalogMetadata(name='spark_catalog', description=None)]
 
-        >>> spark.catalog.listCatalogs("spark*")
+        >>> spark.catalog.listCatalogs("spark%")
         [CatalogMetadata(name='spark_catalog', description=None)]
 
-        >>> spark.catalog.listCatalogs("hive*")
+        >>> spark.catalog.listCatalogs("hive%")
         []
         """
         if pattern is None:
@@ -213,10 +213,10 @@ def listDatabases(self, pattern: Optional[str] = None) -> List[Database]:
         >>> spark.catalog.listDatabases()
         [Database(name='default', catalog='spark_catalog', description='default database', ...
 
-        >>> spark.catalog.listDatabases("def*")
+        >>> spark.catalog.listDatabases("def%")
         [Database(name='default', catalog='spark_catalog', description='default database', ...
 
-        >>> spark.catalog.listDatabases("def2*")
+        >>> spark.catalog.listDatabases("def2%")
         []
         """
         if pattern is None:
@@ -342,10 +342,10 @@ def listTables(
         >>> spark.catalog.listTables()
         [Table(name='test_view', catalog=None, namespace=[], description=None, ...
 
-        >>> spark.catalog.listTables(pattern="test*")
+        >>> spark.catalog.listTables(pattern="test%")
         [Table(name='test_view', catalog=None, namespace=[], description=None, ...
 
-        >>> spark.catalog.listTables(pattern="table*")
+        >>> spark.catalog.listTables(pattern="table%")
         []
 
         >>> _ = spark.catalog.dropTempView("test_view")
@@ -470,10 +470,10 @@ def listFunctions(
         >>> spark.catalog.listFunctions()
         [Function(name=...
 
-        >>> spark.catalog.listFunctions(pattern="to_*")
+        >>> spark.catalog.listFunctions(pattern="to_%")
         [Function(name=...
 
-        >>> spark.catalog.listFunctions(pattern="*not_existing_func*")
+        >>> spark.catalog.listFunctions(pattern="%not_existing_func%")
         []
         """
         if dbName is None:

diff --git a/python/pyspark/sql/tests/test_catalog.py b/python/pyspark/sql/tests/test_catalog.py
@@ -42,9 +42,9 @@ def test_list_databases(self):
             spark.sql("CREATE DATABASE some_db")
             databases = [db.name for db in spark.catalog.listDatabases()]
             self.assertEqual(sorted(databases), ["default", "some_db"])
-            databases = [db.name for db in spark.catalog.listDatabases("def*")]
+            databases = [db.name for db in spark.catalog.listDatabases("def%")]
             self.assertEqual(sorted(databases), ["default"])
-            databases = [db.name for db in spark.catalog.listDatabases("def2*")]
+            databases = [db.name for db in spark.catalog.listDatabases("def2%")]
             self.assertEqual(sorted(databases), [])
 
     def test_database_exists(self):
@@ -94,17 +94,17 @@ def test_list_tables(self):
 
                     tables = sorted(spark.catalog.listTables(), key=lambda t: t.name)
                     tablesWithPattern = sorted(
-                        spark.catalog.listTables(pattern="tab*"), key=lambda t: t.name
+                        spark.catalog.listTables(pattern="tab%"), key=lambda t: t.name
                     )
                     tablesDefault = sorted(
                         spark.catalog.listTables("default"), key=lambda t: t.name
                     )
                     tablesDefaultWithPattern = sorted(
-                        spark.catalog.listTables("default", "tab*"), key=lambda t: t.name
+                        spark.catalog.listTables("default", "tab%"), key=lambda t: t.name
                     )
                     tablesSomeDb = sorted(spark.catalog.listTables("some_db"), key=lambda t: t.name)
                     tablesSomeDbWithPattern = sorted(
-                        spark.catalog.listTables("some_db", "tab*"), key=lambda t: t.name
+                        spark.catalog.listTables("some_db", "tab%"), key=lambda t: t.name
                     )
                     self.assertEqual(tables, tablesDefault)
                     self.assertEqual(tablesWithPattern, tablesDefaultWithPattern)
@@ -265,10 +265,10 @@ def test_list_functions(self):
             self.assertEqual(functions, functionsDefault)
 
             functionsWithPattern = dict(
-                (f.name, f) for f in spark.catalog.listFunctions(pattern="to*")
+                (f.name, f) for f in spark.catalog.listFunctions(pattern="to%")
             )
             functionsDefaultWithPattern = dict(
-                (f.name, f) for f in spark.catalog.listFunctions("default", "to*")
+                (f.name, f) for f in spark.catalog.listFunctions("default", "to%")
             )
             self.assertTrue(len(functionsWithPattern) > 10)
             self.assertFalse("+" in functionsWithPattern)
@@ -279,7 +279,7 @@ def test_list_functions(self):
             self.assertTrue("to_unix_timestamp" in functionsWithPattern)
             self.assertEqual(functionsWithPattern, functionsDefaultWithPattern)
             functionsWithPattern = dict(
-                (f.name, f) for f in spark.catalog.listFunctions(pattern="*not_existing_func*")
+                (f.name, f) for f in spark.catalog.listFunctions(pattern="%not_existing_func%")
             )
             self.assertTrue(len(functionsWithPattern) == 0)
 

diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/TableCatalog.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/TableCatalog.java
@@ -17,7 +17,16 @@
 
 package org.apache.spark.sql.connector.catalog;
 
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import scala.jdk.javaapi.CollectionConverters;
+
 import org.apache.spark.annotation.Evolving;
+import org.apache.spark.sql.catalyst.util.StringUtils;
 import org.apache.spark.sql.connector.expressions.Transform;
 import org.apache.spark.sql.catalyst.analysis.NoSuchNamespaceException;
 import org.apache.spark.sql.catalyst.analysis.NoSuchTableException;
@@ -26,10 +35,6 @@
 import org.apache.spark.sql.errors.QueryExecutionErrors;
 import org.apache.spark.sql.types.StructType;
 
-import java.util.Collections;
-import java.util.Map;
-import java.util.Set;
-
 /**
  * Catalog methods for working with Tables.
  * <p>
@@ -97,6 +102,30 @@ public interface TableCatalog extends CatalogPlugin {
    */
   Identifier[] listTables(String[] namespace) throws NoSuchNamespaceException;
 
+  /**
+   * List the tables in a namespace from the catalog by pattern string.
+   * <p>
+   * If the catalog supports views, this must return identifiers for only tables and not views.
+   *
+   * @param namespace a multi-part namespace
+   * @param pattern the filter pattern,
+   *                when 'spark.sql.legacy.useVerticalBarAndStarAsWildcardsInLikePattern'
+   *                is true, use '*' for any character(s) and '|' for a choice as wildcards.
+   *                If it is false, use '%' for any character(s) and '_' for a single character
+   *                as wildcards. Please refer to 'regex_pattern' in https://
+   *                spark.apache.org/docs/latest/sql-ref-syntax-aux-show-tables.html#parameters
+   *                for more details.
+   * @return an array of Identifiers for tables
+   * @throws NoSuchNamespaceException If the namespace does not exist (optional).
+   */
+  default Identifier[] listTables(String[] namespace, String pattern)
+      throws NoSuchNamespaceException {
+    List<String> tableNames = Arrays.stream(listTables(namespace)).map(Identifier::name).toList();
+    return CollectionConverters.asJava(StringUtils.filterPattern(
+        CollectionConverters.asScala(tableNames).toSeq(), pattern)).stream().map(
+            name -> Identifier.of(namespace, name)).toArray(Identifier[]::new);
+  }
+
   /**
    * Load table metadata by {@link Identifier identifier} from the catalog.
    * <p>

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
@@ -1033,7 +1033,8 @@ class SessionCatalog(
    * Note that, if the specified database is global temporary view database, we will list global
    * temporary views.
    */
-  def listTables(db: String): Seq[TableIdentifier] = listTables(db, "*")
+  def listTables(db: String): Seq[TableIdentifier] =
+    listTables(db, StringUtils.getMatchAllWildcard)
 
   /**
    * List all matching tables in the specified database, including local temporary views.
@@ -1830,7 +1831,8 @@ class SessionCatalog(
    * returns the function identifier and the scope in which it was defined (system or user
    * defined).
    */
-  def listFunctions(db: String): Seq[(FunctionIdentifier, String)] = listFunctions(db, "*")
+  def listFunctions(db: String): Seq[(FunctionIdentifier, String)] =
+    listFunctions(db, StringUtils.getMatchAllWildcard)
 
   /**
    * List all matching functions in the specified database, including temporary functions. This
@@ -1882,7 +1884,7 @@ class SessionCatalog(
       dropTable(table, ignoreIfNotExists = false, purge = false)
     }
     // Temp functions are dropped below, we only need to drop permanent functions here.
-    externalCatalog.listFunctions(DEFAULT_DATABASE, "*").map { f =>
+    externalCatalog.listFunctions(DEFAULT_DATABASE, StringUtils.getMatchAllWildcard).map { f =>
       FunctionIdentifier(f, Some(DEFAULT_DATABASE))
     }.foreach(dropFunction(_, ignoreIfNotExists = false))
     clearTempTables()

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala
@@ -108,26 +108,84 @@ object StringUtils extends Logging {
   // scalastyle:on caselocale
 
   /**
-   * This utility can be used for filtering pattern in the "Like" of "Show Tables / Functions" DDL
+   * get Wildcard that represent matching "all"
+   */
+  def getMatchAllWildcard: String = {
+    if (SQLConf.get.legacyUseStarAndVerticalBarAsWildcardsInLikePattern) {
+      "*"
+    } else {
+      "%"
+    }
+  }
+
+  def filterPattern(names: Seq[String], pattern: String): Seq[String] = {
+    if (SQLConf.get.legacyUseStarAndVerticalBarAsWildcardsInLikePattern) {
+      legacyFilterPattern(names, pattern)
+    } else {
+      filterBySQLLikePattern(names, pattern)
+    }
+  }
+
+  /**
+   * This legacy utility can be used for filtering pattern in the "Like" of
+   * "Show Tables / Functions" DDL.
    * @param names the names list to be filtered
    * @param pattern the filter pattern, only '*' and '|' are allowed as wildcards, others will
    *                follow regular expression convention, case insensitive match and white spaces
    *                on both ends will be ignored
    * @return the filtered names list in order
    */
-  def filterPattern(names: Seq[String], pattern: String): Seq[String] = {
+  def legacyFilterPattern(names: Seq[String], pattern: String): Seq[String] = {
     val funcNames = scala.collection.mutable.SortedSet.empty[String]
     pattern.trim().split("\\|").foreach { subPattern =>
       try {
         val regex = ("(?i)" + subPattern.replaceAll("\\*", ".*")).r
-        funcNames ++= names.filter{ name => regex.pattern.matcher(name).matches() }
+        funcNames ++= names.filter { name => regex.pattern.matcher(name).matches() }
       } catch {
         case _: PatternSyntaxException =>
       }
     }
     funcNames.toSeq
   }
 
+  /**
+   * This utility can be used for filtering pattern in the "Like" of "Show Tables / Functions" DDL.
+   * @param names the names list to be filtered
+   * @param pattern the filter pattern, same as SQL type `like` expressions:
+   *                '%' for any character(s), and '_' for a single character
+   * @return the filtered names list
+   */
+  def filterBySQLLikePattern(names: Seq[String], pattern: String): Seq[String] = {
+    try {
+      val p = Pattern.compile(likePatternToRegExp(pattern), Pattern.CASE_INSENSITIVE)
+      names.filter { name => p.matcher(name).matches() }
+    } catch {
+      case _: PatternSyntaxException => Seq.empty[String]
+    }
+  }
+
+  private[util] def likePatternToRegExp(pattern: String): String = {
+    val regExp = new StringBuilder()
+
+    var index = 0
+    while (index < pattern.length) {
+      val cur = pattern.charAt(index)
+      cur match {
+        // Make a special case for "\\_" and "\\%"
+        case '\\' if (index + 1 < pattern.length()
+          && (pattern.charAt(index + 1) == '_' || pattern.charAt(index + 1) == '%')) =>
+          regExp += pattern.charAt(index + 1)
+          index = index + 1
+        case '_' => regExp ++= "."
+        case '%' => regExp ++= ".*?"
+        case _ => regExp ++= Pattern.quote(Character.toString(cur))
+      }
+      index = index + 1
+    }
+
+    regExp.result()
+  }
+
   /**
    * A string concatenator for plan strings.  Uses length from a configured value, and
    *  prints a warning the first time a plan is truncated.