Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ public class GetTablesOperation extends MetadataOperation {
private final String schemaName;
private final String tableName;
private final List<String> tableTypes = new ArrayList<String>();
private final RowSet rowSet;
protected final RowSet rowSet;
private final TableTypeMapping tableTypeMapping;


Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.spark.sql.hive.thriftserver

import java.util.{List => JList}

import scala.collection.JavaConverters.seqAsJavaListConverter

import org.apache.hadoop.hive.ql.security.authorization.plugin.HiveOperationType
import org.apache.hadoop.hive.ql.security.authorization.plugin.HivePrivilegeObjectUtils
import org.apache.hive.service.cli._
import org.apache.hive.service.cli.operation.GetTablesOperation
import org.apache.hive.service.cli.session.HiveSession

import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.catalyst.catalog.CatalogTableType
import org.apache.spark.sql.catalyst.catalog.CatalogTableType._

/**
* Spark's own GetTablesOperation
*
* @param sqlContext SQLContext to use
* @param parentSession a HiveSession from SessionManager
* @param catalogName catalog name. null if not applicable
* @param schemaName database name, null or a concrete database name
* @param tableName table name pattern
* @param tableTypes list of allowed table types, e.g. "TABLE", "VIEW"
*/
private[hive] class SparkGetTablesOperation(
Copy link
Member

@gatorsmile gatorsmile Oct 24, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

/**
 * @param schemaName database name, null or a concrete database name
 * @param tableName table name pattern
 * @param tableTypes list of allowed table types
 * ................
 */

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed.

sqlContext: SQLContext,
parentSession: HiveSession,
catalogName: String,
schemaName: String,
tableName: String,
tableTypes: JList[String])
extends GetTablesOperation(parentSession, catalogName, schemaName, tableName, tableTypes) {

if (tableTypes != null) {
this.tableTypes.addAll(tableTypes)
}

override def runInternal(): Unit = {
setState(OperationState.RUNNING)
// Always use the latest class loader provided by executionHive's state.
val executionHiveClassLoader = sqlContext.sharedState.jarClassLoader
Thread.currentThread().setContextClassLoader(executionHiveClassLoader)

val catalog = sqlContext.sessionState.catalog
val schemaPattern = convertSchemaPattern(schemaName)
val matchingDbs = catalog.listDatabases(schemaPattern)

if (isAuthV2Enabled) {
val privObjs =
HivePrivilegeObjectUtils.getHivePrivDbObjects(seqAsJavaListConverter(matchingDbs).asJava)
val cmdStr = s"catalog : $catalogName, schemaPattern : $schemaName"
authorizeMetaGets(HiveOperationType.GET_TABLES, privObjs, cmdStr)
}

val tablePattern = convertIdentifierPattern(tableName, true)
matchingDbs.foreach { dbName =>
catalog.listTables(dbName, tablePattern).foreach { tableIdentifier =>
val catalogTable = catalog.getTableMetadata(tableIdentifier)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This can be very slow for big schemas. Calling getTableMetadata on every table will trigger 3 separate database calls to the metastore (requireDbExists, requireTableExists, and getTable) taking ~tens of ms for every table. So it can be tens of seconds for schemas with hundreds of tables.

The underlying Hive Thriftserver GetTables uses MetastoreClient.getTableObjectsByName (https://hive.apache.org/javadocs/r2.1.1/api/org/apache/hadoop/hive/metastore/HiveMetaStoreClient.html#getTableObjectsByName-java.lang.String-java.util.List-) call to bulk-list the tables, but we don't expose that through our SessionCatalog / ExternalCatalog / HiveClientImpl

Would it be possible to thread that bulk getTableObjectsByName operation through our catalog APIs, to be able to retrieve the tables efficiently here? @wangyum @gatorsmile - what do you think?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@LantaoJin @wangyum Could either of you submit a PR to resolve the issue raised by @juliuszsompolski ?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, I will take this issue.

val tableType = tableTypeString(catalogTable.tableType)
if (tableTypes == null || tableTypes.isEmpty || tableTypes.contains(tableType)) {
val rowData = Array[AnyRef](
"",
catalogTable.database,
catalogTable.identifier.table,
tableType,
catalogTable.comment.getOrElse(""))
rowSet.addRow(rowData)
}
}
}
setState(OperationState.FINISHED)
}

private def tableTypeString(tableType: CatalogTableType): String = tableType match {
case EXTERNAL | MANAGED => "TABLE"
case VIEW => "VIEW"
case t =>
throw new IllegalArgumentException(s"Unknown table type is found at showCreateHiveTable: $t")
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,17 +17,17 @@

package org.apache.spark.sql.hive.thriftserver.server

import java.util.{Map => JMap}
import java.util.{List => JList, Map => JMap}
import java.util.concurrent.ConcurrentHashMap

import org.apache.hive.service.cli._
import org.apache.hive.service.cli.operation.{ExecuteStatementOperation, GetSchemasOperation, Operation, OperationManager}
import org.apache.hive.service.cli.operation.{ExecuteStatementOperation, GetSchemasOperation, MetadataOperation, Operation, OperationManager}
import org.apache.hive.service.cli.session.HiveSession

import org.apache.spark.internal.Logging
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.hive.HiveUtils
import org.apache.spark.sql.hive.thriftserver.{ReflectionUtils, SparkExecuteStatementOperation, SparkGetSchemasOperation}
import org.apache.spark.sql.hive.thriftserver.{ReflectionUtils, SparkExecuteStatementOperation, SparkGetSchemasOperation, SparkGetTablesOperation}
import org.apache.spark.sql.internal.SQLConf

/**
Expand Down Expand Up @@ -76,6 +76,22 @@ private[thriftserver] class SparkSQLOperationManager()
operation
}

override def newGetTablesOperation(
parentSession: HiveSession,
catalogName: String,
schemaName: String,
tableName: String,
tableTypes: JList[String]): MetadataOperation = synchronized {
val sqlContext = sessionToContexts.get(parentSession.getSessionHandle)
require(sqlContext != null, s"Session handle: ${parentSession.getSessionHandle} has not been" +
" initialized or had already closed.")
val operation = new SparkGetTablesOperation(sqlContext, parentSession,
catalogName, schemaName, tableName, tableTypes)
handleToOperation.put(operation.getHandle, operation)
logDebug(s"Created GetTablesOperation with session=$parentSession.")
operation
}

def setConfMap(conf: SQLConf, confMap: java.util.Map[String, String]): Unit = {
val iterator = confMap.entrySet().iterator()
while (iterator.hasNext) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -280,7 +280,7 @@ class HiveThriftBinaryServerSuite extends HiveThriftJdbcTest {
var defaultV2: String = null
var data: ArrayBuffer[Int] = null

withMultipleConnectionJdbcStatement("test_map")(
withMultipleConnectionJdbcStatement("test_map", "db1.test_map2")(
// create table
{ statement =>

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

package org.apache.spark.sql.hive.thriftserver

import java.util.Properties
import java.util.{Arrays => JArrays, List => JList, Properties}

import org.apache.hive.jdbc.{HiveConnection, HiveQueryResultSet, Utils => JdbcUtils}
import org.apache.hive.service.auth.PlainSaslHelper
Expand Down Expand Up @@ -100,4 +100,89 @@ class SparkMetadataOperationSuite extends HiveThriftJdbcTest {
}
}
}

test("Spark's own GetTablesOperation(SparkGetTablesOperation)") {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

def testGetTablesOperation(
schema: String,
tableNamePattern: String,
tableTypes: JList[String])(f: HiveQueryResultSet => Unit): Unit = {
val rawTransport = new TSocket("localhost", serverPort)
val connection = new HiveConnection(s"jdbc:hive2://localhost:$serverPort", new Properties)
val user = System.getProperty("user.name")
val transport = PlainSaslHelper.getPlainTransport(user, "anonymous", rawTransport)
val client = new TCLIService.Client(new TBinaryProtocol(transport))
transport.open()

var rs: HiveQueryResultSet = null

try {
val openResp = client.OpenSession(new TOpenSessionReq)
val sessHandle = openResp.getSessionHandle

val getTableReq = new TGetTablesReq(sessHandle)
getTableReq.setSchemaName(schema)
getTableReq.setTableName(tableNamePattern)
getTableReq.setTableTypes(tableTypes)

val getTableResp = client.GetTables(getTableReq)

JdbcUtils.verifySuccess(getTableResp.getStatus)

rs = new HiveQueryResultSet.Builder(connection)
.setClient(client)
.setSessionHandle(sessHandle)
.setStmtHandle(getTableResp.getOperationHandle)
.build()

f(rs)
} finally {
rs.close()
connection.close()
transport.close()
rawTransport.close()
}
}

def checkResult(tableNames: Seq[String], rs: HiveQueryResultSet): Unit = {
if (tableNames.nonEmpty) {
for (i <- tableNames.indices) {
assert(rs.next())
assert(rs.getString("TABLE_NAME") === tableNames(i))
}
} else {
assert(!rs.next())
}
}

withJdbcStatement("table1", "table2") { statement =>
Seq(
"CREATE TABLE table1(key INT, val STRING)",
"CREATE TABLE table2(key INT, val STRING)",
"CREATE VIEW view1 AS SELECT * FROM table2").foreach(statement.execute)

testGetTablesOperation("%", "%", null) { rs =>
checkResult(Seq("table1", "table2", "view1"), rs)
}

testGetTablesOperation("%", "table1", null) { rs =>
checkResult(Seq("table1"), rs)
}

testGetTablesOperation("%", "table_not_exist", null) { rs =>
checkResult(Seq.empty, rs)
}

testGetTablesOperation("%", "%", JArrays.asList("TABLE")) { rs =>
checkResult(Seq("table1", "table2"), rs)
}

testGetTablesOperation("%", "%", JArrays.asList("VIEW")) { rs =>
checkResult(Seq("view1"), rs)
}

testGetTablesOperation("%", "%", JArrays.asList("TABLE", "VIEW")) { rs =>
checkResult(Seq("table1", "table2", "view1"), rs)
}
}
}
}