Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.spark.sql.jdbc.v2

import java.sql.Connection

import org.apache.spark.SparkConf
import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.execution.datasources.v2.jdbc.JDBCTableCatalog
import org.apache.spark.sql.jdbc.{DatabaseOnDocker, DockerJDBCIntegrationSuite}
import org.apache.spark.sql.types._
import org.apache.spark.tags.DockerTest

/**
* To run this test suite for a specific version (e.g., ibmcom/db2:11.5.4.0):
* {{{
* DB2_DOCKER_IMAGE_NAME=ibmcom/db2:11.5.4.0
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

DB2 docker test is much simpler than Oracle? aea78d2#diff-a003dfa2ba6f747fa3ac7f4563e78325R34-R54

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In Oracle docker test, it has instructions for how to build docker image. In DB2 docker test and all the other docker tests, it is assumed that the docker images are there and only has instruction for how to run the tests. That's why DB2 docker test looks much simpler.
e.g. here is what we have for MS SQL Server docker test

/**
 * To run this test suite for a specific version (e.g., 2019-GA-ubuntu-16.04):
 * {{{
 *   MSSQLSERVER_DOCKER_IMAGE_NAME=2019-GA-ubuntu-16.04
 *     ./build/sbt -Pdocker-integration-tests "test-only *MsSqlServerIntegrationSuite"
 * }}}
 */

* ./build/sbt -Pdocker-integration-tests "test-only *DB2IntegrationSuite"
* }}}
*/
@DockerTest
class DB2IntegrationSuite extends DockerJDBCIntegrationSuite with V2JDBCTest {
override val catalogName: String = "db2"
override val db = new DatabaseOnDocker {
override val imageName = sys.env.getOrElse("DB2_DOCKER_IMAGE_NAME", "ibmcom/db2:11.5.4.0")
override val env = Map(
"DB2INST1_PASSWORD" -> "rootpass",
"LICENSE" -> "accept",
"DBNAME" -> "foo",
"ARCHIVE_LOGS" -> "false",
"AUTOCONFIG" -> "false"
)
override val usesIpc = false
override val jdbcPort: Int = 50000
override val privileged = true
override def getJdbcUrl(ip: String, port: Int): String =
s"jdbc:db2://$ip:$port/foo:user=db2inst1;password=rootpass;retrieveMessagesFromServerOnGetMessage=true;" //scalastyle:ignore
}

override def sparkConf: SparkConf = super.sparkConf
.set("spark.sql.catalog.db2", classOf[JDBCTableCatalog].getName)
.set("spark.sql.catalog.db2.url", db.getJdbcUrl(dockerIp, externalPort))

override def dataPreparation(conn: Connection): Unit = {}

override def testUpdateColumnType(tbl: String): Unit = {
sql(s"CREATE TABLE $tbl (ID INTEGER) USING _")
var t = spark.table(tbl)
var expectedSchema = new StructType().add("ID", IntegerType)
assert(t.schema === expectedSchema)
sql(s"ALTER TABLE $tbl ALTER COLUMN id TYPE DOUBLE")
t = spark.table(tbl)
expectedSchema = new StructType().add("ID", DoubleType)
assert(t.schema === expectedSchema)
// Update column type from DOUBLE to STRING
val msg1 = intercept[AnalysisException] {
sql(s"ALTER TABLE $tbl ALTER COLUMN id TYPE VARCHAR(10)")
}.getMessage
assert(msg1.contains("Cannot update alt_table field ID: double cannot be cast to varchar"))
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,8 @@ import org.scalatest.time.SpanSugar._

import org.apache.spark.SparkConf
import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.parser.ParseException
import org.apache.spark.sql.execution.datasources.v2.jdbc.JDBCTableCatalog
import org.apache.spark.sql.jdbc.{DatabaseOnDocker, DockerJDBCIntegrationSuite}
import org.apache.spark.sql.test.SharedSparkSession
import org.apache.spark.sql.types._
import org.apache.spark.tags.DockerTest

Expand Down Expand Up @@ -54,7 +52,8 @@ import org.apache.spark.tags.DockerTest
* It has been validated with 18.4.0 Express Edition.
*/
@DockerTest
class OracleIntegrationSuite extends DockerJDBCIntegrationSuite with SharedSparkSession {
class OracleIntegrationSuite extends DockerJDBCIntegrationSuite with V2JDBCTest {
override val catalogName: String = "oracle"
override val db = new DatabaseOnDocker {
override val imageName = sys.env("ORACLE_DOCKER_IMAGE_NAME")
override val env = Map(
Expand All @@ -73,80 +72,19 @@ class OracleIntegrationSuite extends DockerJDBCIntegrationSuite with SharedSpark
override val connectionTimeout = timeout(7.minutes)
override def dataPreparation(conn: Connection): Unit = {}

test("SPARK-33034: ALTER TABLE ... add new columns") {
withTable("oracle.alt_table") {
sql("CREATE TABLE oracle.alt_table (ID STRING) USING _")
sql("ALTER TABLE oracle.alt_table ADD COLUMNS (C1 STRING, C2 STRING)")
var t = spark.table("oracle.alt_table")
var expectedSchema = new StructType()
.add("ID", StringType)
.add("C1", StringType)
.add("C2", StringType)
assert(t.schema === expectedSchema)
sql("ALTER TABLE oracle.alt_table ADD COLUMNS (C3 STRING)")
t = spark.table("oracle.alt_table")
expectedSchema = expectedSchema.add("C3", StringType)
assert(t.schema === expectedSchema)
// Add already existing column
val msg = intercept[AnalysisException] {
sql(s"ALTER TABLE oracle.alt_table ADD COLUMNS (C3 DOUBLE)")
}.getMessage
assert(msg.contains("Cannot add column, because C3 already exists"))
}
// Add a column to not existing table
val msg = intercept[AnalysisException] {
sql(s"ALTER TABLE oracle.not_existing_table ADD COLUMNS (C4 STRING)")
override def testUpdateColumnType(tbl: String): Unit = {
sql(s"CREATE TABLE $tbl (ID INTEGER) USING _")
var t = spark.table(tbl)
var expectedSchema = new StructType().add("ID", DecimalType(10, 0))
assert(t.schema === expectedSchema)
sql(s"ALTER TABLE $tbl ALTER COLUMN id TYPE STRING")
t = spark.table(tbl)
expectedSchema = new StructType().add("ID", StringType)
assert(t.schema === expectedSchema)
// Update column type from STRING to INTEGER
val msg1 = intercept[AnalysisException] {
sql(s"ALTER TABLE $tbl ALTER COLUMN id TYPE INTEGER")
}.getMessage
assert(msg.contains("Table not found"))
}

test("SPARK-33034: ALTER TABLE ... update column type") {
withTable("oracle.alt_table") {
sql("CREATE TABLE oracle.alt_table (ID INTEGER) USING _")
sql("ALTER TABLE oracle.alt_table ALTER COLUMN id TYPE STRING")
val t = spark.table("oracle.alt_table")
val expectedSchema = new StructType().add("ID", StringType)
assert(t.schema === expectedSchema)
// Update column type from STRING to INTEGER
val msg1 = intercept[AnalysisException] {
sql("ALTER TABLE oracle.alt_table ALTER COLUMN id TYPE INTEGER")
}.getMessage
assert(msg1.contains("Cannot update alt_table field ID: string cannot be cast to int"))
// Update not existing column
val msg2 = intercept[AnalysisException] {
sql("ALTER TABLE oracle.alt_table ALTER COLUMN bad_column TYPE DOUBLE")
}.getMessage
assert(msg2.contains("Cannot update missing field bad_column"))
// Update column to wrong type
val msg3 = intercept[ParseException] {
sql("ALTER TABLE oracle.alt_table ALTER COLUMN id TYPE bad_type")
}.getMessage
assert(msg3.contains("DataType bad_type is not supported"))
}
// Update column type in not existing table
val msg = intercept[AnalysisException] {
sql(s"ALTER TABLE oracle.not_existing_table ALTER COLUMN id TYPE DOUBLE")
}.getMessage
assert(msg.contains("Table not found"))
}

test("SPARK-33034: ALTER TABLE ... update column nullability") {
withTable("oracle.alt_table") {
sql("CREATE TABLE oracle.alt_table (ID STRING NOT NULL) USING _")
sql("ALTER TABLE oracle.alt_table ALTER COLUMN ID DROP NOT NULL")
val t = spark.table("oracle.alt_table")
val expectedSchema = new StructType().add("ID", StringType, nullable = true)
assert(t.schema === expectedSchema)
// Update nullability of not existing column
val msg = intercept[AnalysisException] {
sql("ALTER TABLE oracle.alt_table ALTER COLUMN bad_column DROP NOT NULL")
}.getMessage
assert(msg.contains("Cannot update missing field bad_column"))
}
// Update column nullability in not existing table
val msg = intercept[AnalysisException] {
sql(s"ALTER TABLE oracle.not_existing_table ALTER COLUMN ID DROP NOT NULL")
}.getMessage
assert(msg.contains("Table not found"))
assert(msg1.contains("Cannot update alt_table field ID: string cannot be cast to int"))
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.spark.sql.jdbc.v2

import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.test.SharedSparkSession
import org.apache.spark.sql.types._
import org.apache.spark.tags.DockerTest

@DockerTest
trait V2JDBCTest extends SharedSparkSession {
val catalogName: String
// dialect specific update column type test
def testUpdateColumnType(tbl: String): Unit

test("SPARK-33034: ALTER TABLE ... add new columns") {
withTable(s"$catalogName.alt_table") {
sql(s"CREATE TABLE $catalogName.alt_table (ID STRING) USING _")
var t = spark.table(s"$catalogName.alt_table")
var expectedSchema = new StructType().add("ID", StringType)
assert(t.schema === expectedSchema)
sql(s"ALTER TABLE $catalogName.alt_table ADD COLUMNS (C1 STRING, C2 STRING)")
t = spark.table(s"$catalogName.alt_table")
expectedSchema = expectedSchema.add("C1", StringType).add("C2", StringType)
assert(t.schema === expectedSchema)
sql(s"ALTER TABLE $catalogName.alt_table ADD COLUMNS (C3 STRING)")
t = spark.table(s"$catalogName.alt_table")
expectedSchema = expectedSchema.add("C3", StringType)
assert(t.schema === expectedSchema)
// Add already existing column
val msg = intercept[AnalysisException] {
sql(s"ALTER TABLE $catalogName.alt_table ADD COLUMNS (C3 DOUBLE)")
}.getMessage
assert(msg.contains("Cannot add column, because C3 already exists"))
}
// Add a column to not existing table
val msg = intercept[AnalysisException] {
sql(s"ALTER TABLE $catalogName.not_existing_table ADD COLUMNS (C4 STRING)")
}.getMessage
assert(msg.contains("Table not found"))
}

test("SPARK-33034: ALTER TABLE ... update column type") {
withTable(s"$catalogName.alt_table") {
testUpdateColumnType(s"$catalogName.alt_table")
// Update not existing column
val msg2 = intercept[AnalysisException] {
sql(s"ALTER TABLE $catalogName.alt_table ALTER COLUMN bad_column TYPE DOUBLE")
}.getMessage
assert(msg2.contains("Cannot update missing field bad_column"))
}
// Update column type in not existing table
val msg = intercept[AnalysisException] {
sql(s"ALTER TABLE $catalogName.not_existing_table ALTER COLUMN id TYPE DOUBLE")
}.getMessage
assert(msg.contains("Table not found"))
}

test("SPARK-33034: ALTER TABLE ... update column nullability") {
withTable(s"$catalogName.alt_table") {
sql(s"CREATE TABLE $catalogName.alt_table (ID STRING NOT NULL) USING _")
var t = spark.table(s"$catalogName.alt_table")
// nullable is true in the expecteSchema because Spark always sets nullable to true
// regardless of the JDBC metadata https://github.com/apache/spark/pull/18445
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we can change it in JDBC V2, as the table metadata is stored in the remote JDBC server directly. This can be done in a followup.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I did a couple of quick tests using V2 write API:

sql("INSERT INTO h2.test.people SELECT 'bob', null")

and

sql("SELECT null AS ID, 'bob' AS NAME").writeTo("h2.test.people")

I got Exception from h2 jdbc driver:

Caused by: org.h2.jdbc.JdbcSQLException: NULL not allowed for column "ID"; SQL statement:
INSERT INTO "test"."people" ("NAME","ID") VALUES (?,?) [23502-195]
	at org.h2.message.DbException.getJdbcSQLException(DbException.java:345)

So we are able to pass the null value for not null column ID to h2 and h2 blocks the insert.

However, if I change the current code in JDBCRDD.resolveTable to make alwaysNullable = false to get the real nullable value,

  def resolveTable(options: JDBCOptions): StructType = {

          ......

          JdbcUtils.getSchema(rs, dialect, alwaysNullable = false)

For insert, I got Exception from Spark

Cannot write incompatible data to table 'test.people':
- Cannot write nullable values to non-null column 'ID';
org.apache.spark.sql.AnalysisException: Cannot write incompatible data to table 'test.people':
- Cannot write nullable values to non-null column 'ID';
	at org.apache.spark.sql.catalyst.analysis.TableOutputResolver$.resolveOutputColumns(TableOutputResolver.scala:72)
	at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveOutputRelation$$anonfun$apply$31.applyOrElse(Analyzer.scala:3040)
	at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveOutputRelation$$anonfun$apply$31.applyOrElse(Analyzer.scala:3035)

Spark blocks the insert and we are not able to pass the null value for not null column ID to h2. Since the whole point of #18445 is to let the underlying database to decide how to process null for a not null column, I guess we will not change this alwaysNullable for JDBCV2?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This does expose a problem in Spark: most databases allow to write nullable data to non-nullable column, and fail at runtime if they see null values. I think Spark shouldn't block it at compile time. After all, nullability is more like a constraint, not data type itself. cc @rdblue @dongjoon-hyun @viirya @maropu @MaxGekk

var expectedSchema = new StructType().add("ID", StringType, nullable = true)
assert(t.schema === expectedSchema)
sql(s"ALTER TABLE $catalogName.alt_table ALTER COLUMN ID DROP NOT NULL")
t = spark.table(s"$catalogName.alt_table")
expectedSchema = new StructType().add("ID", StringType, nullable = true)
assert(t.schema === expectedSchema)
// Update nullability of not existing column
val msg = intercept[AnalysisException] {
sql(s"ALTER TABLE $catalogName.alt_table ALTER COLUMN bad_column DROP NOT NULL")
}.getMessage
assert(msg.contains("Cannot update missing field bad_column"))
}
// Update column nullability in not existing table
val msg = intercept[AnalysisException] {
sql(s"ALTER TABLE $catalogName.not_existing_table ALTER COLUMN ID DROP NOT NULL")
}.getMessage
assert(msg.contains("Table not found"))
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -591,6 +591,13 @@ class DDLParserSuite extends AnalysisTest {
None))
}

test("alter table: update column type invalid type") {
val msg = intercept[ParseException] {
parsePlan("ALTER TABLE table_name ALTER COLUMN a.b.c TYPE bad_type")
}.getMessage
assert(msg.contains("DataType bad_type is not supported"))
}

test("alter table: update column type") {
comparePlans(
parsePlan("ALTER TABLE table_name CHANGE COLUMN a.b.c TYPE bigint"),
Expand Down
20 changes: 20 additions & 0 deletions sql/core/src/main/scala/org/apache/spark/sql/jdbc/DB2Dialect.scala
Original file line number Diff line number Diff line change
Expand Up @@ -58,4 +58,24 @@ private object DB2Dialect extends JdbcDialect {
override def renameTable(oldTable: String, newTable: String): String = {
s"RENAME TABLE $oldTable TO $newTable"
}

// scalastyle:off line.size.limit
// See https://www.ibm.com/support/knowledgecenter/en/SSEPGG_11.5.0/com.ibm.db2.luw.sql.ref.doc/doc/r0000888.html
// scalastyle:on line.size.limit
override def getUpdateColumnTypeQuery(
tableName: String,
columnName: String,
newDataType: String): String =
s"ALTER TABLE $tableName ALTER COLUMN $columnName SET DATA TYPE $newDataType"

// scalastyle:off line.size.limit
// See https://www.ibm.com/support/knowledgecenter/en/SSEPGG_11.5.0/com.ibm.db2.luw.sql.ref.doc/doc/r0000888.html
// scalastyle:on line.size.limit
override def getUpdateColumnNullabilityQuery(
tableName: String,
columnName: String,
isNullable: Boolean): String = {
val nullable = if (isNullable) "DROP NOT NULL" else "SET NOT NULL"
s"ALTER TABLE $tableName ALTER COLUMN $columnName $nullable"
}
}