-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-18123][SQL] Use db column names instead of RDD column ones during JDBC Writing #15664
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
28a6c9a
d4b8e42
ddb9fdc
c7fc7ed
7d83676
d74ba52
a1a0bbc
647d34c
5e1672f
fd062fa
35c7723
7abaf3c
54adaf5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -26,7 +26,7 @@ import scala.util.control.NonFatal | |
| import org.apache.spark.TaskContext | ||
| import org.apache.spark.executor.InputMetrics | ||
| import org.apache.spark.internal.Logging | ||
| import org.apache.spark.sql.{DataFrame, Row} | ||
| import org.apache.spark.sql.{AnalysisException, DataFrame, Row} | ||
| import org.apache.spark.sql.catalyst.InternalRow | ||
| import org.apache.spark.sql.catalyst.encoders.RowEncoder | ||
| import org.apache.spark.sql.catalyst.expressions.SpecificInternalRow | ||
|
|
@@ -108,14 +108,36 @@ object JdbcUtils extends Logging { | |
| } | ||
|
|
||
| /** | ||
| * Returns a PreparedStatement that inserts a row into table via conn. | ||
| * Returns an Insert SQL statement for inserting a row into the target table via JDBC conn. | ||
| */ | ||
| def insertStatement(conn: Connection, table: String, rddSchema: StructType, dialect: JdbcDialect) | ||
| : PreparedStatement = { | ||
| val columns = rddSchema.fields.map(x => dialect.quoteIdentifier(x.name)).mkString(",") | ||
| def getInsertStatement( | ||
| table: String, | ||
| rddSchema: StructType, | ||
| tableSchema: Option[StructType], | ||
| isCaseSensitive: Boolean, | ||
| dialect: JdbcDialect): String = { | ||
| val columns = if (tableSchema.isEmpty) { | ||
| rddSchema.fields.map(x => dialect.quoteIdentifier(x.name)).mkString(",") | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The legacy behavior is used when |
||
| } else { | ||
| val columnNameEquality = if (isCaseSensitive) { | ||
| org.apache.spark.sql.catalyst.analysis.caseSensitiveResolution | ||
| } else { | ||
| org.apache.spark.sql.catalyst.analysis.caseInsensitiveResolution | ||
| } | ||
| // The generated insert statement needs to follow rddSchema's column sequence and | ||
| // tableSchema's column names. When appending data into some case-sensitive DBMSs like | ||
| // PostgreSQL/Oracle, we need to respect the existing case-sensitive column names instead of | ||
| // RDD column names for user convenience. | ||
| val tableColumnNames = tableSchema.get.fieldNames | ||
| rddSchema.fields.map { col => | ||
| val normalizedName = tableColumnNames.find(f => columnNameEquality(f, col.name)).getOrElse { | ||
| throw new AnalysisException(s"""Column "${col.name}" not found in schema $tableSchema""") | ||
| } | ||
| dialect.quoteIdentifier(normalizedName) | ||
| }.mkString(",") | ||
| } | ||
| val placeholders = rddSchema.fields.map(_ => "?").mkString(",") | ||
| val sql = s"INSERT INTO $table ($columns) VALUES ($placeholders)" | ||
| conn.prepareStatement(sql) | ||
| s"INSERT INTO $table ($columns) VALUES ($placeholders)" | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -210,6 +232,26 @@ object JdbcUtils extends Logging { | |
| answer | ||
| } | ||
|
|
||
| /** | ||
| * Returns the schema if the table already exists in the JDBC database. | ||
| */ | ||
| def getSchemaOption(conn: Connection, url: String, table: String): Option[StructType] = { | ||
| val dialect = JdbcDialects.get(url) | ||
|
|
||
| try { | ||
| val statement = conn.prepareStatement(dialect.getSchemaQuery(table)) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why should this be included in the try block?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes. It's due to
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it better not to hide that here? If then, I'll remove that
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. JDBC data sources might detect whether the table exists when preparing the SQL statement. Thus, please keep it. Thanks |
||
| try { | ||
| Some(getSchema(statement.executeQuery(), dialect)) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For unsupported types, it will throw an
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Then, we can keep the existing way. See master...gatorsmile:pr-15664Changed1 |
||
| } catch { | ||
| case _: SQLException => None | ||
| } finally { | ||
| statement.close() | ||
| } | ||
| } catch { | ||
| case _: SQLException => None | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Takes a [[ResultSet]] and returns its Catalyst schema. | ||
| * | ||
|
|
@@ -531,7 +573,7 @@ object JdbcUtils extends Logging { | |
| table: String, | ||
| iterator: Iterator[Row], | ||
| rddSchema: StructType, | ||
| nullTypes: Array[Int], | ||
| insertStmt: String, | ||
| batchSize: Int, | ||
| dialect: JdbcDialect, | ||
| isolationLevel: Int): Iterator[Byte] = { | ||
|
|
@@ -568,9 +610,9 @@ object JdbcUtils extends Logging { | |
| conn.setAutoCommit(false) // Everything in the same db transaction. | ||
| conn.setTransactionIsolation(finalIsolationLevel) | ||
| } | ||
| val stmt = insertStatement(conn, table, rddSchema, dialect) | ||
| val setters: Array[JDBCValueSetter] = rddSchema.fields.map(_.dataType) | ||
| .map(makeSetter(conn, dialect, _)).toArray | ||
| val stmt = conn.prepareStatement(insertStmt) | ||
| val setters = rddSchema.fields.map(f => makeSetter(conn, dialect, f.dataType)) | ||
| val nullTypes = rddSchema.fields.map(f => getJdbcType(f.dataType, dialect).jdbcNullType) | ||
| val numFields = rddSchema.fields.length | ||
|
|
||
| try { | ||
|
|
@@ -657,16 +699,16 @@ object JdbcUtils extends Logging { | |
| df: DataFrame, | ||
| url: String, | ||
| table: String, | ||
| tableSchema: Option[StructType], | ||
| isCaseSensitive: Boolean, | ||
| options: JDBCOptions): Unit = { | ||
| val dialect = JdbcDialects.get(url) | ||
| val nullTypes: Array[Int] = df.schema.fields.map { field => | ||
| getJdbcType(field.dataType, dialect).jdbcNullType | ||
| } | ||
|
|
||
| val rddSchema = df.schema | ||
| val getConnection: () => Connection = createConnectionFactory(options) | ||
| val batchSize = options.batchSize | ||
| val isolationLevel = options.isolationLevel | ||
|
|
||
| val insertStmt = getInsertStatement(table, rddSchema, tableSchema, isCaseSensitive, dialect) | ||
| val repartitionedDF = options.numPartitions match { | ||
| case Some(n) if n <= 0 => throw new IllegalArgumentException( | ||
| s"Invalid value `$n` for parameter `${JDBCOptions.JDBC_NUM_PARTITIONS}` in table writing " + | ||
|
|
@@ -675,7 +717,7 @@ object JdbcUtils extends Logging { | |
| case _ => df | ||
| } | ||
| repartitionedDF.foreachPartition(iterator => savePartition( | ||
| getConnection, table, iterator, rddSchema, nullTypes, batchSize, dialect, isolationLevel) | ||
| getConnection, table, iterator, rddSchema, insertStmt, batchSize, dialect, isolationLevel) | ||
| ) | ||
| } | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I moved this into
casestatements.Since
JdbcUtils.tableExistsis used,getSchemaOptioncan be skipped for the otherSaveMode.