Skip to content

Commit 9aff6f3

Browse files
gatorsmilezsxwing
authored andcommitted
[SPARK-15515][SQL] Error Handling in Running SQL Directly On Files
#### What changes were proposed in this pull request? This PR is to address the following issues: - **ISSUE 1:** For ORC source format, we are reporting the strange error message when we did not enable Hive support: ```SQL SQL Example: select id from `org.apache.spark.sql.hive.orc`.`file_path` Error Message: Table or view not found: `org.apache.spark.sql.hive.orc`.`file_path` ``` Instead, we should issue the error message like: ``` Expected Error Message: The ORC data source must be used with Hive support enabled ``` - **ISSUE 2:** For the Avro format, we report the strange error message like: The example query is like ```SQL SQL Example: select id from `avro`.`file_path` select id from `com.databricks.spark.avro`.`file_path` Error Message: Table or view not found: `com.databricks.spark.avro`.`file_path` ``` The desired message should be like: ``` Expected Error Message: Failed to find data source: avro. Please use Spark package http://spark-packages.org/package/databricks/spark-avro" ``` - ~~**ISSUE 3:** Unable to detect incompatibility libraries for Spark 2.0 in Data Source Resolution. We report a strange error message:~~ **Update**: The latest code changes contains - For JDBC format, we added an extra checking in the rule `ResolveRelations` of `Analyzer`. Without the PR, Spark will return the error message like: `Option 'url' not specified`. Now, we are reporting `Unsupported data source type for direct query on files: jdbc` - Make data source format name case incensitive so that error handling behaves consistent with the normal cases. - Added the test cases for all the supported formats. #### How was this patch tested? Added test cases to cover all the above issues Author: gatorsmile <[email protected]> Author: xiaoli <[email protected]> Author: Xiao Li <[email protected]> Closes #13283 from gatorsmile/runSQLAgainstFile.
1 parent 8900c8d commit 9aff6f3

File tree

6 files changed

+134
-34
lines changed

6 files changed

+134
-34
lines changed

sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala

Lines changed: 13 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -132,28 +132,20 @@ case class DataSource(
132132
// Found the data source using fully qualified path
133133
dataSource
134134
case Failure(error) =>
135-
if (error.isInstanceOf[ClassNotFoundException]) {
136-
val className = error.getMessage
137-
if (spark2RemovedClasses.contains(className)) {
138-
throw new ClassNotFoundException(s"$className is removed in Spark 2.0. " +
139-
"Please check if your library is compatible with Spark 2.0")
140-
}
141-
}
142-
if (provider.startsWith("org.apache.spark.sql.hive.orc")) {
143-
throw new ClassNotFoundException(
144-
"The ORC data source must be used with Hive support enabled.", error)
135+
if (provider.toLowerCase == "orc" ||
136+
provider.startsWith("org.apache.spark.sql.hive.orc")) {
137+
throw new AnalysisException(
138+
"The ORC data source must be used with Hive support enabled")
139+
} else if (provider.toLowerCase == "avro" ||
140+
provider == "com.databricks.spark.avro") {
141+
throw new AnalysisException(
142+
s"Failed to find data source: ${provider.toLowerCase}. Please use Spark " +
143+
"package http://spark-packages.org/package/databricks/spark-avro")
145144
} else {
146-
if (provider == "avro" || provider == "com.databricks.spark.avro") {
147-
throw new ClassNotFoundException(
148-
s"Failed to find data source: $provider. Please use Spark package " +
149-
"http://spark-packages.org/package/databricks/spark-avro",
150-
error)
151-
} else {
152-
throw new ClassNotFoundException(
153-
s"Failed to find data source: $provider. Please find packages at " +
154-
"http://spark-packages.org",
155-
error)
156-
}
145+
throw new ClassNotFoundException(
146+
s"Failed to find data source: $provider. Please find packages at " +
147+
"http://spark-packages.org",
148+
error)
157149
}
158150
}
159151
} catch {

sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717

1818
package org.apache.spark.sql.execution.datasources
1919

20+
import scala.util.control.NonFatal
21+
2022
import org.apache.spark.sql.{AnalysisException, SaveMode, SparkSession}
2123
import org.apache.spark.sql.catalyst.analysis._
2224
import org.apache.spark.sql.catalyst.catalog.SessionCatalog
@@ -28,7 +30,7 @@ import org.apache.spark.sql.internal.SQLConf
2830
import org.apache.spark.sql.sources.{BaseRelation, InsertableRelation}
2931

3032
/**
31-
* Try to replaces [[UnresolvedRelation]]s with [[ResolvedDataSource]].
33+
* Try to replaces [[UnresolvedRelation]]s with [[ResolveDataSource]].
3234
*/
3335
private[sql] class ResolveDataSource(sparkSession: SparkSession) extends Rule[LogicalPlan] {
3436
def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators {
@@ -38,6 +40,16 @@ private[sql] class ResolveDataSource(sparkSession: SparkSession) extends Rule[Lo
3840
sparkSession,
3941
paths = u.tableIdentifier.table :: Nil,
4042
className = u.tableIdentifier.database.get)
43+
44+
val notSupportDirectQuery = try {
45+
!classOf[FileFormat].isAssignableFrom(dataSource.providingClass)
46+
} catch {
47+
case NonFatal(e) => false
48+
}
49+
if (notSupportDirectQuery) {
50+
throw new AnalysisException("Unsupported data source type for direct query on files: " +
51+
s"${u.tableIdentifier.database.get}")
52+
}
4153
val plan = LogicalRelation(dataSource.resolveRelation())
4254
u.alias.map(a => SubqueryAlias(u.alias.get, plan)).getOrElse(plan)
4355
} catch {

sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala

Lines changed: 47 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1838,20 +1838,61 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
18381838
df)
18391839
})
18401840

1841-
val e1 = intercept[AnalysisException] {
1841+
var e = intercept[AnalysisException] {
18421842
sql("select * from in_valid_table")
18431843
}
1844-
assert(e1.message.contains("Table or view not found"))
1844+
assert(e.message.contains("Table or view not found"))
18451845

1846-
val e2 = intercept[AnalysisException] {
1846+
e = intercept[AnalysisException] {
18471847
sql("select * from no_db.no_table").show()
18481848
}
1849-
assert(e2.message.contains("Table or view not found"))
1849+
assert(e.message.contains("Table or view not found"))
18501850

1851-
val e3 = intercept[AnalysisException] {
1851+
e = intercept[AnalysisException] {
18521852
sql("select * from json.invalid_file")
18531853
}
1854-
assert(e3.message.contains("Path does not exist"))
1854+
assert(e.message.contains("Path does not exist"))
1855+
1856+
e = intercept[AnalysisException] {
1857+
sql(s"select id from `org.apache.spark.sql.hive.orc`.`file_path`")
1858+
}
1859+
assert(e.message.contains("The ORC data source must be used with Hive support enabled"))
1860+
1861+
e = intercept[AnalysisException] {
1862+
sql(s"select id from `com.databricks.spark.avro`.`file_path`")
1863+
}
1864+
assert(e.message.contains("Failed to find data source: com.databricks.spark.avro. " +
1865+
"Please use Spark package http://spark-packages.org/package/databricks/spark-avro"))
1866+
1867+
// data source type is case insensitive
1868+
e = intercept[AnalysisException] {
1869+
sql(s"select id from Avro.`file_path`")
1870+
}
1871+
assert(e.message.contains("Failed to find data source: avro. Please use Spark package " +
1872+
"http://spark-packages.org/package/databricks/spark-avro"))
1873+
1874+
e = intercept[AnalysisException] {
1875+
sql(s"select id from avro.`file_path`")
1876+
}
1877+
assert(e.message.contains("Failed to find data source: avro. Please use Spark package " +
1878+
"http://spark-packages.org/package/databricks/spark-avro"))
1879+
1880+
e = intercept[AnalysisException] {
1881+
sql(s"select id from `org.apache.spark.sql.sources.HadoopFsRelationProvider`.`file_path`")
1882+
}
1883+
assert(e.message.contains("Table or view not found: " +
1884+
"`org.apache.spark.sql.sources.HadoopFsRelationProvider`.`file_path`"))
1885+
1886+
e = intercept[AnalysisException] {
1887+
sql(s"select id from `Jdbc`.`file_path`")
1888+
}
1889+
assert(e.message.contains("Unsupported data source type for direct query on files: Jdbc"))
1890+
1891+
e = intercept[AnalysisException] {
1892+
sql(s"select id from `org.apache.spark.sql.execution.datasources.jdbc`.`file_path`")
1893+
}
1894+
assert(e.message.contains("Unsupported data source type for direct query on files: " +
1895+
"org.apache.spark.sql.execution.datasources.jdbc"))
18551896
}
18561897

18571898
test("SortMergeJoin returns wrong results when using UnsafeRows") {

sql/core/src/test/scala/org/apache/spark/sql/sources/DDLSourceLoadSuite.scala

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
package org.apache.spark.sql.sources
1919

20-
import org.apache.spark.sql.SQLContext
20+
import org.apache.spark.sql.{AnalysisException, SQLContext}
2121
import org.apache.spark.sql.test.SharedSQLContext
2222
import org.apache.spark.sql.types.{StringType, StructField, StructType}
2323

@@ -42,9 +42,10 @@ class DDLSourceLoadSuite extends DataSourceTest with SharedSQLContext {
4242
}
4343

4444
test("should fail to load ORC without Hive Support") {
45-
intercept[ClassNotFoundException] {
45+
val e = intercept[AnalysisException] {
4646
spark.read.format("orc").load()
4747
}
48+
assert(e.message.contains("The ORC data source must be used with Hive support enabled"))
4849
}
4950
}
5051

sql/core/src/test/scala/org/apache/spark/sql/sources/ResolvedDataSourceSuite.scala

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
package org.apache.spark.sql.sources
1919

2020
import org.apache.spark.SparkFunSuite
21+
import org.apache.spark.sql.AnalysisException
2122
import org.apache.spark.sql.execution.datasources.DataSource
2223

2324
class ResolvedDataSourceSuite extends SparkFunSuite {
@@ -60,13 +61,22 @@ class ResolvedDataSourceSuite extends SparkFunSuite {
6061
classOf[org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat])
6162
}
6263

64+
test("csv") {
65+
assert(
66+
getProvidingClass("csv") ===
67+
classOf[org.apache.spark.sql.execution.datasources.csv.CSVFileFormat])
68+
assert(
69+
getProvidingClass("com.databricks.spark.csv") ===
70+
classOf[org.apache.spark.sql.execution.datasources.csv.CSVFileFormat])
71+
}
72+
6373
test("error message for unknown data sources") {
64-
val error1 = intercept[ClassNotFoundException] {
74+
val error1 = intercept[AnalysisException] {
6575
getProvidingClass("avro")
6676
}
6777
assert(error1.getMessage.contains("spark-packages"))
6878

69-
val error2 = intercept[ClassNotFoundException] {
79+
val error2 = intercept[AnalysisException] {
7080
getProvidingClass("com.databricks.spark.avro")
7181
}
7282
assert(error2.getMessage.contains("spark-packages"))

sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala

Lines changed: 46 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1247,11 +1247,12 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
12471247
}
12481248
}
12491249

1250-
test("run sql directly on files") {
1250+
test("run sql directly on files - parquet") {
12511251
val df = spark.range(100).toDF()
12521252
withTempPath(f => {
12531253
df.write.parquet(f.getCanonicalPath)
1254-
checkAnswer(sql(s"select id from parquet.`${f.getCanonicalPath}`"),
1254+
// data source type is case insensitive
1255+
checkAnswer(sql(s"select id from Parquet.`${f.getCanonicalPath}`"),
12551256
df)
12561257
checkAnswer(sql(s"select id from `org.apache.spark.sql.parquet`.`${f.getCanonicalPath}`"),
12571258
df)
@@ -1260,6 +1261,49 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
12601261
})
12611262
}
12621263

1264+
test("run sql directly on files - orc") {
1265+
val df = spark.range(100).toDF()
1266+
withTempPath(f => {
1267+
df.write.orc(f.getCanonicalPath)
1268+
// data source type is case insensitive
1269+
checkAnswer(sql(s"select id from ORC.`${f.getCanonicalPath}`"),
1270+
df)
1271+
checkAnswer(sql(s"select id from `org.apache.spark.sql.hive.orc`.`${f.getCanonicalPath}`"),
1272+
df)
1273+
checkAnswer(sql(s"select a.id from orc.`${f.getCanonicalPath}` as a"),
1274+
df)
1275+
})
1276+
}
1277+
1278+
test("run sql directly on files - csv") {
1279+
val df = spark.range(100).toDF()
1280+
withTempPath(f => {
1281+
df.write.csv(f.getCanonicalPath)
1282+
// data source type is case insensitive
1283+
checkAnswer(sql(s"select cast(_c0 as int) id from CSV.`${f.getCanonicalPath}`"),
1284+
df)
1285+
checkAnswer(
1286+
sql(s"select cast(_c0 as int) id from `com.databricks.spark.csv`.`${f.getCanonicalPath}`"),
1287+
df)
1288+
checkAnswer(sql(s"select cast(a._c0 as int) id from csv.`${f.getCanonicalPath}` as a"),
1289+
df)
1290+
})
1291+
}
1292+
1293+
test("run sql directly on files - json") {
1294+
val df = spark.range(100).toDF()
1295+
withTempPath(f => {
1296+
df.write.json(f.getCanonicalPath)
1297+
// data source type is case insensitive
1298+
checkAnswer(sql(s"select id from jsoN.`${f.getCanonicalPath}`"),
1299+
df)
1300+
checkAnswer(sql(s"select id from `org.apache.spark.sql.json`.`${f.getCanonicalPath}`"),
1301+
df)
1302+
checkAnswer(sql(s"select a.id from json.`${f.getCanonicalPath}` as a"),
1303+
df)
1304+
})
1305+
}
1306+
12631307
test("SPARK-8976 Wrong Result for Rollup #1") {
12641308
checkAnswer(sql(
12651309
"SELECT count(*) AS cnt, key % 5, grouping_id() FROM src GROUP BY key%5 WITH ROLLUP"),

0 commit comments

Comments
 (0)