From 50f74d26ec1bc37c5f5bea054da60a1910778e46 Mon Sep 17 00:00:00 2001 From: Xiao Li Date: Sun, 5 Mar 2017 20:18:20 -0800 Subject: [PATCH 1/4] add parseTableSchema API --- .../sql/catalyst/parser/AstBuilder.scala | 2 +- .../sql/catalyst/parser/ParseDriver.scala | 7 +- .../sql/catalyst/parser/ParserInterface.scala | 4 + .../parser/TableSchemaParserSuite.scala | 85 +++++++++++++++++++ 4 files changed, 96 insertions(+), 2 deletions(-) create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/TableSchemaParserSuite.scala diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala index d2e091f4dda6..1c46cca7fdbf 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala @@ -1476,7 +1476,7 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with Logging { /** * Create top level table schema. */ - protected def createSchema(ctx: ColTypeListContext): StructType = { + def createSchema(ctx: ColTypeListContext): StructType = { StructType(Option(ctx).toSeq.flatMap(visitColTypeList)) } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParseDriver.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParseDriver.scala index d687a85c18b6..8fd2ac5c1316 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParseDriver.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParseDriver.scala @@ -26,7 +26,7 @@ import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.trees.Origin -import org.apache.spark.sql.types.DataType +import org.apache.spark.sql.types.{DataType, StructType} /** * Base SQL parsing infrastructure. @@ -49,6 +49,11 @@ abstract class AbstractSqlParser extends ParserInterface with Logging { astBuilder.visitSingleTableIdentifier(parser.singleTableIdentifier()) } + /** Creates StructType for a given SQL string. */ + override def parseTableSchema(sqlText: String): StructType = parse(sqlText) { parser => + astBuilder.createSchema(parser.colTypeList()) + } + /** Creates LogicalPlan for a given SQL string. */ override def parsePlan(sqlText: String): LogicalPlan = parse(sqlText) { parser => astBuilder.visitSingleStatement(parser.singleStatement()) match { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserInterface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserInterface.scala index 7f35d650b957..b2662d7617fe 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserInterface.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserInterface.scala @@ -20,6 +20,7 @@ package org.apache.spark.sql.catalyst.parser import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.types.StructType /** * Interface for a parser. @@ -33,4 +34,7 @@ trait ParserInterface { /** Creates TableIdentifier for a given SQL string. */ def parseTableIdentifier(sqlText: String): TableIdentifier + + /** Creates StructType for a given SQL string. */ + def parseTableSchema(sqlText: String): StructType } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/TableSchemaParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/TableSchemaParserSuite.scala new file mode 100644 index 000000000000..1e86796f2dc2 --- /dev/null +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/TableSchemaParserSuite.scala @@ -0,0 +1,85 @@ +/* +* Licensed to the Apache Software Foundation (ASF) under one or more +* contributor license agreements. See the NOTICE file distributed with +* this work for additional information regarding copyright ownership. +* The ASF licenses this file to You under the Apache License, Version 2.0 +* (the "License"); you may not use this file except in compliance with +* the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +package org.apache.spark.sql.catalyst.parser + +import org.apache.spark.SparkFunSuite +import org.apache.spark.sql.types._ + +class TableSchemaParserSuite extends SparkFunSuite { + + def parse(sql: String): DataType = CatalystSqlParser.parseTableSchema(sql) + + def checkTableSchema(tableSchemaString: String, expectedDataType: DataType): Unit = { + test(s"parse ${tableSchemaString.replace("\n", "")}") { + assert(parse(tableSchemaString) === expectedDataType) + } + } + + checkTableSchema("a int", (new StructType).add("a", "int")) + checkTableSchema("A int", (new StructType).add("A", "int")) + checkTableSchema("a INT", (new StructType).add("a", "int")) + checkTableSchema("`!@#$%.^&*()` string", (new StructType).add("!@#$%.^&*()", "string")) + checkTableSchema("a int, b long", (new StructType).add("a", "int").add("b", "long")) + checkTableSchema("a STRUCT", + StructType( + StructField("a", StructType( + StructField("intType", IntegerType) :: + StructField("ts", TimestampType) :: Nil)) :: Nil)) + + checkTableSchema( + "a int comment 'test'", + (new StructType).add("a", "int", nullable = true, "test")) + + test("complex hive type") { + val tableSchemaString = + """ + |complexStructCol struct< + |struct:struct, + |MAP:Map, + |arrAy:Array, + |anotherArray:Array> + """.stripMargin.replace("\n", "") + + val builder = new MetadataBuilder + builder.putString(HIVE_TYPE_STRING, + "struct," + + "MAP:map,arrAy:array,anotherArray:array>") + + val expectedDataType = + StructType( + StructField("complexStructCol", StructType( + StructField("struct", + StructType( + StructField("deciMal", DecimalType.USER_DEFAULT) :: + StructField("anotherDecimal", DecimalType(5, 2)) :: Nil)) :: + StructField("MAP", MapType(TimestampType, StringType)) :: + StructField("arrAy", ArrayType(DoubleType)) :: + StructField("anotherArray", ArrayType(StringType)) :: Nil), + nullable = true, + builder.build()) :: Nil) + + assert(parse(tableSchemaString) === expectedDataType) + } + + test("illegal col types") { + val e = intercept[ParseException] { + CatalystSqlParser.parseTableSchema("a INT b long") + }.getMessage + assert(e.contains("mismatched input 'b' expecting {, '(', ',', 'COMMENT'}")) + } +} From 3ec8483d8d7e8ba4be88041ebba73a2bda922186 Mon Sep 17 00:00:00 2001 From: Xiao Li Date: Mon, 6 Mar 2017 15:37:19 -0800 Subject: [PATCH 2/4] address comments. --- .../sql/catalyst/parser/ParseDriver.scala | 5 +++- .../sql/catalyst/parser/ParserInterface.scala | 5 +++- .../parser/TableSchemaParserSuite.scala | 26 +++++++++---------- 3 files changed, 21 insertions(+), 15 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParseDriver.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParseDriver.scala index 8fd2ac5c1316..d9c0221784cf 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParseDriver.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParseDriver.scala @@ -49,7 +49,10 @@ abstract class AbstractSqlParser extends ParserInterface with Logging { astBuilder.visitSingleTableIdentifier(parser.singleTableIdentifier()) } - /** Creates StructType for a given SQL string. */ + /** + * Creates StructType for a given SQL string, which is a comma separated list of field + * definitions which will preserve the correct Hive metadata. + */ override def parseTableSchema(sqlText: String): StructType = parse(sqlText) { parser => astBuilder.createSchema(parser.colTypeList()) } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserInterface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserInterface.scala index b2662d7617fe..6edbe253970e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserInterface.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserInterface.scala @@ -35,6 +35,9 @@ trait ParserInterface { /** Creates TableIdentifier for a given SQL string. */ def parseTableIdentifier(sqlText: String): TableIdentifier - /** Creates StructType for a given SQL string. */ + /** + * Creates StructType for a given SQL string, which is a comma separated list of field + * definitions which will preserve the correct Hive metadata. + */ def parseTableSchema(sqlText: String): StructType } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/TableSchemaParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/TableSchemaParserSuite.scala index 1e86796f2dc2..418519f0dbbb 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/TableSchemaParserSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/TableSchemaParserSuite.scala @@ -30,20 +30,22 @@ class TableSchemaParserSuite extends SparkFunSuite { } } - checkTableSchema("a int", (new StructType).add("a", "int")) - checkTableSchema("A int", (new StructType).add("A", "int")) - checkTableSchema("a INT", (new StructType).add("a", "int")) - checkTableSchema("`!@#$%.^&*()` string", (new StructType).add("!@#$%.^&*()", "string")) - checkTableSchema("a int, b long", (new StructType).add("a", "int").add("b", "long")) + def intercept(sql: String): Unit = + intercept[ParseException](CatalystSqlParser.parseTableSchema(sql)) + + checkTableSchema("a int", new StructType().add("a", "int")) + checkTableSchema("A int", new StructType().add("A", "int")) + checkTableSchema("a INT", new StructType().add("a", "int")) + checkTableSchema("`!@#$%.^&*()` string", new StructType().add("!@#$%.^&*()", "string")) + checkTableSchema("a int, b long", new StructType().add("a", "int").add("b", "long")) checkTableSchema("a STRUCT", StructType( StructField("a", StructType( StructField("intType", IntegerType) :: StructField("ts", TimestampType) :: Nil)) :: Nil)) - checkTableSchema( "a int comment 'test'", - (new StructType).add("a", "int", nullable = true, "test")) + new StructType().add("a", "int", nullable = true, "test")) test("complex hive type") { val tableSchemaString = @@ -76,10 +78,8 @@ class TableSchemaParserSuite extends SparkFunSuite { assert(parse(tableSchemaString) === expectedDataType) } - test("illegal col types") { - val e = intercept[ParseException] { - CatalystSqlParser.parseTableSchema("a INT b long") - }.getMessage - assert(e.contains("mismatched input 'b' expecting {, '(', ',', 'COMMENT'}")) - } + // Negative cases + intercept("a INT b long") + intercept("a INT,, b long") + intercept("a INT, b long,,") } From 22b7db8bc013d5dcd23c3ef0f45483c47ea66b98 Mon Sep 17 00:00:00 2001 From: Xiao Li Date: Mon, 6 Mar 2017 15:55:56 -0800 Subject: [PATCH 3/4] address comments. --- .../org/apache/spark/sql/catalyst/parser/AstBuilder.scala | 2 +- .../org/apache/spark/sql/catalyst/parser/ParseDriver.scala | 2 +- .../spark/sql/catalyst/parser/TableSchemaParserSuite.scala | 5 ++++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala index 1c46cca7fdbf..d2e091f4dda6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala @@ -1476,7 +1476,7 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with Logging { /** * Create top level table schema. */ - def createSchema(ctx: ColTypeListContext): StructType = { + protected def createSchema(ctx: ColTypeListContext): StructType = { StructType(Option(ctx).toSeq.flatMap(visitColTypeList)) } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParseDriver.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParseDriver.scala index d9c0221784cf..f704b0998cad 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParseDriver.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParseDriver.scala @@ -54,7 +54,7 @@ abstract class AbstractSqlParser extends ParserInterface with Logging { * definitions which will preserve the correct Hive metadata. */ override def parseTableSchema(sqlText: String): StructType = parse(sqlText) { parser => - astBuilder.createSchema(parser.colTypeList()) + StructType(astBuilder.visitColTypeList(parser.colTypeList())) } /** Creates LogicalPlan for a given SQL string. */ diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/TableSchemaParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/TableSchemaParserSuite.scala index 418519f0dbbb..4a7c3a3e822f 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/TableSchemaParserSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/TableSchemaParserSuite.scala @@ -22,7 +22,7 @@ import org.apache.spark.sql.types._ class TableSchemaParserSuite extends SparkFunSuite { - def parse(sql: String): DataType = CatalystSqlParser.parseTableSchema(sql) + def parse(sql: String): StructType = CatalystSqlParser.parseTableSchema(sql) def checkTableSchema(tableSchemaString: String, expectedDataType: DataType): Unit = { test(s"parse ${tableSchemaString.replace("\n", "")}") { @@ -79,7 +79,10 @@ class TableSchemaParserSuite extends SparkFunSuite { } // Negative cases + intercept("") + intercept("a") intercept("a INT b long") intercept("a INT,, b long") intercept("a INT, b long,,") + intercept("a INT, b long, c int,") } From b18ae84c1f0485d929e58d217c1881d037721881 Mon Sep 17 00:00:00 2001 From: Xiao Li Date: Wed, 15 Mar 2017 18:49:17 -0700 Subject: [PATCH 4/4] address comments. --- .../catalyst/parser/TableSchemaParserSuite.scala | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/TableSchemaParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/TableSchemaParserSuite.scala index 4a7c3a3e822f..da1041d61708 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/TableSchemaParserSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/TableSchemaParserSuite.scala @@ -25,12 +25,12 @@ class TableSchemaParserSuite extends SparkFunSuite { def parse(sql: String): StructType = CatalystSqlParser.parseTableSchema(sql) def checkTableSchema(tableSchemaString: String, expectedDataType: DataType): Unit = { - test(s"parse ${tableSchemaString.replace("\n", "")}") { + test(s"parse $tableSchemaString") { assert(parse(tableSchemaString) === expectedDataType) } } - def intercept(sql: String): Unit = + def assertError(sql: String): Unit = intercept[ParseException](CatalystSqlParser.parseTableSchema(sql)) checkTableSchema("a int", new StructType().add("a", "int")) @@ -79,10 +79,10 @@ class TableSchemaParserSuite extends SparkFunSuite { } // Negative cases - intercept("") - intercept("a") - intercept("a INT b long") - intercept("a INT,, b long") - intercept("a INT, b long,,") - intercept("a INT, b long, c int,") + assertError("") + assertError("a") + assertError("a INT b long") + assertError("a INT,, b long") + assertError("a INT, b long,,") + assertError("a INT, b long, c int,") }