From 9c26e2e765ce40fc1ded22e69155b8d85f4e1598 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Fri, 6 Oct 2017 18:54:03 +0100 Subject: [PATCH 1/5] Initial commit --- .../expressions/codegen/CodeGenerator.scala | 3 +-- .../org/apache/spark/sql/internal/SQLConf.scala | 17 +++++++++++++++++ .../sql/execution/debug/DebuggingSuite.scala | 16 ++++++++++++++++ 3 files changed, 34 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala index 838c045d5bcce..19905a25dbdfa 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala @@ -1177,8 +1177,7 @@ class CodegenContext { // be extremely expensive in certain cases, such as deeply-nested expressions which operate over // inputs with wide schemas. For more details on the performance issues that motivated this // flat, see SPARK-15680. - if (force || - SparkEnv.get != null && SparkEnv.get.conf.getBoolean("spark.sql.codegen.comments", false)) { + if (force || SQLConf.get.codegenComments) { val name = if (placeholderId != "") { assert(!placeHolderToComments.contains(placeholderId)) placeholderId diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 07d33fa7d52ae..b784944f4e5d1 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -751,6 +751,19 @@ object SQLConf { .booleanConf .createWithDefault(true) + val MAX_CASES_BRANCHES = buildConf("spark.sql.codegen.maxCaseBranches") + .internal() + .doc("The maximum number of switches supported with codegen.") + .intConf + .createWithDefault(20) + + val CODEGEN_COMMENTS = buildConf("spark.sql.codegen.comments") + .internal() + .doc("When true, put comment in the generated code. Since computing huge comments " + + "can be extremely expensive in certain cases, default is false.") + .booleanConf + .createWithDefault(false) + val CODEGEN_LOGGING_MAX_LINES = buildConf("spark.sql.codegen.logging.maxLines") .internal() .doc("The maximum number of codegen lines to log when errors occur. Use -1 for unlimited.") @@ -1539,6 +1552,10 @@ class SQLConf extends Serializable with Logging { def codegenFallback: Boolean = getConf(CODEGEN_FALLBACK) + def maxCaseBranchesForCodegen: Int = getConf(MAX_CASES_BRANCHES) + + def codegenComments: Boolean = getConf(CODEGEN_COMMENTS) + def loggingMaxLinesForCodegen: Int = getConf(CODEGEN_LOGGING_MAX_LINES) def hugeMethodLimit: Int = getConf(WHOLESTAGE_HUGE_METHOD_LIMIT) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/debug/DebuggingSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/debug/DebuggingSuite.scala index 8251ff159e05f..be4a79e3d429f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/debug/DebuggingSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/debug/DebuggingSuite.scala @@ -48,4 +48,20 @@ class DebuggingSuite extends SparkFunSuite with SharedSQLContext { assert(res.forall{ case (subtree, code) => subtree.contains("Range") && code.contains("Object[]")}) } + + test("check to generate comment") { + withSQLConf("spark.sql.codegen.comments" -> "false") { + val res = codegenStringSeq(spark.range(10).groupBy("id").count().queryExecution.executedPlan) + assert(res.length == 2) + assert(res.forall{ case (_, code) => + !code.contains("* Codegend pipeline") && !code.contains("// input[")}) + } + + withSQLConf("spark.sql.codegen.comments" -> "true") { + val res = codegenStringSeq(spark.range(10).groupBy("id").count().queryExecution.executedPlan) + assert(res.length == 2) + assert(res.forall{ case (_, code) => + code.contains("* Codegend pipeline") && code.contains("// input[")}) + } + } } From a198901d2a1d2b164107fb415ae72abcf0b7075d Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Mon, 16 Jul 2018 17:26:41 +0100 Subject: [PATCH 2/5] use StaticSQLConf --- .../apache/spark/sql/internal/SQLConf.scala | 9 +-------- .../spark/sql/internal/StaticSQLConf.scala | 7 +++++++ .../sql/execution/debug/DebuggingSuite.scala | 16 --------------- .../internal/ExecutorSideSQLConfSuite.scala | 20 +++++++++++++++++++ 4 files changed, 28 insertions(+), 24 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index b784944f4e5d1..58ed78cac7122 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -757,13 +757,6 @@ object SQLConf { .intConf .createWithDefault(20) - val CODEGEN_COMMENTS = buildConf("spark.sql.codegen.comments") - .internal() - .doc("When true, put comment in the generated code. Since computing huge comments " + - "can be extremely expensive in certain cases, default is false.") - .booleanConf - .createWithDefault(false) - val CODEGEN_LOGGING_MAX_LINES = buildConf("spark.sql.codegen.logging.maxLines") .internal() .doc("The maximum number of codegen lines to log when errors occur. Use -1 for unlimited.") @@ -1554,7 +1547,7 @@ class SQLConf extends Serializable with Logging { def maxCaseBranchesForCodegen: Int = getConf(MAX_CASES_BRANCHES) - def codegenComments: Boolean = getConf(CODEGEN_COMMENTS) + def codegenComments: Boolean = getConf(StaticSQLConf.CODEGEN_COMMENTS) def loggingMaxLinesForCodegen: Int = getConf(CODEGEN_LOGGING_MAX_LINES) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/StaticSQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/StaticSQLConf.scala index 384b1917a1f79..0aee436c4a88e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/StaticSQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/StaticSQLConf.scala @@ -74,6 +74,13 @@ object StaticSQLConf { .checkValue(maxEntries => maxEntries >= 0, "The maximum must not be negative") .createWithDefault(100) + val CODEGEN_COMMENTS = buildStaticConf("spark.sql.codegen.comments") + .internal() + .doc("When true, put comment in the generated code. Since computing huge comments " + + "can be extremely expensive in certain cases, default is false.") + .booleanConf + .createWithDefault(false) + // When enabling the debug, Spark SQL internal table properties are not filtered out; however, // some related DDL commands (e.g., ANALYZE TABLE and CREATE TABLE LIKE) might not work properly. val DEBUG_MODE = buildStaticConf("spark.sql.debug") diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/debug/DebuggingSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/debug/DebuggingSuite.scala index be4a79e3d429f..8251ff159e05f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/debug/DebuggingSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/debug/DebuggingSuite.scala @@ -48,20 +48,4 @@ class DebuggingSuite extends SparkFunSuite with SharedSQLContext { assert(res.forall{ case (subtree, code) => subtree.contains("Range") && code.contains("Object[]")}) } - - test("check to generate comment") { - withSQLConf("spark.sql.codegen.comments" -> "false") { - val res = codegenStringSeq(spark.range(10).groupBy("id").count().queryExecution.executedPlan) - assert(res.length == 2) - assert(res.forall{ case (_, code) => - !code.contains("* Codegend pipeline") && !code.contains("// input[")}) - } - - withSQLConf("spark.sql.codegen.comments" -> "true") { - val res = codegenStringSeq(spark.range(10).groupBy("id").count().queryExecution.executedPlan) - assert(res.length == 2) - assert(res.forall{ case (_, code) => - code.contains("* Codegend pipeline") && code.contains("// input[")}) - } - } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/internal/ExecutorSideSQLConfSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/internal/ExecutorSideSQLConfSuite.scala index 855fe4f4523f2..a8b47edd54745 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/internal/ExecutorSideSQLConfSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/internal/ExecutorSideSQLConfSuite.scala @@ -19,6 +19,8 @@ package org.apache.spark.sql.internal import org.apache.spark.SparkFunSuite import org.apache.spark.sql.{AnalysisException, SparkSession} +import org.apache.spark.sql.execution.debug.codegenStringSeq +import org.apache.spark.sql.functions.col import org.apache.spark.sql.test.SQLTestUtils class ExecutorSideSQLConfSuite extends SparkFunSuite with SQLTestUtils { @@ -82,4 +84,22 @@ class ExecutorSideSQLConfSuite extends SparkFunSuite with SQLTestUtils { assert(checks.forall(_ == true)) } } + + test("SPARK-22219: refactor to control to generate comment") { + withSQLConf(StaticSQLConf.CODEGEN_COMMENTS.key -> "false") { + val res = codegenStringSeq(spark.range(10).groupBy(col("id") * 2).count() + .queryExecution.executedPlan) + assert(res.length == 2) + assert(res.forall{ case (_, code) => + !code.contains("* Codegend pipeline") && !code.contains("// input[")}) + } + + withSQLConf(StaticSQLConf.CODEGEN_COMMENTS.key -> "true") { + val res = codegenStringSeq(spark.range(10).groupBy(col("id") * 2).count() + .queryExecution.executedPlan) + assert(res.length == 2) + assert(res.forall{ case (_, code) => + code.contains("* Codegend pipeline") && code.contains("// input[")}) + } + } } From 86800261afa8c451f9d0bf43903026a14ee971ae Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Tue, 17 Jul 2018 02:13:05 +0100 Subject: [PATCH 3/5] address review comment --- .../scala/org/apache/spark/sql/internal/SQLConf.scala | 8 -------- 1 file changed, 8 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 58ed78cac7122..a0d3fa184cac6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -751,12 +751,6 @@ object SQLConf { .booleanConf .createWithDefault(true) - val MAX_CASES_BRANCHES = buildConf("spark.sql.codegen.maxCaseBranches") - .internal() - .doc("The maximum number of switches supported with codegen.") - .intConf - .createWithDefault(20) - val CODEGEN_LOGGING_MAX_LINES = buildConf("spark.sql.codegen.logging.maxLines") .internal() .doc("The maximum number of codegen lines to log when errors occur. Use -1 for unlimited.") @@ -1545,8 +1539,6 @@ class SQLConf extends Serializable with Logging { def codegenFallback: Boolean = getConf(CODEGEN_FALLBACK) - def maxCaseBranchesForCodegen: Int = getConf(MAX_CASES_BRANCHES) - def codegenComments: Boolean = getConf(StaticSQLConf.CODEGEN_COMMENTS) def loggingMaxLinesForCodegen: Int = getConf(CODEGEN_LOGGING_MAX_LINES) From afe889d7cd05f7a293f76103616cd62106b91305 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Tue, 31 Jul 2018 20:21:20 +0100 Subject: [PATCH 4/5] address review comments --- .../sql/catalyst/expressions/codegen/CodeGenerator.scala | 4 ---- .../scala/org/apache/spark/sql/internal/StaticSQLConf.scala | 3 ++- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala index 19905a25dbdfa..8414c1f522836 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala @@ -1173,10 +1173,6 @@ class CodegenContext { text: => String, placeholderId: String = "", force: Boolean = false): Block = { - // By default, disable comments in generated code because computing the comments themselves can - // be extremely expensive in certain cases, such as deeply-nested expressions which operate over - // inputs with wide schemas. For more details on the performance issues that motivated this - // flat, see SPARK-15680. if (force || SQLConf.get.codegenComments) { val name = if (placeholderId != "") { assert(!placeHolderToComments.contains(placeholderId)) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/StaticSQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/StaticSQLConf.scala index 0aee436c4a88e..d9c354b165e52 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/StaticSQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/StaticSQLConf.scala @@ -77,7 +77,8 @@ object StaticSQLConf { val CODEGEN_COMMENTS = buildStaticConf("spark.sql.codegen.comments") .internal() .doc("When true, put comment in the generated code. Since computing huge comments " + - "can be extremely expensive in certain cases, default is false.") + "can be extremely expensive in certain cases, such as deeply-nested expressions which " + + "operate over inputs with wide schemas, default is false.") .booleanConf .createWithDefault(false) From 253bc19af270185e6d419a9ed0261917f84688c1 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Wed, 1 Aug 2018 19:49:10 +0100 Subject: [PATCH 5/5] address review comment --- .../internal/ExecutorSideSQLConfSuite.scala | 24 ++++++++----------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/internal/ExecutorSideSQLConfSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/internal/ExecutorSideSQLConfSuite.scala index a8b47edd54745..5b4736ef4f7f3 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/internal/ExecutorSideSQLConfSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/internal/ExecutorSideSQLConfSuite.scala @@ -86,20 +86,16 @@ class ExecutorSideSQLConfSuite extends SparkFunSuite with SQLTestUtils { } test("SPARK-22219: refactor to control to generate comment") { - withSQLConf(StaticSQLConf.CODEGEN_COMMENTS.key -> "false") { - val res = codegenStringSeq(spark.range(10).groupBy(col("id") * 2).count() - .queryExecution.executedPlan) - assert(res.length == 2) - assert(res.forall{ case (_, code) => - !code.contains("* Codegend pipeline") && !code.contains("// input[")}) - } - - withSQLConf(StaticSQLConf.CODEGEN_COMMENTS.key -> "true") { - val res = codegenStringSeq(spark.range(10).groupBy(col("id") * 2).count() - .queryExecution.executedPlan) - assert(res.length == 2) - assert(res.forall{ case (_, code) => - code.contains("* Codegend pipeline") && code.contains("// input[")}) + Seq(true, false).foreach { flag => + withSQLConf(StaticSQLConf.CODEGEN_COMMENTS.key -> flag.toString) { + val res = codegenStringSeq(spark.range(10).groupBy(col("id") * 2).count() + .queryExecution.executedPlan) + assert(res.length == 2) + assert(res.forall { case (_, code) => + (code.contains("* Codegend pipeline") == flag) && + (code.contains("// input[") == flag) + }) + } } } }