Skip to content

Commit 3b07a4e

Browse files
younggyu chundongjoon-hyun
authored andcommitted
[SPARK-27931][SQL] Accept "true", "yes", "1", "false", "no", "0", and unique prefixes as input and trim input for the boolean data type
## What changes were proposed in this pull request? This PR aims to add "true", "yes", "1", "false", "no", "0", and unique prefixes as input for the boolean data type and ignore input whitespace. Please see the following what string representations are using for the boolean type in other databases. https://www.postgresql.org/docs/devel/datatype-boolean.html https://docs.aws.amazon.com/redshift/latest/dg/r_Boolean_type.html ## How was this patch tested? Added new tests to CastSuite. Closes apache#25458 from younggyuchun/SPARK-27931. Authored-by: younggyu chun <[email protected]> Signed-off-by: Dongjoon Hyun <[email protected]>
1 parent ea90ea6 commit 3b07a4e

File tree

4 files changed

+33
-18
lines changed

4 files changed

+33
-18
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -65,12 +65,16 @@ object StringUtils extends Logging {
6565
"(?s)" + out.result() // (?s) enables dotall mode, causing "." to match new lines
6666
}
6767

68-
private[this] val trueStrings = Set("t", "true", "y", "yes", "1").map(UTF8String.fromString)
69-
private[this] val falseStrings = Set("f", "false", "n", "no", "0").map(UTF8String.fromString)
68+
// "true", "yes", "1", "false", "no", "0", and unique prefixes of these strings are accepted.
69+
private[this] val trueStrings =
70+
Set("true", "tru", "tr", "t", "yes", "ye", "y", "on", "1").map(UTF8String.fromString)
71+
72+
private[this] val falseStrings =
73+
Set("false", "fals", "fal", "fa", "f", "no", "n", "off", "of", "0").map(UTF8String.fromString)
7074

7175
// scalastyle:off caselocale
72-
def isTrueString(s: UTF8String): Boolean = trueStrings.contains(s.toLowerCase)
73-
def isFalseString(s: UTF8String): Boolean = falseStrings.contains(s.toLowerCase)
76+
def isTrueString(s: UTF8String): Boolean = trueStrings.contains(s.toLowerCase.trim())
77+
def isFalseString(s: UTF8String): Boolean = falseStrings.contains(s.toLowerCase.trim())
7478
// scalastyle:on caselocale
7579

7680
/**

sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -819,20 +819,34 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
819819
}
820820

821821
test("cast string to boolean") {
822-
checkCast("t", true)
823822
checkCast("true", true)
823+
checkCast("tru", true)
824+
checkCast("tr", true)
825+
checkCast("t", true)
824826
checkCast("tRUe", true)
825-
checkCast("y", true)
827+
checkCast(" tRue ", true)
828+
checkCast(" tRu ", true)
826829
checkCast("yes", true)
830+
checkCast("ye", true)
831+
checkCast("y", true)
827832
checkCast("1", true)
833+
checkCast("on", true)
828834

829-
checkCast("f", false)
830835
checkCast("false", false)
831-
checkCast("FAlsE", false)
832-
checkCast("n", false)
836+
checkCast("fals", false)
837+
checkCast("fal", false)
838+
checkCast("fa", false)
839+
checkCast("f", false)
840+
checkCast(" fAlse ", false)
841+
checkCast(" fAls ", false)
842+
checkCast(" FAlsE ", false)
833843
checkCast("no", false)
844+
checkCast("n", false)
834845
checkCast("0", false)
846+
checkCast("off", false)
847+
checkCast("of", false)
835848

849+
checkEvaluation(cast("o", BooleanType), null)
836850
checkEvaluation(cast("abc", BooleanType), null)
837851
checkEvaluation(cast("", BooleanType), null)
838852
}

sql/core/src/test/resources/sql-tests/inputs/pgSQL/boolean.sql

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@ SELECT false AS `false`;
2222

2323
SELECT boolean('t') AS true;
2424

25-
-- [SPARK-27931] Trim the string when cast string type to boolean type
2625
SELECT boolean(' f ') AS `false`;
2726

2827
SELECT boolean('true') AS true;
@@ -49,12 +48,10 @@ SELECT boolean('no') AS `false`;
4948
-- [SPARK-27923] PostgreSQL does not accept 'nay' but Spark SQL accepts it and sets it to NULL
5049
SELECT boolean('nay') AS error;
5150

52-
-- [SPARK-27931] Accept 'on' and 'off' as input for boolean data type
5351
SELECT boolean('on') AS true;
5452

5553
SELECT boolean('off') AS `false`;
5654

57-
-- [SPARK-27931] Accept unique prefixes thereof
5855
SELECT boolean('of') AS `false`;
5956

6057
-- [SPARK-27923] PostgreSQL does not accept 'o' but Spark SQL accepts it and sets it to NULL
@@ -101,7 +98,7 @@ SELECT boolean('f') <= boolean('t') AS true;
10198

10299
-- explicit casts to/from text
103100
SELECT boolean(string('TrUe')) AS true, boolean(string('fAlse')) AS `false`;
104-
-- [SPARK-27931] Trim the string when cast to boolean type
101+
105102
SELECT boolean(string(' true ')) AS true,
106103
boolean(string(' FALSE')) AS `false`;
107104
SELECT string(boolean(true)) AS true, string(boolean(false)) AS `false`;

sql/core/src/test/resources/sql-tests/results/pgSQL/boolean.sql.out

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ SELECT boolean(' f ') AS `false`
3939
-- !query 4 schema
4040
struct<false:boolean>
4141
-- !query 4 output
42-
NULL
42+
false
4343

4444

4545
-- !query 5
@@ -127,23 +127,23 @@ SELECT boolean('on') AS true
127127
-- !query 15 schema
128128
struct<true:boolean>
129129
-- !query 15 output
130-
NULL
130+
true
131131

132132

133133
-- !query 16
134134
SELECT boolean('off') AS `false`
135135
-- !query 16 schema
136136
struct<false:boolean>
137137
-- !query 16 output
138-
NULL
138+
false
139139

140140

141141
-- !query 17
142142
SELECT boolean('of') AS `false`
143143
-- !query 17 schema
144144
struct<false:boolean>
145145
-- !query 17 output
146-
NULL
146+
false
147147

148148

149149
-- !query 18
@@ -296,7 +296,7 @@ SELECT boolean(string(' true ')) AS true,
296296
-- !query 36 schema
297297
struct<true:boolean,false:boolean>
298298
-- !query 36 output
299-
NULL NULL
299+
true false
300300

301301

302302
-- !query 37

0 commit comments

Comments
 (0)