Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -921,6 +921,18 @@ public static int collationNameToId(String collationName) throws SparkException
return Collation.CollationSpec.collationNameToId(collationName);
}

/**
* Returns whether the ICU collation is not Case Sensitive Accent Insensitive
* for the given collation id.
* This method is used in expressions which do not support CS_AI collations.
*/
public static boolean isCaseSensitiveAndAccentInsensitive(int collationId) {
return Collation.CollationSpecICU.fromCollationId(collationId).caseSensitivity ==
Collation.CollationSpecICU.CaseSensitivity.CS &&
Collation.CollationSpecICU.fromCollationId(collationId).accentSensitivity ==
Collation.CollationSpecICU.AccentSensitivity.AI;
}

public static void assertValidProvider(String provider) throws SparkException {
if (!SUPPORTED_PROVIDERS.contains(provider.toLowerCase())) {
Map<String, String> params = Map.of(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,3 +51,12 @@ case object StringTypeBinaryLcase extends AbstractStringType {
case object StringTypeAnyCollation extends AbstractStringType {
override private[sql] def acceptsType(other: DataType): Boolean = other.isInstanceOf[StringType]
}

/**
* Use StringTypeNonCSAICollation for expressions supporting all possible collation types except
* CS_AI collation types.
*/
case object StringTypeNonCSAICollation extends AbstractStringType {
override private[sql] def acceptsType(other: DataType): Boolean =
other.isInstanceOf[StringType] && other.asInstanceOf[StringType].isNonCSAI
}
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,9 @@ class StringType private (val collationId: Int) extends AtomicType with Serializ
private[sql] def supportsLowercaseEquality: Boolean =
CollationFactory.fetchCollation(collationId).supportsLowercaseEquality

private[sql] def isNonCSAI: Boolean =
!CollationFactory.isCaseSensitiveAndAccentInsensitive(collationId)

private[sql] def isUTF8BinaryCollation: Boolean =
collationId == CollationFactory.UTF8_BINARY_COLLATION_ID

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ import org.apache.spark.sql.catalyst.trees.TreePattern._
import org.apache.spark.sql.catalyst.util._
import org.apache.spark.sql.errors.QueryCompilationErrors
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.internal.types.StringTypeAnyCollation
import org.apache.spark.sql.internal.types.StringTypeNonCSAICollation
import org.apache.spark.sql.types._
import org.apache.spark.unsafe.types.UTF8String
import org.apache.spark.util.ArrayImplicits._
Expand Down Expand Up @@ -579,7 +579,7 @@ case class StringToMap(text: Expression, pairDelim: Expression, keyValueDelim: E
override def third: Expression = keyValueDelim

override def inputTypes: Seq[AbstractDataType] =
Seq(StringTypeAnyCollation, StringTypeAnyCollation, StringTypeAnyCollation)
Seq(StringTypeNonCSAICollation, StringTypeNonCSAICollation, StringTypeNonCSAICollation)

override def dataType: DataType = MapType(first.dataType, first.dataType)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ import org.apache.spark.sql.catalyst.trees.TreePattern.{TreePattern, UPPER_OR_LO
import org.apache.spark.sql.catalyst.util.{ArrayData, CharsetProvider, CollationFactory, CollationSupport, GenericArrayData, TypeUtils}
import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryExecutionErrors}
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.internal.types.{AbstractArrayType, StringTypeAnyCollation}
import org.apache.spark.sql.internal.types.{AbstractArrayType, StringTypeAnyCollation, StringTypeNonCSAICollation}
import org.apache.spark.sql.types._
import org.apache.spark.unsafe.UTF8StringBuilder
import org.apache.spark.unsafe.array.ByteArrayMethods
Expand Down Expand Up @@ -609,6 +609,8 @@ case class Contains(left: Expression, right: Expression) extends StringPredicate
defineCodeGen(ctx, ev, (c1, c2) =>
CollationSupport.Contains.genCode(c1, c2, collationId))
}
override def inputTypes : Seq[AbstractDataType] =
Seq(StringTypeNonCSAICollation, StringTypeNonCSAICollation)
override protected def withNewChildrenInternal(
newLeft: Expression, newRight: Expression): Contains = copy(left = newLeft, right = newRight)
}
Expand Down Expand Up @@ -650,6 +652,10 @@ case class StartsWith(left: Expression, right: Expression) extends StringPredica
defineCodeGen(ctx, ev, (c1, c2) =>
CollationSupport.StartsWith.genCode(c1, c2, collationId))
}

override def inputTypes : Seq[AbstractDataType] =
Seq(StringTypeNonCSAICollation, StringTypeNonCSAICollation, StringTypeNonCSAICollation)

override protected def withNewChildrenInternal(
newLeft: Expression, newRight: Expression): StartsWith = copy(left = newLeft, right = newRight)
}
Expand Down Expand Up @@ -691,6 +697,10 @@ case class EndsWith(left: Expression, right: Expression) extends StringPredicate
defineCodeGen(ctx, ev, (c1, c2) =>
CollationSupport.EndsWith.genCode(c1, c2, collationId))
}

override def inputTypes : Seq[AbstractDataType] =
Seq(StringTypeNonCSAICollation, StringTypeNonCSAICollation, StringTypeNonCSAICollation)

override protected def withNewChildrenInternal(
newLeft: Expression, newRight: Expression): EndsWith = copy(left = newLeft, right = newRight)
}
Expand Down Expand Up @@ -919,7 +929,7 @@ case class StringReplace(srcExpr: Expression, searchExpr: Expression, replaceExp

override def dataType: DataType = srcExpr.dataType
override def inputTypes: Seq[AbstractDataType] =
Seq(StringTypeAnyCollation, StringTypeAnyCollation, StringTypeAnyCollation)
Seq(StringTypeNonCSAICollation, StringTypeNonCSAICollation, StringTypeNonCSAICollation)
override def first: Expression = srcExpr
override def second: Expression = searchExpr
override def third: Expression = replaceExpr
Expand Down Expand Up @@ -1167,7 +1177,7 @@ case class StringTranslate(srcExpr: Expression, matchingExpr: Expression, replac

override def dataType: DataType = srcExpr.dataType
override def inputTypes: Seq[AbstractDataType] =
Seq(StringTypeAnyCollation, StringTypeAnyCollation, StringTypeAnyCollation)
Seq(StringTypeNonCSAICollation, StringTypeNonCSAICollation, StringTypeNonCSAICollation)
override def first: Expression = srcExpr
override def second: Expression = matchingExpr
override def third: Expression = replaceExpr
Expand Down Expand Up @@ -1394,6 +1404,9 @@ case class StringTrim(srcStr: Expression, trimStr: Option[Expression] = None)
override def doEval(srcString: UTF8String, trimString: UTF8String): UTF8String =
CollationSupport.StringTrim.exec(srcString, trimString, collationId)

override def inputTypes: Seq[AbstractDataType] =
Seq(StringTypeNonCSAICollation, StringTypeNonCSAICollation)

override protected def withNewChildrenInternal(newChildren: IndexedSeq[Expression]): Expression =
copy(
srcStr = newChildren.head,
Expand Down Expand Up @@ -1501,6 +1514,9 @@ case class StringTrimLeft(srcStr: Expression, trimStr: Option[Expression] = None
override def doEval(srcString: UTF8String, trimString: UTF8String): UTF8String =
CollationSupport.StringTrimLeft.exec(srcString, trimString, collationId)

override def inputTypes: Seq[AbstractDataType] =
Seq(StringTypeNonCSAICollation, StringTypeNonCSAICollation)

override protected def withNewChildrenInternal(
newChildren: IndexedSeq[Expression]): StringTrimLeft =
copy(
Expand Down Expand Up @@ -1561,6 +1577,9 @@ case class StringTrimRight(srcStr: Expression, trimStr: Option[Expression] = Non
override def doEval(srcString: UTF8String, trimString: UTF8String): UTF8String =
CollationSupport.StringTrimRight.exec(srcString, trimString, collationId)

override def inputTypes: Seq[AbstractDataType] =
Seq(StringTypeNonCSAICollation, StringTypeNonCSAICollation)

override protected def withNewChildrenInternal(
newChildren: IndexedSeq[Expression]): StringTrimRight =
copy(
Expand Down Expand Up @@ -1595,7 +1614,7 @@ case class StringInstr(str: Expression, substr: Expression)
override def right: Expression = substr
override def dataType: DataType = IntegerType
override def inputTypes: Seq[AbstractDataType] =
Seq(StringTypeAnyCollation, StringTypeAnyCollation)
Seq(StringTypeNonCSAICollation, StringTypeNonCSAICollation)

override def nullSafeEval(string: Any, sub: Any): Any = {
CollationSupport.StringInstr.
Expand Down Expand Up @@ -1643,7 +1662,7 @@ case class SubstringIndex(strExpr: Expression, delimExpr: Expression, countExpr:

override def dataType: DataType = strExpr.dataType
override def inputTypes: Seq[AbstractDataType] =
Seq(StringTypeAnyCollation, StringTypeAnyCollation, IntegerType)
Seq(StringTypeNonCSAICollation, StringTypeNonCSAICollation, IntegerType)
override def first: Expression = strExpr
override def second: Expression = delimExpr
override def third: Expression = countExpr
Expand Down Expand Up @@ -1701,7 +1720,7 @@ case class StringLocate(substr: Expression, str: Expression, start: Expression)
override def nullable: Boolean = substr.nullable || str.nullable
override def dataType: DataType = IntegerType
override def inputTypes: Seq[AbstractDataType] =
Seq(StringTypeAnyCollation, StringTypeAnyCollation, IntegerType)
Seq(StringTypeNonCSAICollation, StringTypeNonCSAICollation, IntegerType)

override def eval(input: InternalRow): Any = {
val s = start.eval(input)
Expand Down Expand Up @@ -3463,7 +3482,7 @@ case class SplitPart (
false)
override def nodeName: String = "split_part"
override def inputTypes: Seq[AbstractDataType] =
Seq(StringTypeAnyCollation, StringTypeAnyCollation, IntegerType)
Seq(StringTypeNonCSAICollation, StringTypeNonCSAICollation, IntegerType)
def children: Seq[Expression] = Seq(str, delimiter, partNum)
protected def withNewChildrenInternal(newChildren: IndexedSeq[Expression]): Expression = {
copy(str = newChildren.apply(0), delimiter = newChildren.apply(1),
Expand Down
Loading