Skip to content

Commit c011502

Browse files
Michael ChiricoHyukjinKwon
authored andcommitted
[SPARK-31573][R] Apply fixed=TRUE as appropriate to regex usage in R
### What changes were proposed in this pull request? For regex functions in base R (`gsub`, `grep`, `grepl`, `strsplit`, `gregexpr`), supplying the `fixed=TRUE` option will be more performant. ### Why are the changes needed? This is a minor fix for performance ### Does this PR introduce any user-facing change? No (although some internal code was applying fixed-as-regex in some cases that could technically have been over-broad and caught unintended patterns) ### How was this patch tested? Not Closes #28367 from MichaelChirico/r-regex-fixed. Authored-by: Michael Chirico <[email protected]> Signed-off-by: HyukjinKwon <[email protected]>
1 parent 6ed2dfb commit c011502

File tree

7 files changed

+25
-20
lines changed

7 files changed

+25
-20
lines changed

R/pkg/R/DataFrame.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2614,7 +2614,7 @@ setMethod("join",
26142614
"left", "leftouter", "left_outer",
26152615
"right", "rightouter", "right_outer",
26162616
"semi", "left_semi", "leftsemi", "anti", "left_anti", "leftanti")) {
2617-
joinType <- gsub("_", "", joinType)
2617+
joinType <- gsub("_", "", joinType, fixed = TRUE)
26182618
sdf <- callJMethod(x@sdf, "join", y@sdf, joinExpr@jc, joinType)
26192619
} else {
26202620
stop(paste("joinType must be one of the following types:",

R/pkg/R/SQLContext.R

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -110,10 +110,11 @@ sparkR.conf <- function(key, defaultValue) {
110110
value <- if (missing(defaultValue)) {
111111
tryCatch(callJMethod(conf, "get", key),
112112
error = function(e) {
113-
if (any(grep("java.util.NoSuchElementException", as.character(e)))) {
113+
estr <- as.character(e)
114+
if (any(grepl("java.util.NoSuchElementException", estr, fixed = TRUE))) {
114115
stop(paste0("Config '", key, "' is not set"))
115116
} else {
116-
stop(paste0("Unknown error: ", as.character(e)))
117+
stop(paste0("Unknown error: ", estr))
117118
}
118119
})
119120
} else {
@@ -205,7 +206,7 @@ getSchema <- function(schema, firstRow = NULL, rdd = NULL) {
205206
# SPAKR-SQL does not support '.' in column name, so replace it with '_'
206207
# TODO(davies): remove this once SPARK-2775 is fixed
207208
names <- lapply(names, function(n) {
208-
nn <- gsub("[.]", "_", n)
209+
nn <- gsub(".", "_", n, fixed = TRUE)
209210
if (nn != n) {
210211
warning(paste("Use", nn, "instead of", n, "as column name"))
211212
}

R/pkg/R/client.R

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -65,8 +65,8 @@ checkJavaVersion <- function() {
6565
javaHome <- Sys.getenv("JAVA_HOME")
6666
javaReqs <- utils::packageDescription(utils::packageName(), fields = c("SystemRequirements"))
6767
sparkJavaVersions <- strsplit(javaReqs, "[(,)]")[[1]]
68-
minJavaVersion <- as.numeric(strsplit(sparkJavaVersions[[2]], ">= ")[[1]][[2]])
69-
maxJavaVersion <- as.numeric(strsplit(sparkJavaVersions[[3]], "< ")[[1]][[2]])
68+
minJavaVersion <- as.numeric(strsplit(sparkJavaVersions[[2]], ">= ", fixed = TRUE)[[1]][[2]])
69+
maxJavaVersion <- as.numeric(strsplit(sparkJavaVersions[[3]], "< ", fixed = TRUE)[[1]][[2]])
7070
if (javaHome != "") {
7171
javaBin <- file.path(javaHome, "bin", javaBin)
7272
}
@@ -89,13 +89,13 @@ checkJavaVersion <- function() {
8989
})
9090
javaVersionFilter <- Filter(
9191
function(x) {
92-
grepl(" version", x)
92+
grepl(" version", x, fixed = TRUE)
9393
}, javaVersionOut)
9494

95-
javaVersionStr <- strsplit(javaVersionFilter[[1]], "[\"]")[[1L]][2]
95+
javaVersionStr <- strsplit(javaVersionFilter[[1]], '"', fixed = TRUE)[[1L]][2]
9696
# javaVersionStr is of the form 1.8.0_92/9.0.x/11.0.x.
9797
# We are using 8, 9, 10, 11 for sparkJavaVersion.
98-
versions <- strsplit(javaVersionStr, "[.]")[[1L]]
98+
versions <- strsplit(javaVersionStr, ".", fixed = TRUE)[[1L]]
9999
if ("1" == versions[1]) {
100100
javaVersionNum <- as.integer(versions[2])
101101
} else {

R/pkg/R/install.R

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -214,9 +214,9 @@ getPreferredMirror <- function(version, packageName) {
214214
file.path("spark", version, packageName),
215215
".tgz&as_json=1")
216216
textLines <- readLines(jsonUrl, warn = FALSE)
217-
rowNum <- grep("\"preferred\"", textLines)
217+
rowNum <- grep('"preferred"', textLines, fixed = TRUE)
218218
linePreferred <- textLines[rowNum]
219-
matchInfo <- regexpr("\"[A-Za-z][A-Za-z0-9+-.]*://.+\"", linePreferred)
219+
matchInfo <- regexpr('"[A-Za-z][A-Za-z0-9+-.]*://.+"', linePreferred)
220220
if (matchInfo != -1) {
221221
startPos <- matchInfo + 1
222222
endPos <- matchInfo + attr(matchInfo, "match.length") - 2

R/pkg/R/schema.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -182,7 +182,7 @@ checkType <- function(type) {
182182
# strsplit does not return the final empty string, so check if
183183
# the final char is ","
184184
if (substr(fieldsString, nchar(fieldsString), nchar(fieldsString)) != ",") {
185-
fields <- strsplit(fieldsString, ",")[[1]]
185+
fields <- strsplit(fieldsString, ",", fixed = TRUE)[[1]]
186186
for (field in fields) {
187187
m <- regexec("^(.+):(.+)$", field)
188188
matchedStrings <- regmatches(field, m)

R/pkg/R/sparkR.R

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -435,7 +435,7 @@ sparkR.session <- function(
435435
# Check if version number of SparkSession matches version number of SparkR package
436436
jvmVersion <- callJMethod(sparkSession, "version")
437437
# Remove -SNAPSHOT from jvm versions
438-
jvmVersionStrip <- gsub("-SNAPSHOT", "", jvmVersion)
438+
jvmVersionStrip <- gsub("-SNAPSHOT", "", jvmVersion, fixed = TRUE)
439439
rPackageVersion <- paste0(packageVersion("SparkR"))
440440

441441
if (jvmVersionStrip != rPackageVersion) {
@@ -606,7 +606,7 @@ getClientModeSparkSubmitOpts <- function(submitOps, sparkEnvirMap) {
606606
# process only if --option is not already specified
607607
if (!is.null(opsValue) &&
608608
nchar(opsValue) > 1 &&
609-
!grepl(sparkConfToSubmitOps[[conf]], submitOps)) {
609+
!grepl(sparkConfToSubmitOps[[conf]], submitOps, fixed = TRUE)) {
610610
# put "" around value in case it has spaces
611611
paste0(sparkConfToSubmitOps[[conf]], " \"", opsValue, "\" ")
612612
} else {

R/pkg/R/utils.R

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -818,30 +818,32 @@ captureJVMException <- function(e, method) {
818818
}
819819

820820
# StreamingQueryException could wrap an IllegalArgumentException, so look for that first
821-
if (any(grep("org.apache.spark.sql.streaming.StreamingQueryException: ", stacktrace))) {
821+
if (any(grep("org.apache.spark.sql.streaming.StreamingQueryException: ",
822+
stacktrace, fixed = TRUE))) {
822823
msg <- strsplit(stacktrace, "org.apache.spark.sql.streaming.StreamingQueryException: ",
823824
fixed = TRUE)[[1]]
824825
# Extract "Error in ..." message.
825826
rmsg <- msg[1]
826827
# Extract the first message of JVM exception.
827828
first <- strsplit(msg[2], "\r?\n\tat")[[1]][1]
828829
stop(paste0(rmsg, "streaming query error - ", first), call. = FALSE)
829-
} else if (any(grep("java.lang.IllegalArgumentException: ", stacktrace))) {
830+
} else if (any(grep("java.lang.IllegalArgumentException: ", stacktrace, fixed = TRUE))) {
830831
msg <- strsplit(stacktrace, "java.lang.IllegalArgumentException: ", fixed = TRUE)[[1]]
831832
# Extract "Error in ..." message.
832833
rmsg <- msg[1]
833834
# Extract the first message of JVM exception.
834835
first <- strsplit(msg[2], "\r?\n\tat")[[1]][1]
835836
stop(paste0(rmsg, "illegal argument - ", first), call. = FALSE)
836-
} else if (any(grep("org.apache.spark.sql.AnalysisException: ", stacktrace))) {
837+
} else if (any(grep("org.apache.spark.sql.AnalysisException: ", stacktrace, fixed = TRUE))) {
837838
msg <- strsplit(stacktrace, "org.apache.spark.sql.AnalysisException: ", fixed = TRUE)[[1]]
838839
# Extract "Error in ..." message.
839840
rmsg <- msg[1]
840841
# Extract the first message of JVM exception.
841842
first <- strsplit(msg[2], "\r?\n\tat")[[1]][1]
842843
stop(paste0(rmsg, "analysis error - ", first), call. = FALSE)
843844
} else
844-
if (any(grep("org.apache.spark.sql.catalyst.analysis.NoSuchDatabaseException: ", stacktrace))) {
845+
if (any(grep("org.apache.spark.sql.catalyst.analysis.NoSuchDatabaseException: ",
846+
stacktrace, fixed = TRUE))) {
845847
msg <- strsplit(stacktrace, "org.apache.spark.sql.catalyst.analysis.NoSuchDatabaseException: ",
846848
fixed = TRUE)[[1]]
847849
# Extract "Error in ..." message.
@@ -850,15 +852,17 @@ captureJVMException <- function(e, method) {
850852
first <- strsplit(msg[2], "\r?\n\tat")[[1]][1]
851853
stop(paste0(rmsg, "no such database - ", first), call. = FALSE)
852854
} else
853-
if (any(grep("org.apache.spark.sql.catalyst.analysis.NoSuchTableException: ", stacktrace))) {
855+
if (any(grep("org.apache.spark.sql.catalyst.analysis.NoSuchTableException: ",
856+
stacktrace, fixed = TRUE))) {
854857
msg <- strsplit(stacktrace, "org.apache.spark.sql.catalyst.analysis.NoSuchTableException: ",
855858
fixed = TRUE)[[1]]
856859
# Extract "Error in ..." message.
857860
rmsg <- msg[1]
858861
# Extract the first message of JVM exception.
859862
first <- strsplit(msg[2], "\r?\n\tat")[[1]][1]
860863
stop(paste0(rmsg, "no such table - ", first), call. = FALSE)
861-
} else if (any(grep("org.apache.spark.sql.catalyst.parser.ParseException: ", stacktrace))) {
864+
} else if (any(grep("org.apache.spark.sql.catalyst.parser.ParseException: ",
865+
stacktrace, fixed = TRUE))) {
862866
msg <- strsplit(stacktrace, "org.apache.spark.sql.catalyst.parser.ParseException: ",
863867
fixed = TRUE)[[1]]
864868
# Extract "Error in ..." message.

0 commit comments

Comments
 (0)