Skip to content

Commit 3310789

Browse files
viiryasrowen
authored andcommitted
[SPARK-11215][ML] Add multiple columns support to StringIndexer
## What changes were proposed in this pull request? This takes over #19621 to add multi-column support to StringIndexer: 1. Supports encoding multiple columns. 2. Previously, when specifying `frequencyDesc` or `frequencyAsc` as `stringOrderType` param in `StringIndexer`, in case of equal frequency, the order of strings is undefined. After this change, the strings with equal frequency are further sorted alphabetically. ## How was this patch tested? Added tests. Closes #20146 from viirya/SPARK-11215. Authored-by: Liang-Chi Hsieh <[email protected]> Signed-off-by: Sean Owen <[email protected]>
1 parent 5d672b7 commit 3310789

File tree

10 files changed

+531
-113
lines changed

10 files changed

+531
-113
lines changed

R/pkg/tests/fulltests/test_mllib_classification.R

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -313,7 +313,7 @@ test_that("spark.mlp", {
313313
# Test predict method
314314
mlpTestDF <- df
315315
mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
316-
expect_equal(head(mlpPredictions$prediction, 6), c("1.0", "0.0", "0.0", "0.0", "0.0", "0.0"))
316+
expect_equal(head(mlpPredictions$prediction, 6), c("0.0", "1.0", "1.0", "1.0", "1.0", "1.0"))
317317

318318
# Test model save/load
319319
if (windows_with_hadoop()) {
@@ -348,12 +348,12 @@ test_that("spark.mlp", {
348348

349349
# Test random seed
350350
# default seed
351-
model <- spark.mlp(df, label ~ features, layers = c(4, 5, 4, 3), maxIter = 10)
351+
model <- spark.mlp(df, label ~ features, layers = c(4, 5, 4, 3), maxIter = 100)
352352
mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
353353
expect_equal(head(mlpPredictions$prediction, 10),
354354
c("1.0", "1.0", "1.0", "1.0", "0.0", "1.0", "2.0", "2.0", "1.0", "0.0"))
355355
# seed equals 10
356-
model <- spark.mlp(df, label ~ features, layers = c(4, 5, 4, 3), maxIter = 10, seed = 10)
356+
model <- spark.mlp(df, label ~ features, layers = c(4, 5, 4, 3), maxIter = 100, seed = 10)
357357
mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
358358
expect_equal(head(mlpPredictions$prediction, 10),
359359
c("1.0", "1.0", "1.0", "1.0", "0.0", "1.0", "2.0", "2.0", "1.0", "0.0"))

R/pkg/tests/fulltests/test_mllib_regression.R

Lines changed: 29 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -102,10 +102,18 @@ test_that("spark.glm and predict", {
102102
})
103103

104104
test_that("spark.glm summary", {
105+
# prepare dataset
106+
Sepal.Length <- c(2.0, 1.5, 1.8, 3.4, 5.1, 1.8, 1.0, 2.3)
107+
Sepal.Width <- c(2.1, 2.3, 5.4, 4.7, 3.1, 2.1, 3.1, 5.5)
108+
Petal.Length <- c(1.8, 2.1, 7.1, 2.5, 3.7, 6.3, 2.2, 7.2)
109+
Species <- c("setosa", "versicolor", "versicolor", "versicolor", "virginica", "virginica",
110+
"versicolor", "virginica")
111+
dataset <- data.frame(Sepal.Length, Sepal.Width, Petal.Length, Species, stringsAsFactors = TRUE)
112+
105113
# gaussian family
106-
training <- suppressWarnings(createDataFrame(iris))
114+
training <- suppressWarnings(createDataFrame(dataset))
107115
stats <- summary(spark.glm(training, Sepal_Width ~ Sepal_Length + Species))
108-
rStats <- summary(glm(Sepal.Width ~ Sepal.Length + Species, data = iris))
116+
rStats <- summary(glm(Sepal.Width ~ Sepal.Length + Species, data = dataset))
109117

110118
# test summary coefficients return matrix type
111119
expect_true(class(stats$coefficients) == "matrix")
@@ -126,15 +134,15 @@ test_that("spark.glm summary", {
126134

127135
out <- capture.output(print(stats))
128136
expect_match(out[2], "Deviance Residuals:")
129-
expect_true(any(grepl("AIC: 59.22", out)))
137+
expect_true(any(grepl("AIC: 35.84", out)))
130138

131139
# binomial family
132-
df <- suppressWarnings(createDataFrame(iris))
140+
df <- suppressWarnings(createDataFrame(dataset))
133141
training <- df[df$Species %in% c("versicolor", "virginica"), ]
134142
stats <- summary(spark.glm(training, Species ~ Sepal_Length + Sepal_Width,
135143
family = binomial(link = "logit")))
136144

137-
rTraining <- iris[iris$Species %in% c("versicolor", "virginica"), ]
145+
rTraining <- dataset[dataset$Species %in% c("versicolor", "virginica"), ]
138146
rStats <- summary(glm(Species ~ Sepal.Length + Sepal.Width, data = rTraining,
139147
family = binomial(link = "logit")))
140148

@@ -174,17 +182,17 @@ test_that("spark.glm summary", {
174182
expect_equal(stats$aic, rStats$aic)
175183

176184
# Test spark.glm works with offset
177-
training <- suppressWarnings(createDataFrame(iris))
185+
training <- suppressWarnings(createDataFrame(dataset))
178186
stats <- summary(spark.glm(training, Sepal_Width ~ Sepal_Length + Species,
179187
family = poisson(), offsetCol = "Petal_Length"))
180188
rStats <- suppressWarnings(summary(glm(Sepal.Width ~ Sepal.Length + Species,
181-
data = iris, family = poisson(), offset = iris$Petal.Length)))
189+
data = dataset, family = poisson(), offset = dataset$Petal.Length)))
182190
expect_true(all(abs(rStats$coefficients - stats$coefficients) < 1e-3))
183191

184192
# Test summary works on base GLM models
185-
baseModel <- stats::glm(Sepal.Width ~ Sepal.Length + Species, data = iris)
193+
baseModel <- stats::glm(Sepal.Width ~ Sepal.Length + Species, data = dataset)
186194
baseSummary <- summary(baseModel)
187-
expect_true(abs(baseSummary$deviance - 12.19313) < 1e-4)
195+
expect_true(abs(baseSummary$deviance - 11.84013) < 1e-4)
188196

189197
# Test spark.glm works with regularization parameter
190198
data <- as.data.frame(cbind(a1, a2, b))
@@ -300,11 +308,19 @@ test_that("glm and predict", {
300308
})
301309

302310
test_that("glm summary", {
311+
# prepare dataset
312+
Sepal.Length <- c(2.0, 1.5, 1.8, 3.4, 5.1, 1.8, 1.0, 2.3)
313+
Sepal.Width <- c(2.1, 2.3, 5.4, 4.7, 3.1, 2.1, 3.1, 5.5)
314+
Petal.Length <- c(1.8, 2.1, 7.1, 2.5, 3.7, 6.3, 2.2, 7.2)
315+
Species <- c("setosa", "versicolor", "versicolor", "versicolor", "virginica", "virginica",
316+
"versicolor", "virginica")
317+
dataset <- data.frame(Sepal.Length, Sepal.Width, Petal.Length, Species, stringsAsFactors = TRUE)
318+
303319
# gaussian family
304-
training <- suppressWarnings(createDataFrame(iris))
320+
training <- suppressWarnings(createDataFrame(dataset))
305321
stats <- summary(glm(Sepal_Width ~ Sepal_Length + Species, data = training))
306322

307-
rStats <- summary(glm(Sepal.Width ~ Sepal.Length + Species, data = iris))
323+
rStats <- summary(glm(Sepal.Width ~ Sepal.Length + Species, data = dataset))
308324

309325
coefs <- stats$coefficients
310326
rCoefs <- rStats$coefficients
@@ -320,12 +336,12 @@ test_that("glm summary", {
320336
expect_equal(stats$aic, rStats$aic)
321337

322338
# binomial family
323-
df <- suppressWarnings(createDataFrame(iris))
339+
df <- suppressWarnings(createDataFrame(dataset))
324340
training <- df[df$Species %in% c("versicolor", "virginica"), ]
325341
stats <- summary(glm(Species ~ Sepal_Length + Sepal_Width, data = training,
326342
family = binomial(link = "logit")))
327343

328-
rTraining <- iris[iris$Species %in% c("versicolor", "virginica"), ]
344+
rTraining <- dataset[dataset$Species %in% c("versicolor", "virginica"), ]
329345
rStats <- summary(glm(Species ~ Sepal.Length + Sepal.Width, data = rTraining,
330346
family = binomial(link = "logit")))
331347

docs/ml-features.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -585,11 +585,13 @@ for more details on the API.
585585
## StringIndexer
586586

587587
`StringIndexer` encodes a string column of labels to a column of label indices.
588-
The indices are in `[0, numLabels)`, and four ordering options are supported:
588+
`StringIndexer` can encode multiple columns. The indices are in `[0, numLabels)`, and four ordering options are supported:
589589
"frequencyDesc": descending order by label frequency (most frequent label assigned 0),
590590
"frequencyAsc": ascending order by label frequency (least frequent label assigned 0),
591591
"alphabetDesc": descending alphabetical order, and "alphabetAsc": ascending alphabetical order
592-
(default = "frequencyDesc").
592+
(default = "frequencyDesc"). Note that in case of equal frequency when under
593+
"frequencyDesc"/"frequencyAsc", the strings are further sorted by alphabet.
594+
593595
The unseen labels will be put at index numLabels if user chooses to keep them.
594596
If the input column is numeric, we cast it to string and index the string
595597
values. When downstream pipeline components such as `Estimator` or

docs/ml-guide.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,15 @@ and the migration guide below will explain all changes between releases.
110110

111111
* `OneHotEncoder` which is deprecated in 2.3, is removed in 3.0 and `OneHotEncoderEstimator` is now renamed to `OneHotEncoder`.
112112

113+
### Changes of behavior
114+
115+
* [SPARK-11215](https://issues.apache.org/jira/browse/SPARK-11215):
116+
In Spark 2.4 and previous versions, when specifying `frequencyDesc` or `frequencyAsc` as
117+
`stringOrderType` param in `StringIndexer`, in case of equal frequency, the order of
118+
strings is undefined. Since Spark 3.0, the strings with equal frequency are further
119+
sorted by alphabet. And since Spark 3.0, `StringIndexer` supports encoding multiple
120+
columns.
121+
113122
## From 2.2 to 2.3
114123

115124
### Breaking changes

0 commit comments

Comments
 (0)