@@ -635,68 +635,141 @@ test_that("spark.isotonicRegression", {
635635})
636636
637637test_that(" spark.logit" , {
638- # test binary logistic regression
639- label <- c(0.0 , 0.0 , 0.0 , 1.0 , 1.0 )
640- feature <- c(1.1419053 , 0.9194079 , - 0.9498666 , - 1.1069903 , 0.2809776 )
641- binary_data <- as.data.frame(cbind(label , feature ))
642- binary_df <- createDataFrame(binary_data )
643-
644- blr_model <- spark.logit(binary_df , label ~ feature , thresholds = 1.0 )
645- blr_predict <- collect(select(predict(blr_model , binary_df ), " prediction" ))
646- expect_equal(blr_predict $ prediction , c(" 0.0" , " 0.0" , " 0.0" , " 0.0" , " 0.0" ))
647- blr_model1 <- spark.logit(binary_df , label ~ feature , thresholds = 0.0 )
648- blr_predict1 <- collect(select(predict(blr_model1 , binary_df ), " prediction" ))
649- expect_equal(blr_predict1 $ prediction , c(" 1.0" , " 1.0" , " 1.0" , " 1.0" , " 1.0" ))
650-
651- # test summary of binary logistic regression
652- blr_summary <- summary(blr_model )
653- blr_fmeasure <- collect(select(blr_summary $ fMeasureByThreshold , " threshold" , " F-Measure" ))
654- expect_equal(blr_fmeasure $ threshold , c(0.6565513 , 0.6214563 , 0.3325291 , 0.2115995 , 0.1778653 ),
655- tolerance = 1e-4 )
656- expect_equal(blr_fmeasure $ " F-Measure" , c(0.6666667 , 0.5000000 , 0.8000000 , 0.6666667 , 0.5714286 ),
657- tolerance = 1e-4 )
658- blr_precision <- collect(select(blr_summary $ precisionByThreshold , " threshold" , " precision" ))
659- expect_equal(blr_precision $ precision , c(1.0000000 , 0.5000000 , 0.6666667 , 0.5000000 , 0.4000000 ),
660- tolerance = 1e-4 )
661- blr_recall <- collect(select(blr_summary $ recallByThreshold , " threshold" , " recall" ))
662- expect_equal(blr_recall $ recall , c(0.5000000 , 0.5000000 , 1.0000000 , 1.0000000 , 1.0000000 ),
663- tolerance = 1e-4 )
638+ # R code to reproduce the result.
639+ # nolint start
640+ # ' library(glmnet)
641+ # ' iris.x = as.matrix(iris[, 1:4])
642+ # ' iris.y = as.factor(as.character(iris[, 5]))
643+ # ' logit = glmnet(iris.x, iris.y, family="multinomial", alpha=0, lambda=0.5)
644+ # ' coef(logit)
645+ #
646+ # $setosa
647+ # 5 x 1 sparse Matrix of class "dgCMatrix"
648+ # s0
649+ # 1.0981324
650+ # Sepal.Length -0.2909860
651+ # Sepal.Width 0.5510907
652+ # Petal.Length -0.1915217
653+ # Petal.Width -0.4211946
654+ #
655+ # $versicolor
656+ # 5 x 1 sparse Matrix of class "dgCMatrix"
657+ # s0
658+ # 1.520061e+00
659+ # Sepal.Length 2.524501e-02
660+ # Sepal.Width -5.310313e-01
661+ # Petal.Length 3.656543e-02
662+ # Petal.Width -3.144464e-05
663+ #
664+ # $virginica
665+ # 5 x 1 sparse Matrix of class "dgCMatrix"
666+ # s0
667+ # -2.61819385
668+ # Sepal.Length 0.26574097
669+ # Sepal.Width -0.02005932
670+ # Petal.Length 0.15495629
671+ # Petal.Width 0.42122607
672+ # nolint end
664673
665- # test model save and read
666- modelPath <- tempfile(pattern = " spark-logisticRegression" , fileext = " .tmp" )
667- write.ml(blr_model , modelPath )
668- expect_error(write.ml(blr_model , modelPath ))
669- write.ml(blr_model , modelPath , overwrite = TRUE )
670- blr_model2 <- read.ml(modelPath )
671- blr_predict2 <- collect(select(predict(blr_model2 , binary_df ), " prediction" ))
672- expect_equal(blr_predict $ prediction , blr_predict2 $ prediction )
673- expect_error(summary(blr_model2 ))
674+ # Test multinomial logistic regression againt three classes
675+ df <- suppressWarnings(createDataFrame(iris ))
676+ model <- spark.logit(df , Species ~ . , regParam = 0.5 )
677+ summary <- summary(model )
678+ versicolorCoefsR <- c(1.52 , 0.03 , - 0.53 , 0.04 , 0.00 )
679+ virginicaCoefsR <- c(- 2.62 , 0.27 , - 0.02 , 0.16 , 0.42 )
680+ setosaCoefsR <- c(1.10 , - 0.29 , 0.55 , - 0.19 , - 0.42 )
681+ versicolorCoefs <- unlist(summary $ coefficients [, " versicolor" ])
682+ virginicaCoefs <- unlist(summary $ coefficients [, " virginica" ])
683+ setosaCoefs <- unlist(summary $ coefficients [, " setosa" ])
684+ expect_true(all(abs(versicolorCoefsR - versicolorCoefs ) < 0.1 ))
685+ expect_true(all(abs(virginicaCoefsR - virginicaCoefs ) < 0.1 ))
686+ expect_true(all(abs(setosaCoefs - setosaCoefs ) < 0.1 ))
687+
688+ # Test model save and load
689+ modelPath <- tempfile(pattern = " spark-logit" , fileext = " .tmp" )
690+ write.ml(model , modelPath )
691+ expect_error(write.ml(model , modelPath ))
692+ write.ml(model , modelPath , overwrite = TRUE )
693+ model2 <- read.ml(modelPath )
694+ coefs <- summary(model )$ coefficients
695+ coefs2 <- summary(model2 )$ coefficients
696+ expect_equal(coefs , coefs2 )
674697 unlink(modelPath )
675698
676- # test prediction label as text
677- training <- suppressWarnings(createDataFrame(iris ))
678- binomial_training <- training [training $ Species %in% c(" versicolor" , " virginica" ), ]
679- binomial_model <- spark.logit(binomial_training , Species ~ Sepal_Length + Sepal_Width )
680- prediction <- predict(binomial_model , binomial_training )
699+ # R code to reproduce the result.
700+ # nolint start
701+ # ' library(glmnet)
702+ # ' iris2 <- iris[iris$Species %in% c("versicolor", "virginica"), ]
703+ # ' iris.x = as.matrix(iris2[, 1:4])
704+ # ' iris.y = as.factor(as.character(iris2[, 5]))
705+ # ' logit = glmnet(iris.x, iris.y, family="multinomial", alpha=0, lambda=0.5)
706+ # ' coef(logit)
707+ #
708+ # $versicolor
709+ # 5 x 1 sparse Matrix of class "dgCMatrix"
710+ # s0
711+ # 3.93844796
712+ # Sepal.Length -0.13538675
713+ # Sepal.Width -0.02386443
714+ # Petal.Length -0.35076451
715+ # Petal.Width -0.77971954
716+ #
717+ # $virginica
718+ # 5 x 1 sparse Matrix of class "dgCMatrix"
719+ # s0
720+ # -3.93844796
721+ # Sepal.Length 0.13538675
722+ # Sepal.Width 0.02386443
723+ # Petal.Length 0.35076451
724+ # Petal.Width 0.77971954
725+ #
726+ # ' logit = glmnet(iris.x, iris.y, family="binomial", alpha=0, lambda=0.5)
727+ # ' coef(logit)
728+ #
729+ # 5 x 1 sparse Matrix of class "dgCMatrix"
730+ # s0
731+ # (Intercept) -6.0824412
732+ # Sepal.Length 0.2458260
733+ # Sepal.Width 0.1642093
734+ # Petal.Length 0.4759487
735+ # Petal.Width 1.0383948
736+ #
737+ # nolint end
738+
739+ # Test multinomial logistic regression againt two classes
740+ df <- suppressWarnings(createDataFrame(iris ))
741+ training <- df [df $ Species %in% c(" versicolor" , " virginica" ), ]
742+ model <- spark.logit(training , Species ~ . , regParam = 0.5 , family = " multinomial" )
743+ summary <- summary(model )
744+ versicolorCoefsR <- c(3.94 , - 0.16 , - 0.02 , - 0.35 , - 0.78 )
745+ virginicaCoefsR <- c(- 3.94 , 0.16 , - 0.02 , 0.35 , 0.78 )
746+ versicolorCoefs <- unlist(summary $ coefficients [, " versicolor" ])
747+ virginicaCoefs <- unlist(summary $ coefficients [, " virginica" ])
748+ expect_true(all(abs(versicolorCoefsR - versicolorCoefs ) < 0.1 ))
749+ expect_true(all(abs(virginicaCoefsR - virginicaCoefs ) < 0.1 ))
750+
751+ # Test binomial logistic regression againt two classes
752+ model <- spark.logit(training , Species ~ . , regParam = 0.5 )
753+ summary <- summary(model )
754+ coefsR <- c(- 6.08 , 0.25 , 0.16 , 0.48 , 1.04 )
755+ coefs <- unlist(summary $ coefficients [, " Estimate" ])
756+ expect_true(all(abs(coefsR - coefs ) < 0.1 ))
757+
758+ # Test prediction with string label
759+ prediction <- predict(model , training )
681760 expect_equal(typeof(take(select(prediction , " prediction" ), 1 )$ prediction ), " character" )
682- expected <- c(" virginica " , " virginica " , " virginica" , " versicolor" , " virginica " ,
683- " versicolor" , " virginica " , " versicolor" , " virginica " , " versicolor" )
761+ expected <- c(" versicolor " , " versicolor " , " virginica" , " versicolor" , " versicolor " ,
762+ " versicolor" , " versicolor " , " versicolor" , " versicolor " , " versicolor" )
684763 expect_equal(as.list(take(select(prediction , " prediction" ), 10 ))[[1 ]], expected )
685764
686- # test multinomial logistic regression
687- label <- c(0.0 , 1.0 , 2.0 , 0.0 , 0.0 )
688- feature1 <- c(4.845940 , 5.64480 , 7.430381 , 6.464263 , 5.555667 )
689- feature2 <- c(2.941319 , 2.614812 , 2.162451 , 3.339474 , 2.970987 )
690- feature3 <- c(1.322733 , 1.348044 , 3.861237 , 9.686976 , 3.447130 )
691- feature4 <- c(1.3246388 , 0.5510444 , 0.9225810 , 1.2147881 , 1.6020842 )
692- data <- as.data.frame(cbind(label , feature1 , feature2 , feature3 , feature4 ))
765+ # Test prediction with numeric label
766+ label <- c(0.0 , 0.0 , 0.0 , 1.0 , 1.0 )
767+ feature <- c(1.1419053 , 0.9194079 , - 0.9498666 , - 1.1069903 , 0.2809776 )
768+ data <- as.data.frame(cbind(label , feature ))
693769 df <- createDataFrame(data )
694-
695- model <- spark.logit(df , label ~ . , family = " multinomial" , thresholds = c(0 , 1 , 1 ))
696- predict1 <- collect(select(predict(model , df ), " prediction" ))
697- expect_equal(predict1 $ prediction , c(" 0.0" , " 0.0" , " 0.0" , " 0.0" , " 0.0" ))
698- # Summary of multinomial logistic regression is not implemented yet
699- expect_error(summary(model ))
770+ model <- spark.logit(df , label ~ feature )
771+ prediction <- collect(select(predict(model , df ), " prediction" ))
772+ expect_equal(prediction $ prediction , c(" 0.0" , " 0.0" , " 1.0" , " 1.0" , " 0.0" ))
700773})
701774
702775test_that(" spark.gaussianMixture" , {
0 commit comments