@@ -102,10 +102,18 @@ test_that("spark.glm and predict", {
102102})
103103
104104test_that(" spark.glm summary" , {
105+ # prepare dataset
106+ Sepal.Length <- c(2.0 , 1.5 , 1.8 , 3.4 , 5.1 , 1.8 , 1.0 , 2.3 )
107+ Sepal.Width <- c(2.1 , 2.3 , 5.4 , 4.7 , 3.1 , 2.1 , 3.1 , 5.5 )
108+ Petal.Length <- c(1.8 , 2.1 , 7.1 , 2.5 , 3.7 , 6.3 , 2.2 , 7.2 )
109+ Species <- c(" setosa" , " versicolor" , " versicolor" , " versicolor" , " virginica" , " virginica" ,
110+ " versicolor" , " virginica" )
111+ dataset <- data.frame (Sepal.Length , Sepal.Width , Petal.Length , Species , stringsAsFactors = TRUE )
112+
105113 # gaussian family
106- training <- suppressWarnings(createDataFrame(iris ))
114+ training <- suppressWarnings(createDataFrame(dataset ))
107115 stats <- summary(spark.glm(training , Sepal_Width ~ Sepal_Length + Species ))
108- rStats <- summary(glm(Sepal.Width ~ Sepal.Length + Species , data = iris ))
116+ rStats <- summary(glm(Sepal.Width ~ Sepal.Length + Species , data = dataset ))
109117
110118 # test summary coefficients return matrix type
111119 expect_true(class(stats $ coefficients ) == " matrix" )
@@ -126,15 +134,15 @@ test_that("spark.glm summary", {
126134
127135 out <- capture.output(print(stats ))
128136 expect_match(out [2 ], " Deviance Residuals:" )
129- expect_true(any(grepl(" AIC: 59.22 " , out )))
137+ expect_true(any(grepl(" AIC: 35.84 " , out )))
130138
131139 # binomial family
132- df <- suppressWarnings(createDataFrame(iris ))
140+ df <- suppressWarnings(createDataFrame(dataset ))
133141 training <- df [df $ Species %in% c(" versicolor" , " virginica" ), ]
134142 stats <- summary(spark.glm(training , Species ~ Sepal_Length + Sepal_Width ,
135143 family = binomial(link = " logit" )))
136144
137- rTraining <- iris [ iris $ Species %in% c(" versicolor" , " virginica" ), ]
145+ rTraining <- dataset [ dataset $ Species %in% c(" versicolor" , " virginica" ), ]
138146 rStats <- summary(glm(Species ~ Sepal.Length + Sepal.Width , data = rTraining ,
139147 family = binomial(link = " logit" )))
140148
@@ -174,17 +182,17 @@ test_that("spark.glm summary", {
174182 expect_equal(stats $ aic , rStats $ aic )
175183
176184 # Test spark.glm works with offset
177- training <- suppressWarnings(createDataFrame(iris ))
185+ training <- suppressWarnings(createDataFrame(dataset ))
178186 stats <- summary(spark.glm(training , Sepal_Width ~ Sepal_Length + Species ,
179187 family = poisson(), offsetCol = " Petal_Length" ))
180188 rStats <- suppressWarnings(summary(glm(Sepal.Width ~ Sepal.Length + Species ,
181- data = iris , family = poisson(), offset = iris $ Petal.Length )))
189+ data = dataset , family = poisson(), offset = dataset $ Petal.Length )))
182190 expect_true(all(abs(rStats $ coefficients - stats $ coefficients ) < 1e-3 ))
183191
184192 # Test summary works on base GLM models
185- baseModel <- stats :: glm(Sepal.Width ~ Sepal.Length + Species , data = iris )
193+ baseModel <- stats :: glm(Sepal.Width ~ Sepal.Length + Species , data = dataset )
186194 baseSummary <- summary(baseModel )
187- expect_true(abs(baseSummary $ deviance - 12.19313 ) < 1e-4 )
195+ expect_true(abs(baseSummary $ deviance - 11.84013 ) < 1e-4 )
188196
189197 # Test spark.glm works with regularization parameter
190198 data <- as.data.frame(cbind(a1 , a2 , b ))
@@ -300,11 +308,19 @@ test_that("glm and predict", {
300308})
301309
302310test_that(" glm summary" , {
311+ # prepare dataset
312+ Sepal.Length <- c(2.0 , 1.5 , 1.8 , 3.4 , 5.1 , 1.8 , 1.0 , 2.3 )
313+ Sepal.Width <- c(2.1 , 2.3 , 5.4 , 4.7 , 3.1 , 2.1 , 3.1 , 5.5 )
314+ Petal.Length <- c(1.8 , 2.1 , 7.1 , 2.5 , 3.7 , 6.3 , 2.2 , 7.2 )
315+ Species <- c(" setosa" , " versicolor" , " versicolor" , " versicolor" , " virginica" , " virginica" ,
316+ " versicolor" , " virginica" )
317+ dataset <- data.frame (Sepal.Length , Sepal.Width , Petal.Length , Species , stringsAsFactors = TRUE )
318+
303319 # gaussian family
304- training <- suppressWarnings(createDataFrame(iris ))
320+ training <- suppressWarnings(createDataFrame(dataset ))
305321 stats <- summary(glm(Sepal_Width ~ Sepal_Length + Species , data = training ))
306322
307- rStats <- summary(glm(Sepal.Width ~ Sepal.Length + Species , data = iris ))
323+ rStats <- summary(glm(Sepal.Width ~ Sepal.Length + Species , data = dataset ))
308324
309325 coefs <- stats $ coefficients
310326 rCoefs <- rStats $ coefficients
@@ -320,12 +336,12 @@ test_that("glm summary", {
320336 expect_equal(stats $ aic , rStats $ aic )
321337
322338 # binomial family
323- df <- suppressWarnings(createDataFrame(iris ))
339+ df <- suppressWarnings(createDataFrame(dataset ))
324340 training <- df [df $ Species %in% c(" versicolor" , " virginica" ), ]
325341 stats <- summary(glm(Species ~ Sepal_Length + Sepal_Width , data = training ,
326342 family = binomial(link = " logit" )))
327343
328- rTraining <- iris [ iris $ Species %in% c(" versicolor" , " virginica" ), ]
344+ rTraining <- dataset [ dataset $ Species %in% c(" versicolor" , " virginica" ), ]
329345 rStats <- summary(glm(Species ~ Sepal.Length + Sepal.Width , data = rTraining ,
330346 family = binomial(link = " logit" )))
331347
0 commit comments