@@ -28,6 +28,7 @@ class HingeAggregatorSuite extends SparkFunSuite with MLlibTestSparkContext {
2828 @ transient var instances : Array [Instance ] = _
2929 @ transient var instancesConstantFeature : Array [Instance ] = _
3030 @ transient var instancesConstantFeatureFiltered : Array [Instance ] = _
31+ @ transient var standardizedInstances : Array [Instance ] = _
3132
3233 override def beforeAll (): Unit = {
3334 super .beforeAll()
@@ -46,6 +47,7 @@ class HingeAggregatorSuite extends SparkFunSuite with MLlibTestSparkContext {
4647 Instance (1.0 , 0.5 , Vectors .dense(1.0 )),
4748 Instance (2.0 , 0.3 , Vectors .dense(0.5 ))
4849 )
50+ standardizedInstances = standardize(instances)
4951 }
5052
5153 /** Get summary statistics for some data and create a new HingeAggregator. */
@@ -61,18 +63,27 @@ class HingeAggregatorSuite extends SparkFunSuite with MLlibTestSparkContext {
6163 new HingeAggregator (bcFeaturesStd, fitIntercept)(bcCoefficients)
6264 }
6365
66+ private def standardize (instances : Array [Instance ]): Array [Instance ] = {
67+ val (featuresSummarizer, _) =
68+ Summarizer .getClassificationSummarizers(sc.parallelize(instances))
69+ val stdArray = featuresSummarizer.std.toArray
70+ val numFeatures = stdArray.length
71+ instances.map { case Instance (label, weight, features) =>
72+ val standardized = Array .ofDim[Double ](numFeatures)
73+ features.foreachNonZero { (i, v) =>
74+ val std = stdArray(i)
75+ if (std != 0 ) standardized(i) = v / std
76+ }
77+ Instance (label, weight, Vectors .dense(standardized).compressed)
78+ }
79+ }
80+
6481 /** Get summary statistics for some data and create a new BlockHingeAggregator. */
6582 private def getNewBlockAggregator (
66- instances : Array [Instance ],
6783 coefficients : Vector ,
68- fitIntercept : Boolean ,
69- blockSize : Int ): BlockHingeAggregator = {
70- val (featuresSummarizer, ySummarizer) =
71- Summarizer .getClassificationSummarizers(sc.parallelize(instances))
72- val featuresStd = featuresSummarizer.std.toArray
73- val numFeatures = featuresStd.length
84+ fitIntercept : Boolean ): BlockHingeAggregator = {
7485 val bcCoefficients = spark.sparkContext.broadcast(coefficients)
75- new BlockHingeAggregator (numFeatures, fitIntercept, blockSize )(bcCoefficients)
86+ new BlockHingeAggregator (fitIntercept)(bcCoefficients)
7687 }
7788
7889 test(" aggregator add method input size" ) {
@@ -153,8 +164,26 @@ class HingeAggregatorSuite extends SparkFunSuite with MLlibTestSparkContext {
153164 }
154165 val gradient = Vectors .dense((gradientCoef ++ Array (gradientIntercept)).map(_ / weightSum))
155166
156- assert(loss ~== agg.loss relTol 0.01 )
157- assert(gradient ~== agg.gradient relTol 0.01 )
167+ assert(loss ~== agg.loss relTol 1e-9 )
168+ assert(gradient ~== agg.gradient relTol 1e-9 )
169+
170+ Seq (1 , 2 , 4 ).foreach { blockSize =>
171+ val blocks1 = standardizedInstances
172+ .grouped(blockSize)
173+ .map(seq => InstanceBlock .fromInstances(seq))
174+ .toArray
175+ val blocks2 = blocks1.map { block =>
176+ new InstanceBlock (block.labels, block.weights, block.matrix.toSparseRowMajor)
177+ }
178+
179+ Seq (blocks1, blocks2).foreach { blocks =>
180+ val blockAgg = getNewBlockAggregator(Vectors .dense(coefArray ++ Array (intercept)),
181+ fitIntercept = true )
182+ blocks.foreach(blockAgg.add)
183+ assert(loss ~== blockAgg.loss relTol 1e-9 )
184+ assert(gradient ~== blockAgg.gradient relTol 1e-9 )
185+ }
186+ }
158187 }
159188
160189 test(" check with zero standard deviation" ) {
@@ -172,51 +201,4 @@ class HingeAggregatorSuite extends SparkFunSuite with MLlibTestSparkContext {
172201 assert(aggConstantFeatureBinary.gradient(0 ) === 0.0 )
173202 assert(aggConstantFeatureBinary.gradient(1 ) == aggConstantFeatureBinaryFiltered.gradient(0 ))
174203 }
175-
176- test(" Block HingeAggregator" ) {
177- val coefArray = Array (1.0 , 2.0 )
178- val intercept = 1.0
179- val blocks1 = instances
180- .grouped(2 )
181- .map(seq => InstanceBlock .fromInstances(seq))
182- .toArray
183-
184- val blocks2 = blocks1.map { block =>
185- new InstanceBlock (block.labels, block.weights, block.matrix.toSparseRowMajor)
186- }
187-
188- val blocks3 = blocks1.zipWithIndex.map { case (block, i) =>
189- if (i % 2 == 0 ) {
190- new InstanceBlock (block.labels, block.weights, block.matrix.toDense)
191- } else {
192- new InstanceBlock (block.labels, block.weights, block.matrix.toSparseRowMajor)
193- }
194- }
195-
196- val agg1 = getNewBlockAggregator(instances, Vectors .dense(coefArray ++ Array (intercept)),
197- fitIntercept = true , blockSize = 1 )
198- blocks1.foreach(agg1.add)
199- val loss1 = agg1.loss
200- val grad1 = agg1.gradient
201- for (blocks <- Seq (blocks1, blocks2, blocks3); blockSize <- Seq (1 , 2 , 4 )) {
202- val agg = getNewBlockAggregator(instances, Vectors .dense(coefArray ++ Array (intercept)),
203- fitIntercept = true , blockSize = blockSize)
204- blocks.foreach(agg.add)
205- assert(loss1 ~== agg.loss relTol 1e-9 )
206- assert(grad1 ~== agg.gradient relTol 1e-9 )
207- }
208-
209- val agg2 = getNewBlockAggregator(instances, Vectors .dense(coefArray),
210- fitIntercept = false , blockSize = 1 )
211- blocks1.foreach(agg2.add)
212- val loss2 = agg2.loss
213- val grad2 = agg2.gradient
214- for (blocks <- Seq (blocks1, blocks2, blocks3); blockSize <- Seq (1 , 2 , 4 )) {
215- val agg = getNewBlockAggregator(instances, Vectors .dense(coefArray),
216- fitIntercept = false , blockSize = blockSize)
217- blocks.foreach(agg.add)
218- assert(loss2 ~== agg.loss relTol 1e-9 )
219- assert(grad2 ~== agg.gradient relTol 1e-9 )
220- }
221- }
222204}
0 commit comments