From 1b1a0c6c92fc94bc31b8060e1b1f834db20c9d21 Mon Sep 17 00:00:00 2001 From: Yuhao Yang Date: Fri, 4 Dec 2015 09:36:47 -0500 Subject: [PATCH 1/3] avoid view --- .../org/apache/spark/mllib/stat/test/ChiSqTest.scala | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala index 23c8d7c7c8075..63ba1ee7ee975 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala @@ -109,9 +109,10 @@ private[stat] object ChiSqTest extends Logging { } i += 1 distinctLabels += label - features.toArray.view.zipWithIndex.slice(startCol, endCol).map { case (feature, col) => - allDistinctFeatures(col) += feature - (col, feature, label) + features.toArray.slice(startCol, endCol).zip(startCol until endCol).map { + case (feature, col) => + allDistinctFeatures(col) += feature + (col, feature, label) } } }.countByValue() @@ -122,7 +123,7 @@ private[stat] object ChiSqTest extends Logging { pairCounts.keys.filter(_._1 == startCol).map(_._3).toArray.distinct.zipWithIndex.toMap } val numLabels = labels.size - pairCounts.keys.groupBy(_._1).map { case (col, keys) => + pairCounts.keys.groupBy(_._1).foreach { case (col, keys) => val features = keys.map(_._2).toArray.distinct.zipWithIndex.toMap val numRows = features.size val contingency = new BDM(numRows, numLabels, new Array[Double](numRows * numLabels)) From 8d8327d7f828ed865a103509f361292a69402129 Mon Sep 17 00:00:00 2001 From: Yuhao Yang Date: Mon, 7 Dec 2015 17:22:26 +0800 Subject: [PATCH 2/3] optimize loop --- .../org/apache/spark/mllib/stat/test/ChiSqTest.scala | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala index 63ba1ee7ee975..57bf32e927970 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala @@ -109,10 +109,11 @@ private[stat] object ChiSqTest extends Logging { } i += 1 distinctLabels += label - features.toArray.slice(startCol, endCol).zip(startCol until endCol).map { - case (feature, col) => - allDistinctFeatures(col) += feature - (col, feature, label) + val featureArray = features.toArray + (startCol until endCol).map { col => + val feature = featureArray(col) + allDistinctFeatures(col) += feature + (col, feature, label) } } }.countByValue() From a709f49a751aa07ccbefdd6a44a5c1afa4b57f35 Mon Sep 17 00:00:00 2001 From: Yuhao Yang Date: Tue, 12 Jan 2016 10:04:19 +0800 Subject: [PATCH 3/3] use to breeze for features --- .../scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala index 907c88a17f5e1..4a3fb06469818 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala @@ -109,9 +109,9 @@ private[stat] object ChiSqTest extends Logging { } i += 1 distinctLabels += label - val featureArray = features.toArray + val brzFeatures = features.toBreeze (startCol until endCol).map { col => - val feature = featureArray(col) + val feature = brzFeatures(col) allDistinctFeatures(col) += feature (col, feature, label) }