From c0cc64380e906f08a0f8abbfd5c2ccd3c0333bd5 Mon Sep 17 00:00:00 2001 From: RJ Nowling Date: Mon, 22 Sep 2014 16:53:56 -0400 Subject: [PATCH 01/15] Add minimumOccurence filtering to IDF --- docs/mllib-feature-extraction.md | 15 ++++++++ .../org/apache/spark/mllib/feature/IDF.scala | 34 ++++++++++++++++--- .../spark/mllib/feature/JavaTfIdfSuite.java | 22 ++++++++++++ .../apache/spark/mllib/feature/IDFSuite.scala | 34 +++++++++++++++++++ 4 files changed, 101 insertions(+), 4 deletions(-) diff --git a/docs/mllib-feature-extraction.md b/docs/mllib-feature-extraction.md index 41a27f6208d1b..6e22d3f4cfb92 100644 --- a/docs/mllib-feature-extraction.md +++ b/docs/mllib-feature-extraction.md @@ -82,6 +82,21 @@ tf.cache() val idf = new IDF().fit(tf) val tfidf: RDD[Vector] = idf.transform(tf) {% endhighlight %} + +MLLib's IDF implementation provides an option for ignoring terms which occur in less than a +minimum number of documents. In such cases, the IDF for these terms is set to 0. This feature +can be used by passing the `minimumOccurence` value to the IDF constructor. + +{% highlight scala %} +import org.apache.spark.mllib.feature.IDF + +// ... continue from the previous example +tf.cache() +val idf = new IDF(minimumOccurence=2).fit(tf) +val tfidf: RDD[Vector] = idf.transform(tf) +{% endhighlight %} + + diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala index d40d5553c1d21..1772ab914b6cb 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala @@ -30,9 +30,20 @@ import org.apache.spark.rdd.RDD * Inverse document frequency (IDF). * The standard formulation is used: `idf = log((m + 1) / (d(t) + 1))`, where `m` is the total * number of documents and `d(t)` is the number of documents that contain term `t`. + * + * This implementation supports filtering out terms which do not appear in a minimum number + * of documents (controlled by the variable minimumOccurence). For terms that are not in + * at least `minimumOccurence` documents, the IDF is found as 0, resulting in TF-IDFs of 0. + * + * @param minimumOccurence minimum of documents in which a term + * should appear for filtering + * + * */ @Experimental -class IDF { +class IDF(minimumOccurence: Long) { + + def this() = this(0L) // TODO: Allow different IDF formulations. @@ -41,7 +52,7 @@ class IDF { * @param dataset an RDD of term frequency vectors */ def fit(dataset: RDD[Vector]): IDFModel = { - val idf = dataset.treeAggregate(new IDF.DocumentFrequencyAggregator)( + val idf = dataset.treeAggregate(new IDF.DocumentFrequencyAggregator(minimumOccurence=minimumOccurence))( seqOp = (df, v) => df.add(v), combOp = (df1, df2) => df1.merge(df2) ).idf() @@ -60,7 +71,7 @@ class IDF { private object IDF { /** Document frequency aggregator. */ - class DocumentFrequencyAggregator extends Serializable { + class DocumentFrequencyAggregator(minimumOccurence: Long) extends Serializable { /** number of documents */ private var m = 0L @@ -123,7 +134,17 @@ private object IDF { val inv = new Array[Double](n) var j = 0 while (j < n) { - inv(j) = math.log((m + 1.0)/ (df(j) + 1.0)) + /* + * If the term is not present in the minimum + * number of documents, set IDF to 0. This + * will cause multiplication in IDFModel to + * set TF-IDF to 0. + */ + if(df(j) >= minimumOccurence) { + inv(j) = math.log((m + 1.0)/ (df(j) + 1.0)) + } else { + inv(j) = 0.0 + } j += 1 } Vectors.dense(inv) @@ -140,6 +161,11 @@ class IDFModel private[mllib] (val idf: Vector) extends Serializable { /** * Transforms term frequency (TF) vectors to TF-IDF vectors. + * + * If minimumOccurence was set for the IDF calculation, + * the terms which occur in fewer than minimumOccurence + * documents will have an entry of 0. + * * @param dataset an RDD of term frequency vectors * @return an RDD of TF-IDF vectors */ diff --git a/mllib/src/test/java/org/apache/spark/mllib/feature/JavaTfIdfSuite.java b/mllib/src/test/java/org/apache/spark/mllib/feature/JavaTfIdfSuite.java index e8d99f4ae43ae..4dd116d0e2c6c 100644 --- a/mllib/src/test/java/org/apache/spark/mllib/feature/JavaTfIdfSuite.java +++ b/mllib/src/test/java/org/apache/spark/mllib/feature/JavaTfIdfSuite.java @@ -27,6 +27,8 @@ import org.junit.Test; import com.google.common.collect.Lists; +import java.lang.reflect.Method; + import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.mllib.linalg.Vector; @@ -63,4 +65,24 @@ public void tfIdf() { Assert.assertEquals(0.0, v.apply(indexOfThis), 1e-15); } } + + @Test + public void tfIdfMinimumOccurence() { + // The tests are to check Java compatibility. + HashingTF tf = new HashingTF(); + JavaRDD> documents = sc.parallelize(Lists.newArrayList( + Lists.newArrayList("this is a sentence".split(" ")), + Lists.newArrayList("this is another sentence".split(" ")), + Lists.newArrayList("this is still a sentence".split(" "))), 2); + JavaRDD termFreqs = tf.transform(documents); + termFreqs.collect(); + IDF idf = new IDF(2); + JavaRDD tfIdfs = idf.fit(termFreqs).transform(termFreqs); + List localTfIdfs = tfIdfs.collect(); + int indexOfThis = tf.indexOf("this"); + for (Vector v: localTfIdfs) { + Assert.assertEquals(0.0, v.apply(indexOfThis), 1e-15); + } + } + } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/IDFSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/IDFSuite.scala index 53d9c0c640b98..34bd438dbcab6 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/feature/IDFSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/IDFSuite.scala @@ -54,4 +54,38 @@ class IDFSuite extends FunSuite with LocalSparkContext { assert(tfidf2.indices === Array(1)) assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12) } + + test("idf minimum occurence filtering") { + val n = 4 + val localTermFrequencies = Seq( + Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)), + Vectors.dense(0.0, 1.0, 2.0, 3.0), + Vectors.sparse(n, Array(1), Array(1.0)) + ) + val m = localTermFrequencies.size + val termFrequencies = sc.parallelize(localTermFrequencies, 2) + val idf = new IDF(minimumOccurence=1L) + val model = idf.fit(termFrequencies) + val expected = Vectors.dense(Array(0, 3, 1, 2).map { x => + if(x > 0) { + math.log((m.toDouble + 1.0) / (x + 1.0)) + } else { + 0 + } + }) + assert(model.idf ~== expected absTol 1e-12) + val tfidf = model.transform(termFrequencies).cache().zipWithIndex().map(_.swap).collectAsMap() + assert(tfidf.size === 3) + val tfidf0 = tfidf(0L).asInstanceOf[SparseVector] + assert(tfidf0.indices === Array(1, 3)) + assert(Vectors.dense(tfidf0.values) ~== + Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12) + val tfidf1 = tfidf(1L).asInstanceOf[DenseVector] + assert(Vectors.dense(tfidf1.values) ~== + Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12) + val tfidf2 = tfidf(2L).asInstanceOf[SparseVector] + assert(tfidf2.indices === Array(1)) + assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12) + } + } From 4b974f5dcbaa32560f6f5167f154290d68d86f5e Mon Sep 17 00:00:00 2001 From: RJ Nowling Date: Mon, 22 Sep 2014 17:12:50 -0400 Subject: [PATCH 02/15] Remove accidentally-added import from testing --- .../java/org/apache/spark/mllib/feature/JavaTfIdfSuite.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/mllib/src/test/java/org/apache/spark/mllib/feature/JavaTfIdfSuite.java b/mllib/src/test/java/org/apache/spark/mllib/feature/JavaTfIdfSuite.java index 4dd116d0e2c6c..3d9a3a01e045a 100644 --- a/mllib/src/test/java/org/apache/spark/mllib/feature/JavaTfIdfSuite.java +++ b/mllib/src/test/java/org/apache/spark/mllib/feature/JavaTfIdfSuite.java @@ -27,8 +27,6 @@ import org.junit.Test; import com.google.common.collect.Lists; -import java.lang.reflect.Method; - import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.mllib.linalg.Vector; From a200babbad7280d3a20f05abb84140b0b8d51b85 Mon Sep 17 00:00:00 2001 From: RJ Nowling Date: Mon, 22 Sep 2014 19:47:11 -0400 Subject: [PATCH 03/15] Remove unnecessary else statement --- .../src/main/scala/org/apache/spark/mllib/feature/IDF.scala | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala index 1772ab914b6cb..671eddc54aa2a 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala @@ -139,11 +139,12 @@ private object IDF { * number of documents, set IDF to 0. This * will cause multiplication in IDFModel to * set TF-IDF to 0. + * + * Since arrays are initialized to 0 by default, + * we just omit changing those entries. */ if(df(j) >= minimumOccurence) { inv(j) = math.log((m + 1.0)/ (df(j) + 1.0)) - } else { - inv(j) = 0.0 } j += 1 } From 689725201b3fbfa1232f4b5f74dc5002c8950b3f Mon Sep 17 00:00:00 2001 From: RJ Nowling Date: Mon, 22 Sep 2014 20:31:15 -0400 Subject: [PATCH 04/15] Preface minimumOccurence members with val to make them final and immutable --- mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala index 671eddc54aa2a..d0969f9309fed 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala @@ -41,7 +41,7 @@ import org.apache.spark.rdd.RDD * */ @Experimental -class IDF(minimumOccurence: Long) { +class IDF(val minimumOccurence: Long) { def this() = this(0L) @@ -71,7 +71,7 @@ class IDF(minimumOccurence: Long) { private object IDF { /** Document frequency aggregator. */ - class DocumentFrequencyAggregator(minimumOccurence: Long) extends Serializable { + class DocumentFrequencyAggregator(val minimumOccurence: Long) extends Serializable { /** number of documents */ private var m = 0L From 1801fd2e9518c610b3657c6a9cb9239fedd43847 Mon Sep 17 00:00:00 2001 From: RJ Nowling Date: Mon, 22 Sep 2014 20:36:10 -0400 Subject: [PATCH 05/15] Fix style errors in IDF.scala --- mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala index d0969f9309fed..0beaf0471e1f4 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala @@ -52,7 +52,8 @@ class IDF(val minimumOccurence: Long) { * @param dataset an RDD of term frequency vectors */ def fit(dataset: RDD[Vector]): IDFModel = { - val idf = dataset.treeAggregate(new IDF.DocumentFrequencyAggregator(minimumOccurence=minimumOccurence))( + val idf = dataset.treeAggregate(new IDF.DocumentFrequencyAggregator( + minimumOccurence=minimumOccurence))( seqOp = (df, v) => df.add(v), combOp = (df1, df2) => df1.merge(df2) ).idf() From 1fc09d83a911f22b0454ede9ac28ceec0129598c Mon Sep 17 00:00:00 2001 From: RJ Nowling Date: Tue, 23 Sep 2014 08:42:39 -0400 Subject: [PATCH 06/15] Add backwards-compatible constructor to DocumentFrequencyAggregator --- mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala index 0beaf0471e1f4..0712456809825 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala @@ -79,6 +79,9 @@ private object IDF { /** document frequency vector */ private var df: BDV[Long] = _ + + def this() = this(0L); + /** Adds a new document. */ def add(doc: Vector): this.type = { if (isEmpty) { From 9fb40937bfcdca4a00f075892407b23fc7e4c95c Mon Sep 17 00:00:00 2001 From: RJ Nowling Date: Tue, 23 Sep 2014 13:00:59 -0400 Subject: [PATCH 07/15] Remove unnecessary lines from IDF class docs --- mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala | 2 -- 1 file changed, 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala index 0712456809825..8daccc8b1e096 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala @@ -37,8 +37,6 @@ import org.apache.spark.rdd.RDD * * @param minimumOccurence minimum of documents in which a term * should appear for filtering - * - * */ @Experimental class IDF(val minimumOccurence: Long) { From 47850ab413a0829685ac72f18b06da2962e0fdd5 Mon Sep 17 00:00:00 2001 From: RJ Nowling Date: Tue, 23 Sep 2014 13:05:40 -0400 Subject: [PATCH 08/15] Changed minimumOccurence to Int from Long --- .../main/scala/org/apache/spark/mllib/feature/IDF.scala | 8 ++++---- .../scala/org/apache/spark/mllib/feature/IDFSuite.scala | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala index 8daccc8b1e096..07a041bfd9d59 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala @@ -39,9 +39,9 @@ import org.apache.spark.rdd.RDD * should appear for filtering */ @Experimental -class IDF(val minimumOccurence: Long) { +class IDF(val minimumOccurence: Int) { - def this() = this(0L) + def this() = this(0) // TODO: Allow different IDF formulations. @@ -70,7 +70,7 @@ class IDF(val minimumOccurence: Long) { private object IDF { /** Document frequency aggregator. */ - class DocumentFrequencyAggregator(val minimumOccurence: Long) extends Serializable { + class DocumentFrequencyAggregator(val minimumOccurence: Int) extends Serializable { /** number of documents */ private var m = 0L @@ -78,7 +78,7 @@ private object IDF { private var df: BDV[Long] = _ - def this() = this(0L); + def this() = this(0); /** Adds a new document. */ def add(doc: Vector): this.type = { diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/IDFSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/IDFSuite.scala index 34bd438dbcab6..38b0b137cea3e 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/feature/IDFSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/IDFSuite.scala @@ -64,7 +64,7 @@ class IDFSuite extends FunSuite with LocalSparkContext { ) val m = localTermFrequencies.size val termFrequencies = sc.parallelize(localTermFrequencies, 2) - val idf = new IDF(minimumOccurence=1L) + val idf = new IDF(minimumOccurence=1) val model = idf.fit(termFrequencies) val expected = Vectors.dense(Array(0, 3, 1, 2).map { x => if(x > 0) { From 40fd70cc4ca39c999b2da4188aafd7fbcc2b17ea Mon Sep 17 00:00:00 2001 From: RJ Nowling Date: Tue, 23 Sep 2014 13:12:07 -0400 Subject: [PATCH 09/15] Change minimumOccurence to minDocFreq in code and docs --- docs/mllib-feature-extraction.md | 4 ++-- .../org/apache/spark/mllib/feature/IDF.scala | 20 +++++++++---------- .../spark/mllib/feature/JavaTfIdfSuite.java | 2 +- .../apache/spark/mllib/feature/IDFSuite.scala | 4 ++-- 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/docs/mllib-feature-extraction.md b/docs/mllib-feature-extraction.md index 6e22d3f4cfb92..e8904437edb00 100644 --- a/docs/mllib-feature-extraction.md +++ b/docs/mllib-feature-extraction.md @@ -85,14 +85,14 @@ val tfidf: RDD[Vector] = idf.transform(tf) MLLib's IDF implementation provides an option for ignoring terms which occur in less than a minimum number of documents. In such cases, the IDF for these terms is set to 0. This feature -can be used by passing the `minimumOccurence` value to the IDF constructor. +can be used by passing the `minDocFreq` value to the IDF constructor. {% highlight scala %} import org.apache.spark.mllib.feature.IDF // ... continue from the previous example tf.cache() -val idf = new IDF(minimumOccurence=2).fit(tf) +val idf = new IDF(minDocFreq=2).fit(tf) val tfidf: RDD[Vector] = idf.transform(tf) {% endhighlight %} diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala index 07a041bfd9d59..0eed4b8809dce 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala @@ -32,14 +32,14 @@ import org.apache.spark.rdd.RDD * number of documents and `d(t)` is the number of documents that contain term `t`. * * This implementation supports filtering out terms which do not appear in a minimum number - * of documents (controlled by the variable minimumOccurence). For terms that are not in - * at least `minimumOccurence` documents, the IDF is found as 0, resulting in TF-IDFs of 0. + * of documents (controlled by the variable `minDocFreq`). For terms that are not in + * at least `minDocFreq` documents, the IDF is found as 0, resulting in TF-IDFs of 0. * - * @param minimumOccurence minimum of documents in which a term - * should appear for filtering + * @param minDocFreq minimum of documents in which a term + * should appear for filtering */ @Experimental -class IDF(val minimumOccurence: Int) { +class IDF(val minDocFreq: Int) { def this() = this(0) @@ -51,7 +51,7 @@ class IDF(val minimumOccurence: Int) { */ def fit(dataset: RDD[Vector]): IDFModel = { val idf = dataset.treeAggregate(new IDF.DocumentFrequencyAggregator( - minimumOccurence=minimumOccurence))( + minDocFreq=minDocFreq))( seqOp = (df, v) => df.add(v), combOp = (df1, df2) => df1.merge(df2) ).idf() @@ -70,7 +70,7 @@ class IDF(val minimumOccurence: Int) { private object IDF { /** Document frequency aggregator. */ - class DocumentFrequencyAggregator(val minimumOccurence: Int) extends Serializable { + class DocumentFrequencyAggregator(val minDocFreq: Int) extends Serializable { /** number of documents */ private var m = 0L @@ -145,7 +145,7 @@ private object IDF { * Since arrays are initialized to 0 by default, * we just omit changing those entries. */ - if(df(j) >= minimumOccurence) { + if(df(j) >= minDocFreq) { inv(j) = math.log((m + 1.0)/ (df(j) + 1.0)) } j += 1 @@ -165,8 +165,8 @@ class IDFModel private[mllib] (val idf: Vector) extends Serializable { /** * Transforms term frequency (TF) vectors to TF-IDF vectors. * - * If minimumOccurence was set for the IDF calculation, - * the terms which occur in fewer than minimumOccurence + * If `minDocFreq` was set for the IDF calculation, + * the terms which occur in fewer than `minDocFreq` * documents will have an entry of 0. * * @param dataset an RDD of term frequency vectors diff --git a/mllib/src/test/java/org/apache/spark/mllib/feature/JavaTfIdfSuite.java b/mllib/src/test/java/org/apache/spark/mllib/feature/JavaTfIdfSuite.java index 3d9a3a01e045a..064263e02cd11 100644 --- a/mllib/src/test/java/org/apache/spark/mllib/feature/JavaTfIdfSuite.java +++ b/mllib/src/test/java/org/apache/spark/mllib/feature/JavaTfIdfSuite.java @@ -65,7 +65,7 @@ public void tfIdf() { } @Test - public void tfIdfMinimumOccurence() { + public void tfIdfMinimumDocumentFrequency() { // The tests are to check Java compatibility. HashingTF tf = new HashingTF(); JavaRDD> documents = sc.parallelize(Lists.newArrayList( diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/IDFSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/IDFSuite.scala index 38b0b137cea3e..c53fb4c587280 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/feature/IDFSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/IDFSuite.scala @@ -55,7 +55,7 @@ class IDFSuite extends FunSuite with LocalSparkContext { assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12) } - test("idf minimum occurence filtering") { + test("idf minimum document frequency filtering") { val n = 4 val localTermFrequencies = Seq( Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)), @@ -64,7 +64,7 @@ class IDFSuite extends FunSuite with LocalSparkContext { ) val m = localTermFrequencies.size val termFrequencies = sc.parallelize(localTermFrequencies, 2) - val idf = new IDF(minimumOccurence=1) + val idf = new IDF(minDocFreq=1) val model = idf.fit(termFrequencies) val expected = Vectors.dense(Array(0, 3, 1, 2).map { x => if(x > 0) { From 79978fc05860d9ee3f9c1e401737e35f33a243c2 Mon Sep 17 00:00:00 2001 From: RJ Nowling Date: Tue, 23 Sep 2014 13:12:47 -0400 Subject: [PATCH 10/15] Remove unnecessary semi-colon --- mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala index 0eed4b8809dce..6b206e24eff42 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala @@ -78,7 +78,7 @@ private object IDF { private var df: BDV[Long] = _ - def this() = this(0); + def this() = this(0) /** Adds a new document. */ def add(doc: Vector): this.type = { From 901344703749a6370b322df626802ff4b00d280b Mon Sep 17 00:00:00 2001 From: RJ Nowling Date: Tue, 23 Sep 2014 13:13:56 -0400 Subject: [PATCH 11/15] Add space before division operator --- mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala index 6b206e24eff42..b7543ea4c02ed 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala @@ -146,7 +146,7 @@ private object IDF { * we just omit changing those entries. */ if(df(j) >= minDocFreq) { - inv(j) = math.log((m + 1.0)/ (df(j) + 1.0)) + inv(j) = math.log((m + 1.0) / (df(j) + 1.0)) } j += 1 } From 30d20b39757062b4a5fb6a89e7291407fee9b11f Mon Sep 17 00:00:00 2001 From: RJ Nowling Date: Tue, 23 Sep 2014 13:18:25 -0400 Subject: [PATCH 12/15] Add spaces around equals signs --- docs/mllib-feature-extraction.md | 2 +- mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala | 2 +- .../test/scala/org/apache/spark/mllib/feature/IDFSuite.scala | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/mllib-feature-extraction.md b/docs/mllib-feature-extraction.md index e8904437edb00..1511ae6dda4ed 100644 --- a/docs/mllib-feature-extraction.md +++ b/docs/mllib-feature-extraction.md @@ -92,7 +92,7 @@ import org.apache.spark.mllib.feature.IDF // ... continue from the previous example tf.cache() -val idf = new IDF(minDocFreq=2).fit(tf) +val idf = new IDF(minDocFreq = 2).fit(tf) val tfidf: RDD[Vector] = idf.transform(tf) {% endhighlight %} diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala index b7543ea4c02ed..632b8acc03bf1 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala @@ -51,7 +51,7 @@ class IDF(val minDocFreq: Int) { */ def fit(dataset: RDD[Vector]): IDFModel = { val idf = dataset.treeAggregate(new IDF.DocumentFrequencyAggregator( - minDocFreq=minDocFreq))( + minDocFreq = minDocFreq))( seqOp = (df, v) => df.add(v), combOp = (df1, df2) => df1.merge(df2) ).idf() diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/IDFSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/IDFSuite.scala index c53fb4c587280..b325670c1b9df 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/feature/IDFSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/IDFSuite.scala @@ -64,7 +64,7 @@ class IDFSuite extends FunSuite with LocalSparkContext { ) val m = localTermFrequencies.size val termFrequencies = sc.parallelize(localTermFrequencies, 2) - val idf = new IDF(minDocFreq=1) + val idf = new IDF(minDocFreq = 1) val model = idf.fit(termFrequencies) val expected = Vectors.dense(Array(0, 3, 1, 2).map { x => if(x > 0) { From bfa82ece63c4320255efb1ef717e6a69e4e598e2 Mon Sep 17 00:00:00 2001 From: RJ Nowling Date: Tue, 23 Sep 2014 13:19:32 -0400 Subject: [PATCH 13/15] Add space after if --- .../test/scala/org/apache/spark/mllib/feature/IDFSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/IDFSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/IDFSuite.scala index b325670c1b9df..6f00e6f9f0ef5 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/feature/IDFSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/IDFSuite.scala @@ -67,7 +67,7 @@ class IDFSuite extends FunSuite with LocalSparkContext { val idf = new IDF(minDocFreq = 1) val model = idf.fit(termFrequencies) val expected = Vectors.dense(Array(0, 3, 1, 2).map { x => - if(x > 0) { + if (x > 0) { math.log((m.toDouble + 1.0) / (x + 1.0)) } else { 0 From e6523a829d1e402639282642ca72833e403b84b2 Mon Sep 17 00:00:00 2001 From: RJ Nowling Date: Tue, 23 Sep 2014 13:20:47 -0400 Subject: [PATCH 14/15] Remove unnecessary toDouble's from IDFSuite --- .../test/scala/org/apache/spark/mllib/feature/IDFSuite.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/IDFSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/IDFSuite.scala index 6f00e6f9f0ef5..43974f84e3ca8 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/feature/IDFSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/IDFSuite.scala @@ -38,7 +38,7 @@ class IDFSuite extends FunSuite with LocalSparkContext { val idf = new IDF val model = idf.fit(termFrequencies) val expected = Vectors.dense(Array(0, 3, 1, 2).map { x => - math.log((m.toDouble + 1.0) / (x + 1.0)) + math.log((m + 1.0) / (x + 1.0)) }) assert(model.idf ~== expected absTol 1e-12) val tfidf = model.transform(termFrequencies).cache().zipWithIndex().map(_.swap).collectAsMap() @@ -68,7 +68,7 @@ class IDFSuite extends FunSuite with LocalSparkContext { val model = idf.fit(termFrequencies) val expected = Vectors.dense(Array(0, 3, 1, 2).map { x => if (x > 0) { - math.log((m.toDouble + 1.0) / (x + 1.0)) + math.log((m + 1.0) / (x + 1.0)) } else { 0 } From 0aa3c63b699f60d3c08ba8bf739a29e8a35d999a Mon Sep 17 00:00:00 2001 From: RJ Nowling Date: Tue, 23 Sep 2014 15:06:38 -0400 Subject: [PATCH 15/15] Fix identation --- mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala index 632b8acc03bf1..720bb70b08dbf 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala @@ -78,7 +78,7 @@ private object IDF { private var df: BDV[Long] = _ - def this() = this(0) + def this() = this(0) /** Adds a new document. */ def add(doc: Vector): this.type = {