[SPARK-3790][MLlib] CosineSimilarity Example

rezazadeh · mengxr · commit 3d7b36e0de26 · 2014-10-07T16:40:16.000-07:00
Provide example for `RowMatrix.columnSimilarity()` Author: Reza Zadeh <rizlar@gmail.com> Closes apache#2622 from rezazadeh/dimsumexample and squashes the following commits: 8f20b82 [Reza Zadeh] update comment 379066d [Reza Zadeh] cache rows 792b81c [Reza Zadeh] Address review comments e573c7a [Reza Zadeh] Average absolute error b15685f [Reza Zadeh] Use scopt. Distribute evaluation. eca3dfd [Reza Zadeh] Documentation ac96fb2 [Reza Zadeh] Compute approximation error, add command line. 4533579 [Reza Zadeh] CosineSimilarity Example
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/CosineSimilarity.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/CosineSimilarity.scala
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.mllib
+
+import scopt.OptionParser
+
+import org.apache.spark.SparkContext._
+import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.linalg.distributed.{MatrixEntry, RowMatrix}
+import org.apache.spark.{SparkConf, SparkContext}
+
+/**
+ * Compute the similar columns of a matrix, using cosine similarity.
+ *
+ * The input matrix must be stored in row-oriented dense format, one line per row with its entries
+ * separated by space. For example,
+ * {{{
+ * 0.5 1.0
+ * 2.0 3.0
+ * 4.0 5.0
+ * }}}
+ * represents a 3-by-2 matrix, whose first row is (0.5, 1.0).
+ *
+ * Example invocation:
+ *
+ * bin/run-example mllib.CosineSimilarity \
+ * --threshold 0.1 data/mllib/sample_svm_data.txt
+ */
+object CosineSimilarity {
+  case class Params(inputFile: String = null, threshold: Double = 0.1)
+
+  def main(args: Array[String]) {
+    val defaultParams = Params()
+
+    val parser = new OptionParser[Params]("CosineSimilarity") {
+      head("CosineSimilarity: an example app.")
+      opt[Double]("threshold")
+        .required()
+        .text(s"threshold similarity: to tradeoff computation vs quality estimate")
+        .action((x, c) => c.copy(threshold = x))
+      arg[String]("<inputFile>")
+        .required()
+        .text(s"input file, one row per line, space-separated")
+        .action((x, c) => c.copy(inputFile = x))
+      note(
+        """
+          |For example, the following command runs this app on a dataset:
+          |
+          | ./bin/spark-submit  --class org.apache.spark.examples.mllib.CosineSimilarity \
+          | examplesjar.jar \
+          | --threshold 0.1 data/mllib/sample_svm_data.txt
+        """.stripMargin)
+    }
+
+    parser.parse(args, defaultParams).map { params =>
+      run(params)
+    } getOrElse {
+      System.exit(1)
+    }
+  }
+
+  def run(params: Params) {
+    val conf = new SparkConf().setAppName("CosineSimilarity")
+    val sc = new SparkContext(conf)
+
+    // Load and parse the data file.
+    val rows = sc.textFile(params.inputFile).map { line =>
+      val values = line.split(' ').map(_.toDouble)
+      Vectors.dense(values)
+    }.cache()
+    val mat = new RowMatrix(rows)
+
+    // Compute similar columns perfectly, with brute force.
+    val exact = mat.columnSimilarities()
+
+    // Compute similar columns with estimation using DIMSUM
+    val approx = mat.columnSimilarities(params.threshold)
+
+    val exactEntries = exact.entries.map { case MatrixEntry(i, j, u) => ((i, j), u) }
+    val approxEntries = approx.entries.map { case MatrixEntry(i, j, v) => ((i, j), v) }
+    val MAE = exactEntries.leftOuterJoin(approxEntries).values.map {
+      case (u, Some(v)) =>
+        math.abs(u - v)
+      case (u, None) =>
+        math.abs(u)
+    }.mean()
+
+    println(s"Average absolute error in estimate is: $MAE")
+
+    sc.stop()
+  }
+}