Compute approximation error, add command line.

rezazadeh · rezazadeh · commit ac96fb263830 · 2014-10-03T23:34:56.000-07:00
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/CosineSimilarity.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/CosineSimilarity.scala
@@ -17,46 +17,66 @@
 
 package org.apache.spark.examples.mllib
 
+import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.{SparkConf, SparkContext}
 import org.apache.spark.mllib.linalg.distributed.{MatrixEntry, CoordinateMatrix, RowMatrix}
 
 /**
  * Compute the similar columns of a matrix, using cosine similarity.
+ *
+ * The input matrix must be stored in row-oriented dense format, one line per row with its entries
+ * separated by space. For example,
+ * {{{
+ * 0.5 1.0
+ * 2.0 3.0
+ * 4.0 5.0
+ * }}}
+ * represents a 3-by-2 matrix, whose first row is (0.5, 1.0).
+ *
+ * Example invocation:
+ *
+ * bin/run-example org.apache.spark.examples.mllib.CosineSimilarity \
+ * data/mllib/sample_svm_data.txt 0.1
  */
 object CosineSimilarity {
   def main(args: Array[String]) {
+    if (args.length != 2) {
+      System.err.println("Usage: CosineSimilarity <input> <threshold>")
+      System.exit(1)
+    }
+
     val conf = new SparkConf().setAppName("CosineSimilarity")
     val sc = new SparkContext(conf)
 
-    // Number of rows
-    val M = 1000
-    // Number of columns
-    val U = 1000
-    // Number of nonzeros per row
-    val NNZ = 10
-    // Number of partitions for data
-    val NUMCHUNKS = 4
-
-    // Create data
-    val R = sc.parallelize(0 until M, NUMCHUNKS).flatMap{i =>
-      val inds = new scala.collection.mutable.TreeSet[Int]()
-      while (inds.size < NNZ) {
-        inds += scala.util.Random.nextInt(U)
-      }
-      inds.toArray.map(j => MatrixEntry(i, j, scala.math.random))
+    // Load and parse the data file.
+    val rows = sc.textFile(args(0)).map { line =>
+      val values = line.split(' ').map(_.toDouble)
+      Vectors.dense(values)
     }
+    val mat = new RowMatrix(rows)
 
-    val mat = new CoordinateMatrix(R, M, U).toRowMatrix()
+    val threshold = args(1).toDouble
 
     // Compute similar columns perfectly, with brute force.
-    val simsPerfect = mat.columnSimilarities()
+    val simsPerfect = mat.columnSimilarities().entries.collect
+
+    // Compute similar columns with estimation focusing on pairs more similar than threshold
+    val simsEstimate = mat.columnSimilarities(threshold).entries.collect
 
-    println("Pairwise similarities are: " + simsPerfect.entries.collect.mkString(", "))
+    val n = mat.numCols().toInt
+    val real = Array.ofDim[Double](n, n)
+    val est = Array.ofDim[Double](n, n)
+    for (entry <- simsPerfect) {
+      real(entry.i.toInt)(entry.j.toInt) = entry.value
+    }
+    for (entry <- simsEstimate) {
+      est(entry.i.toInt)(entry.j.toInt) = entry.value
+    }
 
-    // Compute similar columns with estimation focusing on pairs more similar than 0.8
-    val simsEstimate = mat.columnSimilarities(0.8)
+    val errors = Array.tabulate[Double](n, n)((i, j) => math.abs(real(i)(j) - est(i)(j)))
+    val avgErr = errors.flatten.sum / (n * (n - 1) / 2)
 
-    println("Estimated pairwise similarities are: " + simsEstimate.entries.collect.mkString(", "))
+    println(s"Average error in estimate is: $avgErr")
 
     sc.stop()
   }