Locality Sensitive Hashing (LSH) Python API.

yanboliang · yanboliang · commit 85d22c37d3fe · 2016-11-04T07:22:23.000-07:00
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala
@@ -39,9 +39,9 @@ private[ml] trait LSHParams extends HasInputCol with HasOutputCol {
    * higher the dimension is, the lower the false negative rate.
    * @group param
    */
-  final val outputDim: IntParam = new IntParam(this, "outputDim", "output dimension, where" +
-    "increasing dimensionality lowers the false negative rate, and decreasing dimensionality" +
-    " improves the running performance", ParamValidators.gt(0))
+  final val outputDim: IntParam = new IntParam(this, "outputDim", "The output dimension, where" +
+    " increasing dimensionality lowers the false negative rate, and decreasing dimensionality" +
+    " improves the running performance.", ParamValidators.gt(0))
 
   /** @group getParam */
   final def getOutputDim: Int = $(outputDim)
@@ -109,11 +109,11 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]]
    *  - Single Probing: Fast, return at most k elements (Probing only one buckets)
    *  - Multiple Probing: Slow, return exact k elements (Probing multiple buckets close to the key)
    *
-   * @param dataset the dataset to search for nearest neighbors of the key
-   * @param key Feature vector representing the item to search for
-   * @param numNearestNeighbors The maximum number of nearest neighbors
-   * @param singleProbing True for using Single Probing; false for multiple probing
-   * @param distCol Output column for storing the distance between each result row and the key
+   * @param dataset The dataset to search for nearest neighbors of the key.
+   * @param key Feature vector representing the item to search for.
+   * @param numNearestNeighbors The maximum number of nearest neighbors.
+   * @param singleProbing True for using Single Probing; false for multiple probing.
+   * @param distCol Output column for storing the distance between each result row and the key.
    * @return A dataset containing at most k items closest to the key. A distCol is added to show
    *         the distance between each row and the key.
    */
@@ -215,12 +215,12 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]]
    * [[outputCol]] exists, it will use the [[outputCol]]. This allows caching of the transformed
    * data when necessary.
    *
-   * @param datasetA One of the datasets to join
-   * @param datasetB Another dataset to join
-   * @param threshold The threshold for the distance of row pairs
-   * @param distCol Output column for storing the distance between each result row and the key
+   * @param datasetA One of the datasets to join.
+   * @param datasetB Another dataset to join.
+   * @param threshold The threshold for the distance of row pairs.
+   * @param distCol Output column for storing the distance between each result row and the key.
    * @return A joined dataset containing pairs of rows. The original rows are in columns
-   *         "datasetA" and "datasetB", and a distCol is added to show the distance of each pair
+   *         "datasetA" and "datasetB", and a distCol is added to show the distance of each pair.
    */
   def approxSimilarityJoin(
       datasetA: Dataset[_],
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala
@@ -33,8 +33,8 @@ import org.apache.spark.sql.types.StructType
  *
  * Model produced by [[MinHash]], where multiple hash functions are stored. Each hash function is
  * a perfect hash function:
- *    `h_i(x) = (x * k_i mod prime) mod numEntries`
- * where `k_i` is the i-th coefficient, and both `x` and `k_i` are from `Z_prime^*`
+ *    `h_i(x) = (x * k_i \mod prime) \mod numEntries`
+ * where `k_i` is the i-th coefficient, and both `x` and `k_i` are from `Z_{prime^*}`
  *
  * Reference:
  * [[https://en.wikipedia.org/wiki/Perfect_hash_function Wikipedia on Perfect Hash Function]]
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala
@@ -60,7 +60,7 @@ private[ml] trait RandomProjectionParams extends Params {
  *
  * Model produced by [[RandomProjection]], where multiple random vectors are stored. The vectors
  * are normalized to be unit vectors and each vector is used in a hash function:
- *    `h_i(x) = floor(r_i.dot(x) / bucketLength)`
+ *    `h_i(x) = floor(r_i * x / bucketLength)`
  * where `r_i` is the i-th random unit vector. The number of buckets will be `(max L2 norm of input
  * vectors) / bucketLength`.
  *
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py

Original file line number	Diff line number	Diff line change
`@@ -33,8 +33,8 @@ import org.apache.spark.sql.types.StructType`
`33`	`33`	`*`
`34`	`34`	`* Model produced by [[MinHash]], where multiple hash functions are stored. Each hash function is`
`35`	`35`	`* a perfect hash function:`
`36`		- * `h_i(x) = (x * k_i mod prime) mod numEntries`
`37`		- * where `k_i` is the i-th coefficient, and both `x` and `k_i` are from `Z_prime^*`
	`36`	+ * `h_i(x) = (x * k_i \mod prime) \mod numEntries`
	`37`	+ * where `k_i` is the i-th coefficient, and both `x` and `k_i` are from `Z_{prime^*}`
`38`	`38`	`*`
`39`	`39`	`* Reference:`
`40`	`40`	`* [[https://en.wikipedia.org/wiki/Perfect_hash_function Wikipedia on Perfect Hash Function]]`
Original file line number	Diff line number	Diff line change
`@@ -60,7 +60,7 @@ private[ml] trait RandomProjectionParams extends Params {`
`60`	`60`	`*`
`61`	`61`	`* Model produced by [[RandomProjection]], where multiple random vectors are stored. The vectors`
`62`	`62`	`* are normalized to be unit vectors and each vector is used in a hash function:`
`63`		- * `h_i(x) = floor(r_i.dot(x) / bucketLength)`
	`63`	+ * `h_i(x) = floor(r_i * x / bucketLength)`
`64`	`64`	* where `r_i` is the i-th random unit vector. The number of buckets will be `(max L2 norm of input
`65`	`65`	* vectors) / bucketLength`.
`66`	`66`	`*`