Code Review Comments: Some minor fixes

Yun Ni · Yun Ni · commit 5d55752024d6 · 2017-02-14T09:12:57.000-08:00
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/BucketedRandomProjectionLSHExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/BucketedRandomProjectionLSHExample.scala
@@ -21,7 +21,7 @@ package org.apache.spark.examples.ml
 // $example on$
 import org.apache.spark.ml.feature.BucketedRandomProjectionLSH
 import org.apache.spark.ml.linalg.Vectors
-import org.apache.spark.sql.functions._
+import org.apache.spark.sql.functions.col
 // $example off$
 import org.apache.spark.sql.SparkSession
 
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/MinHashLSHExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/MinHashLSHExample.scala
@@ -21,7 +21,7 @@ package org.apache.spark.examples.ml
 // $example on$
 import org.apache.spark.ml.feature.MinHashLSH
 import org.apache.spark.ml.linalg.Vectors
-import org.apache.spark.sql.functions._
+import org.apache.spark.sql.functions.col
 // $example off$
 import org.apache.spark.sql.SparkSession
 
@@ -69,10 +69,10 @@ object MinHashLSHExample {
     // We could avoid computing hashes by passing in the already-transformed dataset, e.g.
     // `model.approxSimilarityJoin(transformedA, transformedB, 0.6)`
     println("Approximately joining dfA and dfB on Jaccard distance smaller than 0.6:")
-    model.approxSimilarityJoin(dfA, dfB, 0.6)
+    model.approxSimilarityJoin(dfA, dfB, 0.6, "JaccardDistance")
       .select(col("datasetA.id").alias("idA"),
         col("datasetB.id").alias("idB"),
-        col("distCol").alias("JaccardDistance")).show()
+        col("JaccardDistance")).show()
 
     // Compute the locality sensitive hashes for the input rows, then perform approximate nearest
     // neighbor search.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala
@@ -232,8 +232,8 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]]
    * @param threshold The threshold for the distance of row pairs.
    * @param distCol Output column for storing the distance between each pair of rows.
    * @return A joined dataset containing pairs of rows. The original rows are in columns
-   *         "datasetA" and "datasetB", and a distCol is added to show the distance between each
-   *         pair.
+   *         "datasetA" and "datasetB", and a column "distCol" is added to show the distance
+   *         between each pair.
    */
   def approxSimilarityJoin(
       datasetA: Dataset[_],
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
@@ -244,6 +244,8 @@ class BucketedRandomProjectionLSH(JavaEstimator, LSHParams, HasInputCol, HasOutp
     >>> modelPath = temp_path + "/brp-model"
     >>> model.save(modelPath)
     >>> model2 = BucketedRandomProjectionLSHModel.load(modelPath)
+    >>> model.transform(df).head().hashes == model2.transform(df).head().hashes
+    True
 
     .. versionadded:: 2.2.0
     """
@@ -995,6 +997,8 @@ class MinHashLSH(JavaEstimator, LSHParams, HasInputCol, HasOutputCol, HasSeed,
     >>> modelPath = temp_path + "/mh-model"
     >>> model.save(modelPath)
     >>> model2 = MinHashLSHModel.load(modelPath)
+    >>> model.transform(df).head().hashes == model2.transform(df).head().hashes
+    True
 
     .. versionadded:: 2.2.0
     """