Skip to content

Commit 5d55752

Browse files
author
Yun Ni
committed
Code Review Comments: Some minor fixes
1 parent c64d50b commit 5d55752

File tree

4 files changed

+10
-6
lines changed

4 files changed

+10
-6
lines changed

examples/src/main/scala/org/apache/spark/examples/ml/BucketedRandomProjectionLSHExample.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ package org.apache.spark.examples.ml
2121
// $example on$
2222
import org.apache.spark.ml.feature.BucketedRandomProjectionLSH
2323
import org.apache.spark.ml.linalg.Vectors
24-
import org.apache.spark.sql.functions._
24+
import org.apache.spark.sql.functions.col
2525
// $example off$
2626
import org.apache.spark.sql.SparkSession
2727

examples/src/main/scala/org/apache/spark/examples/ml/MinHashLSHExample.scala

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ package org.apache.spark.examples.ml
2121
// $example on$
2222
import org.apache.spark.ml.feature.MinHashLSH
2323
import org.apache.spark.ml.linalg.Vectors
24-
import org.apache.spark.sql.functions._
24+
import org.apache.spark.sql.functions.col
2525
// $example off$
2626
import org.apache.spark.sql.SparkSession
2727

@@ -69,10 +69,10 @@ object MinHashLSHExample {
6969
// We could avoid computing hashes by passing in the already-transformed dataset, e.g.
7070
// `model.approxSimilarityJoin(transformedA, transformedB, 0.6)`
7171
println("Approximately joining dfA and dfB on Jaccard distance smaller than 0.6:")
72-
model.approxSimilarityJoin(dfA, dfB, 0.6)
72+
model.approxSimilarityJoin(dfA, dfB, 0.6, "JaccardDistance")
7373
.select(col("datasetA.id").alias("idA"),
7474
col("datasetB.id").alias("idB"),
75-
col("distCol").alias("JaccardDistance")).show()
75+
col("JaccardDistance")).show()
7676

7777
// Compute the locality sensitive hashes for the input rows, then perform approximate nearest
7878
// neighbor search.

mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -232,8 +232,8 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]]
232232
* @param threshold The threshold for the distance of row pairs.
233233
* @param distCol Output column for storing the distance between each pair of rows.
234234
* @return A joined dataset containing pairs of rows. The original rows are in columns
235-
* "datasetA" and "datasetB", and a distCol is added to show the distance between each
236-
* pair.
235+
* "datasetA" and "datasetB", and a column "distCol" is added to show the distance
236+
* between each pair.
237237
*/
238238
def approxSimilarityJoin(
239239
datasetA: Dataset[_],

python/pyspark/ml/feature.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -244,6 +244,8 @@ class BucketedRandomProjectionLSH(JavaEstimator, LSHParams, HasInputCol, HasOutp
244244
>>> modelPath = temp_path + "/brp-model"
245245
>>> model.save(modelPath)
246246
>>> model2 = BucketedRandomProjectionLSHModel.load(modelPath)
247+
>>> model.transform(df).head().hashes == model2.transform(df).head().hashes
248+
True
247249
248250
.. versionadded:: 2.2.0
249251
"""
@@ -995,6 +997,8 @@ class MinHashLSH(JavaEstimator, LSHParams, HasInputCol, HasOutputCol, HasSeed,
995997
>>> modelPath = temp_path + "/mh-model"
996998
>>> model.save(modelPath)
997999
>>> model2 = MinHashLSHModel.load(modelPath)
1000+
>>> model.transform(df).head().hashes == model2.transform(df).head().hashes
1001+
True
9981002
9991003
.. versionadded:: 2.2.0
10001004
"""

0 commit comments

Comments
 (0)