From a35d739d9206a66392f2ecc62f2946d7f7697225 Mon Sep 17 00:00:00 2001 From: zhengruifeng Date: Mon, 13 Apr 2020 18:58:04 +0800 Subject: [PATCH 1/3] init --- .../apache/spark/ml/feature/MinHashLSH.scala | 44 ++++++++++++++++--- 1 file changed, 38 insertions(+), 6 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/MinHashLSH.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/MinHashLSH.scala index ac3d79d07755..237dbb8bd808 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/MinHashLSH.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/MinHashLSH.scala @@ -70,19 +70,51 @@ class MinHashLSHModel private[ml]( @Since("2.1.0") override protected[ml] def keyDistance(x: Vector, y: Vector): Double = { - val xSet = x.nonZeroIterator.map(_._1).toSet - val ySet = y.nonZeroIterator.map(_._1).toSet - val intersectionSize = xSet.intersect(ySet).size.toDouble - val unionSize = xSet.size + ySet.size - intersectionSize + val xIter = x.nonZeroIterator.map(_._1) + val yIter = y.nonZeroIterator.map(_._1) + if (xIter.isEmpty) { + assert(yIter.hasNext, "The union of two input sets must have at least 1 elements") + return 0.0 + } else if (yIter.isEmpty) { + return 0.0 + } + + var xIndex = xIter.next + var yIndex = yIter.next + var xSize = 1 + var ySize = 1 + var intersectionSize = 0 + + while (xIndex != -1 || yIndex != -1) { + if (xIndex != -1 && yIndex != -1) { + if (xIndex == yIndex) { + intersectionSize += 1 + xIndex = if (xIter.hasNext) { xSize += 1; xIter.next } else -1 + yIndex = if (yIter.hasNext) { ySize += 1; yIter.next } else -1 + } else if (xIndex > yIndex) { + yIndex = if (yIter.hasNext) { ySize += 1; yIter.next } else -1 + } else { + xIndex = if (xIter.hasNext) { xSize += 1; xIter.next } else -1 + } + } else if (xIndex != -1) { + while (xIter.hasNext) { xIndex = xIter.next; xSize += 1 } + xIndex = -1 + } else { + while (yIter.hasNext) { yIndex = yIter.next; ySize += 1 } + yIndex = -1 + } + } + + val unionSize = xSize + ySize - intersectionSize assert(unionSize > 0, "The union of two input sets must have at least 1 elements") - 1 - intersectionSize / unionSize + 1 - intersectionSize.toDouble / unionSize } @Since("2.1.0") override protected[ml] def hashDistance(x: Seq[Vector], y: Seq[Vector]): Double = { // Since it's generated by hashing, it will be a pair of dense vectors. // TODO: This hashDistance function requires more discussion in SPARK-18454 - x.zip(y).map(vectorPair => + x.iterator.zip(y.iterator).map(vectorPair => vectorPair._1.toArray.zip(vectorPair._2.toArray).count(pair => pair._1 != pair._2) ).min } From a77225ed52c4d8997ba874f0074aff02f6b730ee Mon Sep 17 00:00:00 2001 From: zhengruifeng Date: Mon, 13 Apr 2020 20:12:22 +0800 Subject: [PATCH 2/3] empty -> 1.0 --- .../main/scala/org/apache/spark/ml/feature/MinHashLSH.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/MinHashLSH.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/MinHashLSH.scala index 237dbb8bd808..50200f3f54e2 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/MinHashLSH.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/MinHashLSH.scala @@ -74,9 +74,9 @@ class MinHashLSHModel private[ml]( val yIter = y.nonZeroIterator.map(_._1) if (xIter.isEmpty) { assert(yIter.hasNext, "The union of two input sets must have at least 1 elements") - return 0.0 + return 1.0 } else if (yIter.isEmpty) { - return 0.0 + return 1.0 } var xIndex = xIter.next From 7725d096082821242a628a6c59c1cb3739039749 Mon Sep 17 00:00:00 2001 From: zhengruifeng Date: Tue, 14 Apr 2020 10:36:33 +0800 Subject: [PATCH 3/3] use iter.size for remaining elements use iter.size for remaining elements --- .../apache/spark/ml/feature/MinHashLSH.scala | 31 ++++++++----------- 1 file changed, 13 insertions(+), 18 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/MinHashLSH.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/MinHashLSH.scala index 50200f3f54e2..be467c654aaa 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/MinHashLSH.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/MinHashLSH.scala @@ -73,7 +73,7 @@ class MinHashLSHModel private[ml]( val xIter = x.nonZeroIterator.map(_._1) val yIter = y.nonZeroIterator.map(_._1) if (xIter.isEmpty) { - assert(yIter.hasNext, "The union of two input sets must have at least 1 elements") + require(yIter.hasNext, "The union of two input sets must have at least 1 elements") return 1.0 } else if (yIter.isEmpty) { return 1.0 @@ -85,28 +85,23 @@ class MinHashLSHModel private[ml]( var ySize = 1 var intersectionSize = 0 - while (xIndex != -1 || yIndex != -1) { - if (xIndex != -1 && yIndex != -1) { - if (xIndex == yIndex) { - intersectionSize += 1 - xIndex = if (xIter.hasNext) { xSize += 1; xIter.next } else -1 - yIndex = if (yIter.hasNext) { ySize += 1; yIter.next } else -1 - } else if (xIndex > yIndex) { - yIndex = if (yIter.hasNext) { ySize += 1; yIter.next } else -1 - } else { - xIndex = if (xIter.hasNext) { xSize += 1; xIter.next } else -1 - } - } else if (xIndex != -1) { - while (xIter.hasNext) { xIndex = xIter.next; xSize += 1 } - xIndex = -1 + while (xIndex != -1 && yIndex != -1) { + if (xIndex == yIndex) { + intersectionSize += 1 + xIndex = if (xIter.hasNext) { xSize += 1; xIter.next } else -1 + yIndex = if (yIter.hasNext) { ySize += 1; yIter.next } else -1 + } else if (xIndex > yIndex) { + yIndex = if (yIter.hasNext) { ySize += 1; yIter.next } else -1 } else { - while (yIter.hasNext) { yIndex = yIter.next; ySize += 1 } - yIndex = -1 + xIndex = if (xIter.hasNext) { xSize += 1; xIter.next } else -1 } } + xSize += xIter.size + ySize += yIter.size + val unionSize = xSize + ySize - intersectionSize - assert(unionSize > 0, "The union of two input sets must have at least 1 elements") + require(unionSize > 0, "The union of two input sets must have at least 1 elements") 1 - intersectionSize.toDouble / unionSize }