From f53a0751f09efcb6fe379ddec4d57d68edbd5190 Mon Sep 17 00:00:00 2001 From: Shahid Date: Sun, 6 May 2018 02:28:33 +0530 Subject: [PATCH 1/4] Example code for Power Iteration Clustering --- .../ml/PowerIterationClusteringExample.scala | 114 ++++++++++++++++++ 1 file changed, 114 insertions(+) create mode 100644 examples/src/main/scala/org/apache/spark/examples/ml/PowerIterationClusteringExample.scala diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/PowerIterationClusteringExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/PowerIterationClusteringExample.scala new file mode 100644 index 000000000000..b8dc47bfad85 --- /dev/null +++ b/examples/src/main/scala/org/apache/spark/examples/ml/PowerIterationClusteringExample.scala @@ -0,0 +1,114 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// scalastyle:off println +package org.apache.spark.examples.ml + +import org.apache.log4j.{Level, Logger} + +// $example on$ +import org.apache.spark.ml.clustering.PowerIterationClustering +// $example off$ +import org.apache.spark.sql.{DataFrame, Row, SparkSession} + + + /** + * An example demonstrating power iteration clustering. + * Run with + * {{{ + * bin/run-example ml.PowerIterationClusteringExample + * }}} + */ + +object PowerIterationClusteringExample { + + def main(args: Array[String]): Unit = { + val spark = SparkSession + .builder + .appName(s"${this.getClass.getSimpleName}") + .getOrCreate() + + Logger.getRootLogger.setLevel(Level.WARN) + + // $example on$ + + // Generates data. + val radius1 = 1.0 + val numPoints1 = 5 + val radius2 = 4.0 + val numPoints2 = 20 + + val dataset = generatePICData(spark, radius1, radius2, numPoints1, numPoints2) + + // Trains a PIC model. + val model = new PowerIterationClustering(). + setK(2). + setInitMode("degree"). + setMaxIter(20) + + val prediction = model.transform(dataset).select("id", "prediction") + + // Shows the result. + // println("Cluster Assignment: ") + val result = prediction.collect().map { + row => (row(1), row(0)) + }.groupBy(_._1).mapValues(_.map(_._2)) + + result.foreach { + case (cluster, points) => println(s"$cluster -> [${points.mkString(",")}]") + } + // $example off$ + + spark.stop() + } + + def generatePICData(spark: SparkSession, + r1: Double, + r2: Double, + n1: Int, + n2: Int): DataFrame = { + val n = n1 + n2 + val points = genCircle(r1, n1) ++ genCircle(r2, n2) + + val rows = for (i <- 0 until n) yield { + val neighbors = for (j <- 0 until i) yield { + j.toLong + } + val similarities = for (j <- 0 until i) yield { + sim(points(i), points(j)) + } + (i.toLong, neighbors.toArray, similarities.toArray) + } + spark.createDataFrame(rows).toDF("id", "neighbors", "similarities") + } + + /** Generates a circle of points. */ + private def genCircle(r: Double, n: Int): Array[(Double, Double)] = { + Array.tabulate(n) { i => + val theta = 2.0 * math.Pi * i / n + (r * math.cos(theta), r * math.sin(theta)) + } + } + + /** Computes Gaussian similarity. */ + private def sim(x: (Double, Double), y: (Double, Double)): Double = { + val dist2 = (x._1 - y._1) * (x._1 - y._1) + (x._2 - y._2) * (x._2 - y._2) + math.exp(-dist2 / 2.0) + } +} + +// scalastyle:on println From a718d4a6f7ea03ffea22eba6f55d99006c482606 Mon Sep 17 00:00:00 2001 From: Shahid Date: Tue, 22 May 2018 01:56:16 +0530 Subject: [PATCH 2/4] Example code for Power Iteration Clustering --- .../ml/PowerIterationClusteringExample.scala | 111 +++++------------- 1 file changed, 31 insertions(+), 80 deletions(-) diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/PowerIterationClusteringExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/PowerIterationClusteringExample.scala index b8dc47bfad85..370e0e5270ca 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/PowerIterationClusteringExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/PowerIterationClusteringExample.scala @@ -18,13 +18,10 @@ // scalastyle:off println package org.apache.spark.examples.ml -import org.apache.log4j.{Level, Logger} - // $example on$ import org.apache.spark.ml.clustering.PowerIterationClustering +import org.apache.spark.sql.SparkSession // $example off$ -import org.apache.spark.sql.{DataFrame, Row, SparkSession} - /** * An example demonstrating power iteration clustering. @@ -36,79 +33,33 @@ import org.apache.spark.sql.{DataFrame, Row, SparkSession} object PowerIterationClusteringExample { - def main(args: Array[String]): Unit = { - val spark = SparkSession - .builder - .appName(s"${this.getClass.getSimpleName}") - .getOrCreate() - - Logger.getRootLogger.setLevel(Level.WARN) - - // $example on$ - - // Generates data. - val radius1 = 1.0 - val numPoints1 = 5 - val radius2 = 4.0 - val numPoints2 = 20 - - val dataset = generatePICData(spark, radius1, radius2, numPoints1, numPoints2) - - // Trains a PIC model. - val model = new PowerIterationClustering(). - setK(2). - setInitMode("degree"). - setMaxIter(20) - - val prediction = model.transform(dataset).select("id", "prediction") - - // Shows the result. - // println("Cluster Assignment: ") - val result = prediction.collect().map { - row => (row(1), row(0)) - }.groupBy(_._1).mapValues(_.map(_._2)) - - result.foreach { - case (cluster, points) => println(s"$cluster -> [${points.mkString(",")}]") - } - // $example off$ - - spark.stop() - } - - def generatePICData(spark: SparkSession, - r1: Double, - r2: Double, - n1: Int, - n2: Int): DataFrame = { - val n = n1 + n2 - val points = genCircle(r1, n1) ++ genCircle(r2, n2) - - val rows = for (i <- 0 until n) yield { - val neighbors = for (j <- 0 until i) yield { - j.toLong - } - val similarities = for (j <- 0 until i) yield { - sim(points(i), points(j)) - } - (i.toLong, neighbors.toArray, similarities.toArray) - } - spark.createDataFrame(rows).toDF("id", "neighbors", "similarities") - } - - /** Generates a circle of points. */ - private def genCircle(r: Double, n: Int): Array[(Double, Double)] = { - Array.tabulate(n) { i => - val theta = 2.0 * math.Pi * i / n - (r * math.cos(theta), r * math.sin(theta)) - } - } - - /** Computes Gaussian similarity. */ - private def sim(x: (Double, Double), y: (Double, Double)): Double = { - val dist2 = (x._1 - y._1) * (x._1 - y._1) + (x._2 - y._2) * (x._2 - y._2) - math.exp(-dist2 / 2.0) - } -} - -// scalastyle:on println + def main(args: Array[String]): Unit = { + val spark = SparkSession + .builder + .appName(s"${this.getClass.getSimpleName}") + .getOrCreate() + + // $example on$ + val dataset = spark.createDataFrame(Seq( + (0L, Array(1L, 2L, 4L), Array(0.9, 0.9, 0.1)), + (1L, Array(0L, 2L), Array(0.9, 0.9)), + (2L, Array(0L, 1L), Array(0.9, 0.9)), + (3L, Array(4L), Array(0.9)), + (4L, Array(0L, 3L), Array(0.1, 0.9)) + )).toDF("id", "neighbors", "similarities") + + // Trains a PIC model. + val model = new PowerIterationClustering(). + setK(2). + setInitMode("degree"). + setMaxIter(20) + + val prediction = model.transform(dataset).select("id", "prediction") + + // Shows the cluster assignment + prediction.show() + // $example off$ + + spark.stop() + } + } From d0054939562ff822c2a0581af2a616f61a82e131 Mon Sep 17 00:00:00 2001 From: Shahid Date: Fri, 8 Jun 2018 00:22:28 +0530 Subject: [PATCH 3/4] Example code for Power Iteration Clustering --- .../ml/PowerIterationClusteringExample.scala | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/PowerIterationClusteringExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/PowerIterationClusteringExample.scala index 370e0e5270ca..b9c7c97c37ee 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/PowerIterationClusteringExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/PowerIterationClusteringExample.scala @@ -41,20 +41,21 @@ object PowerIterationClusteringExample { // $example on$ val dataset = spark.createDataFrame(Seq( - (0L, Array(1L, 2L, 4L), Array(0.9, 0.9, 0.1)), - (1L, Array(0L, 2L), Array(0.9, 0.9)), - (2L, Array(0L, 1L), Array(0.9, 0.9)), - (3L, Array(4L), Array(0.9)), - (4L, Array(0L, 3L), Array(0.1, 0.9)) - )).toDF("id", "neighbors", "similarities") + (0L, 1L, 1.0), + (0L, 2L, 1.0), + (1L, 2L, 1.0), + (3L, 4L, 1.0), + (4L, 0L, 0.1) + )).toDF("src", "dst", "weight") // Trains a PIC model. val model = new PowerIterationClustering(). setK(2). + setMaxIter(20). setInitMode("degree"). - setMaxIter(20) + setWeightCol("weight") - val prediction = model.transform(dataset).select("id", "prediction") + val prediction = model.assignClusters(dataset).select("id", "cluster") // Shows the cluster assignment prediction.show() From 6b75b8c20d862dba3f7679833a081296d2a2f8a3 Mon Sep 17 00:00:00 2001 From: Shahid Date: Fri, 8 Jun 2018 01:38:50 +0530 Subject: [PATCH 4/4] Example code for Power Iteration Clustering --- .../ml/PowerIterationClusteringExample.scala | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/PowerIterationClusteringExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/PowerIterationClusteringExample.scala index b9c7c97c37ee..ca8f7affb14e 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/PowerIterationClusteringExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/PowerIterationClusteringExample.scala @@ -20,19 +20,10 @@ package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.clustering.PowerIterationClustering -import org.apache.spark.sql.SparkSession // $example off$ - - /** - * An example demonstrating power iteration clustering. - * Run with - * {{{ - * bin/run-example ml.PowerIterationClusteringExample - * }}} - */ +import org.apache.spark.sql.SparkSession object PowerIterationClusteringExample { - def main(args: Array[String]): Unit = { val spark = SparkSession .builder @@ -48,7 +39,6 @@ object PowerIterationClusteringExample { (4L, 0L, 0.1) )).toDF("src", "dst", "weight") - // Trains a PIC model. val model = new PowerIterationClustering(). setK(2). setMaxIter(20). @@ -58,7 +48,7 @@ object PowerIterationClusteringExample { val prediction = model.assignClusters(dataset).select("id", "cluster") // Shows the cluster assignment - prediction.show() + prediction.show(false) // $example off$ spark.stop()