-
Notifications
You must be signed in to change notification settings - Fork 28.9k
[SPARK-5962] [MLlib] Python support for Power Iteration Clustering #6992
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,32 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| * contributor license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright ownership. | ||
| * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| * (the "License"); you may not use this file except in compliance with | ||
| * the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.spark.mllib.api.python | ||
|
|
||
| import org.apache.spark.rdd.RDD | ||
| import org.apache.spark.mllib.clustering.PowerIterationClusteringModel | ||
|
|
||
| /** | ||
| * A Wrapper of PowerIterationClusteringModel to provide helper method for Python | ||
| */ | ||
| private[python] class PowerIterationClusteringModelWrapper(model: PowerIterationClusteringModel) | ||
| extends PowerIterationClusteringModel(model.k, model.assignments) { | ||
|
|
||
| def getAssignments: RDD[Array[Any]] = { | ||
| model.assignments.map(x => Array(x.id, x.cluster)) | ||
| } | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -25,15 +25,18 @@ | |
|
|
||
| from numpy import array, random, tile | ||
|
|
||
| from collections import namedtuple | ||
|
|
||
| from pyspark import SparkContext | ||
| from pyspark.rdd import RDD, ignore_unicode_prefix | ||
| from pyspark.mllib.common import callMLlibFunc, callJavaFunc, _py2java, _java2py | ||
| from pyspark.mllib.common import JavaModelWrapper, callMLlibFunc, callJavaFunc, _py2java, _java2py | ||
| from pyspark.mllib.linalg import SparseVector, _convert_to_vector, DenseVector | ||
| from pyspark.mllib.stat.distribution import MultivariateGaussian | ||
| from pyspark.mllib.util import Saveable, Loader, inherit_doc | ||
| from pyspark.mllib.util import Saveable, Loader, inherit_doc, JavaLoader, JavaSaveable | ||
| from pyspark.streaming import DStream | ||
|
|
||
| __all__ = ['KMeansModel', 'KMeans', 'GaussianMixtureModel', 'GaussianMixture', | ||
| 'PowerIterationClusteringModel', 'PowerIterationClustering', | ||
| 'StreamingKMeans', 'StreamingKMeansModel'] | ||
|
|
||
|
|
||
|
|
@@ -272,6 +275,94 @@ def train(cls, rdd, k, convergenceTol=1e-3, maxIterations=100, seed=None, initia | |
| return GaussianMixtureModel(weight, mvg_obj) | ||
|
|
||
|
|
||
| class PowerIterationClusteringModel(JavaModelWrapper, JavaSaveable, JavaLoader): | ||
|
|
||
| """ | ||
| .. note:: Experimental | ||
|
|
||
| Model produced by [[PowerIterationClustering]]. | ||
|
|
||
| >>> data = [(0, 1, 1.0), (0, 2, 1.0), (1, 3, 1.0), (2, 3, 1.0), | ||
| ... (0, 3, 1.0), (1, 2, 1.0), (0, 4, 0.1)] | ||
| >>> rdd = sc.parallelize(data, 2) | ||
| >>> model = PowerIterationClustering.train(rdd, 2, 100) | ||
| >>> model.k | ||
| 2 | ||
| >>> sorted(model.assignments().collect()) | ||
| [Assignment(id=0, cluster=1), Assignment(id=1, cluster=0), ... | ||
| >>> import os, tempfile | ||
| >>> path = tempfile.mkdtemp() | ||
| >>> model.save(sc, path) | ||
| >>> sameModel = PowerIterationClusteringModel.load(sc, path) | ||
| >>> sameModel.k | ||
| 2 | ||
| >>> sorted(sameModel.assignments().collect()) | ||
| [Assignment(id=0, cluster=1), Assignment(id=1, cluster=0), ... | ||
| >>> from shutil import rmtree | ||
| >>> try: | ||
| ... rmtree(path) | ||
| ... except OSError: | ||
| ... pass | ||
| """ | ||
|
|
||
| @property | ||
| def k(self): | ||
| """ | ||
| Returns the number of clusters. | ||
| """ | ||
| return self.call("k") | ||
|
|
||
| def assignments(self): | ||
| """ | ||
| Returns the cluster assignments of this model. | ||
| """ | ||
| return self.call("getAssignments").map( | ||
| lambda x: (PowerIterationClustering.Assignment(*x))) | ||
|
|
||
| @classmethod | ||
| def load(cls, sc, path): | ||
| model = cls._load_java(sc, path) | ||
| wrapper = sc._jvm.PowerIterationClusteringModelWrapper(model) | ||
| return PowerIterationClusteringModel(wrapper) | ||
|
|
||
|
|
||
| class PowerIterationClustering(object): | ||
| """ | ||
| .. note:: Experimental | ||
|
|
||
| Power Iteration Clustering (PIC), a scalable graph clustering algorithm | ||
| developed by [[http://www.icml2010.org/papers/387.pdf Lin and Cohen]]. | ||
| From the abstract: PIC finds a very low-dimensional embedding of a | ||
| dataset using truncated power iteration on a normalized pair-wise | ||
| similarity matrix of the data. | ||
| """ | ||
|
|
||
| @classmethod | ||
| def train(cls, rdd, k, maxIterations=100, initMode="random"): | ||
| """ | ||
| :param rdd: an RDD of (i, j, s,,ij,,) tuples representing the | ||
| affinity matrix, which is the matrix A in the PIC paper. | ||
| The similarity s,,ij,, must be nonnegative. | ||
| This is a symmetric matrix and hence s,,ij,, = s,,ji,,. | ||
| For any (i, j) with nonzero similarity, there should be | ||
| either (i, j, s,,ij,,) or (j, i, s,,ji,,) in the input. | ||
| Tuples with i = j are ignored, because we assume | ||
| s,,ij,, = 0.0. | ||
| :param k: Number of clusters. | ||
| :param maxIterations: Maximum number of iterations of the | ||
| PIC algorithm. | ||
| :param initMode: Initialization mode. | ||
| """ | ||
| model = callMLlibFunc("trainPowerIterationClusteringModel", | ||
| rdd.map(_convert_to_vector), int(k), int(maxIterations), initMode) | ||
| return PowerIterationClusteringModel(model) | ||
|
|
||
| class Assignment(namedtuple("Assignment", ["id", "cluster"])): | ||
| """ | ||
| Represents an (id, cluster) tuple. | ||
| """ | ||
|
|
||
|
|
||
| class StreamingKMeansModel(KMeansModel): | ||
| """ | ||
| .. note:: Experimental | ||
|
|
@@ -466,7 +557,8 @@ def predictOnValues(self, dstream): | |
|
|
||
| def _test(): | ||
| import doctest | ||
| globs = globals().copy() | ||
| import pyspark.mllib.clustering | ||
| globs = pyspark.mllib.clustering.__dict__.copy() | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Are the changes required?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, the original code will raise "Py4JError: Trying to call a package" at https://github.com/apache/spark/pull/6992/files#diff-21c55b407050d37f67a2919470e047ebR324 . globs1 = globals().copy()
globs2 = pyspark.mllib.clustering.__dict__.copy()
print [k for k in glob1 if k not in glob2]
print [k for k in glob2 if k not in glob1] and the output is: It means that globs1(can't pass test) has one elements called
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't remember why this is required, but thanks for the explanation and let's keep it this way. |
||
| globs['sc'] = SparkContext('local[4]', 'PythonTest', batchSize=2) | ||
| (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS) | ||
| globs['sc'].stop() | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is worth documenting the elements in each vector.