Skip to content

Commit 6d00f34

Browse files
committed
address comments
1 parent 387d6ff commit 6d00f34

File tree

2 files changed

+156
-109
lines changed

2 files changed

+156
-109
lines changed

python/pyspark/ml/clustering.py

Lines changed: 109 additions & 109 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
__all__ = ['BisectingKMeans', 'BisectingKMeansModel', 'BisectingKMeansSummary',
2727
'KMeans', 'KMeansModel',
2828
'GaussianMixture', 'GaussianMixtureModel', 'GaussianMixtureSummary',
29-
'LDA', 'LDAModel', 'LocalLDAModel', 'DistributedLDAModel']
29+
'LDA', 'LDAModel', 'LocalLDAModel', 'DistributedLDAModel', 'PowerIterationClustering']
3030

3131

3232
class ClusteringSummary(JavaWrapper):
@@ -836,7 +836,7 @@ class LDA(JavaEstimator, HasFeaturesCol, HasMaxIter, HasSeed, HasCheckpointInter
836836
837837
Terminology:
838838
839-
- "term" = "word": an el
839+
- "term" = "word": an element of the vocabulary
840840
- "token": instance of a term appearing in a document
841841
- "topic": multinomial distribution over terms representing some concept
842842
- "document": one piece of text, corresponding to one row in the input data
@@ -938,7 +938,7 @@ def __init__(self, featuresCol="features", maxIter=20, seed=None, checkpointInte
938938
k=10, optimizer="online", learningOffset=1024.0, learningDecay=0.51,\
939939
subsamplingRate=0.05, optimizeDocConcentration=True,\
940940
docConcentration=None, topicConcentration=None,\
941-
topicDistributionCol="topicDistribution", keepLastCheckpoint=True):
941+
topicDistributionCol="topicDistribution", keepLastCheckpoint=True)
942942
"""
943943
super(LDA, self).__init__()
944944
self._java_obj = self._new_java_obj("org.apache.spark.ml.clustering.LDA", self.uid)
@@ -967,7 +967,7 @@ def setParams(self, featuresCol="features", maxIter=20, seed=None, checkpointInt
967967
k=10, optimizer="online", learningOffset=1024.0, learningDecay=0.51,\
968968
subsamplingRate=0.05, optimizeDocConcentration=True,\
969969
docConcentration=None, topicConcentration=None,\
970-
topicDistributionCol="topicDistribution", keepLastCheckpoint=True):
970+
topicDistributionCol="topicDistribution", keepLastCheckpoint=True)
971971
972972
Sets params for LDA.
973973
"""
@@ -1156,126 +1156,68 @@ def getKeepLastCheckpoint(self):
11561156
return self.getOrDefault(self.keepLastCheckpoint)
11571157

11581158

1159-
class _PowerIterationClusteringParams(JavaParams, HasMaxIter, HasPredictionCol):
1160-
"""
1161-
Params for :py:attr:`PowerIterationClustering`.
1162-
.. versionadded:: 2.4.0
1163-
"""
1164-
1165-
k = Param(Params._dummy(), "k",
1166-
"The number of clusters to create. Must be > 1.",
1167-
typeConverter=TypeConverters.toInt)
1168-
initMode = Param(Params._dummy(), "initMode",
1169-
"The initialization algorithm. This can be either " +
1170-
"'random' to use a random vector as vertex properties, or 'degree' to use " +
1171-
"a normalized sum of similarities with other vertices. Supported options: " +
1172-
"'random' and 'degree'.",
1173-
typeConverter=TypeConverters.toString)
1174-
idCol = Param(Params._dummy(), "idCol",
1175-
"Name of the input column for vertex IDs.",
1176-
typeConverter=TypeConverters.toString)
1177-
neighborsCol = Param(Params._dummy(), "neighborsCol",
1178-
"Name of the input column for neighbors in the adjacency list " +
1179-
"representation.",
1180-
typeConverter=TypeConverters.toString)
1181-
similaritiesCol = Param(Params._dummy(), "similaritiesCol",
1182-
"Name of the input column for non-negative weights (similarities) " +
1183-
"of edges between the vertex in `idCol` and each neighbor in " +
1184-
"`neighborsCol`",
1185-
typeConverter=TypeConverters.toString)
1186-
1187-
@since("2.4.0")
1188-
def getK(self):
1189-
"""
1190-
Gets the value of `k`
1191-
"""
1192-
return self.getOrDefault(self.k)
1193-
1194-
@since("2.4.0")
1195-
def getInitMode(self):
1196-
"""
1197-
Gets the value of `initMode`
1198-
"""
1199-
return self.getOrDefault(self.initMode)
1200-
1201-
@since("2.4.0")
1202-
def getIdCol(self):
1203-
"""
1204-
Gets the value of `idCol`
1205-
"""
1206-
return self.getOrDefault(self.idCol)
1207-
1208-
@since("2.4.0")
1209-
def getNeighborsCol(self):
1210-
"""
1211-
Gets the value of `neighborsCol`
1212-
"""
1213-
return self.getOrDefault(self.neighborsCol)
1214-
1215-
@since("2.4.0")
1216-
def getSimilaritiesCol(self):
1217-
"""
1218-
Gets the value of `similaritiesCol`
1219-
"""
1220-
return self.getOrDefault(self.binary)
1221-
1222-
12231159
@inherit_doc
1224-
class PowerIterationClustering(JavaTransformer, _PowerIterationClusteringParams, JavaMLReadable,
1225-
JavaMLWritable):
1160+
class PowerIterationClustering(HasMaxIter, HasPredictionCol, JavaTransformer, JavaParams,
1161+
JavaMLReadable, JavaMLWritable):
12261162
"""
1227-
Model produced by [[PowerIterationClustering]].
1163+
.. note:: Experimental
1164+
Power Iteration Clustering (PIC), a scalable graph clustering algorithm developed by
1165+
<a href=http://www.icml2010.org/papers/387.pdf>Lin and Cohen</a>. From the abstract:
1166+
PIC finds a very low-dimensional embedding of a dataset using truncated power
1167+
iteration on a normalized pair-wise similarity matrix of the data.
1168+
1169+
PIC takes an affinity matrix between items (or vertices) as input. An affinity matrix
1170+
is a symmetric matrix whose entries are non-negative similarities between items.
1171+
PIC takes this matrix (or graph) as an adjacency matrix. Specifically, each input row
1172+
includes:
1173+
- :py:class:`idCol`: vertex ID
1174+
- :py:class:`neighborsCol`: neighbors of vertex in :py:class:`idCol`
1175+
- :py:class:`similaritiesCol`: non-negative weights (similarities) of edges between the
1176+
vertex in :py:class:`idCol` and each neighbor in :py:class:`neighborsCol`
1177+
PIC returns a cluster assignment for each input vertex. It appends a new column
1178+
:py:class:`predictionCol` containing the cluster assignment in :py:class:`[0,k)` for
1179+
each row (vertex).
1180+
1181+
Notes:
1182+
- [[PowerIterationClustering]] is a transformer with an expensive [[transform]] operation.
1183+
Transform runs the iterative PIC algorithm to cluster the whole input dataset.
1184+
- Input validation: This validates that similarities are non-negative but does NOT validate
1185+
that the input matrix is symmetric.
1186+
1187+
@see <a href=http://en.wikipedia.org/wiki/Spectral_clustering>
1188+
Spectral clustering (Wikipedia)</a>
1189+
12281190
>>> from pyspark.sql.types import ArrayType, DoubleType, LongType, StructField, StructType
1229-
>>> import math
1230-
>>> def genCircle(r, n):
1231-
... points = []
1232-
... for i in range(0, n):
1233-
... theta = 2.0 * math.pi * i / n
1234-
... points.append((r * math.cos(theta), r * math.sin(theta)))
1235-
... return points
1236-
>>> def sim(x, y):
1237-
... dist = (x[0] - y[0]) * (x[0] - y[0]) + (x[1] - y[1]) * (x[1] - y[1])
1238-
... return math.exp(-dist / 2.0)
1239-
>>> r1 = 1.0
1240-
>>> n1 = 10
1241-
>>> r2 = 4.0
1242-
>>> n2 = 40
1243-
>>> n = n1 + n2
1244-
>>> points = genCircle(r1, n1) + genCircle(r2, n2)
1245-
>>> similarities = []
1246-
>>> for i in range (1, n):
1247-
... neighbor = []
1248-
... weight = []
1249-
... for j in range (i):
1250-
... neighbor.append((long)(j))
1251-
... weight.append(sim(points[i], points[j]))
1252-
... similarities.append([(long)(i), neighbor, weight])
1191+
>>> similarities = [((long)(1), [0], [0.5]), ((long)(2), [0, 1], [0.7,0.5]), \
1192+
((long)(3), [0, 1, 2], [0.9, 0.7, 0.5]), \
1193+
((long)(4), [0, 1, 2, 3], [1.1, 0.9, 0.7,0.5]), \
1194+
((long)(5), [0, 1, 2, 3, 4], [1.3, 1.1, 0.9, 0.7,0.5])]
12531195
>>> rdd = sc.parallelize(similarities, 2)
12541196
>>> schema = StructType([StructField("id", LongType(), False), \
1255-
StructField("neighbors", ArrayType(LongType(), False), True), \
1256-
StructField("similarities", ArrayType(DoubleType(), False), True)])
1197+
StructField("neighbors", ArrayType(LongType(), False), True), \
1198+
StructField("similarities", ArrayType(DoubleType(), False), True)])
12571199
>>> df = spark.createDataFrame(rdd, schema)
12581200
>>> pic = PowerIterationClustering()
1259-
>>> result = pic.setK(2).setMaxIter(40).transform(df)
1201+
>>> result = pic.setK(2).setMaxIter(10).transform(df)
12601202
>>> predictions = sorted(set([(i[0], i[1]) for i in result.select(result.id, result.prediction)
12611203
... .collect()]), key=lambda x: x[0])
12621204
>>> predictions[0]
12631205
(1, 1)
1264-
>>> predictions[8]
1265-
(9, 1)
1266-
>>> predictions[9]
1267-
(10, 0)
1268-
>>> predictions[20]
1269-
(21, 0)
1270-
>>> predictions[48]
1271-
(49, 0)
1206+
>>> predictions[1]
1207+
(2, 1)
1208+
>>> predictions[2]
1209+
(3, 0)
1210+
>>> predictions[3]
1211+
(4, 0)
1212+
>>> predictions[4]
1213+
(5, 0)
12721214
>>> pic_path = temp_path + "/pic"
12731215
>>> pic.save(pic_path)
12741216
>>> pic2 = PowerIterationClustering.load(pic_path)
12751217
>>> pic2.getK()
12761218
2
12771219
>>> pic2.getMaxIter()
1278-
40
1220+
10
12791221
>>> pic3 = PowerIterationClustering(k=4, initMode="degree")
12801222
>>> pic3.getIdCol()
12811223
'id'
@@ -1288,12 +1230,35 @@ class PowerIterationClustering(JavaTransformer, _PowerIterationClusteringParams,
12881230
12891231
.. versionadded:: 2.4.0
12901232
"""
1233+
1234+
k = Param(Params._dummy(), "k",
1235+
"The number of clusters to create. Must be > 1.",
1236+
typeConverter=TypeConverters.toInt)
1237+
initMode = Param(Params._dummy(), "initMode",
1238+
"The initialization algorithm. This can be either " +
1239+
"'random' to use a random vector as vertex properties, or 'degree' to use " +
1240+
"a normalized sum of similarities with other vertices. Supported options: " +
1241+
"'random' and 'degree'.",
1242+
typeConverter=TypeConverters.toString)
1243+
idCol = Param(Params._dummy(), "idCol",
1244+
"Name of the input column for vertex IDs.",
1245+
typeConverter=TypeConverters.toString)
1246+
neighborsCol = Param(Params._dummy(), "neighborsCol",
1247+
"Name of the input column for neighbors in the adjacency list " +
1248+
"representation.",
1249+
typeConverter=TypeConverters.toString)
1250+
similaritiesCol = Param(Params._dummy(), "similaritiesCol",
1251+
"Name of the input column for non-negative weights (similarities) " +
1252+
"of edges between the vertex in `idCol` and each neighbor in " +
1253+
"`neighborsCol`",
1254+
typeConverter=TypeConverters.toString)
1255+
12911256
@keyword_only
12921257
def __init__(self, predictionCol="prediction", k=2, maxIter=20, initMode="random",
12931258
idCol="id", neighborsCol="neighbors", similaritiesCol="similarities"):
12941259
"""
12951260
__init__(self, predictionCol="prediction", k=2, maxIter=20, initMode="random",\
1296-
idCol="id", neighborsCol="neighbors", similaritiesCol="similarities"):
1261+
idCol="id", neighborsCol="neighbors", similaritiesCol="similarities")
12971262
"""
12981263
super(PowerIterationClustering, self).__init__()
12991264
self._java_obj = self._new_java_obj(
@@ -1309,7 +1274,7 @@ def setParams(self, predictionCol="prediction", k=2, maxIter=20, initMode="rando
13091274
idCol="id", neighborsCol="neighbors", similaritiesCol="similarities"):
13101275
"""
13111276
setParams(self, predictionCol="prediction", k=2, maxIter=20, initMode="random",\
1312-
idCol="id", neighborsCol="neighbors", similaritiesCol="similarities"):
1277+
idCol="id", neighborsCol="neighbors", similaritiesCol="similarities")
13131278
Sets params for PowerIterationClustering.
13141279
"""
13151280
kwargs = self._input_kwargs
@@ -1322,34 +1287,69 @@ def setK(self, value):
13221287
"""
13231288
return self._set(k=value)
13241289

1290+
@since("2.4.0")
1291+
def getK(self):
1292+
"""
1293+
Gets the value of :py:attr:`k`.
1294+
"""
1295+
return self.getOrDefault(self.k)
1296+
13251297
@since("2.4.0")
13261298
def setInitMode(self, value):
13271299
"""
13281300
Sets the value of :py:attr:`initMode`.
13291301
"""
13301302
return self._set(initMode=value)
13311303

1304+
@since("2.4.0")
1305+
def getInitMode(self):
1306+
"""
1307+
Gets the value of `initMode`
1308+
"""
1309+
return self.getOrDefault(self.initMode)
1310+
13321311
@since("2.4.0")
13331312
def setIdCol(self, value):
13341313
"""
13351314
Sets the value of :py:attr:`idCol`.
13361315
"""
13371316
return self._set(idCol=value)
13381317

1318+
@since("2.4.0")
1319+
def getIdCol(self):
1320+
"""
1321+
Gets the value of :py:attr:`idCol`.
1322+
"""
1323+
return self.getOrDefault(self.idCol)
1324+
13391325
@since("2.4.0")
13401326
def setNeighborsCol(self, value):
13411327
"""
13421328
Sets the value of :py:attr:`neighborsCol.
13431329
"""
13441330
return self._set(neighborsCol=value)
13451331

1332+
@since("2.4.0")
1333+
def getNeighborsCol(self):
1334+
"""
1335+
Gets the value of :py:attr:`neighborsCol`.
1336+
"""
1337+
return self.getOrDefault(self.neighborsCol)
1338+
13461339
@since("2.4.0")
13471340
def setSimilaritiesCol(self, value):
13481341
"""
13491342
Sets the value of :py:attr:`similaritiesCol`.
13501343
"""
13511344
return self._set(similaritiesCol=value)
13521345

1346+
@since("2.4.0")
1347+
def getSimilaritiesCol(self):
1348+
"""
1349+
Gets the value of :py:attr:`similaritiesCol`.
1350+
"""
1351+
return self.getOrDefault(self.binary)
1352+
13531353

13541354
if __name__ == "__main__":
13551355
import doctest

python/pyspark/ml/tests.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1873,6 +1873,53 @@ def test_kmeans_cosine_distance(self):
18731873
self.assertTrue(result[4].prediction == result[5].prediction)
18741874

18751875

1876+
class PowerIterationClustering(SparkSessionTestCase):
1877+
1878+
def test_power_iteration_clustering(self):
1879+
from pyspark.sql.types import ArrayType, DoubleType, LongType, StructField, StructType
1880+
from pyspark.ml.clustering import PowerIterationClustering
1881+
import math
1882+
1883+
def genCircle(r, n):
1884+
points = []
1885+
for i in range(0, n):
1886+
theta = 2.0 * math.pi * i / n
1887+
points.append((r * math.cos(theta), r * math.sin(theta)))
1888+
return points
1889+
1890+
def sim(x, y):
1891+
dist = (x[0] - y[0]) * (x[0] - y[0]) + (x[1] - y[1]) * (x[1] - y[1])
1892+
return math.exp(-dist / 2.0)
1893+
1894+
r1 = 1.0
1895+
n1 = 10
1896+
r2 = 4.0
1897+
n2 = 40
1898+
n = n1 + n2
1899+
points = genCircle(r1, n1) + genCircle(r2, n2)
1900+
similarities = []
1901+
for i in range(1, n):
1902+
neighbor = []
1903+
weight = []
1904+
for j in range(i):
1905+
neighbor.append((long)(j))
1906+
weight.append(sim(points[i], points[j]))
1907+
similarities.append([(long)(i), neighbor, weight])
1908+
rdd = self.sc.parallelize(similarities, 2)
1909+
schema = StructType([StructField("id", LongType(), False),
1910+
StructField("neighbors", ArrayType(LongType(), False), True),
1911+
StructField("similarities", ArrayType(DoubleType(), False), True)])
1912+
df = self.spark.createDataFrame(rdd, schema)
1913+
pic = PowerIterationClustering()
1914+
result = pic.setK(2).setMaxIter(40).transform(df)
1915+
predictions = sorted(set([(i[0], i[1]) for i in result.select(result.id,
1916+
result.prediction).collect()]), key=lambda x: x[0])
1917+
for i in range(0, 8):
1918+
self.assertEqual(predictions[i], (i+1, 1))
1919+
for i in range(9, 48):
1920+
self.assertEqual(predictions[i], (i+1, 0))
1921+
1922+
18761923
class OneVsRestTests(SparkSessionTestCase):
18771924

18781925
def test_copy(self):

0 commit comments

Comments
 (0)