2626__all__ = ['BisectingKMeans' , 'BisectingKMeansModel' , 'BisectingKMeansSummary' ,
2727 'KMeans' , 'KMeansModel' ,
2828 'GaussianMixture' , 'GaussianMixtureModel' , 'GaussianMixtureSummary' ,
29- 'LDA' , 'LDAModel' , 'LocalLDAModel' , 'DistributedLDAModel' ]
29+ 'LDA' , 'LDAModel' , 'LocalLDAModel' , 'DistributedLDAModel' , 'PowerIterationClustering' ]
3030
3131
3232class ClusteringSummary (JavaWrapper ):
@@ -836,7 +836,7 @@ class LDA(JavaEstimator, HasFeaturesCol, HasMaxIter, HasSeed, HasCheckpointInter
836836
837837 Terminology:
838838
839- - "term" = "word": an el
839+ - "term" = "word": an element of the vocabulary
840840 - "token": instance of a term appearing in a document
841841 - "topic": multinomial distribution over terms representing some concept
842842 - "document": one piece of text, corresponding to one row in the input data
@@ -938,7 +938,7 @@ def __init__(self, featuresCol="features", maxIter=20, seed=None, checkpointInte
938938 k=10, optimizer="online", learningOffset=1024.0, learningDecay=0.51,\
939939 subsamplingRate=0.05, optimizeDocConcentration=True,\
940940 docConcentration=None, topicConcentration=None,\
941- topicDistributionCol="topicDistribution", keepLastCheckpoint=True):
941+ topicDistributionCol="topicDistribution", keepLastCheckpoint=True)
942942 """
943943 super (LDA , self ).__init__ ()
944944 self ._java_obj = self ._new_java_obj ("org.apache.spark.ml.clustering.LDA" , self .uid )
@@ -967,7 +967,7 @@ def setParams(self, featuresCol="features", maxIter=20, seed=None, checkpointInt
967967 k=10, optimizer="online", learningOffset=1024.0, learningDecay=0.51,\
968968 subsamplingRate=0.05, optimizeDocConcentration=True,\
969969 docConcentration=None, topicConcentration=None,\
970- topicDistributionCol="topicDistribution", keepLastCheckpoint=True):
970+ topicDistributionCol="topicDistribution", keepLastCheckpoint=True)
971971
972972 Sets params for LDA.
973973 """
@@ -1156,126 +1156,68 @@ def getKeepLastCheckpoint(self):
11561156 return self .getOrDefault (self .keepLastCheckpoint )
11571157
11581158
1159- class _PowerIterationClusteringParams (JavaParams , HasMaxIter , HasPredictionCol ):
1160- """
1161- Params for :py:attr:`PowerIterationClustering`.
1162- .. versionadded:: 2.4.0
1163- """
1164-
1165- k = Param (Params ._dummy (), "k" ,
1166- "The number of clusters to create. Must be > 1." ,
1167- typeConverter = TypeConverters .toInt )
1168- initMode = Param (Params ._dummy (), "initMode" ,
1169- "The initialization algorithm. This can be either " +
1170- "'random' to use a random vector as vertex properties, or 'degree' to use " +
1171- "a normalized sum of similarities with other vertices. Supported options: " +
1172- "'random' and 'degree'." ,
1173- typeConverter = TypeConverters .toString )
1174- idCol = Param (Params ._dummy (), "idCol" ,
1175- "Name of the input column for vertex IDs." ,
1176- typeConverter = TypeConverters .toString )
1177- neighborsCol = Param (Params ._dummy (), "neighborsCol" ,
1178- "Name of the input column for neighbors in the adjacency list " +
1179- "representation." ,
1180- typeConverter = TypeConverters .toString )
1181- similaritiesCol = Param (Params ._dummy (), "similaritiesCol" ,
1182- "Name of the input column for non-negative weights (similarities) " +
1183- "of edges between the vertex in `idCol` and each neighbor in " +
1184- "`neighborsCol`" ,
1185- typeConverter = TypeConverters .toString )
1186-
1187- @since ("2.4.0" )
1188- def getK (self ):
1189- """
1190- Gets the value of `k`
1191- """
1192- return self .getOrDefault (self .k )
1193-
1194- @since ("2.4.0" )
1195- def getInitMode (self ):
1196- """
1197- Gets the value of `initMode`
1198- """
1199- return self .getOrDefault (self .initMode )
1200-
1201- @since ("2.4.0" )
1202- def getIdCol (self ):
1203- """
1204- Gets the value of `idCol`
1205- """
1206- return self .getOrDefault (self .idCol )
1207-
1208- @since ("2.4.0" )
1209- def getNeighborsCol (self ):
1210- """
1211- Gets the value of `neighborsCol`
1212- """
1213- return self .getOrDefault (self .neighborsCol )
1214-
1215- @since ("2.4.0" )
1216- def getSimilaritiesCol (self ):
1217- """
1218- Gets the value of `similaritiesCol`
1219- """
1220- return self .getOrDefault (self .binary )
1221-
1222-
12231159@inherit_doc
1224- class PowerIterationClustering (JavaTransformer , _PowerIterationClusteringParams , JavaMLReadable ,
1225- JavaMLWritable ):
1160+ class PowerIterationClustering (HasMaxIter , HasPredictionCol , JavaTransformer , JavaParams ,
1161+ JavaMLReadable , JavaMLWritable ):
12261162 """
1227- Model produced by [[PowerIterationClustering]].
1163+ .. note:: Experimental
1164+ Power Iteration Clustering (PIC), a scalable graph clustering algorithm developed by
1165+ <a href=http://www.icml2010.org/papers/387.pdf>Lin and Cohen</a>. From the abstract:
1166+ PIC finds a very low-dimensional embedding of a dataset using truncated power
1167+ iteration on a normalized pair-wise similarity matrix of the data.
1168+
1169+ PIC takes an affinity matrix between items (or vertices) as input. An affinity matrix
1170+ is a symmetric matrix whose entries are non-negative similarities between items.
1171+ PIC takes this matrix (or graph) as an adjacency matrix. Specifically, each input row
1172+ includes:
1173+ - :py:class:`idCol`: vertex ID
1174+ - :py:class:`neighborsCol`: neighbors of vertex in :py:class:`idCol`
1175+ - :py:class:`similaritiesCol`: non-negative weights (similarities) of edges between the
1176+ vertex in :py:class:`idCol` and each neighbor in :py:class:`neighborsCol`
1177+ PIC returns a cluster assignment for each input vertex. It appends a new column
1178+ :py:class:`predictionCol` containing the cluster assignment in :py:class:`[0,k)` for
1179+ each row (vertex).
1180+
1181+ Notes:
1182+ - [[PowerIterationClustering]] is a transformer with an expensive [[transform]] operation.
1183+ Transform runs the iterative PIC algorithm to cluster the whole input dataset.
1184+ - Input validation: This validates that similarities are non-negative but does NOT validate
1185+ that the input matrix is symmetric.
1186+
1187+ @see <a href=http://en.wikipedia.org/wiki/Spectral_clustering>
1188+ Spectral clustering (Wikipedia)</a>
1189+
12281190 >>> from pyspark.sql.types import ArrayType, DoubleType, LongType, StructField, StructType
1229- >>> import math
1230- >>> def genCircle(r, n):
1231- ... points = []
1232- ... for i in range(0, n):
1233- ... theta = 2.0 * math.pi * i / n
1234- ... points.append((r * math.cos(theta), r * math.sin(theta)))
1235- ... return points
1236- >>> def sim(x, y):
1237- ... dist = (x[0] - y[0]) * (x[0] - y[0]) + (x[1] - y[1]) * (x[1] - y[1])
1238- ... return math.exp(-dist / 2.0)
1239- >>> r1 = 1.0
1240- >>> n1 = 10
1241- >>> r2 = 4.0
1242- >>> n2 = 40
1243- >>> n = n1 + n2
1244- >>> points = genCircle(r1, n1) + genCircle(r2, n2)
1245- >>> similarities = []
1246- >>> for i in range (1, n):
1247- ... neighbor = []
1248- ... weight = []
1249- ... for j in range (i):
1250- ... neighbor.append((long)(j))
1251- ... weight.append(sim(points[i], points[j]))
1252- ... similarities.append([(long)(i), neighbor, weight])
1191+ >>> similarities = [((long)(1), [0], [0.5]), ((long)(2), [0, 1], [0.7,0.5]), \
1192+ ((long)(3), [0, 1, 2], [0.9, 0.7, 0.5]), \
1193+ ((long)(4), [0, 1, 2, 3], [1.1, 0.9, 0.7,0.5]), \
1194+ ((long)(5), [0, 1, 2, 3, 4], [1.3, 1.1, 0.9, 0.7,0.5])]
12531195 >>> rdd = sc.parallelize(similarities, 2)
12541196 >>> schema = StructType([StructField("id", LongType(), False), \
1255- StructField("neighbors", ArrayType(LongType(), False), True), \
1256- StructField("similarities", ArrayType(DoubleType(), False), True)])
1197+ StructField("neighbors", ArrayType(LongType(), False), True), \
1198+ StructField("similarities", ArrayType(DoubleType(), False), True)])
12571199 >>> df = spark.createDataFrame(rdd, schema)
12581200 >>> pic = PowerIterationClustering()
1259- >>> result = pic.setK(2).setMaxIter(40 ).transform(df)
1201+ >>> result = pic.setK(2).setMaxIter(10 ).transform(df)
12601202 >>> predictions = sorted(set([(i[0], i[1]) for i in result.select(result.id, result.prediction)
12611203 ... .collect()]), key=lambda x: x[0])
12621204 >>> predictions[0]
12631205 (1, 1)
1264- >>> predictions[8 ]
1265- (9 , 1)
1266- >>> predictions[9 ]
1267- (10 , 0)
1268- >>> predictions[20 ]
1269- (21 , 0)
1270- >>> predictions[48 ]
1271- (49 , 0)
1206+ >>> predictions[1 ]
1207+ (2 , 1)
1208+ >>> predictions[2 ]
1209+ (3 , 0)
1210+ >>> predictions[3 ]
1211+ (4 , 0)
1212+ >>> predictions[4 ]
1213+ (5 , 0)
12721214 >>> pic_path = temp_path + "/pic"
12731215 >>> pic.save(pic_path)
12741216 >>> pic2 = PowerIterationClustering.load(pic_path)
12751217 >>> pic2.getK()
12761218 2
12771219 >>> pic2.getMaxIter()
1278- 40
1220+ 10
12791221 >>> pic3 = PowerIterationClustering(k=4, initMode="degree")
12801222 >>> pic3.getIdCol()
12811223 'id'
@@ -1288,12 +1230,35 @@ class PowerIterationClustering(JavaTransformer, _PowerIterationClusteringParams,
12881230
12891231 .. versionadded:: 2.4.0
12901232 """
1233+
1234+ k = Param (Params ._dummy (), "k" ,
1235+ "The number of clusters to create. Must be > 1." ,
1236+ typeConverter = TypeConverters .toInt )
1237+ initMode = Param (Params ._dummy (), "initMode" ,
1238+ "The initialization algorithm. This can be either " +
1239+ "'random' to use a random vector as vertex properties, or 'degree' to use " +
1240+ "a normalized sum of similarities with other vertices. Supported options: " +
1241+ "'random' and 'degree'." ,
1242+ typeConverter = TypeConverters .toString )
1243+ idCol = Param (Params ._dummy (), "idCol" ,
1244+ "Name of the input column for vertex IDs." ,
1245+ typeConverter = TypeConverters .toString )
1246+ neighborsCol = Param (Params ._dummy (), "neighborsCol" ,
1247+ "Name of the input column for neighbors in the adjacency list " +
1248+ "representation." ,
1249+ typeConverter = TypeConverters .toString )
1250+ similaritiesCol = Param (Params ._dummy (), "similaritiesCol" ,
1251+ "Name of the input column for non-negative weights (similarities) " +
1252+ "of edges between the vertex in `idCol` and each neighbor in " +
1253+ "`neighborsCol`" ,
1254+ typeConverter = TypeConverters .toString )
1255+
12911256 @keyword_only
12921257 def __init__ (self , predictionCol = "prediction" , k = 2 , maxIter = 20 , initMode = "random" ,
12931258 idCol = "id" , neighborsCol = "neighbors" , similaritiesCol = "similarities" ):
12941259 """
12951260 __init__(self, predictionCol="prediction", k=2, maxIter=20, initMode="random",\
1296- idCol="id", neighborsCol="neighbors", similaritiesCol="similarities"):
1261+ idCol="id", neighborsCol="neighbors", similaritiesCol="similarities")
12971262 """
12981263 super (PowerIterationClustering , self ).__init__ ()
12991264 self ._java_obj = self ._new_java_obj (
@@ -1309,7 +1274,7 @@ def setParams(self, predictionCol="prediction", k=2, maxIter=20, initMode="rando
13091274 idCol = "id" , neighborsCol = "neighbors" , similaritiesCol = "similarities" ):
13101275 """
13111276 setParams(self, predictionCol="prediction", k=2, maxIter=20, initMode="random",\
1312- idCol="id", neighborsCol="neighbors", similaritiesCol="similarities"):
1277+ idCol="id", neighborsCol="neighbors", similaritiesCol="similarities")
13131278 Sets params for PowerIterationClustering.
13141279 """
13151280 kwargs = self ._input_kwargs
@@ -1322,34 +1287,69 @@ def setK(self, value):
13221287 """
13231288 return self ._set (k = value )
13241289
1290+ @since ("2.4.0" )
1291+ def getK (self ):
1292+ """
1293+ Gets the value of :py:attr:`k`.
1294+ """
1295+ return self .getOrDefault (self .k )
1296+
13251297 @since ("2.4.0" )
13261298 def setInitMode (self , value ):
13271299 """
13281300 Sets the value of :py:attr:`initMode`.
13291301 """
13301302 return self ._set (initMode = value )
13311303
1304+ @since ("2.4.0" )
1305+ def getInitMode (self ):
1306+ """
1307+ Gets the value of `initMode`
1308+ """
1309+ return self .getOrDefault (self .initMode )
1310+
13321311 @since ("2.4.0" )
13331312 def setIdCol (self , value ):
13341313 """
13351314 Sets the value of :py:attr:`idCol`.
13361315 """
13371316 return self ._set (idCol = value )
13381317
1318+ @since ("2.4.0" )
1319+ def getIdCol (self ):
1320+ """
1321+ Gets the value of :py:attr:`idCol`.
1322+ """
1323+ return self .getOrDefault (self .idCol )
1324+
13391325 @since ("2.4.0" )
13401326 def setNeighborsCol (self , value ):
13411327 """
13421328 Sets the value of :py:attr:`neighborsCol.
13431329 """
13441330 return self ._set (neighborsCol = value )
13451331
1332+ @since ("2.4.0" )
1333+ def getNeighborsCol (self ):
1334+ """
1335+ Gets the value of :py:attr:`neighborsCol`.
1336+ """
1337+ return self .getOrDefault (self .neighborsCol )
1338+
13461339 @since ("2.4.0" )
13471340 def setSimilaritiesCol (self , value ):
13481341 """
13491342 Sets the value of :py:attr:`similaritiesCol`.
13501343 """
13511344 return self ._set (similaritiesCol = value )
13521345
1346+ @since ("2.4.0" )
1347+ def getSimilaritiesCol (self ):
1348+ """
1349+ Gets the value of :py:attr:`similaritiesCol`.
1350+ """
1351+ return self .getOrDefault (self .binary )
1352+
13531353
13541354if __name__ == "__main__" :
13551355 import doctest
0 commit comments