@@ -2082,8 +2082,9 @@ class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol, HasHandleInvalid,
20822082 """
20832083 A label indexer that maps a string column of labels to an ML column of label indices.
20842084 If the input column is numeric, we cast it to string and index the string values.
2085- The indices are in [0, numLabels), ordered by label frequencies.
2086- So the most frequent label gets index 0.
2085+ The indices are in [0, numLabels). By default, this is ordered by label frequencies
2086+ so the most frequent label gets index 0. The ordering behavior is controlled by
2087+ setting stringOrderType.
20872088
20882089 >>> stringIndexer = StringIndexer(inputCol="label", outputCol="indexed", handleInvalid='error')
20892090 >>> model = stringIndexer.fit(stringIndDf)
@@ -2111,14 +2112,22 @@ class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol, HasHandleInvalid,
21112112 >>> loadedInverter = IndexToString.load(indexToStringPath)
21122113 >>> loadedInverter.getLabels() == inverter.getLabels()
21132114 True
2115+ >>> stringIndexer.getStringOrderType()
2116+ 'frequencyDesc'
2117+ >>> stringIndexer.setStringOrderType("alphabetDesc")
2118+ >>> model = stringIndexer.fit(stringIndDf)
2119+ >>> td = model.transform(stringIndDf)
2120+ >>> sorted(set([(i[0], i[1]) for i in td.select(td.id, td.indexed).collect()]),
2121+ ... key=lambda x: x[0])
2122+ [(0, 2.0), (1, 1.0), (2, 0.0), (3, 2.0), (4, 2.0), (5, 0.0)]
21142123
21152124 .. versionadded:: 1.4.0
21162125 """
21172126
21182127 stringOrderType = Param (Params ._dummy (), "stringOrderType" ,
21192128 "How to order labels of string column. The first label after " +
21202129 "ordering is assigned an index of 0. Supported options: " +
2121- "frequencyDesc, frequencyAsc, alphabetDsec , alphabetAsc." ,
2130+ "frequencyDesc, frequencyAsc, alphabetDesc , alphabetAsc." ,
21222131 typeConverter = TypeConverters .toString )
21232132
21242133 @keyword_only
0 commit comments