From ddf34a549ad76eda4627d19b190ba70daa232bc1 Mon Sep 17 00:00:00 2001 From: Wayne Zhang Date: Sat, 13 May 2017 17:41:47 -0700 Subject: [PATCH 01/11] Python API to StringOrderType in StringIndexer --- python/pyspark/ml/feature.py | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 8d25f5b3a771..99f40758162c 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -2115,22 +2115,32 @@ class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol, HasHandleInvalid, .. versionadded:: 1.4.0 """ + stringOrderType = Param(Params._dummy(), "stringOrderType", + "how to order labels of string column. The first label after ordering " + + " is assigned an index of 0. Supported options: " + + "frequencyDsc, frequencyAsc, frequencyDsc, frequencyDsc.", + typeConverter=TypeConverters.toString) + @keyword_only - def __init__(self, inputCol=None, outputCol=None, handleInvalid="error"): + def __init__(self, inputCol=None, outputCol=None, handleInvalid="error", + stringOrderType="frequencyDsc"): """ - __init__(self, inputCol=None, outputCol=None, handleInvalid="error") + __init__(self, inputCol=None, outputCol=None, handleInvalid="error", + stringOrderType="frequencyDsc") """ super(StringIndexer, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.StringIndexer", self.uid) - self._setDefault(handleInvalid="error") + self._setDefault(handleInvalid="error", stringOrderType="frequencyDsc") kwargs = self._input_kwargs self.setParams(**kwargs) @keyword_only @since("1.4.0") - def setParams(self, inputCol=None, outputCol=None, handleInvalid="error"): + def setParams(self, inputCol=None, outputCol=None, handleInvalid="error", + stringOrderType="frequencyDsc"): """ - setParams(self, inputCol=None, outputCol=None, handleInvalid="error") + setParams(self, inputCol=None, outputCol=None, handleInvalid="error", + stringOrderType="frequencyDsc") Sets params for this StringIndexer. """ kwargs = self._input_kwargs @@ -2139,6 +2149,19 @@ def setParams(self, inputCol=None, outputCol=None, handleInvalid="error"): def _create_model(self, java_model): return StringIndexerModel(java_model) + @since("2.3.0") + def setStringOrderType(self, value): + """ + Sets the value of :py:attr:`stringOrderType`. + """ + return self._set(stringOrderType=value) + + @since("2.3.0") + def getStringOrderType(self): + """ + Gets the value of stringOrderType or its default value. + """ + return self.getOrDefault(self.stringOrderType) class StringIndexerModel(JavaModel, JavaMLReadable, JavaMLWritable): """ From c1966bba863e7c7d2ea7333f377a18f232860587 Mon Sep 17 00:00:00 2001 From: Wayne Zhang Date: Sun, 14 May 2017 00:19:44 -0700 Subject: [PATCH 02/11] fix typo --- python/pyspark/ml/feature.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 99f40758162c..cfa1011e9893 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -2118,29 +2118,29 @@ class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol, HasHandleInvalid, stringOrderType = Param(Params._dummy(), "stringOrderType", "how to order labels of string column. The first label after ordering " + " is assigned an index of 0. Supported options: " + - "frequencyDsc, frequencyAsc, frequencyDsc, frequencyDsc.", + "frequencyDesc, frequencyAsc, alphabetDsc, alphabetAsc.", typeConverter=TypeConverters.toString) @keyword_only def __init__(self, inputCol=None, outputCol=None, handleInvalid="error", - stringOrderType="frequencyDsc"): + stringOrderType="frequencyDesc"): """ __init__(self, inputCol=None, outputCol=None, handleInvalid="error", - stringOrderType="frequencyDsc") + stringOrderType="frequencyDesc") """ super(StringIndexer, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.StringIndexer", self.uid) - self._setDefault(handleInvalid="error", stringOrderType="frequencyDsc") + self._setDefault(handleInvalid="error", stringOrderType="frequencyDesc") kwargs = self._input_kwargs self.setParams(**kwargs) @keyword_only @since("1.4.0") def setParams(self, inputCol=None, outputCol=None, handleInvalid="error", - stringOrderType="frequencyDsc"): + stringOrderType="frequencyDesc"): """ setParams(self, inputCol=None, outputCol=None, handleInvalid="error", - stringOrderType="frequencyDsc") + stringOrderType="frequencyDesc") Sets params for this StringIndexer. """ kwargs = self._input_kwargs From e5c8dcfcdcb9fcb9586339c3efebe85670126fb6 Mon Sep 17 00:00:00 2001 From: Wayne Zhang Date: Sun, 14 May 2017 13:57:44 -0700 Subject: [PATCH 03/11] fix typo --- python/pyspark/ml/feature.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index cfa1011e9893..1a7d19145b7b 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -2116,7 +2116,7 @@ class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol, HasHandleInvalid, """ stringOrderType = Param(Params._dummy(), "stringOrderType", - "how to order labels of string column. The first label after ordering " + + "How to order labels of string column. The first label after ordering " + " is assigned an index of 0. Supported options: " + "frequencyDesc, frequencyAsc, alphabetDsc, alphabetAsc.", typeConverter=TypeConverters.toString) From bd80b37d9728624c6455ceca12198ce763b32a91 Mon Sep 17 00:00:00 2001 From: Wayne Zhang Date: Sun, 14 May 2017 16:29:57 -0700 Subject: [PATCH 04/11] fix style --- python/pyspark/ml/feature.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 1a7d19145b7b..dda58e820e65 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -2116,8 +2116,8 @@ class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol, HasHandleInvalid, """ stringOrderType = Param(Params._dummy(), "stringOrderType", - "How to order labels of string column. The first label after ordering " + - " is assigned an index of 0. Supported options: " + + "How to order labels of string column. The first label after " + + "ordering is assigned an index of 0. Supported options: " + "frequencyDesc, frequencyAsc, alphabetDsc, alphabetAsc.", typeConverter=TypeConverters.toString) @@ -2163,6 +2163,7 @@ def getStringOrderType(self): """ return self.getOrDefault(self.stringOrderType) + class StringIndexerModel(JavaModel, JavaMLReadable, JavaMLWritable): """ Model fitted by :py:class:`StringIndexer`. From 1f336ab70719f4074f4ac69cc0bb4750723b0bd5 Mon Sep 17 00:00:00 2001 From: Wayne Zhang Date: Sun, 14 May 2017 16:53:43 -0700 Subject: [PATCH 05/11] fix style --- python/pyspark/ml/feature.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index dda58e820e65..be90e3af3599 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -2118,14 +2118,14 @@ class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol, HasHandleInvalid, stringOrderType = Param(Params._dummy(), "stringOrderType", "How to order labels of string column. The first label after " + "ordering is assigned an index of 0. Supported options: " + - "frequencyDesc, frequencyAsc, alphabetDsc, alphabetAsc.", + "frequencyDesc, frequencyAsc, alphabetDsec, alphabetAsc.", typeConverter=TypeConverters.toString) @keyword_only def __init__(self, inputCol=None, outputCol=None, handleInvalid="error", stringOrderType="frequencyDesc"): """ - __init__(self, inputCol=None, outputCol=None, handleInvalid="error", + __init__(self, inputCol=None, outputCol=None, handleInvalid="error", \ stringOrderType="frequencyDesc") """ super(StringIndexer, self).__init__() @@ -2139,7 +2139,7 @@ def __init__(self, inputCol=None, outputCol=None, handleInvalid="error", def setParams(self, inputCol=None, outputCol=None, handleInvalid="error", stringOrderType="frequencyDesc"): """ - setParams(self, inputCol=None, outputCol=None, handleInvalid="error", + setParams(self, inputCol=None, outputCol=None, handleInvalid="error", \ stringOrderType="frequencyDesc") Sets params for this StringIndexer. """ @@ -2159,7 +2159,7 @@ def setStringOrderType(self, value): @since("2.3.0") def getStringOrderType(self): """ - Gets the value of stringOrderType or its default value. + Gets the value of :py:attr:`stringOrderType` or its default value. """ return self.getOrDefault(self.stringOrderType) From 44f0a362dd085022de215e9ab8d9536145f20d4d Mon Sep 17 00:00:00 2001 From: Wayne Zhang Date: Sun, 14 May 2017 20:42:15 -0700 Subject: [PATCH 06/11] add tests --- python/pyspark/ml/feature.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index be90e3af3599..4c7f3bc06847 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -2082,8 +2082,9 @@ class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol, HasHandleInvalid, """ A label indexer that maps a string column of labels to an ML column of label indices. If the input column is numeric, we cast it to string and index the string values. - The indices are in [0, numLabels), ordered by label frequencies. - So the most frequent label gets index 0. + The indices are in [0, numLabels). By default, this is ordered by label frequencies + so the most frequent label gets index 0. The ordering behavior is controlled by + setting stringOrderType. >>> stringIndexer = StringIndexer(inputCol="label", outputCol="indexed", handleInvalid='error') >>> model = stringIndexer.fit(stringIndDf) @@ -2111,6 +2112,14 @@ class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol, HasHandleInvalid, >>> loadedInverter = IndexToString.load(indexToStringPath) >>> loadedInverter.getLabels() == inverter.getLabels() True + >>> stringIndexer.getStringOrderType() + 'frequencyDesc' + >>> stringIndexer.setStringOrderType("alphabetDesc") + >>> model = stringIndexer.fit(stringIndDf) + >>> td = model.transform(stringIndDf) + >>> sorted(set([(i[0], i[1]) for i in td.select(td.id, td.indexed).collect()]), + ... key=lambda x: x[0]) + [(0, 2.0), (1, 1.0), (2, 0.0), (3, 2.0), (4, 2.0), (5, 0.0)] .. versionadded:: 1.4.0 """ @@ -2118,7 +2127,7 @@ class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol, HasHandleInvalid, stringOrderType = Param(Params._dummy(), "stringOrderType", "How to order labels of string column. The first label after " + "ordering is assigned an index of 0. Supported options: " + - "frequencyDesc, frequencyAsc, alphabetDsec, alphabetAsc.", + "frequencyDesc, frequencyAsc, alphabetDesc, alphabetAsc.", typeConverter=TypeConverters.toString) @keyword_only From f66a4455aba7ffc69d1b397cb828879d84bb39a6 Mon Sep 17 00:00:00 2001 From: Wayne Zhang Date: Sun, 14 May 2017 21:20:49 -0700 Subject: [PATCH 07/11] fix test error --- python/pyspark/ml/feature.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 4c7f3bc06847..f8766ae76a40 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -2114,7 +2114,8 @@ class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol, HasHandleInvalid, True >>> stringIndexer.getStringOrderType() 'frequencyDesc' - >>> stringIndexer.setStringOrderType("alphabetDesc") + >>> stringIndexer = StringIndexer(inputCol="label", outputCol="indexed", handleInvalid='error', + ... stringOrderType="alphabetDesc") >>> model = stringIndexer.fit(stringIndDf) >>> td = model.transform(stringIndDf) >>> sorted(set([(i[0], i[1]) for i in td.select(td.id, td.indexed).collect()]), From 36006bf32f9ed6aef2de433daaba99d6e11d7e3d Mon Sep 17 00:00:00 2001 From: Wayne Zhang Date: Mon, 15 May 2017 21:49:08 -0700 Subject: [PATCH 08/11] address comments --- python/pyspark/ml/feature.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index f8766ae76a40..b89ced86bb94 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -2084,7 +2084,7 @@ class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol, HasHandleInvalid, If the input column is numeric, we cast it to string and index the string values. The indices are in [0, numLabels). By default, this is ordered by label frequencies so the most frequent label gets index 0. The ordering behavior is controlled by - setting stringOrderType. + setting :py:attr:`stringOrderType`. >>> stringIndexer = StringIndexer(inputCol="label", outputCol="indexed", handleInvalid='error') >>> model = stringIndexer.fit(stringIndDf) @@ -2169,7 +2169,7 @@ def setStringOrderType(self, value): @since("2.3.0") def getStringOrderType(self): """ - Gets the value of :py:attr:`stringOrderType` or its default value. + Gets the value of :py:attr:`stringOrderType` or its default value 'frequencyDesc'. """ return self.getOrDefault(self.stringOrderType) From 6acabc2f2d27cc25fd6cb52ff25c1ba2ce69bd23 Mon Sep 17 00:00:00 2001 From: Wayne Zhang Date: Mon, 15 May 2017 22:23:58 -0700 Subject: [PATCH 09/11] minor style fix --- python/pyspark/ml/feature.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index b89ced86bb94..8a4e45f049b0 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -2086,7 +2086,7 @@ class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol, HasHandleInvalid, so the most frequent label gets index 0. The ordering behavior is controlled by setting :py:attr:`stringOrderType`. - >>> stringIndexer = StringIndexer(inputCol="label", outputCol="indexed", handleInvalid='error') + >>> stringIndexer = StringIndexer(inputCol="label", outputCol="indexed", handleInvalid="error") >>> model = stringIndexer.fit(stringIndDf) >>> td = model.transform(stringIndDf) >>> sorted(set([(i[0], i[1]) for i in td.select(td.id, td.indexed).collect()]), @@ -2114,7 +2114,7 @@ class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol, HasHandleInvalid, True >>> stringIndexer.getStringOrderType() 'frequencyDesc' - >>> stringIndexer = StringIndexer(inputCol="label", outputCol="indexed", handleInvalid='error', + >>> stringIndexer = StringIndexer(inputCol="label", outputCol="indexed", handleInvalid="error", ... stringOrderType="alphabetDesc") >>> model = stringIndexer.fit(stringIndDf) >>> td = model.transform(stringIndDf) From 2fe9432945f16b77916244b0cc36ff07cdb53693 Mon Sep 17 00:00:00 2001 From: Wayne Zhang Date: Sat, 20 May 2017 13:41:41 -0700 Subject: [PATCH 10/11] add default value for stringOrderType in docstring --- python/pyspark/ml/feature.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 8a4e45f049b0..85dcfa8680b8 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -2084,9 +2084,10 @@ class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol, HasHandleInvalid, If the input column is numeric, we cast it to string and index the string values. The indices are in [0, numLabels). By default, this is ordered by label frequencies so the most frequent label gets index 0. The ordering behavior is controlled by - setting :py:attr:`stringOrderType`. + setting :py:attr:`stringOrderType`. Its default value is 'frequencyDesc'. >>> stringIndexer = StringIndexer(inputCol="label", outputCol="indexed", handleInvalid="error") + ... stringOrderType="frequencyDesc") >>> model = stringIndexer.fit(stringIndDf) >>> td = model.transform(stringIndDf) >>> sorted(set([(i[0], i[1]) for i in td.select(td.id, td.indexed).collect()]), From 5bfa4dc3ba60655d9a9ce4aded935303b90d33cb Mon Sep 17 00:00:00 2001 From: Wayne Zhang Date: Sat, 20 May 2017 14:07:39 -0700 Subject: [PATCH 11/11] fix example error --- python/pyspark/ml/feature.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 85dcfa8680b8..955bc9768ce7 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -2086,7 +2086,7 @@ class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol, HasHandleInvalid, so the most frequent label gets index 0. The ordering behavior is controlled by setting :py:attr:`stringOrderType`. Its default value is 'frequencyDesc'. - >>> stringIndexer = StringIndexer(inputCol="label", outputCol="indexed", handleInvalid="error") + >>> stringIndexer = StringIndexer(inputCol="label", outputCol="indexed", handleInvalid="error", ... stringOrderType="frequencyDesc") >>> model = stringIndexer.fit(stringIndDf) >>> td = model.transform(stringIndDf)