From ddf34a549ad76eda4627d19b190ba70daa232bc1 Mon Sep 17 00:00:00 2001
From: Wayne Zhang <actuaryzhang@uber.com>
Date: Sat, 13 May 2017 17:41:47 -0700
Subject: [PATCH 01/11] Python API to StringOrderType in StringIndexer

---
 python/pyspark/ml/feature.py | 33 ++++++++++++++++++++++++++++-----
 1 file changed, 28 insertions(+), 5 deletions(-)

diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 8d25f5b3a771..99f40758162c 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -2115,22 +2115,32 @@ class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol, HasHandleInvalid,
     .. versionadded:: 1.4.0
     """
 
+    stringOrderType = Param(Params._dummy(), "stringOrderType",
+                            "how to order labels of string column. The first label after ordering " +
+                            " is assigned an index of 0. Supported options: " +
+                            "frequencyDsc, frequencyAsc, frequencyDsc, frequencyDsc.",
+                            typeConverter=TypeConverters.toString)
+
     @keyword_only
-    def __init__(self, inputCol=None, outputCol=None, handleInvalid="error"):
+    def __init__(self, inputCol=None, outputCol=None, handleInvalid="error",
+                 stringOrderType="frequencyDsc"):
         """
-        __init__(self, inputCol=None, outputCol=None, handleInvalid="error")
+        __init__(self, inputCol=None, outputCol=None, handleInvalid="error",
+                 stringOrderType="frequencyDsc")
         """
         super(StringIndexer, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.StringIndexer", self.uid)
-        self._setDefault(handleInvalid="error")
+        self._setDefault(handleInvalid="error", stringOrderType="frequencyDsc")
         kwargs = self._input_kwargs
         self.setParams(**kwargs)
 
     @keyword_only
     @since("1.4.0")
-    def setParams(self, inputCol=None, outputCol=None, handleInvalid="error"):
+    def setParams(self, inputCol=None, outputCol=None, handleInvalid="error",
+                  stringOrderType="frequencyDsc"):
         """
-        setParams(self, inputCol=None, outputCol=None, handleInvalid="error")
+        setParams(self, inputCol=None, outputCol=None, handleInvalid="error",
+                  stringOrderType="frequencyDsc")
         Sets params for this StringIndexer.
         """
         kwargs = self._input_kwargs
@@ -2139,6 +2149,19 @@ def setParams(self, inputCol=None, outputCol=None, handleInvalid="error"):
     def _create_model(self, java_model):
         return StringIndexerModel(java_model)
 
+    @since("2.3.0")
+    def setStringOrderType(self, value):
+        """
+        Sets the value of :py:attr:`stringOrderType`.
+        """
+        return self._set(stringOrderType=value)
+
+    @since("2.3.0")
+    def getStringOrderType(self):
+        """
+        Gets the value of stringOrderType or its default value.
+        """
+        return self.getOrDefault(self.stringOrderType)
 
 class StringIndexerModel(JavaModel, JavaMLReadable, JavaMLWritable):
     """

From c1966bba863e7c7d2ea7333f377a18f232860587 Mon Sep 17 00:00:00 2001
From: Wayne Zhang <actuaryzhang@uber.com>
Date: Sun, 14 May 2017 00:19:44 -0700
Subject: [PATCH 02/11] fix typo

---
 python/pyspark/ml/feature.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 99f40758162c..cfa1011e9893 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -2118,29 +2118,29 @@ class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol, HasHandleInvalid,
     stringOrderType = Param(Params._dummy(), "stringOrderType",
                             "how to order labels of string column. The first label after ordering " +
                             " is assigned an index of 0. Supported options: " +
-                            "frequencyDsc, frequencyAsc, frequencyDsc, frequencyDsc.",
+                            "frequencyDesc, frequencyAsc, alphabetDsc, alphabetAsc.",
                             typeConverter=TypeConverters.toString)
 
     @keyword_only
     def __init__(self, inputCol=None, outputCol=None, handleInvalid="error",
-                 stringOrderType="frequencyDsc"):
+                 stringOrderType="frequencyDesc"):
         """
         __init__(self, inputCol=None, outputCol=None, handleInvalid="error",
-                 stringOrderType="frequencyDsc")
+                 stringOrderType="frequencyDesc")
         """
         super(StringIndexer, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.StringIndexer", self.uid)
-        self._setDefault(handleInvalid="error", stringOrderType="frequencyDsc")
+        self._setDefault(handleInvalid="error", stringOrderType="frequencyDesc")
         kwargs = self._input_kwargs
         self.setParams(**kwargs)
 
     @keyword_only
     @since("1.4.0")
     def setParams(self, inputCol=None, outputCol=None, handleInvalid="error",
-                  stringOrderType="frequencyDsc"):
+                  stringOrderType="frequencyDesc"):
         """
         setParams(self, inputCol=None, outputCol=None, handleInvalid="error",
-                  stringOrderType="frequencyDsc")
+                  stringOrderType="frequencyDesc")
         Sets params for this StringIndexer.
         """
         kwargs = self._input_kwargs

From e5c8dcfcdcb9fcb9586339c3efebe85670126fb6 Mon Sep 17 00:00:00 2001
From: Wayne Zhang <actuaryzhang@uber.com>
Date: Sun, 14 May 2017 13:57:44 -0700
Subject: [PATCH 03/11] fix typo

---
 python/pyspark/ml/feature.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index cfa1011e9893..1a7d19145b7b 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -2116,7 +2116,7 @@ class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol, HasHandleInvalid,
     """
 
     stringOrderType = Param(Params._dummy(), "stringOrderType",
-                            "how to order labels of string column. The first label after ordering " +
+                            "How to order labels of string column. The first label after ordering " +
                             " is assigned an index of 0. Supported options: " +
                             "frequencyDesc, frequencyAsc, alphabetDsc, alphabetAsc.",
                             typeConverter=TypeConverters.toString)

From bd80b37d9728624c6455ceca12198ce763b32a91 Mon Sep 17 00:00:00 2001
From: Wayne Zhang <actuaryzhang@uber.com>
Date: Sun, 14 May 2017 16:29:57 -0700
Subject: [PATCH 04/11] fix style

---
 python/pyspark/ml/feature.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 1a7d19145b7b..dda58e820e65 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -2116,8 +2116,8 @@ class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol, HasHandleInvalid,
     """
 
     stringOrderType = Param(Params._dummy(), "stringOrderType",
-                            "How to order labels of string column. The first label after ordering " +
-                            " is assigned an index of 0. Supported options: " +
+                            "How to order labels of string column. The first label after " +
+                            "ordering is assigned an index of 0. Supported options: " +
                             "frequencyDesc, frequencyAsc, alphabetDsc, alphabetAsc.",
                             typeConverter=TypeConverters.toString)
 
@@ -2163,6 +2163,7 @@ def getStringOrderType(self):
         """
         return self.getOrDefault(self.stringOrderType)
 
+
 class StringIndexerModel(JavaModel, JavaMLReadable, JavaMLWritable):
     """
     Model fitted by :py:class:`StringIndexer`.

From 1f336ab70719f4074f4ac69cc0bb4750723b0bd5 Mon Sep 17 00:00:00 2001
From: Wayne Zhang <actuaryzhang@uber.com>
Date: Sun, 14 May 2017 16:53:43 -0700
Subject: [PATCH 05/11] fix style

---
 python/pyspark/ml/feature.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index dda58e820e65..be90e3af3599 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -2118,14 +2118,14 @@ class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol, HasHandleInvalid,
     stringOrderType = Param(Params._dummy(), "stringOrderType",
                             "How to order labels of string column. The first label after " +
                             "ordering is assigned an index of 0. Supported options: " +
-                            "frequencyDesc, frequencyAsc, alphabetDsc, alphabetAsc.",
+                            "frequencyDesc, frequencyAsc, alphabetDsec, alphabetAsc.",
                             typeConverter=TypeConverters.toString)
 
     @keyword_only
     def __init__(self, inputCol=None, outputCol=None, handleInvalid="error",
                  stringOrderType="frequencyDesc"):
         """
-        __init__(self, inputCol=None, outputCol=None, handleInvalid="error",
+        __init__(self, inputCol=None, outputCol=None, handleInvalid="error", \
                  stringOrderType="frequencyDesc")
         """
         super(StringIndexer, self).__init__()
@@ -2139,7 +2139,7 @@ def __init__(self, inputCol=None, outputCol=None, handleInvalid="error",
     def setParams(self, inputCol=None, outputCol=None, handleInvalid="error",
                   stringOrderType="frequencyDesc"):
         """
-        setParams(self, inputCol=None, outputCol=None, handleInvalid="error",
+        setParams(self, inputCol=None, outputCol=None, handleInvalid="error", \
                   stringOrderType="frequencyDesc")
         Sets params for this StringIndexer.
         """
@@ -2159,7 +2159,7 @@ def setStringOrderType(self, value):
     @since("2.3.0")
     def getStringOrderType(self):
         """
-        Gets the value of stringOrderType or its default value.
+        Gets the value of :py:attr:`stringOrderType` or its default value.
         """
         return self.getOrDefault(self.stringOrderType)
 

From 44f0a362dd085022de215e9ab8d9536145f20d4d Mon Sep 17 00:00:00 2001
From: Wayne Zhang <actuaryzhang@uber.com>
Date: Sun, 14 May 2017 20:42:15 -0700
Subject: [PATCH 06/11] add tests

---
 python/pyspark/ml/feature.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index be90e3af3599..4c7f3bc06847 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -2082,8 +2082,9 @@ class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol, HasHandleInvalid,
     """
     A label indexer that maps a string column of labels to an ML column of label indices.
     If the input column is numeric, we cast it to string and index the string values.
-    The indices are in [0, numLabels), ordered by label frequencies.
-    So the most frequent label gets index 0.
+    The indices are in [0, numLabels). By default, this is ordered by label frequencies
+    so the most frequent label gets index 0. The ordering behavior is controlled by
+    setting stringOrderType.
 
     >>> stringIndexer = StringIndexer(inputCol="label", outputCol="indexed", handleInvalid='error')
     >>> model = stringIndexer.fit(stringIndDf)
@@ -2111,6 +2112,14 @@ class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol, HasHandleInvalid,
     >>> loadedInverter = IndexToString.load(indexToStringPath)
     >>> loadedInverter.getLabels() == inverter.getLabels()
     True
+    >>> stringIndexer.getStringOrderType()
+    'frequencyDesc'
+    >>> stringIndexer.setStringOrderType("alphabetDesc")
+    >>> model = stringIndexer.fit(stringIndDf)
+    >>> td = model.transform(stringIndDf)
+    >>> sorted(set([(i[0], i[1]) for i in td.select(td.id, td.indexed).collect()]),
+    ...     key=lambda x: x[0])
+    [(0, 2.0), (1, 1.0), (2, 0.0), (3, 2.0), (4, 2.0), (5, 0.0)]
 
     .. versionadded:: 1.4.0
     """
@@ -2118,7 +2127,7 @@ class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol, HasHandleInvalid,
     stringOrderType = Param(Params._dummy(), "stringOrderType",
                             "How to order labels of string column. The first label after " +
                             "ordering is assigned an index of 0. Supported options: " +
-                            "frequencyDesc, frequencyAsc, alphabetDsec, alphabetAsc.",
+                            "frequencyDesc, frequencyAsc, alphabetDesc, alphabetAsc.",
                             typeConverter=TypeConverters.toString)
 
     @keyword_only

From f66a4455aba7ffc69d1b397cb828879d84bb39a6 Mon Sep 17 00:00:00 2001
From: Wayne Zhang <actuaryzhang@uber.com>
Date: Sun, 14 May 2017 21:20:49 -0700
Subject: [PATCH 07/11] fix test error

---
 python/pyspark/ml/feature.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 4c7f3bc06847..f8766ae76a40 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -2114,7 +2114,8 @@ class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol, HasHandleInvalid,
     True
     >>> stringIndexer.getStringOrderType()
     'frequencyDesc'
-    >>> stringIndexer.setStringOrderType("alphabetDesc")
+    >>> stringIndexer = StringIndexer(inputCol="label", outputCol="indexed", handleInvalid='error',
+    ...     stringOrderType="alphabetDesc")
     >>> model = stringIndexer.fit(stringIndDf)
     >>> td = model.transform(stringIndDf)
     >>> sorted(set([(i[0], i[1]) for i in td.select(td.id, td.indexed).collect()]),

From 36006bf32f9ed6aef2de433daaba99d6e11d7e3d Mon Sep 17 00:00:00 2001
From: Wayne Zhang <actuaryzhang@uber.com>
Date: Mon, 15 May 2017 21:49:08 -0700
Subject: [PATCH 08/11] address comments

---
 python/pyspark/ml/feature.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index f8766ae76a40..b89ced86bb94 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -2084,7 +2084,7 @@ class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol, HasHandleInvalid,
     If the input column is numeric, we cast it to string and index the string values.
     The indices are in [0, numLabels). By default, this is ordered by label frequencies
     so the most frequent label gets index 0. The ordering behavior is controlled by
-    setting stringOrderType.
+    setting :py:attr:`stringOrderType`.
 
     >>> stringIndexer = StringIndexer(inputCol="label", outputCol="indexed", handleInvalid='error')
     >>> model = stringIndexer.fit(stringIndDf)
@@ -2169,7 +2169,7 @@ def setStringOrderType(self, value):
     @since("2.3.0")
     def getStringOrderType(self):
         """
-        Gets the value of :py:attr:`stringOrderType` or its default value.
+        Gets the value of :py:attr:`stringOrderType` or its default value 'frequencyDesc'.
         """
         return self.getOrDefault(self.stringOrderType)
 

From 6acabc2f2d27cc25fd6cb52ff25c1ba2ce69bd23 Mon Sep 17 00:00:00 2001
From: Wayne Zhang <actuaryzhang@uber.com>
Date: Mon, 15 May 2017 22:23:58 -0700
Subject: [PATCH 09/11] minor style fix

---
 python/pyspark/ml/feature.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index b89ced86bb94..8a4e45f049b0 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -2086,7 +2086,7 @@ class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol, HasHandleInvalid,
     so the most frequent label gets index 0. The ordering behavior is controlled by
     setting :py:attr:`stringOrderType`.
 
-    >>> stringIndexer = StringIndexer(inputCol="label", outputCol="indexed", handleInvalid='error')
+    >>> stringIndexer = StringIndexer(inputCol="label", outputCol="indexed", handleInvalid="error")
     >>> model = stringIndexer.fit(stringIndDf)
     >>> td = model.transform(stringIndDf)
     >>> sorted(set([(i[0], i[1]) for i in td.select(td.id, td.indexed).collect()]),
@@ -2114,7 +2114,7 @@ class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol, HasHandleInvalid,
     True
     >>> stringIndexer.getStringOrderType()
     'frequencyDesc'
-    >>> stringIndexer = StringIndexer(inputCol="label", outputCol="indexed", handleInvalid='error',
+    >>> stringIndexer = StringIndexer(inputCol="label", outputCol="indexed", handleInvalid="error",
     ...     stringOrderType="alphabetDesc")
     >>> model = stringIndexer.fit(stringIndDf)
     >>> td = model.transform(stringIndDf)

From 2fe9432945f16b77916244b0cc36ff07cdb53693 Mon Sep 17 00:00:00 2001
From: Wayne Zhang <actuaryzhang@uber.com>
Date: Sat, 20 May 2017 13:41:41 -0700
Subject: [PATCH 10/11] add default value for stringOrderType in docstring

---
 python/pyspark/ml/feature.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 8a4e45f049b0..85dcfa8680b8 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -2084,9 +2084,10 @@ class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol, HasHandleInvalid,
     If the input column is numeric, we cast it to string and index the string values.
     The indices are in [0, numLabels). By default, this is ordered by label frequencies
     so the most frequent label gets index 0. The ordering behavior is controlled by
-    setting :py:attr:`stringOrderType`.
+    setting :py:attr:`stringOrderType`. Its default value is 'frequencyDesc'.
 
     >>> stringIndexer = StringIndexer(inputCol="label", outputCol="indexed", handleInvalid="error")
+    ...     stringOrderType="frequencyDesc")
     >>> model = stringIndexer.fit(stringIndDf)
     >>> td = model.transform(stringIndDf)
     >>> sorted(set([(i[0], i[1]) for i in td.select(td.id, td.indexed).collect()]),

From 5bfa4dc3ba60655d9a9ce4aded935303b90d33cb Mon Sep 17 00:00:00 2001
From: Wayne Zhang <actuaryzhang@uber.com>
Date: Sat, 20 May 2017 14:07:39 -0700
Subject: [PATCH 11/11] fix example error

---
 python/pyspark/ml/feature.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 85dcfa8680b8..955bc9768ce7 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -2086,7 +2086,7 @@ class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol, HasHandleInvalid,
     so the most frequent label gets index 0. The ordering behavior is controlled by
     setting :py:attr:`stringOrderType`. Its default value is 'frequencyDesc'.
 
-    >>> stringIndexer = StringIndexer(inputCol="label", outputCol="indexed", handleInvalid="error")
+    >>> stringIndexer = StringIndexer(inputCol="label", outputCol="indexed", handleInvalid="error",
     ...     stringOrderType="frequencyDesc")
     >>> model = stringIndexer.fit(stringIndDf)
     >>> td = model.transform(stringIndDf)