From 4bca4d95613e6e18361de8fe0a36667182c2d446 Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Fri, 26 May 2017 00:40:22 -0700 Subject: [PATCH 1/6] Pyhton port for Rformula stringIndexerOrderType --- python/pyspark/ml/feature.py | 45 ++++++++++++++++++++++++++++++++---- 1 file changed, 40 insertions(+), 5 deletions(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 955bc9768ce77..72576c88fef84 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -3032,6 +3032,18 @@ class RFormula(JavaEstimator, HasFeaturesCol, HasLabelCol, JavaMLReadable, JavaM ... >>> str(loadedModel) 'RFormulaModel(ResolvedRFormula(label=y, terms=[x,s], hasIntercept=true)) (uid=...)' + >>> rf = RFormula(formula="y ~ x + s", stringIndexerOrderType="alphabetDesc") + >>> rf.getStringIndexerOrderType() + 'alphabetDesc' + >>> rf.fit(df).transform(df).show() + +---+---+---+---------+-----+ + | y| x| s| features|label| + +---+---+---+---------+-----+ + |1.0|1.0| a|[1.0,0.0]| 1.0| + |0.0|2.0| b|[2.0,1.0]| 0.0| + |0.0|0.0| a|(2,[],[])| 0.0| + +---+---+---+---------+-----+ + ... .. versionadded:: 1.5.0 """ @@ -3043,26 +3055,35 @@ class RFormula(JavaEstimator, HasFeaturesCol, HasLabelCol, JavaMLReadable, JavaM "Force to index label whether it is numeric or string", typeConverter=TypeConverters.toBoolean) + stringIndexerOrderType = Param(Params._dummy(), "stringIndexerOrderType", + "How to order categories of a string FEATURE column used by " + + "StringIndexer. The last category after ordering is dropped " + + "when encoding strings. Supported options: frequencyDesc, " + + "frequencyAsc, alphabetDesc, alphabetAsc. The default value " + + "is frequencyDesc. When the ordering is set to alphabetDesc, " + + "RFormula drops the same category as R when encoding strings.", + typeConverter=TypeConverters.toString) + @keyword_only def __init__(self, formula=None, featuresCol="features", labelCol="label", - forceIndexLabel=False): + forceIndexLabel=False, stringIndexerOrderType="frequencyDesc"): """ __init__(self, formula=None, featuresCol="features", labelCol="label", \ - forceIndexLabel=False) + forceIndexLabel=False, stringIndexerOrderType="frequencyDesc") """ super(RFormula, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.RFormula", self.uid) - self._setDefault(forceIndexLabel=False) + self._setDefault(forceIndexLabel=False, stringIndexerOrderType="frequencyDesc") kwargs = self._input_kwargs self.setParams(**kwargs) @keyword_only @since("1.5.0") def setParams(self, formula=None, featuresCol="features", labelCol="label", - forceIndexLabel=False): + forceIndexLabel=False, stringIndexerOrderType="frequencyDesc"): """ setParams(self, formula=None, featuresCol="features", labelCol="label", \ - forceIndexLabel=False) + forceIndexLabel=False, stringIndexerOrderType="frequencyDesc") Sets params for RFormula. """ kwargs = self._input_kwargs @@ -3096,6 +3117,20 @@ def getForceIndexLabel(self): """ return self.getOrDefault(self.forceIndexLabel) + @since("2.3.0") + def setStringIndexerOrderType(self, value): + """ + Sets the value of :py:attr:`stringIndexerOrderType`. + """ + return self._set(stringIndexerOrderType=value) + + @since("2.3.0") + def getStringIndexerOrderType(self): + """ + Gets the value of :py:attr:`stringIndexerOrderType` or its default value 'frequencyDesc'. + """ + return self.getOrDefault(self.stringIndexerOrderType) + def _create_model(self, java_model): return RFormulaModel(java_model) From c3f44303636654232347af38841e5347a63a860f Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Fri, 26 May 2017 14:54:03 -0700 Subject: [PATCH 2/6] fix doc issue --- python/pyspark/ml/feature.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 72576c88fef84..ebdc5ffd70720 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -3056,7 +3056,7 @@ class RFormula(JavaEstimator, HasFeaturesCol, HasLabelCol, JavaMLReadable, JavaM typeConverter=TypeConverters.toBoolean) stringIndexerOrderType = Param(Params._dummy(), "stringIndexerOrderType", - "How to order categories of a string FEATURE column used by " + + "How to order categories of a string feature column used by " + "StringIndexer. The last category after ordering is dropped " + "when encoding strings. Supported options: frequencyDesc, " + "frequencyAsc, alphabetDesc, alphabetAsc. The default value " + From 3510e24379a26551edd7abf2bf8f3fb08ec42aba Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Mon, 29 May 2017 10:50:36 -0700 Subject: [PATCH 3/6] move test to test file --- python/pyspark/ml/feature.py | 12 ------------ python/pyspark/tests.py | 19 +++++++++++++++++++ 2 files changed, 19 insertions(+), 12 deletions(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index ebdc5ffd70720..77de1cc18246d 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -3032,18 +3032,6 @@ class RFormula(JavaEstimator, HasFeaturesCol, HasLabelCol, JavaMLReadable, JavaM ... >>> str(loadedModel) 'RFormulaModel(ResolvedRFormula(label=y, terms=[x,s], hasIntercept=true)) (uid=...)' - >>> rf = RFormula(formula="y ~ x + s", stringIndexerOrderType="alphabetDesc") - >>> rf.getStringIndexerOrderType() - 'alphabetDesc' - >>> rf.fit(df).transform(df).show() - +---+---+---+---------+-----+ - | y| x| s| features|label| - +---+---+---+---------+-----+ - |1.0|1.0| a|[1.0,0.0]| 1.0| - |0.0|2.0| b|[2.0,1.0]| 0.0| - |0.0|0.0| a|(2,[],[])| 0.0| - +---+---+---+---------+-----+ - ... .. versionadded:: 1.5.0 """ diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py index bb13de563cdd4..f795e3bbde38b 100644 --- a/python/pyspark/tests.py +++ b/python/pyspark/tests.py @@ -63,6 +63,7 @@ from pyspark.context import SparkContext from pyspark.rdd import RDD from pyspark.files import SparkFiles +from pyspark.ml.feature import RFormula from pyspark.serializers import read_int, BatchedSerializer, MarshalSerializer, PickleSerializer, \ CloudPickleSerializer, CompressedSerializer, UTF8Deserializer, NoOpSerializer, \ PairDeserializer, CartesianDeserializer, AutoBatchedSerializer, AutoSerializer, \ @@ -2206,6 +2207,24 @@ def set(self, x=None, other=None, other_x=None): self.assertEqual(b._x, 2) +class SparkMLTests(unittest.TestCase): + + def test_rformula(self): + df = spark.createDataFrame([ + (1.0, 1.0, "a"), + (0.0, 2.0, "b"), + (0.0, 0.0, "a") + ], ["y", "x", "s"]) + rf = RFormula(formula="y ~ x + s", stringIndexerOrderType="alphabetDesc") + self.assertEqual(rf.getStringIndexerOrderType(), 'alphabetDesc') + + result = rf.fit(df).transform(df) + observed = result.select("features").collect() + expected = [[1.0, 0.0], [2.0, 1.0], [0.0,0.0]] + for i in range(0, len(expected)): + self.assertEqual(observed[i]["features"].toArray(), expected[i]) + + @unittest.skipIf(not _have_scipy, "SciPy not installed") class SciPyTests(PySparkTestCase): From 320203eeea6d7613bb091f01b170fbfa2805b2a0 Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Mon, 29 May 2017 13:41:33 -0700 Subject: [PATCH 4/6] update test --- python/pyspark/tests.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py index f795e3bbde38b..a9f06f542865c 100644 --- a/python/pyspark/tests.py +++ b/python/pyspark/tests.py @@ -61,9 +61,9 @@ from pyspark import keyword_only from pyspark.conf import SparkConf from pyspark.context import SparkContext -from pyspark.rdd import RDD from pyspark.files import SparkFiles from pyspark.ml.feature import RFormula +from pyspark.rdd import RDD from pyspark.serializers import read_int, BatchedSerializer, MarshalSerializer, PickleSerializer, \ CloudPickleSerializer, CompressedSerializer, UTF8Deserializer, NoOpSerializer, \ PairDeserializer, CartesianDeserializer, AutoBatchedSerializer, AutoSerializer, \ @@ -2207,20 +2207,20 @@ def set(self, x=None, other=None, other_x=None): self.assertEqual(b._x, 2) -class SparkMLTests(unittest.TestCase): +class SparkMLTests(ReusedPySparkTestCase): def test_rformula(self): - df = spark.createDataFrame([ - (1.0, 1.0, "a"), - (0.0, 2.0, "b"), - (0.0, 0.0, "a") - ], ["y", "x", "s"]) + df = self.sc.parallelize([ + (1.0, 1.0, "a"), + (0.0, 2.0, "b"), + (0.0, 0.0, "a") + ]).toDF(["y", "x", "s"]) rf = RFormula(formula="y ~ x + s", stringIndexerOrderType="alphabetDesc") self.assertEqual(rf.getStringIndexerOrderType(), 'alphabetDesc') result = rf.fit(df).transform(df) observed = result.select("features").collect() - expected = [[1.0, 0.0], [2.0, 1.0], [0.0,0.0]] + expected = [[1.0, 0.0], [2.0, 1.0], [0.0, 0.0]] for i in range(0, len(expected)): self.assertEqual(observed[i]["features"].toArray(), expected[i]) From 4af4b3500de27acb0128763be755ea8078736d60 Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Mon, 29 May 2017 18:09:19 -0700 Subject: [PATCH 5/6] fix test issues --- python/pyspark/ml/tests.py | 13 +++++++++++++ python/pyspark/tests.py | 21 +-------------------- 2 files changed, 14 insertions(+), 20 deletions(-) diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py index 0daf29d59cb74..a00889bae72c5 100755 --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -538,6 +538,19 @@ def test_rformula_force_index_label(self): transformedDF2 = model2.transform(df) self.assertEqual(transformedDF2.head().label, 0.0) + def test_rformula_string_indexer_order_type(self): + df = self.spark.createDataFrame([ + (1.0, 1.0, "a"), + (0.0, 2.0, "b"), + (1.0, 0.0, "a")], ["y", "x", "s"]) + rf = RFormula(formula="y ~ x + s", stringIndexerOrderType="alphabetDesc") + self.assertEqual(rf.getStringIndexerOrderType(), 'alphabetDesc') + transformedDF = rf.fit(df).transform(df) + observed = transformedDF.select("features").collect() + expected = [[1.0, 0.0], [2.0, 1.0], [0.0, 0.0]] + for i in range(0, len(expected)): + self.assertTrue((observed[i]["features"].toArray() == expected[i]).all()) + class HasInducedError(Params): diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py index a9f06f542865c..bb13de563cdd4 100644 --- a/python/pyspark/tests.py +++ b/python/pyspark/tests.py @@ -61,9 +61,8 @@ from pyspark import keyword_only from pyspark.conf import SparkConf from pyspark.context import SparkContext -from pyspark.files import SparkFiles -from pyspark.ml.feature import RFormula from pyspark.rdd import RDD +from pyspark.files import SparkFiles from pyspark.serializers import read_int, BatchedSerializer, MarshalSerializer, PickleSerializer, \ CloudPickleSerializer, CompressedSerializer, UTF8Deserializer, NoOpSerializer, \ PairDeserializer, CartesianDeserializer, AutoBatchedSerializer, AutoSerializer, \ @@ -2207,24 +2206,6 @@ def set(self, x=None, other=None, other_x=None): self.assertEqual(b._x, 2) -class SparkMLTests(ReusedPySparkTestCase): - - def test_rformula(self): - df = self.sc.parallelize([ - (1.0, 1.0, "a"), - (0.0, 2.0, "b"), - (0.0, 0.0, "a") - ]).toDF(["y", "x", "s"]) - rf = RFormula(formula="y ~ x + s", stringIndexerOrderType="alphabetDesc") - self.assertEqual(rf.getStringIndexerOrderType(), 'alphabetDesc') - - result = rf.fit(df).transform(df) - observed = result.select("features").collect() - expected = [[1.0, 0.0], [2.0, 1.0], [0.0, 0.0]] - for i in range(0, len(expected)): - self.assertEqual(observed[i]["features"].toArray(), expected[i]) - - @unittest.skipIf(not _have_scipy, "SciPy not installed") class SciPyTests(PySparkTestCase): From 2e854a88ff83d8533225240c2394db8498fbfe25 Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Tue, 30 May 2017 09:37:58 -0700 Subject: [PATCH 6/6] improve tests --- python/pyspark/ml/tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py index a00889bae72c5..17a39472e1fe5 100755 --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -549,7 +549,7 @@ def test_rformula_string_indexer_order_type(self): observed = transformedDF.select("features").collect() expected = [[1.0, 0.0], [2.0, 1.0], [0.0, 0.0]] for i in range(0, len(expected)): - self.assertTrue((observed[i]["features"].toArray() == expected[i]).all()) + self.assertTrue(all(observed[i]["features"].toArray() == expected[i])) class HasInducedError(Params):